Add dlsite genres scraping

This commit is contained in:
2025-10-18 14:54:00 +08:00
parent 27bee0cfde
commit eacf897f8c
3 changed files with 50 additions and 12 deletions

View File

@@ -1,6 +1,5 @@
use crate::app;
use crate::config::types::ApplicationConfig;
use crate::constants::APP_CONIFG_FILE_PATH;
use clap::{command, Args, Command, Parser, Subcommand};
use color_eyre::Result;
use ratatui::crossterm;
@@ -168,7 +167,7 @@ impl SyncDLSiteCommand {
impl FolderAddCommand {
pub async fn handle(&self) -> Result<()> {
let mut config = ApplicationConfig::from_file(&APP_CONIFG_FILE_PATH.to_path_buf())?;
let mut config = ApplicationConfig::get_config()?;
let path = PathBuf::from(&self.path);
let abs_path = path.canonicalize()?;
if !abs_path.is_dir() {

View File

@@ -15,13 +15,13 @@ impl ApplicationConfig {
}
}
pub fn from_file(path: &PathBuf) -> Result<Self> {
fn from_file(path: &PathBuf) -> Result<Self> {
let reader = std::fs::File::open(path)?;
let result = serde_json::from_reader(reader)?;
Ok(result)
}
pub fn new() -> Self {
fn new() -> Self {
let conf = Self {
basic_config: BasicConfig {
db_path: APP_DATA_DIR

View File

@@ -4,12 +4,14 @@ use color_eyre::eyre::eyre;
use reqwest::Url;
use color_eyre::Result;
use lazy_static::lazy_static;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use crate::constants::APP_DATA_DIR;
use crate::crawler::Crawler;
const DLSITE_URL: &str = "https://www.dlsite.com/";
const DLSITE_API_ENDPOINT: &str = "/maniax/product/info/ajax";
const DLSITE_MANIAX_PATH: &str = "/maniax/work/=/product_id/";
lazy_static! {
pub static ref DLSITE_IMG_FOLDER: PathBuf = APP_DATA_DIR.clone().join("dlsite").join("img");
}
@@ -25,7 +27,9 @@ pub struct DLSiteManiax {
#[serde(rename = "work_image")]
work_image_url: String,
#[serde(rename = "dl_count")]
pub sells_count: u32
pub sells_count: u32,
#[serde(skip)]
pub genre_ids: Vec<u16>
}
impl DLSiteCrawler {
@@ -51,18 +55,25 @@ impl DLSiteCrawler {
pub async fn get_game_info(&self, rj_num: &str) -> Result<DLSiteManiax> {
if !Self::is_valid_number(rj_num) {
return Err(eyre!("Invalid number: {}", rj_num));
return Err(eyre!("Invalid number: {rj_num}"));
}
let mut api_url = self.crawler.base_url.clone();
api_url.set_path(DLSITE_API_ENDPOINT);
api_url.set_query(Some(&format!("product_id={}", rj_num)));
let res = self.crawler.client.get(api_url).send().await?;
let maniax_result = match res.json::<HashMap<String, DLSiteManiax>>().await {
api_url.set_query(Some(&format!("product_id={rj_num}")));
let api_res = self.crawler.client.get(api_url).send().await?;
let maniax_result = match api_res.json::<HashMap<String, DLSiteManiax>>().await {
Ok(maniax_result) => maniax_result,
Err(_) => return Err(eyre!("Maniax {} is restricted/removed", rj_num)),
Err(_) => return Err(eyre!("Maniax {rj_num} is restricted/removed")),
};
let maniax_info = maniax_result.iter().next().unwrap().1.clone();
let mut maniax_info = maniax_result.iter().next().unwrap().1.clone();
self.save_main_image(&maniax_info, rj_num).await?;
let html_path = format!("{DLSITE_MANIAX_PATH}{rj_num}");
let (html, _) = self.crawler.get_html(&html_path).await?;
let genres = self.get_genres(&html)?;
maniax_info.genre_ids = genres;
Ok(maniax_info)
}
@@ -72,7 +83,35 @@ impl DLSiteCrawler {
let img_res = self.crawler.client.get(url).send().await?;
let img_bytes = img_res.bytes().await?;
let img = image::load_from_memory(&img_bytes)?;
img.save(DLSITE_IMG_FOLDER.clone().join(format!("{}.jpg", rj_num)).as_path())?;
img.save(DLSITE_IMG_FOLDER.clone().join(format!("{rj_num}.jpg")).as_path())?;
Ok(())
}
fn get_genres(&self, html: &Html) -> Result<Vec<u16>> {
let selector = Result::unwrap(
Selector::parse(
"#work_outline > tbody:nth-child(1)"
)
);
let result = html.select(&selector).next().unwrap();
let genre_row = result.child_elements()
.filter(|e|
e.child_elements().any(|e| e.inner_html() == "ジャンル")
).next().unwrap();
let data = genre_row
.child_elements().skip(1).next().unwrap()
.child_elements().next().unwrap();
let genre_urls = data.child_elements()
.map(|e| e.attr("href").unwrap())
.map(|s| Url::parse(s).unwrap())
.collect::<Vec<_>>();
let genre_ids = genre_urls.iter()
.map(|x| {
x.path_segments().unwrap()
.skip(4).next().unwrap()
.parse::<u16>().unwrap()
})
.collect::<Vec<_>>();
Ok(genre_ids)
}
}