Add dlsite genres scraping

2025-10-18 14:54:00 +08:00
parent 27bee0cfde
commit eacf897f8c
3 changed files with 50 additions and 12 deletions
--- a/src/cli.rs
+++ b/src/cli.rs
@@ -1,6 +1,5 @@
 use crate::app;
 use crate::config::types::ApplicationConfig;
-use crate::constants::APP_CONIFG_FILE_PATH;
 use clap::{command, Args, Command, Parser, Subcommand};
 use color_eyre::Result;
 use ratatui::crossterm;
@@ -168,7 +167,7 @@ impl SyncDLSiteCommand {

 impl FolderAddCommand {
    pub async fn handle(&self) -> Result<()> {
-        let mut config = ApplicationConfig::from_file(&APP_CONIFG_FILE_PATH.to_path_buf())?;
+        let mut config = ApplicationConfig::get_config()?;
        let path = PathBuf::from(&self.path);
        let abs_path = path.canonicalize()?;
        if !abs_path.is_dir() {
--- a/src/config/mod.rs
+++ b/src/config/mod.rs
@@ -15,13 +15,13 @@ impl ApplicationConfig {
        }
    }

-    pub fn from_file(path: &PathBuf) -> Result<Self> {
+    fn from_file(path: &PathBuf) -> Result<Self> {
        let reader = std::fs::File::open(path)?;
        let result = serde_json::from_reader(reader)?;
        Ok(result)
    }

-    pub fn new() -> Self {
+    fn new() -> Self {
        let conf = Self {
            basic_config: BasicConfig {
                db_path: APP_DATA_DIR
--- a/src/crawler/dlsite.rs
+++ b/src/crawler/dlsite.rs
@@ -4,12 +4,14 @@ use color_eyre::eyre::eyre;
 use reqwest::Url;
 use color_eyre::Result;
 use lazy_static::lazy_static;
+use scraper::{Html, Selector};
 use serde::{Deserialize, Serialize};
 use crate::constants::APP_DATA_DIR;
 use crate::crawler::Crawler;

 const DLSITE_URL: &str = "https://www.dlsite.com/";
 const DLSITE_API_ENDPOINT: &str = "/maniax/product/info/ajax";
+const DLSITE_MANIAX_PATH: &str = "/maniax/work/=/product_id/";
 lazy_static! {
    pub static ref DLSITE_IMG_FOLDER: PathBuf = APP_DATA_DIR.clone().join("dlsite").join("img");
 }
@@ -25,7 +27,9 @@ pub struct DLSiteManiax {
    #[serde(rename = "work_image")]
    work_image_url: String,
    #[serde(rename = "dl_count")]
-    pub sells_count: u32
+    pub sells_count: u32,
+    #[serde(skip)]
+    pub genre_ids: Vec<u16>
 }

 impl DLSiteCrawler {
@@ -51,18 +55,25 @@ impl DLSiteCrawler {

    pub async fn get_game_info(&self, rj_num: &str) -> Result<DLSiteManiax> {
        if !Self::is_valid_number(rj_num) {
-            return Err(eyre!("Invalid number: {}", rj_num));
+            return Err(eyre!("Invalid number: {rj_num}"));
        }
        let mut api_url = self.crawler.base_url.clone();
        api_url.set_path(DLSITE_API_ENDPOINT);
-        api_url.set_query(Some(&format!("product_id={}", rj_num)));
-        let res = self.crawler.client.get(api_url).send().await?;
-        let maniax_result = match res.json::<HashMap<String, DLSiteManiax>>().await {
+        api_url.set_query(Some(&format!("product_id={rj_num}")));
+        let api_res = self.crawler.client.get(api_url).send().await?;
+        let maniax_result = match api_res.json::<HashMap<String, DLSiteManiax>>().await {
            Ok(maniax_result) => maniax_result,
-            Err(_) => return Err(eyre!("Maniax {} is restricted/removed", rj_num)),
+            Err(_) => return Err(eyre!("Maniax {rj_num} is restricted/removed")),
        };
-        let maniax_info = maniax_result.iter().next().unwrap().1.clone();
+        let mut maniax_info = maniax_result.iter().next().unwrap().1.clone();
+
        self.save_main_image(&maniax_info, rj_num).await?;
+
+        let html_path = format!("{DLSITE_MANIAX_PATH}{rj_num}");
+        let (html, _) = self.crawler.get_html(&html_path).await?;
+        let genres = self.get_genres(&html)?;
+        maniax_info.genre_ids = genres;
+
        Ok(maniax_info)
    }

@@ -72,7 +83,35 @@ impl DLSiteCrawler {
        let img_res = self.crawler.client.get(url).send().await?;
        let img_bytes = img_res.bytes().await?;
        let img = image::load_from_memory(&img_bytes)?;
-        img.save(DLSITE_IMG_FOLDER.clone().join(format!("{}.jpg", rj_num)).as_path())?;
+        img.save(DLSITE_IMG_FOLDER.clone().join(format!("{rj_num}.jpg")).as_path())?;
        Ok(())
    }
+
+    fn get_genres(&self, html: &Html) -> Result<Vec<u16>> {
+        let selector = Result::unwrap(
+            Selector::parse(
+                "#work_outline > tbody:nth-child(1)"
+            )
+        );
+        let result = html.select(&selector).next().unwrap();
+        let genre_row = result.child_elements()
+            .filter(|e|
+                e.child_elements().any(|e| e.inner_html() == "ジャンル")
+            ).next().unwrap();
+        let data = genre_row
+            .child_elements().skip(1).next().unwrap()
+            .child_elements().next().unwrap();
+        let genre_urls = data.child_elements()
+            .map(|e| e.attr("href").unwrap())
+            .map(|s| Url::parse(s).unwrap())
+            .collect::<Vec<_>>();
+        let genre_ids = genre_urls.iter()
+            .map(|x| {
+                x.path_segments().unwrap()
+                    .skip(4).next().unwrap()
+                    .parse::<u16>().unwrap()
+            })
+            .collect::<Vec<_>>();
+        Ok(genre_ids)
+    }
 }