Refactor structure

This commit is contained in:
2025-12-14 21:34:06 +08:00
parent 952f00261b
commit 27cb9fa32f
37 changed files with 712 additions and 486 deletions

194
crawler/src/dlsite.rs Executable file
View File

@@ -0,0 +1,194 @@
use std::collections::{HashMap, HashSet};
use std::path::{PathBuf};
use color_eyre::eyre::eyre;
use color_eyre::owo_colors::OwoColorize;
use reqwest::{Url};
use color_eyre::{Report, Result};
use futures::stream::FuturesUnordered;
use itertools::Itertools;
use language_tags::LanguageTag;
use lazy_static::lazy_static;
use scraper::{Element, Html, Selector};
use serde_json::Value;
use models::APP_DATA_DIR;
use models::dlsite::{matches_primary_language, PrimaryLanguage, JP_LOCALE};
use super::Crawler;
use models::dlsite::crawler::*;
//TODO: override locale with user one
const DLSITE_URL: &str = "https://www.dlsite.com/";
const DLSITE_PRODUCT_API_ENDPOINT: &str = "/maniax/product/info/ajax";
const DLSITE_FS_ENDPOINT: &str = "/maniax/fs/=/api_access/1/";
const DLSITE_MANIAX_PATH: &str = "/maniax/work/=/product_id/";
lazy_static! {
pub static ref DLSITE_IMG_FOLDER: PathBuf = APP_DATA_DIR.clone().join("dlsite").join("img");
}
#[derive(Clone, Debug)]
pub struct DLSiteCrawler {
crawler: Crawler
}
impl DLSiteCrawler {
pub fn new() -> Result<Self> {
let url = Url::parse(DLSITE_URL)?;
let crawler = Self {
crawler: Crawler::new(
"DLSite",
url
)
};
Ok(crawler)
}
pub async fn get_game_infos(&self, rj_nums: Vec<String>, locale: &LanguageTag) -> Result<FuturesUnordered<impl Future<Output=Result<DLSiteManiax, Report>>>>
{
let invalid_nums = rj_nums.iter()
.filter(|&n| !is_valid_rj_number(n))
.map(|n| n.to_string())
.collect::<Vec<String>>();
if !invalid_nums.is_empty() {
return Err(
eyre!("Invalid numbers: {}", invalid_nums.join(", "))
);
}
let primary_language: PrimaryLanguage = locale.try_into()?;
let locale_str = match primary_language {
PrimaryLanguage::EN => "en_US",
PrimaryLanguage::JP => "ja_JP",
};
let query = &format!("product_id={}&locale={}", rj_nums.join(","), locale_str);
let (value, _) = self.crawler
.get_json::<Value>(DLSITE_PRODUCT_API_ENDPOINT, Some(query))
.await?;
// try to catch '[]' empty result from the api
let value_downcast_result: Result<HashMap<String, DLSiteManiax>, _> = serde_json::from_value(value);
let maniax_result = value_downcast_result.unwrap_or(HashMap::new());
Self::verify_all_works_exists(&maniax_result, rj_nums);
let tasks = FuturesUnordered::new();
for (rj_num, mut info) in maniax_result {
tasks.push(async move {
let html_path = format!("{DLSITE_MANIAX_PATH}{rj_num}");
let query = format!("locale={locale_str}");
let (_, html_result) = tokio::join!(
self.save_main_image(&info, &rj_num),
self.crawler.get_html(&html_path, Some(&query))
);
let (html, _) = html_result?;
let genres = self.get_work_genres(&html, locale.try_into()?).await?;
info.genre_ids = genres;
info.rj_num = rj_num;
Ok::<DLSiteManiax, Report>(info)
})
}
Ok(tasks)
}
fn verify_all_works_exists(maniax_result: &HashMap<String, DLSiteManiax>, rj_nums: Vec<String>) {
let keys = maniax_result.keys()
.map(|k| k.to_string())
.collect::<Vec<String>>();
let keys_hash: HashSet<String> = HashSet::from_iter(keys);
let nums_hash: HashSet<String> = HashSet::from_iter(rj_nums);
let nums_diff = nums_hash.difference(&keys_hash)
.map(|n| n.to_string())
.collect::<Vec<String>>();
if !nums_diff.is_empty() {
println!("Restricted/Removed Works: {}", nums_diff.join(", ").red());
}
}
async fn save_main_image(&self, info: &DLSiteManiax, rj_num: &str) -> Result<()> {
let img_file_name = format!("{rj_num}.jpg");
let img_save_path = DLSITE_IMG_FOLDER.clone().join(img_file_name);
if img_save_path.exists() {
return Ok(());
}
let url_string = format!("https:{}", info.work_image_url);
let url = Url::parse(&url_string)?;
let (img, _) = self.crawler.get_img(&url).await?;
img.save(img_save_path)?;
Ok(())
}
async fn get_work_genres(&self, html: &Html, primary_language: PrimaryLanguage) -> Result<Vec<u16>> {
let selector = Result::unwrap(
Selector::parse(
"#work_outline > tbody:nth-child(1)"
)
);
let genre_str = match primary_language {
PrimaryLanguage::EN => "Genre",
PrimaryLanguage::JP => "ジャンル"
};
let result = html.select(&selector).next().unwrap();
let genre_rows = result.child_elements().collect::<Vec<_>>();
let genre_row = genre_rows.iter()
.find(|v| v.first_element_child().unwrap().text().next().unwrap() == genre_str)
.unwrap();
let data = genre_row
.child_elements().skip(1).next().unwrap()
.child_elements().next().unwrap();
let genre_urls = data.child_elements()
.map(|e| e.attr("href").unwrap())
.map(|s| Url::parse(s).unwrap())
.collect::<Vec<_>>();
let genre_ids = genre_urls.iter()
.map(|x| {
x.path_segments().unwrap()
.skip(4).next().unwrap()
.parse::<u16>().unwrap()
})
.collect::<Vec<_>>();
Ok(genre_ids)
}
}
impl DLSiteCrawler {
pub async fn get_all_genres(&self, locale: &LanguageTag) -> Result<Vec<DLSiteGenreCategory>> {
let primary_language: PrimaryLanguage = locale.try_into()?;
let locale_str = match primary_language {
PrimaryLanguage::EN => "en_US",
PrimaryLanguage::JP => "ja_JP",
};
let query = format!("locale={}", locale_str);
let (json, _) = self.crawler.get_json::<DLSiteFilter>(DLSITE_FS_ENDPOINT, Some(&query)).await?;
let values =
if matches_primary_language(&locale, &JP_LOCALE) {
serde_json::from_value::<Vec<DLSiteGenreCategory>>(json.genre_all)?
} else {
// IDK why they are using different object type bruh
serde_json::from_value::<HashMap<u16, DLSiteGenreCategory>>(json.genre_all)?
.into_iter().map(|(_, v)| v)
.collect_vec()
};
let mut categories = Vec::new();
for (i, value) in values.into_iter().enumerate() {
let mut category = value.clone();
category.id = i as u8;
categories.push(category);
}
Ok(categories)
}
}
pub fn is_valid_rj_number(rj_num: &str) -> bool {
let len = rj_num.len();
if len != 8 && len != 10 {
return false;
}
if !rj_num.starts_with("RJ") {
return false;
}
if !rj_num.chars().skip(2).all(|c| c.is_numeric()) {
return false;
}
true
}