use std::collections::{HashMap, HashSet}; use std::path::{PathBuf}; use color_eyre::eyre::eyre; use color_eyre::owo_colors::OwoColorize; use reqwest::{StatusCode, Url}; use color_eyre::{Report, Result}; use futures::stream::FuturesUnordered; use itertools::Itertools; use language_tags::LanguageTag; use lazy_static::lazy_static; use scraper::{Element, Html, Selector}; use serde_json::Value; use models::APP_DATA_DIR; use models::dlsite::{matches_primary_language, PrimaryLanguage, JP_LOCALE}; use super::Crawler; use models::dlsite::crawler::*; //TODO: override locale with user one const DLSITE_URL: &str = "https://www.dlsite.com/"; const DLSITE_PRODUCT_API_ENDPOINT: &str = "/maniax/product/info/ajax"; const DLSITE_FS_ENDPOINT: &str = "/maniax/fs/=/api_access/1/"; const DLSITE_MANIAX_PATH: &str = "/maniax/work/=/product_id/"; lazy_static! { pub static ref DLSITE_IMG_FOLDER: PathBuf = APP_DATA_DIR.clone().join("dlsite").join("img"); } #[derive(Clone, Debug)] pub struct DLSiteCrawler { crawler: Crawler } impl DLSiteCrawler { pub fn new() -> Result { let url = Url::parse(DLSITE_URL)?; let crawler = Self { crawler: Crawler::new( "DLSite", url ) }; Ok(crawler) } pub async fn get_game_infos(&self, rj_nums: Vec, locale: &LanguageTag) -> Result, Report>>>> { let invalid_nums = rj_nums.iter() .filter(|&n| !is_valid_rj_number(n)) .map(|n| n.to_string()) .collect::>(); if !invalid_nums.is_empty() { return Err( eyre!("Invalid numbers: {}", invalid_nums.join(", ")) ); } let primary_language: PrimaryLanguage = locale.try_into()?; let locale_str = match primary_language { PrimaryLanguage::EN => "en_US", PrimaryLanguage::JP => "ja_JP", }; let query = &format!("product_id={}&locale={}", rj_nums.join(","), locale_str); let (value, _) = self.crawler .get_json::(DLSITE_PRODUCT_API_ENDPOINT, Some(query)) .await?; // try to catch '[]' empty result from the api let value_downcast_result: Result, _> = serde_json::from_value(value); let maniax_result = value_downcast_result.unwrap_or(HashMap::new()); Self::verify_all_works_exists(&maniax_result, rj_nums); let tasks = FuturesUnordered::new(); for (rj_num, mut info) in maniax_result { tasks.push(async move { let html_path = format!("{DLSITE_MANIAX_PATH}{rj_num}"); let query = format!("locale={locale_str}"); let (_, html_result) = tokio::join!( self.save_main_image(&info, &rj_num), self.crawler.get_html(&html_path, Some(&query)) ); let (html, status) = html_result?; if StatusCode::NOT_FOUND == status { println!("{} is no longer available", rj_num); return Ok(None); } info.genre_ids = self.get_work_genres(&html, locale.try_into()?).await?; info.rj_num = rj_num; Ok::, Report>(Some(info)) }) } Ok(tasks) } fn verify_all_works_exists(maniax_result: &HashMap, rj_nums: Vec) { let keys = maniax_result.keys() .map(|k| k.to_string()) .collect::>(); let keys_hash: HashSet = HashSet::from_iter(keys); let nums_hash: HashSet = HashSet::from_iter(rj_nums); let nums_diff = nums_hash.difference(&keys_hash) .map(|n| n.to_string()) .collect::>(); if !nums_diff.is_empty() { println!("Restricted/Removed Works: {}", nums_diff.join(", ").red()); } } async fn save_main_image(&self, info: &DLSiteManiax, rj_num: &str) -> Result<()> { let img_file_name = format!("{rj_num}.jpg"); let img_save_path = DLSITE_IMG_FOLDER.clone().join(img_file_name); if img_save_path.exists() { return Ok(()); } let url_string = format!("https:{}", info.work_image_url); let url = Url::parse(&url_string)?; let (img, _) = self.crawler.get_img(&url).await?; img.save(img_save_path)?; Ok(()) } async fn get_work_genres(&self, html: &Html, primary_language: PrimaryLanguage) -> Result> { let selector = Result::unwrap( Selector::parse( ".main_genre" ) ); let Some(result) = html.select(&selector).next() else { return Err(eyre!("Genre is empty")); }; let mut genre_ids = Vec::new(); for elem in result.child_elements() { let Some(genre_href) = elem.attr("href") else { return Err(eyre!("Genre url is empty")); }; let genre_url = Url::parse(genre_href)?; let Some(path_segments) = genre_url.path_segments() else { return Err(eyre!("Genre url has no segment: {}", genre_href)); }; let Some(genre_id) = genre_url.path_segments().unwrap() .into_iter() .skip(4) .next() else { return Err(eyre!("Invalid url: {}", genre_href)); }; genre_ids.push(genre_id.parse::()?); } Ok(genre_ids) } } impl DLSiteCrawler { pub async fn get_all_genres(&self, locale: &LanguageTag) -> Result> { let primary_language: PrimaryLanguage = locale.try_into()?; let locale_str = match primary_language { PrimaryLanguage::EN => "en_US", PrimaryLanguage::JP => "ja_JP", }; let query = format!("locale={}", locale_str); let (json, _) = self.crawler.get_json::(DLSITE_FS_ENDPOINT, Some(&query)).await?; let values = if matches_primary_language(&locale, &JP_LOCALE) { serde_json::from_value::>(json.genre_all)? } else { // IDK why they are using different object type bruh serde_json::from_value::>(json.genre_all)? .into_iter().map(|(_, v)| v) .collect_vec() }; let mut categories = Vec::new(); for (i, value) in values.into_iter().enumerate() { let mut category = value.clone(); category.id = i as u8; categories.push(category); } Ok(categories) } } pub fn is_valid_rj_number(rj_num: &str) -> bool { let len = rj_num.len(); if len != 8 && len != 10 { return false; } if !rj_num.starts_with("RJ") { return false; } if !rj_num.chars().skip(2).all(|c| c.is_numeric()) { return false; } true }