Improve genre parsing

Fix game list sorting
This commit is contained in:
2025-12-14 22:42:31 +08:00
parent 12e8bd2b92
commit 111de9a8b0
4 changed files with 52 additions and 38 deletions

View File

@@ -5,7 +5,7 @@ edition = "2024"
[dependencies] [dependencies]
image = "0.25.9" image = "0.25.9"
reqwest = { version = "0.12.25", features = ["json"] } reqwest = { version = "0.12.25", features = ["json", "rustls-tls"] }
scraper = "0.25.0" scraper = "0.25.0"
robotstxt = "0.3.0" robotstxt = "0.3.0"

View File

@@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet};
use std::path::{PathBuf}; use std::path::{PathBuf};
use color_eyre::eyre::eyre; use color_eyre::eyre::eyre;
use color_eyre::owo_colors::OwoColorize; use color_eyre::owo_colors::OwoColorize;
use reqwest::{Url}; use reqwest::{StatusCode, Url};
use color_eyre::{Report, Result}; use color_eyre::{Report, Result};
use futures::stream::FuturesUnordered; use futures::stream::FuturesUnordered;
use itertools::Itertools; use itertools::Itertools;
@@ -42,7 +42,7 @@ impl DLSiteCrawler {
Ok(crawler) Ok(crawler)
} }
pub async fn get_game_infos(&self, rj_nums: Vec<String>, locale: &LanguageTag) -> Result<FuturesUnordered<impl Future<Output=Result<DLSiteManiax, Report>>>> pub async fn get_game_infos(&self, rj_nums: Vec<String>, locale: &LanguageTag) -> Result<FuturesUnordered<impl Future<Output=Result<Option<DLSiteManiax>, Report>>>>
{ {
let invalid_nums = rj_nums.iter() let invalid_nums = rj_nums.iter()
.filter(|&n| !is_valid_rj_number(n)) .filter(|&n| !is_valid_rj_number(n))
@@ -77,11 +77,14 @@ impl DLSiteCrawler {
self.save_main_image(&info, &rj_num), self.save_main_image(&info, &rj_num),
self.crawler.get_html(&html_path, Some(&query)) self.crawler.get_html(&html_path, Some(&query))
); );
let (html, _) = html_result?; let (html, status) = html_result?;
let genres = self.get_work_genres(&html, locale.try_into()?).await?; if StatusCode::NOT_FOUND == status {
info.genre_ids = genres; println!("{} is no longer available", rj_num);
return Ok(None);
}
info.genre_ids = self.get_work_genres(&html, locale.try_into()?).await?;
info.rj_num = rj_num; info.rj_num = rj_num;
Ok::<DLSiteManiax, Report>(info) Ok::<Option<DLSiteManiax>, Report>(Some(info))
}) })
} }
Ok(tasks) Ok(tasks)
@@ -118,33 +121,29 @@ impl DLSiteCrawler {
async fn get_work_genres(&self, html: &Html, primary_language: PrimaryLanguage) -> Result<Vec<u16>> { async fn get_work_genres(&self, html: &Html, primary_language: PrimaryLanguage) -> Result<Vec<u16>> {
let selector = Result::unwrap( let selector = Result::unwrap(
Selector::parse( Selector::parse(
"#work_outline > tbody:nth-child(1)" ".main_genre"
) )
); );
let genre_str = match primary_language { let Some(result) = html.select(&selector).next() else {
PrimaryLanguage::EN => "Genre", return Err(eyre!("Genre is empty"));
PrimaryLanguage::JP => "ジャンル"
}; };
let mut genre_ids = Vec::new();
let result = html.select(&selector).next().unwrap(); for elem in result.child_elements() {
let genre_rows = result.child_elements().collect::<Vec<_>>(); let Some(genre_href) = elem.attr("href") else {
let genre_row = genre_rows.iter() return Err(eyre!("Genre url is empty"));
.find(|v| v.first_element_child().unwrap().text().next().unwrap() == genre_str) };
.unwrap(); let genre_url = Url::parse(genre_href)?;
let data = genre_row let Some(path_segments) = genre_url.path_segments() else {
.child_elements().skip(1).next().unwrap() return Err(eyre!("Genre url has no segment: {}", genre_href));
.child_elements().next().unwrap(); };
let genre_urls = data.child_elements() let Some(genre_id) = genre_url.path_segments().unwrap()
.map(|e| e.attr("href").unwrap()) .into_iter()
.map(|s| Url::parse(s).unwrap()) .skip(4)
.collect::<Vec<_>>(); .next() else {
let genre_ids = genre_urls.iter() return Err(eyre!("Invalid url: {}", genre_href));
.map(|x| { };
x.path_segments().unwrap() genre_ids.push(genre_id.parse::<u16>()?);
.skip(4).next().unwrap() }
.parse::<u16>().unwrap()
})
.collect::<Vec<_>>();
Ok(genre_ids) Ok(genre_ids)
} }
} }

View File

@@ -6,10 +6,9 @@ use crossterm::style::{style, Stylize};
use futures::StreamExt; use futures::StreamExt;
use indicatif::{ProgressBar, ProgressStyle}; use indicatif::{ProgressBar, ProgressStyle};
use itertools::Itertools; use itertools::Itertools;
use tokio::sync::Mutex;
use tokio::time::Instant; use tokio::time::Instant;
use crawler::DLSiteCrawler; use crawler::DLSiteCrawler;
use db::{RocksDBFactory}; use db::RocksDBFactory;
use models::config::ApplicationConfig; use models::config::ApplicationConfig;
use models::dlsite::{DLSiteCategory, DLSiteGenre, DLSiteManiax, DLSiteTranslation}; use models::dlsite::{DLSiteCategory, DLSiteGenre, DLSiteManiax, DLSiteTranslation};
use crate::helpers; use crate::helpers;
@@ -48,7 +47,10 @@ impl DLSiteSyncCommand {
pub async fn handle(&self) -> Result<()> { pub async fn handle(&self) -> Result<()> {
let now = Instant::now(); let now = Instant::now();
let app_conf = ApplicationConfig::get_config()?; let app_conf = ApplicationConfig::get_config()?;
let db_factory = RocksDBFactory::default(); let mut db_factory = RocksDBFactory::default();
db_factory.register::<DLSiteManiax>();
db_factory.register::<DLSiteGenre>();
db_factory.register::<DLSiteCategory>();
let crawler = DLSiteCrawler::new()?; let crawler = DLSiteCrawler::new()?;
if self.do_sync_genre { if self.do_sync_genre {
let genre_now = Instant::now(); let genre_now = Instant::now();
@@ -125,9 +127,10 @@ impl DLSiteSyncCommand {
let progress = ProgressBar::new(game_infos.len() as u64) let progress = ProgressBar::new(game_infos.len() as u64)
.with_style(ProgressStyle::default_bar()); .with_style(ProgressStyle::default_bar());
let shared_progress = Mutex::new(progress);
while let Some(info) = game_infos.next().await { while let Some(info) = game_infos.next().await {
let maniax = info?; let Some(maniax) = info? else {
continue;
};
let existing_maniax = existing_game_infos.iter() let existing_maniax = existing_game_infos.iter()
.find(|v| v.rj_num == maniax.rj_num); .find(|v| v.rj_num == maniax.rj_num);
if let Some(existing_maniax) = existing_maniax { if let Some(existing_maniax) = existing_maniax {
@@ -145,7 +148,6 @@ impl DLSiteSyncCommand {
value.folder_path = maniax_folder; value.folder_path = maniax_folder;
modified_maniaxes.push(value); modified_maniaxes.push(value);
} }
let progress = shared_progress.lock().await;
progress.inc(1); progress.inc(1);
} }
db.set_values(&modified_maniaxes)?; db.set_values(&modified_maniaxes)?;

View File

@@ -39,7 +39,20 @@ enum Status {
impl MainView { impl MainView {
pub fn new(mut db_factory: RocksDBFactory) -> color_eyre::Result<Self> { pub fn new(mut db_factory: RocksDBFactory) -> color_eyre::Result<Self> {
let db = db_factory.get_current_context()?; let db = db_factory.get_current_context()?;
let games = db.get_all_values::<DLSiteManiax>()?; let mut games = db.get_all_values::<DLSiteManiax>()?;
games.sort_by(|a, b| {
let left = a.rj_num
.chars().skip(2)
.collect::<String>()
.parse::<u32>()
.unwrap();
let right = b.rj_num
.chars().skip(2)
.collect::<String>()
.parse::<u32>()
.unwrap();
left.cmp(&right)
});
let dl_game_list = GameList::new(games)?; let dl_game_list = GameList::new(games)?;
let view = Self { let view = Self {
state: MainViewState { state: MainViewState {