Improve genre parsing

Fix game list sorting
This commit is contained in:
2025-12-14 22:42:31 +08:00
parent 12e8bd2b92
commit 111de9a8b0
4 changed files with 52 additions and 38 deletions

View File

@@ -5,7 +5,7 @@ edition = "2024"
[dependencies]
image = "0.25.9"
reqwest = { version = "0.12.25", features = ["json"] }
reqwest = { version = "0.12.25", features = ["json", "rustls-tls"] }
scraper = "0.25.0"
robotstxt = "0.3.0"

View File

@@ -3,7 +3,7 @@ use std::collections::{HashMap, HashSet};
use std::path::{PathBuf};
use color_eyre::eyre::eyre;
use color_eyre::owo_colors::OwoColorize;
use reqwest::{Url};
use reqwest::{StatusCode, Url};
use color_eyre::{Report, Result};
use futures::stream::FuturesUnordered;
use itertools::Itertools;
@@ -42,7 +42,7 @@ impl DLSiteCrawler {
Ok(crawler)
}
pub async fn get_game_infos(&self, rj_nums: Vec<String>, locale: &LanguageTag) -> Result<FuturesUnordered<impl Future<Output=Result<DLSiteManiax, Report>>>>
pub async fn get_game_infos(&self, rj_nums: Vec<String>, locale: &LanguageTag) -> Result<FuturesUnordered<impl Future<Output=Result<Option<DLSiteManiax>, Report>>>>
{
let invalid_nums = rj_nums.iter()
.filter(|&n| !is_valid_rj_number(n))
@@ -77,11 +77,14 @@ impl DLSiteCrawler {
self.save_main_image(&info, &rj_num),
self.crawler.get_html(&html_path, Some(&query))
);
let (html, _) = html_result?;
let genres = self.get_work_genres(&html, locale.try_into()?).await?;
info.genre_ids = genres;
let (html, status) = html_result?;
if StatusCode::NOT_FOUND == status {
println!("{} is no longer available", rj_num);
return Ok(None);
}
info.genre_ids = self.get_work_genres(&html, locale.try_into()?).await?;
info.rj_num = rj_num;
Ok::<DLSiteManiax, Report>(info)
Ok::<Option<DLSiteManiax>, Report>(Some(info))
})
}
Ok(tasks)
@@ -118,33 +121,29 @@ impl DLSiteCrawler {
async fn get_work_genres(&self, html: &Html, primary_language: PrimaryLanguage) -> Result<Vec<u16>> {
let selector = Result::unwrap(
Selector::parse(
"#work_outline > tbody:nth-child(1)"
".main_genre"
)
);
let genre_str = match primary_language {
PrimaryLanguage::EN => "Genre",
PrimaryLanguage::JP => "ジャンル"
let Some(result) = html.select(&selector).next() else {
return Err(eyre!("Genre is empty"));
};
let result = html.select(&selector).next().unwrap();
let genre_rows = result.child_elements().collect::<Vec<_>>();
let genre_row = genre_rows.iter()
.find(|v| v.first_element_child().unwrap().text().next().unwrap() == genre_str)
.unwrap();
let data = genre_row
.child_elements().skip(1).next().unwrap()
.child_elements().next().unwrap();
let genre_urls = data.child_elements()
.map(|e| e.attr("href").unwrap())
.map(|s| Url::parse(s).unwrap())
.collect::<Vec<_>>();
let genre_ids = genre_urls.iter()
.map(|x| {
x.path_segments().unwrap()
.skip(4).next().unwrap()
.parse::<u16>().unwrap()
})
.collect::<Vec<_>>();
let mut genre_ids = Vec::new();
for elem in result.child_elements() {
let Some(genre_href) = elem.attr("href") else {
return Err(eyre!("Genre url is empty"));
};
let genre_url = Url::parse(genre_href)?;
let Some(path_segments) = genre_url.path_segments() else {
return Err(eyre!("Genre url has no segment: {}", genre_href));
};
let Some(genre_id) = genre_url.path_segments().unwrap()
.into_iter()
.skip(4)
.next() else {
return Err(eyre!("Invalid url: {}", genre_href));
};
genre_ids.push(genre_id.parse::<u16>()?);
}
Ok(genre_ids)
}
}