Add dlsite genres scraping
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
use crate::app;
|
||||
use crate::config::types::ApplicationConfig;
|
||||
use crate::constants::APP_CONIFG_FILE_PATH;
|
||||
use clap::{command, Args, Command, Parser, Subcommand};
|
||||
use color_eyre::Result;
|
||||
use ratatui::crossterm;
|
||||
@@ -168,7 +167,7 @@ impl SyncDLSiteCommand {
|
||||
|
||||
impl FolderAddCommand {
|
||||
pub async fn handle(&self) -> Result<()> {
|
||||
let mut config = ApplicationConfig::from_file(&APP_CONIFG_FILE_PATH.to_path_buf())?;
|
||||
let mut config = ApplicationConfig::get_config()?;
|
||||
let path = PathBuf::from(&self.path);
|
||||
let abs_path = path.canonicalize()?;
|
||||
if !abs_path.is_dir() {
|
||||
|
||||
@@ -15,13 +15,13 @@ impl ApplicationConfig {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_file(path: &PathBuf) -> Result<Self> {
|
||||
fn from_file(path: &PathBuf) -> Result<Self> {
|
||||
let reader = std::fs::File::open(path)?;
|
||||
let result = serde_json::from_reader(reader)?;
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
pub fn new() -> Self {
|
||||
fn new() -> Self {
|
||||
let conf = Self {
|
||||
basic_config: BasicConfig {
|
||||
db_path: APP_DATA_DIR
|
||||
|
||||
@@ -4,12 +4,14 @@ use color_eyre::eyre::eyre;
|
||||
use reqwest::Url;
|
||||
use color_eyre::Result;
|
||||
use lazy_static::lazy_static;
|
||||
use scraper::{Html, Selector};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use crate::constants::APP_DATA_DIR;
|
||||
use crate::crawler::Crawler;
|
||||
|
||||
const DLSITE_URL: &str = "https://www.dlsite.com/";
|
||||
const DLSITE_API_ENDPOINT: &str = "/maniax/product/info/ajax";
|
||||
const DLSITE_MANIAX_PATH: &str = "/maniax/work/=/product_id/";
|
||||
lazy_static! {
|
||||
pub static ref DLSITE_IMG_FOLDER: PathBuf = APP_DATA_DIR.clone().join("dlsite").join("img");
|
||||
}
|
||||
@@ -25,7 +27,9 @@ pub struct DLSiteManiax {
|
||||
#[serde(rename = "work_image")]
|
||||
work_image_url: String,
|
||||
#[serde(rename = "dl_count")]
|
||||
pub sells_count: u32
|
||||
pub sells_count: u32,
|
||||
#[serde(skip)]
|
||||
pub genre_ids: Vec<u16>
|
||||
}
|
||||
|
||||
impl DLSiteCrawler {
|
||||
@@ -51,18 +55,25 @@ impl DLSiteCrawler {
|
||||
|
||||
pub async fn get_game_info(&self, rj_num: &str) -> Result<DLSiteManiax> {
|
||||
if !Self::is_valid_number(rj_num) {
|
||||
return Err(eyre!("Invalid number: {}", rj_num));
|
||||
return Err(eyre!("Invalid number: {rj_num}"));
|
||||
}
|
||||
let mut api_url = self.crawler.base_url.clone();
|
||||
api_url.set_path(DLSITE_API_ENDPOINT);
|
||||
api_url.set_query(Some(&format!("product_id={}", rj_num)));
|
||||
let res = self.crawler.client.get(api_url).send().await?;
|
||||
let maniax_result = match res.json::<HashMap<String, DLSiteManiax>>().await {
|
||||
api_url.set_query(Some(&format!("product_id={rj_num}")));
|
||||
let api_res = self.crawler.client.get(api_url).send().await?;
|
||||
let maniax_result = match api_res.json::<HashMap<String, DLSiteManiax>>().await {
|
||||
Ok(maniax_result) => maniax_result,
|
||||
Err(_) => return Err(eyre!("Maniax {} is restricted/removed", rj_num)),
|
||||
Err(_) => return Err(eyre!("Maniax {rj_num} is restricted/removed")),
|
||||
};
|
||||
let maniax_info = maniax_result.iter().next().unwrap().1.clone();
|
||||
let mut maniax_info = maniax_result.iter().next().unwrap().1.clone();
|
||||
|
||||
self.save_main_image(&maniax_info, rj_num).await?;
|
||||
|
||||
let html_path = format!("{DLSITE_MANIAX_PATH}{rj_num}");
|
||||
let (html, _) = self.crawler.get_html(&html_path).await?;
|
||||
let genres = self.get_genres(&html)?;
|
||||
maniax_info.genre_ids = genres;
|
||||
|
||||
Ok(maniax_info)
|
||||
}
|
||||
|
||||
@@ -72,7 +83,35 @@ impl DLSiteCrawler {
|
||||
let img_res = self.crawler.client.get(url).send().await?;
|
||||
let img_bytes = img_res.bytes().await?;
|
||||
let img = image::load_from_memory(&img_bytes)?;
|
||||
img.save(DLSITE_IMG_FOLDER.clone().join(format!("{}.jpg", rj_num)).as_path())?;
|
||||
img.save(DLSITE_IMG_FOLDER.clone().join(format!("{rj_num}.jpg")).as_path())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_genres(&self, html: &Html) -> Result<Vec<u16>> {
|
||||
let selector = Result::unwrap(
|
||||
Selector::parse(
|
||||
"#work_outline > tbody:nth-child(1)"
|
||||
)
|
||||
);
|
||||
let result = html.select(&selector).next().unwrap();
|
||||
let genre_row = result.child_elements()
|
||||
.filter(|e|
|
||||
e.child_elements().any(|e| e.inner_html() == "ジャンル")
|
||||
).next().unwrap();
|
||||
let data = genre_row
|
||||
.child_elements().skip(1).next().unwrap()
|
||||
.child_elements().next().unwrap();
|
||||
let genre_urls = data.child_elements()
|
||||
.map(|e| e.attr("href").unwrap())
|
||||
.map(|s| Url::parse(s).unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
let genre_ids = genre_urls.iter()
|
||||
.map(|x| {
|
||||
x.path_segments().unwrap()
|
||||
.skip(4).next().unwrap()
|
||||
.parse::<u16>().unwrap()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
Ok(genre_ids)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user