Add dlsite genres scraping
This commit is contained in:
@@ -1,6 +1,5 @@
|
|||||||
use crate::app;
|
use crate::app;
|
||||||
use crate::config::types::ApplicationConfig;
|
use crate::config::types::ApplicationConfig;
|
||||||
use crate::constants::APP_CONIFG_FILE_PATH;
|
|
||||||
use clap::{command, Args, Command, Parser, Subcommand};
|
use clap::{command, Args, Command, Parser, Subcommand};
|
||||||
use color_eyre::Result;
|
use color_eyre::Result;
|
||||||
use ratatui::crossterm;
|
use ratatui::crossterm;
|
||||||
@@ -168,7 +167,7 @@ impl SyncDLSiteCommand {
|
|||||||
|
|
||||||
impl FolderAddCommand {
|
impl FolderAddCommand {
|
||||||
pub async fn handle(&self) -> Result<()> {
|
pub async fn handle(&self) -> Result<()> {
|
||||||
let mut config = ApplicationConfig::from_file(&APP_CONIFG_FILE_PATH.to_path_buf())?;
|
let mut config = ApplicationConfig::get_config()?;
|
||||||
let path = PathBuf::from(&self.path);
|
let path = PathBuf::from(&self.path);
|
||||||
let abs_path = path.canonicalize()?;
|
let abs_path = path.canonicalize()?;
|
||||||
if !abs_path.is_dir() {
|
if !abs_path.is_dir() {
|
||||||
|
|||||||
@@ -15,13 +15,13 @@ impl ApplicationConfig {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn from_file(path: &PathBuf) -> Result<Self> {
|
fn from_file(path: &PathBuf) -> Result<Self> {
|
||||||
let reader = std::fs::File::open(path)?;
|
let reader = std::fs::File::open(path)?;
|
||||||
let result = serde_json::from_reader(reader)?;
|
let result = serde_json::from_reader(reader)?;
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn new() -> Self {
|
fn new() -> Self {
|
||||||
let conf = Self {
|
let conf = Self {
|
||||||
basic_config: BasicConfig {
|
basic_config: BasicConfig {
|
||||||
db_path: APP_DATA_DIR
|
db_path: APP_DATA_DIR
|
||||||
|
|||||||
@@ -4,12 +4,14 @@ use color_eyre::eyre::eyre;
|
|||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use color_eyre::Result;
|
use color_eyre::Result;
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use crate::constants::APP_DATA_DIR;
|
use crate::constants::APP_DATA_DIR;
|
||||||
use crate::crawler::Crawler;
|
use crate::crawler::Crawler;
|
||||||
|
|
||||||
const DLSITE_URL: &str = "https://www.dlsite.com/";
|
const DLSITE_URL: &str = "https://www.dlsite.com/";
|
||||||
const DLSITE_API_ENDPOINT: &str = "/maniax/product/info/ajax";
|
const DLSITE_API_ENDPOINT: &str = "/maniax/product/info/ajax";
|
||||||
|
const DLSITE_MANIAX_PATH: &str = "/maniax/work/=/product_id/";
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
pub static ref DLSITE_IMG_FOLDER: PathBuf = APP_DATA_DIR.clone().join("dlsite").join("img");
|
pub static ref DLSITE_IMG_FOLDER: PathBuf = APP_DATA_DIR.clone().join("dlsite").join("img");
|
||||||
}
|
}
|
||||||
@@ -25,7 +27,9 @@ pub struct DLSiteManiax {
|
|||||||
#[serde(rename = "work_image")]
|
#[serde(rename = "work_image")]
|
||||||
work_image_url: String,
|
work_image_url: String,
|
||||||
#[serde(rename = "dl_count")]
|
#[serde(rename = "dl_count")]
|
||||||
pub sells_count: u32
|
pub sells_count: u32,
|
||||||
|
#[serde(skip)]
|
||||||
|
pub genre_ids: Vec<u16>
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DLSiteCrawler {
|
impl DLSiteCrawler {
|
||||||
@@ -51,18 +55,25 @@ impl DLSiteCrawler {
|
|||||||
|
|
||||||
pub async fn get_game_info(&self, rj_num: &str) -> Result<DLSiteManiax> {
|
pub async fn get_game_info(&self, rj_num: &str) -> Result<DLSiteManiax> {
|
||||||
if !Self::is_valid_number(rj_num) {
|
if !Self::is_valid_number(rj_num) {
|
||||||
return Err(eyre!("Invalid number: {}", rj_num));
|
return Err(eyre!("Invalid number: {rj_num}"));
|
||||||
}
|
}
|
||||||
let mut api_url = self.crawler.base_url.clone();
|
let mut api_url = self.crawler.base_url.clone();
|
||||||
api_url.set_path(DLSITE_API_ENDPOINT);
|
api_url.set_path(DLSITE_API_ENDPOINT);
|
||||||
api_url.set_query(Some(&format!("product_id={}", rj_num)));
|
api_url.set_query(Some(&format!("product_id={rj_num}")));
|
||||||
let res = self.crawler.client.get(api_url).send().await?;
|
let api_res = self.crawler.client.get(api_url).send().await?;
|
||||||
let maniax_result = match res.json::<HashMap<String, DLSiteManiax>>().await {
|
let maniax_result = match api_res.json::<HashMap<String, DLSiteManiax>>().await {
|
||||||
Ok(maniax_result) => maniax_result,
|
Ok(maniax_result) => maniax_result,
|
||||||
Err(_) => return Err(eyre!("Maniax {} is restricted/removed", rj_num)),
|
Err(_) => return Err(eyre!("Maniax {rj_num} is restricted/removed")),
|
||||||
};
|
};
|
||||||
let maniax_info = maniax_result.iter().next().unwrap().1.clone();
|
let mut maniax_info = maniax_result.iter().next().unwrap().1.clone();
|
||||||
|
|
||||||
self.save_main_image(&maniax_info, rj_num).await?;
|
self.save_main_image(&maniax_info, rj_num).await?;
|
||||||
|
|
||||||
|
let html_path = format!("{DLSITE_MANIAX_PATH}{rj_num}");
|
||||||
|
let (html, _) = self.crawler.get_html(&html_path).await?;
|
||||||
|
let genres = self.get_genres(&html)?;
|
||||||
|
maniax_info.genre_ids = genres;
|
||||||
|
|
||||||
Ok(maniax_info)
|
Ok(maniax_info)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -72,7 +83,35 @@ impl DLSiteCrawler {
|
|||||||
let img_res = self.crawler.client.get(url).send().await?;
|
let img_res = self.crawler.client.get(url).send().await?;
|
||||||
let img_bytes = img_res.bytes().await?;
|
let img_bytes = img_res.bytes().await?;
|
||||||
let img = image::load_from_memory(&img_bytes)?;
|
let img = image::load_from_memory(&img_bytes)?;
|
||||||
img.save(DLSITE_IMG_FOLDER.clone().join(format!("{}.jpg", rj_num)).as_path())?;
|
img.save(DLSITE_IMG_FOLDER.clone().join(format!("{rj_num}.jpg")).as_path())?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_genres(&self, html: &Html) -> Result<Vec<u16>> {
|
||||||
|
let selector = Result::unwrap(
|
||||||
|
Selector::parse(
|
||||||
|
"#work_outline > tbody:nth-child(1)"
|
||||||
|
)
|
||||||
|
);
|
||||||
|
let result = html.select(&selector).next().unwrap();
|
||||||
|
let genre_row = result.child_elements()
|
||||||
|
.filter(|e|
|
||||||
|
e.child_elements().any(|e| e.inner_html() == "ジャンル")
|
||||||
|
).next().unwrap();
|
||||||
|
let data = genre_row
|
||||||
|
.child_elements().skip(1).next().unwrap()
|
||||||
|
.child_elements().next().unwrap();
|
||||||
|
let genre_urls = data.child_elements()
|
||||||
|
.map(|e| e.attr("href").unwrap())
|
||||||
|
.map(|s| Url::parse(s).unwrap())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let genre_ids = genre_urls.iter()
|
||||||
|
.map(|x| {
|
||||||
|
x.path_segments().unwrap()
|
||||||
|
.skip(4).next().unwrap()
|
||||||
|
.parse::<u16>().unwrap()
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
Ok(genre_ids)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user