Refactor structure

This commit is contained in:
2025-12-14 21:34:06 +08:00
parent 952f00261b
commit 27cb9fa32f
37 changed files with 712 additions and 486 deletions

111
crawler/src/lib.rs Executable file
View File

@@ -0,0 +1,111 @@
mod dlsite;
use color_eyre::eyre::eyre;
use color_eyre::Result;
use image::DynamicImage;
use reqwest::{Client, StatusCode, Url};
use robotstxt::DefaultMatcher;
use scraper::Html;
use serde::de::DeserializeOwned;
pub use dlsite::*;
#[derive(Clone, Debug)]
struct Crawler {
id: String,
pub(crate) base_url: Url,
client: Client,
robots_txt: Option<String>
}
impl Crawler {
pub fn new(id: &str, base_url: Url) -> Self {
let crawler = Self {
id: id.to_string(),
client: Client::new(),
robots_txt: None,
base_url,
};
crawler
}
async fn check_access(&self, url: &Url) -> Result<()> {
let mut matcher = DefaultMatcher::default();
let is_access_allowed = matcher.one_agent_allowed_by_robots(
&self.get_robots_txt().await?,
"reqwest",
url.as_str(),
);
if !is_access_allowed {
return Err(eyre!("Crawler cannot access site {}", self.base_url.as_str()));
}
Ok(())
}
async fn get_robots_txt(&self) -> Result<String> {
if let Some(txt) = &self.robots_txt {
return Ok(txt.clone());
}
let local_robots_path = models::APP_CACHE_PATH.clone().join(&self.id).join("robots.txt");
if !local_robots_path.exists() {
let mut robots_url = self.base_url.clone();
robots_url.set_path("/robots.txt");
let response = reqwest::get(robots_url).await.expect(
format!(
"Failed to get robots.txt in `{}/robots.txt`",
self.base_url.as_str()
)
.as_str(),
);
let content = response.text().await?;
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await?;
tokio::fs::write(&local_robots_path, &content).await?;
Ok(content)
} else {
Ok(tokio::fs::read_to_string(&local_robots_path).await?)
}
}
pub async fn get_html(&self, path: &str, query: Option<&str>) -> Result<(Html, StatusCode)> {
let mut url = self.base_url.clone();
self.check_access(&url).await?;
url.set_path(path);
url.set_query(query);
let res = self.client.get(url).send().await?;
let status = res.status();
let html_text = &res.text().await?;
Ok((Html::parse_document(html_text), status))
}
pub async fn get_json<T>(&self, path: &str, query: Option<&str>) -> Result<(T, StatusCode)>
where T : DeserializeOwned {
let mut url = self.base_url.clone();
url.set_path(path);
url.set_query(query);
self.check_access(&url).await?;
let res = self.client.get(url).send().await?;
let status = res.status();
let json = res.json().await?;
Ok((json, status))
}
pub async fn get_img(&self, url: &Url) -> Result<(DynamicImage, StatusCode)> {
self.check_access(url).await?;
let res = self.client.get(url.clone()).send().await?;
let status = res.status();
let bytes = res.bytes().await?;
let img = image::load_from_memory(&bytes)?;
Ok((img, status))
}
pub async fn get_bytes(&self, path: &str) -> Result<(Vec<u8>, StatusCode)> {
let mut url = self.base_url.clone();
url.set_path(path);
self.check_access(&url).await?;
let res = self.client.get(url).send().await?;
let status = res.status();
let bytes = res.bytes().await?;
Ok((bytes.to_vec(), status))
}
}