mod dlsite; use color_eyre::eyre::eyre; use color_eyre::Result; use image::DynamicImage; use reqwest::{Client, StatusCode, Url}; use robotstxt::DefaultMatcher; use scraper::Html; use serde::de::DeserializeOwned; pub use dlsite::*; #[derive(Clone, Debug)] struct Crawler { id: String, pub(crate) base_url: Url, client: Client, robots_txt: Option } impl Crawler { pub fn new(id: &str, base_url: Url) -> Self { let crawler = Self { id: id.to_string(), client: Client::new(), robots_txt: None, base_url, }; crawler } async fn check_access(&self, url: &Url) -> Result<()> { let mut matcher = DefaultMatcher::default(); let is_access_allowed = matcher.one_agent_allowed_by_robots( &self.get_robots_txt().await?, "reqwest", url.as_str(), ); if !is_access_allowed { return Err(eyre!("Crawler cannot access site {}", self.base_url.as_str())); } Ok(()) } async fn get_robots_txt(&self) -> Result { if let Some(txt) = &self.robots_txt { return Ok(txt.clone()); } let local_robots_path = models::APP_CACHE_PATH.clone().join(&self.id).join("robots.txt"); if !local_robots_path.exists() { let mut robots_url = self.base_url.clone(); robots_url.set_path("/robots.txt"); let response = reqwest::get(robots_url).await.expect( format!( "Failed to get robots.txt in `{}/robots.txt`", self.base_url.as_str() ) .as_str(), ); let content = response.text().await?; tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await?; tokio::fs::write(&local_robots_path, &content).await?; Ok(content) } else { Ok(tokio::fs::read_to_string(&local_robots_path).await?) } } pub async fn get_html(&self, path: &str, query: Option<&str>) -> Result<(Html, StatusCode)> { let mut url = self.base_url.clone(); self.check_access(&url).await?; url.set_path(path); url.set_query(query); let res = self.client.get(url).send().await?; let status = res.status(); let html_text = &res.text().await?; Ok((Html::parse_document(html_text), status)) } pub async fn get_json(&self, path: &str, query: Option<&str>) -> Result<(T, StatusCode)> where T : DeserializeOwned { let mut url = self.base_url.clone(); url.set_path(path); url.set_query(query); self.check_access(&url).await?; let res = self.client.get(url).send().await?; let status = res.status(); let json = res.json().await?; Ok((json, status)) } pub async fn get_img(&self, url: &Url) -> Result<(DynamicImage, StatusCode)> { self.check_access(url).await?; let res = self.client.get(url.clone()).send().await?; let status = res.status(); let bytes = res.bytes().await?; let img = image::load_from_memory(&bytes)?; Ok((img, status)) } pub async fn get_bytes(&self, path: &str) -> Result<(Vec, StatusCode)> { let mut url = self.base_url.clone(); url.set_path(path); self.check_access(&url).await?; let res = self.client.get(url).send().await?; let status = res.status(); let bytes = res.bytes().await?; Ok((bytes.to_vec(), status)) } }