112 lines
3.5 KiB
Rust
Executable File
112 lines
3.5 KiB
Rust
Executable File
mod dlsite;
|
|
|
|
use color_eyre::eyre::eyre;
|
|
use color_eyre::Result;
|
|
use image::DynamicImage;
|
|
use reqwest::{Client, StatusCode, Url};
|
|
use robotstxt::DefaultMatcher;
|
|
use scraper::Html;
|
|
use serde::de::DeserializeOwned;
|
|
|
|
pub use dlsite::*;
|
|
|
|
#[derive(Clone, Debug)]
|
|
struct Crawler {
|
|
id: String,
|
|
pub(crate) base_url: Url,
|
|
client: Client,
|
|
robots_txt: Option<String>
|
|
}
|
|
|
|
impl Crawler {
|
|
pub fn new(id: &str, base_url: Url) -> Self {
|
|
let crawler = Self {
|
|
id: id.to_string(),
|
|
client: Client::new(),
|
|
robots_txt: None,
|
|
base_url,
|
|
};
|
|
crawler
|
|
}
|
|
|
|
async fn check_access(&self, url: &Url) -> Result<()> {
|
|
let mut matcher = DefaultMatcher::default();
|
|
let is_access_allowed = matcher.one_agent_allowed_by_robots(
|
|
&self.get_robots_txt().await?,
|
|
"reqwest",
|
|
url.as_str(),
|
|
);
|
|
if !is_access_allowed {
|
|
return Err(eyre!("Crawler cannot access site {}", self.base_url.as_str()));
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
async fn get_robots_txt(&self) -> Result<String> {
|
|
if let Some(txt) = &self.robots_txt {
|
|
return Ok(txt.clone());
|
|
}
|
|
|
|
let local_robots_path = models::APP_CACHE_PATH.clone().join(&self.id).join("robots.txt");
|
|
if !local_robots_path.exists() {
|
|
let mut robots_url = self.base_url.clone();
|
|
robots_url.set_path("/robots.txt");
|
|
let response = reqwest::get(robots_url).await.expect(
|
|
format!(
|
|
"Failed to get robots.txt in `{}/robots.txt`",
|
|
self.base_url.as_str()
|
|
)
|
|
.as_str(),
|
|
);
|
|
let content = response.text().await?;
|
|
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await?;
|
|
tokio::fs::write(&local_robots_path, &content).await?;
|
|
Ok(content)
|
|
} else {
|
|
Ok(tokio::fs::read_to_string(&local_robots_path).await?)
|
|
}
|
|
}
|
|
|
|
pub async fn get_html(&self, path: &str, query: Option<&str>) -> Result<(Html, StatusCode)> {
|
|
let mut url = self.base_url.clone();
|
|
self.check_access(&url).await?;
|
|
url.set_path(path);
|
|
url.set_query(query);
|
|
let res = self.client.get(url).send().await?;
|
|
let status = res.status();
|
|
let html_text = &res.text().await?;
|
|
Ok((Html::parse_document(html_text), status))
|
|
}
|
|
|
|
pub async fn get_json<T>(&self, path: &str, query: Option<&str>) -> Result<(T, StatusCode)>
|
|
where T : DeserializeOwned {
|
|
let mut url = self.base_url.clone();
|
|
url.set_path(path);
|
|
url.set_query(query);
|
|
self.check_access(&url).await?;
|
|
let res = self.client.get(url).send().await?;
|
|
let status = res.status();
|
|
let json = res.json().await?;
|
|
Ok((json, status))
|
|
}
|
|
|
|
pub async fn get_img(&self, url: &Url) -> Result<(DynamicImage, StatusCode)> {
|
|
self.check_access(url).await?;
|
|
let res = self.client.get(url.clone()).send().await?;
|
|
let status = res.status();
|
|
let bytes = res.bytes().await?;
|
|
let img = image::load_from_memory(&bytes)?;
|
|
Ok((img, status))
|
|
}
|
|
|
|
pub async fn get_bytes(&self, path: &str) -> Result<(Vec<u8>, StatusCode)> {
|
|
let mut url = self.base_url.clone();
|
|
url.set_path(path);
|
|
self.check_access(&url).await?;
|
|
let res = self.client.get(url).send().await?;
|
|
let status = res.status();
|
|
let bytes = res.bytes().await?;
|
|
Ok((bytes.to_vec(), status))
|
|
}
|
|
}
|