Refactor structure
This commit is contained in:
111
crawler/src/lib.rs
Executable file
111
crawler/src/lib.rs
Executable file
@@ -0,0 +1,111 @@
|
||||
mod dlsite;
|
||||
|
||||
use color_eyre::eyre::eyre;
|
||||
use color_eyre::Result;
|
||||
use image::DynamicImage;
|
||||
use reqwest::{Client, StatusCode, Url};
|
||||
use robotstxt::DefaultMatcher;
|
||||
use scraper::Html;
|
||||
use serde::de::DeserializeOwned;
|
||||
|
||||
pub use dlsite::*;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct Crawler {
|
||||
id: String,
|
||||
pub(crate) base_url: Url,
|
||||
client: Client,
|
||||
robots_txt: Option<String>
|
||||
}
|
||||
|
||||
impl Crawler {
|
||||
pub fn new(id: &str, base_url: Url) -> Self {
|
||||
let crawler = Self {
|
||||
id: id.to_string(),
|
||||
client: Client::new(),
|
||||
robots_txt: None,
|
||||
base_url,
|
||||
};
|
||||
crawler
|
||||
}
|
||||
|
||||
async fn check_access(&self, url: &Url) -> Result<()> {
|
||||
let mut matcher = DefaultMatcher::default();
|
||||
let is_access_allowed = matcher.one_agent_allowed_by_robots(
|
||||
&self.get_robots_txt().await?,
|
||||
"reqwest",
|
||||
url.as_str(),
|
||||
);
|
||||
if !is_access_allowed {
|
||||
return Err(eyre!("Crawler cannot access site {}", self.base_url.as_str()));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn get_robots_txt(&self) -> Result<String> {
|
||||
if let Some(txt) = &self.robots_txt {
|
||||
return Ok(txt.clone());
|
||||
}
|
||||
|
||||
let local_robots_path = models::APP_CACHE_PATH.clone().join(&self.id).join("robots.txt");
|
||||
if !local_robots_path.exists() {
|
||||
let mut robots_url = self.base_url.clone();
|
||||
robots_url.set_path("/robots.txt");
|
||||
let response = reqwest::get(robots_url).await.expect(
|
||||
format!(
|
||||
"Failed to get robots.txt in `{}/robots.txt`",
|
||||
self.base_url.as_str()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
let content = response.text().await?;
|
||||
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await?;
|
||||
tokio::fs::write(&local_robots_path, &content).await?;
|
||||
Ok(content)
|
||||
} else {
|
||||
Ok(tokio::fs::read_to_string(&local_robots_path).await?)
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_html(&self, path: &str, query: Option<&str>) -> Result<(Html, StatusCode)> {
|
||||
let mut url = self.base_url.clone();
|
||||
self.check_access(&url).await?;
|
||||
url.set_path(path);
|
||||
url.set_query(query);
|
||||
let res = self.client.get(url).send().await?;
|
||||
let status = res.status();
|
||||
let html_text = &res.text().await?;
|
||||
Ok((Html::parse_document(html_text), status))
|
||||
}
|
||||
|
||||
pub async fn get_json<T>(&self, path: &str, query: Option<&str>) -> Result<(T, StatusCode)>
|
||||
where T : DeserializeOwned {
|
||||
let mut url = self.base_url.clone();
|
||||
url.set_path(path);
|
||||
url.set_query(query);
|
||||
self.check_access(&url).await?;
|
||||
let res = self.client.get(url).send().await?;
|
||||
let status = res.status();
|
||||
let json = res.json().await?;
|
||||
Ok((json, status))
|
||||
}
|
||||
|
||||
pub async fn get_img(&self, url: &Url) -> Result<(DynamicImage, StatusCode)> {
|
||||
self.check_access(url).await?;
|
||||
let res = self.client.get(url.clone()).send().await?;
|
||||
let status = res.status();
|
||||
let bytes = res.bytes().await?;
|
||||
let img = image::load_from_memory(&bytes)?;
|
||||
Ok((img, status))
|
||||
}
|
||||
|
||||
pub async fn get_bytes(&self, path: &str) -> Result<(Vec<u8>, StatusCode)> {
|
||||
let mut url = self.base_url.clone();
|
||||
url.set_path(path);
|
||||
self.check_access(&url).await?;
|
||||
let res = self.client.get(url).send().await?;
|
||||
let status = res.status();
|
||||
let bytes = res.bytes().await?;
|
||||
Ok((bytes.to_vec(), status))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user