Reformat code

This commit is contained in:
2025-10-15 21:57:33 +08:00
parent a776e55187
commit d033d8b93f
19 changed files with 206 additions and 194 deletions

View File

@@ -5,6 +5,4 @@ pub(crate) struct DLSiteCrawler {
crawler: Crawler,
}
impl DLSiteCrawler {
}
impl DLSiteCrawler {}

View File

@@ -1,17 +1,17 @@
mod dlsite;
use reqwest::{Client, Url};
use robotstxt::{DefaultMatcher};
use crate::constants::APP_CACHE_PATH;
use color_eyre::Result;
use reqwest::{Client, Url};
use robotstxt::DefaultMatcher;
use scraper::Html;
use crate::constants::{APP_CACHE_PATH};
#[derive(Clone)]
pub(crate) struct Crawler {
id: String,
base_url: Url,
client: Client,
robots_txt: String
robots_txt: String,
}
impl Crawler {
@@ -23,7 +23,11 @@ impl Crawler {
base_url,
};
let mut matcher = DefaultMatcher::default();
let is_access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
let is_access_allowed = matcher.one_agent_allowed_by_robots(
&crawler.robots_txt,
"reqwest",
crawler.base_url.as_str(),
);
if !is_access_allowed {
panic!("Crawler cannot access site {}", crawler.base_url.as_str());
}
@@ -31,20 +35,22 @@ impl Crawler {
}
async fn get_robots_txt(id: &str, base_url: &Url) -> Result<String> {
let local_robots_path = APP_CACHE_PATH.clone()
.join(id).join("robots.txt");
let local_robots_path = APP_CACHE_PATH.clone().join(id).join("robots.txt");
if !local_robots_path.exists() {
let mut robots_url = base_url.clone();
robots_url.set_path("/robots.txt");
let response = reqwest::get(robots_url).await
.expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
let response = reqwest::get(robots_url).await.expect(
format!(
"Failed to get robots.txt in `{}/robots.txt`",
base_url.as_str()
)
.as_str(),
);
let content = response.text().await?;
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await?;
tokio::fs::write(&local_robots_path, &content).await?;
Ok(content)
}
else {
} else {
Ok(tokio::fs::read_to_string(&local_robots_path).await?)
}
}
@@ -55,4 +61,4 @@ impl Crawler {
let html_text = &self.client.get(url).send().await?.text().await?;
Ok(Html::parse_document(html_text))
}
}
}