add robots txt detection

This commit is contained in:
2025-10-09 15:26:36 +08:00
parent 10b89aee17
commit d660f25fb1
8 changed files with 1268 additions and 16 deletions

45
src/crawler/mod.rs Normal file
View File

@@ -0,0 +1,45 @@
use std::fs;
use reqwest::{Client, Url};
use robotstxt::DefaultMatcher;
use robotstxt::matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
use crate::constants::APP_DATA_DIR;
use crate::crawler;
pub(crate) struct Crawler {
id: String,
base_url: Url,
client: Client,
robots_txt: String
}
impl Crawler {
pub async fn new(id: &str, base_url: Url) -> Self {
let crawler = Self {
id: id.to_string(),
client: Client::new(),
robots_txt: Self::get_robots_txt(id, &base_url).await,
base_url,
};
let mut matcher = DefaultMatcher::default();
let access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
assert_eq!(true, access_allowed);
crawler
}
async fn get_robots_txt(id: &str, base_url: &Url) -> String {
let local_robots_path = APP_DATA_DIR.clone().join(id).join("robots.txt");
if !local_robots_path.exists() {
let mut robots_url = base_url.clone();
robots_url.set_path("/robots.txt");
let response = reqwest::get(robots_url).await
.expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
let content = response.text().await.unwrap();
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await.unwrap();
tokio::fs::write(&local_robots_path, &content).await.unwrap();
content
}
else {
tokio::fs::read_to_string(&local_robots_path).await.unwrap()
}
}
}