add robots txt detection

2025-10-09 15:26:36 +08:00
parent 10b89aee17
commit d660f25fb1
8 changed files with 1268 additions and 16 deletions
--- a/src/crawler/mod.rs
+++ b/src/crawler/mod.rs
@@ -0,0 +1,45 @@
+use std::fs;
+use reqwest::{Client, Url};
+use robotstxt::DefaultMatcher;
+use robotstxt::matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
+use crate::constants::APP_DATA_DIR;
+use crate::crawler;
+
+pub(crate) struct Crawler {
+    id: String,
+    base_url: Url,
+    client: Client,
+    robots_txt: String
+}
+
+impl Crawler {
+    pub async fn new(id: &str, base_url: Url) -> Self {
+        let crawler = Self {
+            id: id.to_string(),
+            client: Client::new(),
+            robots_txt: Self::get_robots_txt(id, &base_url).await,
+            base_url,
+        };
+        let mut matcher = DefaultMatcher::default();
+        let access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
+        assert_eq!(true, access_allowed);
+        crawler
+    }
+
+    async fn get_robots_txt(id: &str, base_url: &Url) -> String {
+        let local_robots_path = APP_DATA_DIR.clone().join(id).join("robots.txt");
+        if !local_robots_path.exists() {
+            let mut robots_url = base_url.clone();
+            robots_url.set_path("/robots.txt");
+            let response = reqwest::get(robots_url).await
+                .expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
+            let content = response.text().await.unwrap();
+            tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await.unwrap();
+            tokio::fs::write(&local_robots_path, &content).await.unwrap();
+            content
+        }
+        else {
+            tokio::fs::read_to_string(&local_robots_path).await.unwrap()
+        }
+    }
+}