add robots txt detection
This commit is contained in:
45
src/crawler/mod.rs
Normal file
45
src/crawler/mod.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
use std::fs;
|
||||
use reqwest::{Client, Url};
|
||||
use robotstxt::DefaultMatcher;
|
||||
use robotstxt::matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
|
||||
use crate::constants::APP_DATA_DIR;
|
||||
use crate::crawler;
|
||||
|
||||
pub(crate) struct Crawler {
|
||||
id: String,
|
||||
base_url: Url,
|
||||
client: Client,
|
||||
robots_txt: String
|
||||
}
|
||||
|
||||
impl Crawler {
|
||||
pub async fn new(id: &str, base_url: Url) -> Self {
|
||||
let crawler = Self {
|
||||
id: id.to_string(),
|
||||
client: Client::new(),
|
||||
robots_txt: Self::get_robots_txt(id, &base_url).await,
|
||||
base_url,
|
||||
};
|
||||
let mut matcher = DefaultMatcher::default();
|
||||
let access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
|
||||
assert_eq!(true, access_allowed);
|
||||
crawler
|
||||
}
|
||||
|
||||
async fn get_robots_txt(id: &str, base_url: &Url) -> String {
|
||||
let local_robots_path = APP_DATA_DIR.clone().join(id).join("robots.txt");
|
||||
if !local_robots_path.exists() {
|
||||
let mut robots_url = base_url.clone();
|
||||
robots_url.set_path("/robots.txt");
|
||||
let response = reqwest::get(robots_url).await
|
||||
.expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
|
||||
let content = response.text().await.unwrap();
|
||||
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await.unwrap();
|
||||
tokio::fs::write(&local_robots_path, &content).await.unwrap();
|
||||
content
|
||||
}
|
||||
else {
|
||||
tokio::fs::read_to_string(&local_robots_path).await.unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user