Reformat code
This commit is contained in:
@@ -5,6 +5,4 @@ pub(crate) struct DLSiteCrawler {
|
||||
crawler: Crawler,
|
||||
}
|
||||
|
||||
impl DLSiteCrawler {
|
||||
|
||||
}
|
||||
impl DLSiteCrawler {}
|
||||
|
||||
@@ -1,17 +1,17 @@
|
||||
mod dlsite;
|
||||
|
||||
use reqwest::{Client, Url};
|
||||
use robotstxt::{DefaultMatcher};
|
||||
use crate::constants::APP_CACHE_PATH;
|
||||
use color_eyre::Result;
|
||||
use reqwest::{Client, Url};
|
||||
use robotstxt::DefaultMatcher;
|
||||
use scraper::Html;
|
||||
use crate::constants::{APP_CACHE_PATH};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct Crawler {
|
||||
id: String,
|
||||
base_url: Url,
|
||||
client: Client,
|
||||
robots_txt: String
|
||||
robots_txt: String,
|
||||
}
|
||||
|
||||
impl Crawler {
|
||||
@@ -23,7 +23,11 @@ impl Crawler {
|
||||
base_url,
|
||||
};
|
||||
let mut matcher = DefaultMatcher::default();
|
||||
let is_access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
|
||||
let is_access_allowed = matcher.one_agent_allowed_by_robots(
|
||||
&crawler.robots_txt,
|
||||
"reqwest",
|
||||
crawler.base_url.as_str(),
|
||||
);
|
||||
if !is_access_allowed {
|
||||
panic!("Crawler cannot access site {}", crawler.base_url.as_str());
|
||||
}
|
||||
@@ -31,20 +35,22 @@ impl Crawler {
|
||||
}
|
||||
|
||||
async fn get_robots_txt(id: &str, base_url: &Url) -> Result<String> {
|
||||
let local_robots_path = APP_CACHE_PATH.clone()
|
||||
.join(id).join("robots.txt");
|
||||
let local_robots_path = APP_CACHE_PATH.clone().join(id).join("robots.txt");
|
||||
if !local_robots_path.exists() {
|
||||
let mut robots_url = base_url.clone();
|
||||
robots_url.set_path("/robots.txt");
|
||||
let response = reqwest::get(robots_url).await
|
||||
.expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
|
||||
let response = reqwest::get(robots_url).await.expect(
|
||||
format!(
|
||||
"Failed to get robots.txt in `{}/robots.txt`",
|
||||
base_url.as_str()
|
||||
)
|
||||
.as_str(),
|
||||
);
|
||||
let content = response.text().await?;
|
||||
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await?;
|
||||
tokio::fs::write(&local_robots_path, &content).await?;
|
||||
Ok(content)
|
||||
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
Ok(tokio::fs::read_to_string(&local_robots_path).await?)
|
||||
}
|
||||
}
|
||||
@@ -55,4 +61,4 @@ impl Crawler {
|
||||
let html_text = &self.client.get(url).send().await?.text().await?;
|
||||
Ok(Html::parse_document(html_text))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user