add robots txt detection
This commit is contained in:
12
src/app.rs
12
src/app.rs
@@ -1,4 +1,3 @@
|
||||
use std::env;
|
||||
use std::path::PathBuf;
|
||||
use crate::event::{Event, EventHandler};
|
||||
use ratatui::widgets::{Block, Borders, Paragraph};
|
||||
@@ -7,9 +6,7 @@ use std::time::Duration;
|
||||
use color_eyre::Result;
|
||||
use crossterm::event;
|
||||
use crossterm::event::KeyCode::Char;
|
||||
use diesel::{AggregateExpressionMethods, Connection, SqliteConnection};
|
||||
use directories::BaseDirs;
|
||||
use dotenvy::dotenv;
|
||||
use diesel::{Connection, SqliteConnection};
|
||||
use lazy_static::lazy_static;
|
||||
use ratatui::buffer::Buffer;
|
||||
use ratatui::layout::{Constraint, Direction, Layout, Rect};
|
||||
@@ -17,14 +14,9 @@ use ratatui::prelude::{Widget};
|
||||
use ratatui::style::{Color, Style};
|
||||
use ratatui::text::{Line, Span, Text};
|
||||
use crate::config::types::ApplicationConfig;
|
||||
use crate::constants::{APP_CONFIG_DIR, APP_DATA_DIR};
|
||||
|
||||
const APP_DIR_NAME: &str = "sus_manager";
|
||||
lazy_static! {
|
||||
static ref BASE_DIRS: BaseDirs = BaseDirs::new().unwrap();
|
||||
static ref APP_CONFIG_DIR: PathBuf = BASE_DIRS.config_dir().to_path_buf()
|
||||
.join(APP_DIR_NAME);
|
||||
static ref APP_DATA_DIR: PathBuf = BASE_DIRS.data_dir().to_path_buf()
|
||||
.join(APP_DIR_NAME);
|
||||
static ref APP_CONIFG_FILE_PATH: PathBuf = APP_CONFIG_DIR.clone()
|
||||
.join("config.ini");
|
||||
}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::path::{PathBuf};
|
||||
use ini::Ini;
|
||||
use crate::config::types::{ApplicationConfig, BasicConfig};
|
||||
use crate::constants::APP_CONFIG_DIR;
|
||||
|
||||
pub mod types;
|
||||
|
||||
@@ -20,7 +21,7 @@ impl ApplicationConfig {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
basic_config: BasicConfig {
|
||||
db_path: "games.db".to_string(),
|
||||
db_path: APP_CONFIG_DIR.clone().to_str().unwrap().to_string(),
|
||||
tick_rate: 250
|
||||
}
|
||||
}
|
||||
|
||||
12
src/constants.rs
Normal file
12
src/constants.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
use std::path::PathBuf;
|
||||
use directories::BaseDirs;
|
||||
use lazy_static::lazy_static;
|
||||
|
||||
const APP_DIR_NAME: &str = "sus_manager";
|
||||
lazy_static!(
|
||||
static ref BASE_DIRS: BaseDirs = BaseDirs::new().unwrap();
|
||||
pub static ref APP_CONFIG_DIR: PathBuf = BASE_DIRS.config_dir().to_path_buf()
|
||||
.join(APP_DIR_NAME);
|
||||
pub static ref APP_DATA_DIR: PathBuf = BASE_DIRS.data_dir().to_path_buf()
|
||||
.join(APP_DIR_NAME);
|
||||
);
|
||||
45
src/crawler/mod.rs
Normal file
45
src/crawler/mod.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
use std::fs;
|
||||
use reqwest::{Client, Url};
|
||||
use robotstxt::DefaultMatcher;
|
||||
use robotstxt::matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
|
||||
use crate::constants::APP_DATA_DIR;
|
||||
use crate::crawler;
|
||||
|
||||
pub(crate) struct Crawler {
|
||||
id: String,
|
||||
base_url: Url,
|
||||
client: Client,
|
||||
robots_txt: String
|
||||
}
|
||||
|
||||
impl Crawler {
|
||||
pub async fn new(id: &str, base_url: Url) -> Self {
|
||||
let crawler = Self {
|
||||
id: id.to_string(),
|
||||
client: Client::new(),
|
||||
robots_txt: Self::get_robots_txt(id, &base_url).await,
|
||||
base_url,
|
||||
};
|
||||
let mut matcher = DefaultMatcher::default();
|
||||
let access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
|
||||
assert_eq!(true, access_allowed);
|
||||
crawler
|
||||
}
|
||||
|
||||
async fn get_robots_txt(id: &str, base_url: &Url) -> String {
|
||||
let local_robots_path = APP_DATA_DIR.clone().join(id).join("robots.txt");
|
||||
if !local_robots_path.exists() {
|
||||
let mut robots_url = base_url.clone();
|
||||
robots_url.set_path("/robots.txt");
|
||||
let response = reqwest::get(robots_url).await
|
||||
.expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
|
||||
let content = response.text().await.unwrap();
|
||||
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await.unwrap();
|
||||
tokio::fs::write(&local_robots_path, &content).await.unwrap();
|
||||
content
|
||||
}
|
||||
else {
|
||||
tokio::fs::read_to_string(&local_robots_path).await.unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,13 +4,21 @@ mod schema;
|
||||
mod types;
|
||||
mod config;
|
||||
mod helpers;
|
||||
mod crawler;
|
||||
mod constants;
|
||||
|
||||
use color_eyre::Result;
|
||||
use reqwest::Url;
|
||||
use tokio;
|
||||
use crate::crawler::Crawler;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
color_eyre::install()?;
|
||||
|
||||
let crawler = Crawler::new("dlsite", Url::parse("https://www.dlsite.com/")?).await;
|
||||
|
||||
|
||||
let terminal = ratatui::init();
|
||||
let app = app::App::new();
|
||||
let result = app.run(terminal).await;
|
||||
|
||||
Reference in New Issue
Block a user