add robots txt detection

This commit is contained in:
2025-10-09 15:26:36 +08:00
parent 10b89aee17
commit d660f25fb1
8 changed files with 1268 additions and 16 deletions

3
.gitignore vendored
View File

@@ -1,6 +1,5 @@
target
.idea
games.db
.env
*.env
diesel.toml
migrations/.*

1198
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -16,6 +16,9 @@ directories = "6.0.0"
lazy_static = "1.5.0"
dotenvy = "0.15"
rust-ini = "0.21.3"
robotstxt = "0.3.0"
scraper = "0.24.0"
reqwest = { version = "0.12.23", features = ["blocking"] }
[dependencies.crossterm]
version = "0.29.0"

View File

@@ -1,4 +1,3 @@
use std::env;
use std::path::PathBuf;
use crate::event::{Event, EventHandler};
use ratatui::widgets::{Block, Borders, Paragraph};
@@ -7,9 +6,7 @@ use std::time::Duration;
use color_eyre::Result;
use crossterm::event;
use crossterm::event::KeyCode::Char;
use diesel::{AggregateExpressionMethods, Connection, SqliteConnection};
use directories::BaseDirs;
use dotenvy::dotenv;
use diesel::{Connection, SqliteConnection};
use lazy_static::lazy_static;
use ratatui::buffer::Buffer;
use ratatui::layout::{Constraint, Direction, Layout, Rect};
@@ -17,14 +14,9 @@ use ratatui::prelude::{Widget};
use ratatui::style::{Color, Style};
use ratatui::text::{Line, Span, Text};
use crate::config::types::ApplicationConfig;
use crate::constants::{APP_CONFIG_DIR, APP_DATA_DIR};
const APP_DIR_NAME: &str = "sus_manager";
lazy_static! {
static ref BASE_DIRS: BaseDirs = BaseDirs::new().unwrap();
static ref APP_CONFIG_DIR: PathBuf = BASE_DIRS.config_dir().to_path_buf()
.join(APP_DIR_NAME);
static ref APP_DATA_DIR: PathBuf = BASE_DIRS.data_dir().to_path_buf()
.join(APP_DIR_NAME);
static ref APP_CONIFG_FILE_PATH: PathBuf = APP_CONFIG_DIR.clone()
.join("config.ini");
}

View File

@@ -1,6 +1,7 @@
use std::path::{PathBuf};
use ini::Ini;
use crate::config::types::{ApplicationConfig, BasicConfig};
use crate::constants::APP_CONFIG_DIR;
pub mod types;
@@ -20,7 +21,7 @@ impl ApplicationConfig {
pub fn new() -> Self {
Self {
basic_config: BasicConfig {
db_path: "games.db".to_string(),
db_path: APP_CONFIG_DIR.clone().to_str().unwrap().to_string(),
tick_rate: 250
}
}

12
src/constants.rs Normal file
View File

@@ -0,0 +1,12 @@
use std::path::PathBuf;
use directories::BaseDirs;
use lazy_static::lazy_static;
const APP_DIR_NAME: &str = "sus_manager";
lazy_static!(
static ref BASE_DIRS: BaseDirs = BaseDirs::new().unwrap();
pub static ref APP_CONFIG_DIR: PathBuf = BASE_DIRS.config_dir().to_path_buf()
.join(APP_DIR_NAME);
pub static ref APP_DATA_DIR: PathBuf = BASE_DIRS.data_dir().to_path_buf()
.join(APP_DIR_NAME);
);

45
src/crawler/mod.rs Normal file
View File

@@ -0,0 +1,45 @@
use std::fs;
use reqwest::{Client, Url};
use robotstxt::DefaultMatcher;
use robotstxt::matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
use crate::constants::APP_DATA_DIR;
use crate::crawler;
pub(crate) struct Crawler {
id: String,
base_url: Url,
client: Client,
robots_txt: String
}
impl Crawler {
pub async fn new(id: &str, base_url: Url) -> Self {
let crawler = Self {
id: id.to_string(),
client: Client::new(),
robots_txt: Self::get_robots_txt(id, &base_url).await,
base_url,
};
let mut matcher = DefaultMatcher::default();
let access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
assert_eq!(true, access_allowed);
crawler
}
async fn get_robots_txt(id: &str, base_url: &Url) -> String {
let local_robots_path = APP_DATA_DIR.clone().join(id).join("robots.txt");
if !local_robots_path.exists() {
let mut robots_url = base_url.clone();
robots_url.set_path("/robots.txt");
let response = reqwest::get(robots_url).await
.expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
let content = response.text().await.unwrap();
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await.unwrap();
tokio::fs::write(&local_robots_path, &content).await.unwrap();
content
}
else {
tokio::fs::read_to_string(&local_robots_path).await.unwrap()
}
}
}

View File

@@ -4,13 +4,21 @@ mod schema;
mod types;
mod config;
mod helpers;
mod crawler;
mod constants;
use color_eyre::Result;
use reqwest::Url;
use tokio;
use crate::crawler::Crawler;
#[tokio::main]
async fn main() -> Result<()> {
color_eyre::install()?;
let crawler = Crawler::new("dlsite", Url::parse("https://www.dlsite.com/")?).await;
let terminal = ratatui::init();
let app = app::App::new();
let result = app.run(terminal).await;