add robots txt detection
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,6 +1,5 @@
|
|||||||
target
|
target
|
||||||
.idea
|
.idea
|
||||||
games.db
|
*.env
|
||||||
.env
|
|
||||||
diesel.toml
|
diesel.toml
|
||||||
migrations/.*
|
migrations/.*
|
||||||
1198
Cargo.lock
generated
1198
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -16,6 +16,9 @@ directories = "6.0.0"
|
|||||||
lazy_static = "1.5.0"
|
lazy_static = "1.5.0"
|
||||||
dotenvy = "0.15"
|
dotenvy = "0.15"
|
||||||
rust-ini = "0.21.3"
|
rust-ini = "0.21.3"
|
||||||
|
robotstxt = "0.3.0"
|
||||||
|
scraper = "0.24.0"
|
||||||
|
reqwest = { version = "0.12.23", features = ["blocking"] }
|
||||||
|
|
||||||
[dependencies.crossterm]
|
[dependencies.crossterm]
|
||||||
version = "0.29.0"
|
version = "0.29.0"
|
||||||
|
|||||||
12
src/app.rs
12
src/app.rs
@@ -1,4 +1,3 @@
|
|||||||
use std::env;
|
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use crate::event::{Event, EventHandler};
|
use crate::event::{Event, EventHandler};
|
||||||
use ratatui::widgets::{Block, Borders, Paragraph};
|
use ratatui::widgets::{Block, Borders, Paragraph};
|
||||||
@@ -7,9 +6,7 @@ use std::time::Duration;
|
|||||||
use color_eyre::Result;
|
use color_eyre::Result;
|
||||||
use crossterm::event;
|
use crossterm::event;
|
||||||
use crossterm::event::KeyCode::Char;
|
use crossterm::event::KeyCode::Char;
|
||||||
use diesel::{AggregateExpressionMethods, Connection, SqliteConnection};
|
use diesel::{Connection, SqliteConnection};
|
||||||
use directories::BaseDirs;
|
|
||||||
use dotenvy::dotenv;
|
|
||||||
use lazy_static::lazy_static;
|
use lazy_static::lazy_static;
|
||||||
use ratatui::buffer::Buffer;
|
use ratatui::buffer::Buffer;
|
||||||
use ratatui::layout::{Constraint, Direction, Layout, Rect};
|
use ratatui::layout::{Constraint, Direction, Layout, Rect};
|
||||||
@@ -17,14 +14,9 @@ use ratatui::prelude::{Widget};
|
|||||||
use ratatui::style::{Color, Style};
|
use ratatui::style::{Color, Style};
|
||||||
use ratatui::text::{Line, Span, Text};
|
use ratatui::text::{Line, Span, Text};
|
||||||
use crate::config::types::ApplicationConfig;
|
use crate::config::types::ApplicationConfig;
|
||||||
|
use crate::constants::{APP_CONFIG_DIR, APP_DATA_DIR};
|
||||||
|
|
||||||
const APP_DIR_NAME: &str = "sus_manager";
|
|
||||||
lazy_static! {
|
lazy_static! {
|
||||||
static ref BASE_DIRS: BaseDirs = BaseDirs::new().unwrap();
|
|
||||||
static ref APP_CONFIG_DIR: PathBuf = BASE_DIRS.config_dir().to_path_buf()
|
|
||||||
.join(APP_DIR_NAME);
|
|
||||||
static ref APP_DATA_DIR: PathBuf = BASE_DIRS.data_dir().to_path_buf()
|
|
||||||
.join(APP_DIR_NAME);
|
|
||||||
static ref APP_CONIFG_FILE_PATH: PathBuf = APP_CONFIG_DIR.clone()
|
static ref APP_CONIFG_FILE_PATH: PathBuf = APP_CONFIG_DIR.clone()
|
||||||
.join("config.ini");
|
.join("config.ini");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::path::{PathBuf};
|
use std::path::{PathBuf};
|
||||||
use ini::Ini;
|
use ini::Ini;
|
||||||
use crate::config::types::{ApplicationConfig, BasicConfig};
|
use crate::config::types::{ApplicationConfig, BasicConfig};
|
||||||
|
use crate::constants::APP_CONFIG_DIR;
|
||||||
|
|
||||||
pub mod types;
|
pub mod types;
|
||||||
|
|
||||||
@@ -20,7 +21,7 @@ impl ApplicationConfig {
|
|||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Self {
|
Self {
|
||||||
basic_config: BasicConfig {
|
basic_config: BasicConfig {
|
||||||
db_path: "games.db".to_string(),
|
db_path: APP_CONFIG_DIR.clone().to_str().unwrap().to_string(),
|
||||||
tick_rate: 250
|
tick_rate: 250
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
12
src/constants.rs
Normal file
12
src/constants.rs
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
use directories::BaseDirs;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
|
||||||
|
const APP_DIR_NAME: &str = "sus_manager";
|
||||||
|
lazy_static!(
|
||||||
|
static ref BASE_DIRS: BaseDirs = BaseDirs::new().unwrap();
|
||||||
|
pub static ref APP_CONFIG_DIR: PathBuf = BASE_DIRS.config_dir().to_path_buf()
|
||||||
|
.join(APP_DIR_NAME);
|
||||||
|
pub static ref APP_DATA_DIR: PathBuf = BASE_DIRS.data_dir().to_path_buf()
|
||||||
|
.join(APP_DIR_NAME);
|
||||||
|
);
|
||||||
45
src/crawler/mod.rs
Normal file
45
src/crawler/mod.rs
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
use std::fs;
|
||||||
|
use reqwest::{Client, Url};
|
||||||
|
use robotstxt::DefaultMatcher;
|
||||||
|
use robotstxt::matcher::{LongestMatchRobotsMatchStrategy, RobotsMatcher};
|
||||||
|
use crate::constants::APP_DATA_DIR;
|
||||||
|
use crate::crawler;
|
||||||
|
|
||||||
|
pub(crate) struct Crawler {
|
||||||
|
id: String,
|
||||||
|
base_url: Url,
|
||||||
|
client: Client,
|
||||||
|
robots_txt: String
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Crawler {
|
||||||
|
pub async fn new(id: &str, base_url: Url) -> Self {
|
||||||
|
let crawler = Self {
|
||||||
|
id: id.to_string(),
|
||||||
|
client: Client::new(),
|
||||||
|
robots_txt: Self::get_robots_txt(id, &base_url).await,
|
||||||
|
base_url,
|
||||||
|
};
|
||||||
|
let mut matcher = DefaultMatcher::default();
|
||||||
|
let access_allowed = matcher.one_agent_allowed_by_robots(&crawler.robots_txt, "reqwest", crawler.base_url.as_str());
|
||||||
|
assert_eq!(true, access_allowed);
|
||||||
|
crawler
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn get_robots_txt(id: &str, base_url: &Url) -> String {
|
||||||
|
let local_robots_path = APP_DATA_DIR.clone().join(id).join("robots.txt");
|
||||||
|
if !local_robots_path.exists() {
|
||||||
|
let mut robots_url = base_url.clone();
|
||||||
|
robots_url.set_path("/robots.txt");
|
||||||
|
let response = reqwest::get(robots_url).await
|
||||||
|
.expect(format!("Failed to get robots.txt in `{}/robots.txt`", base_url.as_str()).as_str());
|
||||||
|
let content = response.text().await.unwrap();
|
||||||
|
tokio::fs::create_dir_all(local_robots_path.parent().unwrap()).await.unwrap();
|
||||||
|
tokio::fs::write(&local_robots_path, &content).await.unwrap();
|
||||||
|
content
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
tokio::fs::read_to_string(&local_robots_path).await.unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -4,13 +4,21 @@ mod schema;
|
|||||||
mod types;
|
mod types;
|
||||||
mod config;
|
mod config;
|
||||||
mod helpers;
|
mod helpers;
|
||||||
|
mod crawler;
|
||||||
|
mod constants;
|
||||||
|
|
||||||
use color_eyre::Result;
|
use color_eyre::Result;
|
||||||
|
use reqwest::Url;
|
||||||
use tokio;
|
use tokio;
|
||||||
|
use crate::crawler::Crawler;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<()> {
|
async fn main() -> Result<()> {
|
||||||
color_eyre::install()?;
|
color_eyre::install()?;
|
||||||
|
|
||||||
|
let crawler = Crawler::new("dlsite", Url::parse("https://www.dlsite.com/")?).await;
|
||||||
|
|
||||||
|
|
||||||
let terminal = ratatui::init();
|
let terminal = ratatui::init();
|
||||||
let app = app::App::new();
|
let app = app::App::new();
|
||||||
let result = app.run(terminal).await;
|
let result = app.run(terminal).await;
|
||||||
|
|||||||
Reference in New Issue
Block a user