1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

readability stub

This commit is contained in:
Jan Lukas Gernert 2022-10-08 23:10:26 +02:00
parent 273ddd832c
commit d906f6b7fe
5 changed files with 42 additions and 21 deletions

View file

@ -1,7 +1,7 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use std::fs::File;
use std::io::{Error, ErrorKind, Write}; use std::io::{Error, ErrorKind, Write};
use std::path::PathBuf; use std::path::PathBuf;
use std::fs::File;
use url::Url; use url::Url;
pub struct Article { pub struct Article {
@ -28,6 +28,9 @@ impl Article {
} }
} }
Err(Error::new(ErrorKind::NotFound, "Article does not contain HTML")) Err(Error::new(
ErrorKind::NotFound,
"Article does not contain HTML",
))
} }
} }

View file

@ -1,13 +1,14 @@
pub mod error;
pub mod config; pub mod config;
pub mod error;
mod fingerprints; mod fingerprints;
#[cfg(test)] #[cfg(test)]
mod tests; mod tests;
use self::config::{ConfigCollection, ConfigEntry};
use self::error::{FullTextParserError, FullTextParserErrorKind}; use self::error::{FullTextParserError, FullTextParserErrorKind};
use crate::article::Article; use crate::article::Article;
use self::config::{ConfigCollection, ConfigEntry}; use crate::util::Util;
use chrono::DateTime; use chrono::DateTime;
use encoding_rs::Encoding; use encoding_rs::Encoding;
use failure::ResultExt; use failure::ResultExt;
@ -20,7 +21,6 @@ use reqwest::header::HeaderMap;
use reqwest::Client; use reqwest::Client;
use std::path::Path; use std::path::Path;
use std::str::FromStr; use std::str::FromStr;
use crate::util::Util;
pub struct FullTextParser { pub struct FullTextParser {
config_files: ConfigCollection, config_files: ConfigCollection,
@ -29,9 +29,7 @@ pub struct FullTextParser {
impl FullTextParser { impl FullTextParser {
pub async fn new(config_path: Option<&Path>) -> Self { pub async fn new(config_path: Option<&Path>) -> Self {
let config_files = ConfigCollection::parse(config_path).await; let config_files = ConfigCollection::parse(config_path).await;
Self { Self { config_files }
config_files,
}
} }
pub async fn parse( pub async fn parse(
@ -83,7 +81,8 @@ impl FullTextParser {
}; };
let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?; let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?;
let mut root = Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?; let mut root =
Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
document.set_root_element(&root); document.set_root_element(&root);
Self::generate_head(&mut root, &document)?; Self::generate_head(&mut root, &document)?;
@ -263,7 +262,10 @@ impl FullTextParser {
if response.status().is_success() { if response.status().is_success() {
let headers = response.headers().clone(); let headers = response.headers().clone();
let text = response.text().await.context(FullTextParserErrorKind::Http)?; let text = response
.text()
.await
.context(FullTextParserErrorKind::Http)?;
{ {
if let Some(decoded_html) = if let Some(decoded_html) =
Self::decode_html(&text, Self::get_encoding_from_html(&text)) Self::decode_html(&text, Self::get_encoding_from_html(&text))

View file

@ -1,18 +1,21 @@
pub mod images;
mod article; mod article;
mod full_text_parser;
mod util;
mod error; mod error;
mod full_text_parser;
pub mod images;
mod readability;
mod util;
use std::path::Path;
use article::Article; use article::Article;
use full_text_parser::FullTextParser;
use error::{ScraperError, ScraperErrorKind}; use error::{ScraperError, ScraperErrorKind};
use full_text_parser::FullTextParser;
use images::ImageDownloader; use images::ImageDownloader;
use readability::Readability;
use reqwest::Client; use reqwest::Client;
use std::path::Path;
pub struct ArticleScraper { pub struct ArticleScraper {
full_text_parser: FullTextParser, full_text_parser: FullTextParser,
readability: Readability,
image_downloader: ImageDownloader, image_downloader: ImageDownloader,
} }
@ -20,6 +23,7 @@ impl ArticleScraper {
pub async fn new(user_configs: Option<&Path>) -> Self { pub async fn new(user_configs: Option<&Path>) -> Self {
Self { Self {
full_text_parser: FullTextParser::new(user_configs).await, full_text_parser: FullTextParser::new(user_configs).await,
readability: Readability::new(),
image_downloader: ImageDownloader::new((2048, 2048)), image_downloader: ImageDownloader::new((2048, 2048)),
} }
} }
@ -30,7 +34,6 @@ impl ArticleScraper {
download_images: bool, download_images: bool,
client: &Client, client: &Client,
) -> Result<Article, ScraperError> { ) -> Result<Article, ScraperError> {
let res = self.full_text_parser.parse(url, client).await; let res = self.full_text_parser.parse(url, client).await;
if download_images { if download_images {
@ -45,4 +48,4 @@ impl ArticleScraper {
unimplemented!() unimplemented!()
} }
} }

7
src/readability/mod.rs Normal file
View file

@ -0,0 +1,7 @@
pub struct Readability;
impl Readability {
pub fn new() -> Self {
unimplemented!()
}
}

View file

@ -65,8 +65,8 @@ impl Util {
} }
for header in &global_rule.header { for header in &global_rule.header {
let name = let name = HeaderName::from_bytes(header.name.as_bytes())
HeaderName::from_bytes(header.name.as_bytes()).context(FullTextParserErrorKind::Config)?; .context(FullTextParserErrorKind::Config)?;
let value = header let value = header
.value .value
.parse::<HeaderValue>() .parse::<HeaderValue>()
@ -158,7 +158,10 @@ impl Util {
Err(FullTextParserErrorKind::Xml.into()) Err(FullTextParserErrorKind::Xml.into())
} }
pub fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, FullTextParserError> { pub fn extract_value_merge(
context: &Context,
xpath: &str,
) -> Result<String, FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, xpath, true)?; let node_vec = Util::evaluate_xpath(context, xpath, true)?;
let mut val = String::new(); let mut val = String::new();
for node in node_vec { for node in node_vec {
@ -188,7 +191,10 @@ impl Util {
Ok(()) Ok(())
} }
pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), FullTextParserError> { pub fn strip_id_or_class(
context: &Context,
id_or_class: &str,
) -> Result<(), FullTextParserError> {
let xpath = &format!( let xpath = &format!(
"//*[contains(@class, '{}') or contains(@id, '{}')]", "//*[contains(@class, '{}') or contains(@id, '{}')]",
id_or_class, id_or_class id_or_class, id_or_class