mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
readability stub
This commit is contained in:
parent
273ddd832c
commit
d906f6b7fe
5 changed files with 42 additions and 21 deletions
|
@ -1,7 +1,7 @@
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
|
use std::fs::File;
|
||||||
use std::io::{Error, ErrorKind, Write};
|
use std::io::{Error, ErrorKind, Write};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::fs::File;
|
|
||||||
use url::Url;
|
use url::Url;
|
||||||
|
|
||||||
pub struct Article {
|
pub struct Article {
|
||||||
|
@ -28,6 +28,9 @@ impl Article {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(Error::new(ErrorKind::NotFound, "Article does not contain HTML"))
|
Err(Error::new(
|
||||||
|
ErrorKind::NotFound,
|
||||||
|
"Article does not contain HTML",
|
||||||
|
))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,13 +1,14 @@
|
||||||
pub mod error;
|
|
||||||
pub mod config;
|
pub mod config;
|
||||||
|
pub mod error;
|
||||||
mod fingerprints;
|
mod fingerprints;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
||||||
|
use self::config::{ConfigCollection, ConfigEntry};
|
||||||
use self::error::{FullTextParserError, FullTextParserErrorKind};
|
use self::error::{FullTextParserError, FullTextParserErrorKind};
|
||||||
use crate::article::Article;
|
use crate::article::Article;
|
||||||
use self::config::{ConfigCollection, ConfigEntry};
|
use crate::util::Util;
|
||||||
use chrono::DateTime;
|
use chrono::DateTime;
|
||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
use failure::ResultExt;
|
use failure::ResultExt;
|
||||||
|
@ -20,7 +21,6 @@ use reqwest::header::HeaderMap;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::FromStr;
|
use std::str::FromStr;
|
||||||
use crate::util::Util;
|
|
||||||
|
|
||||||
pub struct FullTextParser {
|
pub struct FullTextParser {
|
||||||
config_files: ConfigCollection,
|
config_files: ConfigCollection,
|
||||||
|
@ -29,9 +29,7 @@ pub struct FullTextParser {
|
||||||
impl FullTextParser {
|
impl FullTextParser {
|
||||||
pub async fn new(config_path: Option<&Path>) -> Self {
|
pub async fn new(config_path: Option<&Path>) -> Self {
|
||||||
let config_files = ConfigCollection::parse(config_path).await;
|
let config_files = ConfigCollection::parse(config_path).await;
|
||||||
Self {
|
Self { config_files }
|
||||||
config_files,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn parse(
|
pub async fn parse(
|
||||||
|
@ -83,7 +81,8 @@ impl FullTextParser {
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?;
|
let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?;
|
||||||
let mut root = Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
|
let mut root =
|
||||||
|
Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
|
||||||
document.set_root_element(&root);
|
document.set_root_element(&root);
|
||||||
|
|
||||||
Self::generate_head(&mut root, &document)?;
|
Self::generate_head(&mut root, &document)?;
|
||||||
|
@ -263,7 +262,10 @@ impl FullTextParser {
|
||||||
|
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
let headers = response.headers().clone();
|
let headers = response.headers().clone();
|
||||||
let text = response.text().await.context(FullTextParserErrorKind::Http)?;
|
let text = response
|
||||||
|
.text()
|
||||||
|
.await
|
||||||
|
.context(FullTextParserErrorKind::Http)?;
|
||||||
{
|
{
|
||||||
if let Some(decoded_html) =
|
if let Some(decoded_html) =
|
||||||
Self::decode_html(&text, Self::get_encoding_from_html(&text))
|
Self::decode_html(&text, Self::get_encoding_from_html(&text))
|
||||||
|
|
17
src/lib.rs
17
src/lib.rs
|
@ -1,18 +1,21 @@
|
||||||
pub mod images;
|
|
||||||
mod article;
|
mod article;
|
||||||
mod full_text_parser;
|
|
||||||
mod util;
|
|
||||||
mod error;
|
mod error;
|
||||||
|
mod full_text_parser;
|
||||||
|
pub mod images;
|
||||||
|
mod readability;
|
||||||
|
mod util;
|
||||||
|
|
||||||
use std::path::Path;
|
|
||||||
use article::Article;
|
use article::Article;
|
||||||
use full_text_parser::FullTextParser;
|
|
||||||
use error::{ScraperError, ScraperErrorKind};
|
use error::{ScraperError, ScraperErrorKind};
|
||||||
|
use full_text_parser::FullTextParser;
|
||||||
use images::ImageDownloader;
|
use images::ImageDownloader;
|
||||||
|
use readability::Readability;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
full_text_parser: FullTextParser,
|
full_text_parser: FullTextParser,
|
||||||
|
readability: Readability,
|
||||||
image_downloader: ImageDownloader,
|
image_downloader: ImageDownloader,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +23,7 @@ impl ArticleScraper {
|
||||||
pub async fn new(user_configs: Option<&Path>) -> Self {
|
pub async fn new(user_configs: Option<&Path>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
full_text_parser: FullTextParser::new(user_configs).await,
|
full_text_parser: FullTextParser::new(user_configs).await,
|
||||||
|
readability: Readability::new(),
|
||||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -30,7 +34,6 @@ impl ArticleScraper {
|
||||||
download_images: bool,
|
download_images: bool,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<Article, ScraperError> {
|
) -> Result<Article, ScraperError> {
|
||||||
|
|
||||||
let res = self.full_text_parser.parse(url, client).await;
|
let res = self.full_text_parser.parse(url, client).await;
|
||||||
|
|
||||||
if download_images {
|
if download_images {
|
||||||
|
@ -45,4 +48,4 @@ impl ArticleScraper {
|
||||||
|
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
7
src/readability/mod.rs
Normal file
7
src/readability/mod.rs
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
pub struct Readability;
|
||||||
|
|
||||||
|
impl Readability {
|
||||||
|
pub fn new() -> Self {
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
}
|
14
src/util.rs
14
src/util.rs
|
@ -65,8 +65,8 @@ impl Util {
|
||||||
}
|
}
|
||||||
|
|
||||||
for header in &global_rule.header {
|
for header in &global_rule.header {
|
||||||
let name =
|
let name = HeaderName::from_bytes(header.name.as_bytes())
|
||||||
HeaderName::from_bytes(header.name.as_bytes()).context(FullTextParserErrorKind::Config)?;
|
.context(FullTextParserErrorKind::Config)?;
|
||||||
let value = header
|
let value = header
|
||||||
.value
|
.value
|
||||||
.parse::<HeaderValue>()
|
.parse::<HeaderValue>()
|
||||||
|
@ -158,7 +158,10 @@ impl Util {
|
||||||
Err(FullTextParserErrorKind::Xml.into())
|
Err(FullTextParserErrorKind::Xml.into())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
|
pub fn extract_value_merge(
|
||||||
|
context: &Context,
|
||||||
|
xpath: &str,
|
||||||
|
) -> Result<String, FullTextParserError> {
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, true)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, true)?;
|
||||||
let mut val = String::new();
|
let mut val = String::new();
|
||||||
for node in node_vec {
|
for node in node_vec {
|
||||||
|
@ -188,7 +191,10 @@ impl Util {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), FullTextParserError> {
|
pub fn strip_id_or_class(
|
||||||
|
context: &Context,
|
||||||
|
id_or_class: &str,
|
||||||
|
) -> Result<(), FullTextParserError> {
|
||||||
let xpath = &format!(
|
let xpath = &format!(
|
||||||
"//*[contains(@class, '{}') or contains(@id, '{}')]",
|
"//*[contains(@class, '{}') or contains(@id, '{}')]",
|
||||||
id_or_class, id_or_class
|
id_or_class, id_or_class
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue