mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
readability stub
This commit is contained in:
parent
273ddd832c
commit
d906f6b7fe
5 changed files with 42 additions and 21 deletions
|
@ -1,7 +1,7 @@
|
|||
use chrono::{DateTime, Utc};
|
||||
use std::fs::File;
|
||||
use std::io::{Error, ErrorKind, Write};
|
||||
use std::path::PathBuf;
|
||||
use std::fs::File;
|
||||
use url::Url;
|
||||
|
||||
pub struct Article {
|
||||
|
@ -28,6 +28,9 @@ impl Article {
|
|||
}
|
||||
}
|
||||
|
||||
Err(Error::new(ErrorKind::NotFound, "Article does not contain HTML"))
|
||||
Err(Error::new(
|
||||
ErrorKind::NotFound,
|
||||
"Article does not contain HTML",
|
||||
))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
pub mod error;
|
||||
pub mod config;
|
||||
pub mod error;
|
||||
mod fingerprints;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use self::config::{ConfigCollection, ConfigEntry};
|
||||
use self::error::{FullTextParserError, FullTextParserErrorKind};
|
||||
use crate::article::Article;
|
||||
use self::config::{ConfigCollection, ConfigEntry};
|
||||
use crate::util::Util;
|
||||
use chrono::DateTime;
|
||||
use encoding_rs::Encoding;
|
||||
use failure::ResultExt;
|
||||
|
@ -20,7 +21,6 @@ use reqwest::header::HeaderMap;
|
|||
use reqwest::Client;
|
||||
use std::path::Path;
|
||||
use std::str::FromStr;
|
||||
use crate::util::Util;
|
||||
|
||||
pub struct FullTextParser {
|
||||
config_files: ConfigCollection,
|
||||
|
@ -29,9 +29,7 @@ pub struct FullTextParser {
|
|||
impl FullTextParser {
|
||||
pub async fn new(config_path: Option<&Path>) -> Self {
|
||||
let config_files = ConfigCollection::parse(config_path).await;
|
||||
Self {
|
||||
config_files,
|
||||
}
|
||||
Self { config_files }
|
||||
}
|
||||
|
||||
pub async fn parse(
|
||||
|
@ -83,7 +81,8 @@ impl FullTextParser {
|
|||
};
|
||||
|
||||
let mut document = Document::new().map_err(|()| FullTextParserErrorKind::Xml)?;
|
||||
let mut root = Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
|
||||
let mut root =
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserErrorKind::Xml)?;
|
||||
document.set_root_element(&root);
|
||||
|
||||
Self::generate_head(&mut root, &document)?;
|
||||
|
@ -263,7 +262,10 @@ impl FullTextParser {
|
|||
|
||||
if response.status().is_success() {
|
||||
let headers = response.headers().clone();
|
||||
let text = response.text().await.context(FullTextParserErrorKind::Http)?;
|
||||
let text = response
|
||||
.text()
|
||||
.await
|
||||
.context(FullTextParserErrorKind::Http)?;
|
||||
{
|
||||
if let Some(decoded_html) =
|
||||
Self::decode_html(&text, Self::get_encoding_from_html(&text))
|
||||
|
|
15
src/lib.rs
15
src/lib.rs
|
@ -1,18 +1,21 @@
|
|||
pub mod images;
|
||||
mod article;
|
||||
mod full_text_parser;
|
||||
mod util;
|
||||
mod error;
|
||||
mod full_text_parser;
|
||||
pub mod images;
|
||||
mod readability;
|
||||
mod util;
|
||||
|
||||
use std::path::Path;
|
||||
use article::Article;
|
||||
use full_text_parser::FullTextParser;
|
||||
use error::{ScraperError, ScraperErrorKind};
|
||||
use full_text_parser::FullTextParser;
|
||||
use images::ImageDownloader;
|
||||
use readability::Readability;
|
||||
use reqwest::Client;
|
||||
use std::path::Path;
|
||||
|
||||
pub struct ArticleScraper {
|
||||
full_text_parser: FullTextParser,
|
||||
readability: Readability,
|
||||
image_downloader: ImageDownloader,
|
||||
}
|
||||
|
||||
|
@ -20,6 +23,7 @@ impl ArticleScraper {
|
|||
pub async fn new(user_configs: Option<&Path>) -> Self {
|
||||
Self {
|
||||
full_text_parser: FullTextParser::new(user_configs).await,
|
||||
readability: Readability::new(),
|
||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||
}
|
||||
}
|
||||
|
@ -30,7 +34,6 @@ impl ArticleScraper {
|
|||
download_images: bool,
|
||||
client: &Client,
|
||||
) -> Result<Article, ScraperError> {
|
||||
|
||||
let res = self.full_text_parser.parse(url, client).await;
|
||||
|
||||
if download_images {
|
||||
|
|
7
src/readability/mod.rs
Normal file
7
src/readability/mod.rs
Normal file
|
@ -0,0 +1,7 @@
|
|||
pub struct Readability;
|
||||
|
||||
impl Readability {
|
||||
pub fn new() -> Self {
|
||||
unimplemented!()
|
||||
}
|
||||
}
|
14
src/util.rs
14
src/util.rs
|
@ -65,8 +65,8 @@ impl Util {
|
|||
}
|
||||
|
||||
for header in &global_rule.header {
|
||||
let name =
|
||||
HeaderName::from_bytes(header.name.as_bytes()).context(FullTextParserErrorKind::Config)?;
|
||||
let name = HeaderName::from_bytes(header.name.as_bytes())
|
||||
.context(FullTextParserErrorKind::Config)?;
|
||||
let value = header
|
||||
.value
|
||||
.parse::<HeaderValue>()
|
||||
|
@ -158,7 +158,10 @@ impl Util {
|
|||
Err(FullTextParserErrorKind::Xml.into())
|
||||
}
|
||||
|
||||
pub fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
|
||||
pub fn extract_value_merge(
|
||||
context: &Context,
|
||||
xpath: &str,
|
||||
) -> Result<String, FullTextParserError> {
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, true)?;
|
||||
let mut val = String::new();
|
||||
for node in node_vec {
|
||||
|
@ -188,7 +191,10 @@ impl Util {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), FullTextParserError> {
|
||||
pub fn strip_id_or_class(
|
||||
context: &Context,
|
||||
id_or_class: &str,
|
||||
) -> Result<(), FullTextParserError> {
|
||||
let xpath = &format!(
|
||||
"//*[contains(@class, '{}') or contains(@id, '{}')]",
|
||||
id_or_class, id_or_class
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue