1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

start implementing readability

This commit is contained in:
Jan Lukas Gernert 2023-01-01 14:51:34 +01:00
parent c08f5afa5d
commit 2750ad648d
10 changed files with 375 additions and 124 deletions

View file

@ -1,20 +1,10 @@
image: rust:latest
stages:
- lint
- build
run-build:
stage: build
image: rust:latest
script:
- rustc --version && cargo --version
- cargo build --release --jobs 1
run-lint:
stage: lint
image: rust:latest
before_script:
- rustup component add rustfmt
- rustup component add clippy
@ -22,3 +12,4 @@ run-lint:
- rustc --version && cargo --version
- cargo fmt -- --check
- cargo clippy --all-targets --all-features -- -D warnings
- cargo build --release --jobs 1

View file

@ -16,8 +16,9 @@ url = "2.3"
regex = "1.7"
encoding_rs = "0.8"
chrono = "0.4"
base64 = "0.13"
base64 = "0.20"
image = "0.24"
log = "0.4"
rust-embed="6.4"
once_cell = "1.16"
escaper = "0.1"

View file

@ -7,28 +7,26 @@ static FINGERPRINT_REGEXES: Lazy<HashMap<&'static str, Regex>> = Lazy::new(|| {
let mut m = HashMap::with_capacity(4);
m.insert(
"fingerprint.blogspot.com",
regex::Regex::new(
Regex::new(
r#"/\\<meta\s*content=([\\'"])blogger([\\'"])\s*name=([\\'"])generator([\\'"])/i"#,
)
.expect("failed to build static regex"),
);
m.insert(
"fingerprint.blogspot.com",
regex::Regex::new(
Regex::new(
r#"/\\<meta\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])Blogger([\\'"])/i"#,
)
.expect("failed to build static regex"),
);
m.insert(
"fingerprint.wordpress.com",
regex::Regex::new(
r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#,
)
Regex::new(r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#)
.expect("failed to build static regex"),
);
m.insert(
"fingerprint.ippen.media",
regex::Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
.expect("failed to build static regex"),
);
m

View file

@ -0,0 +1,132 @@
use chrono::{DateTime, Utc};
use libxml::xpath::Context;
use log::{debug, warn};
use std::str::FromStr;
use crate::{article::Article, util::Util};
use super::config::ConfigEntry;
pub fn extract(
context: &Context,
config: Option<&ConfigEntry>,
global_config: &ConfigEntry,
article: &mut Article,
) {
if article.title.is_none() {
article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) {
Ok(escaped_title) => escaped_title,
Err(_error) => title,
}));
}
if article.author.is_none() {
article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) {
Ok(escaped_author) => escaped_author,
Err(_error) => author,
}));
}
if article.date.is_none() {
article.date = extract_date(context, config, global_config);
}
}
fn extract_title(
context: &Context,
config: Option<&ConfigEntry>,
global_config: &ConfigEntry
) -> Option<String> {
// check site specific config
if let Some(config) = config {
for xpath_title in &config.xpath_title {
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
debug!("Article title: '{}'", title);
return Some(title);
}
}
}
// check global config
for xpath_title in &global_config.xpath_title {
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
debug!("Article title: '{}'", title);
return Some(title);
}
}
// generic meta (readablity)
get_meta(context, "dc:title")
.or_else(|| get_meta(context, "dcterm:title"))
.or_else(|| get_meta(context, "og:title"))
.or_else(|| get_meta(context, "weibo:article:title"))
.or_else(|| get_meta(context, "weibo:webpage:title"))
.or_else(|| get_meta(context, "title"))
.or_else(|| get_meta(context, "twitter:title"))
}
fn extract_author(
context: &Context,
config: Option<&ConfigEntry>,
global_config: &ConfigEntry
) -> Option<String> {
// check site specific config
if let Some(config) = config {
for xpath_author in &config.xpath_author {
if let Ok(author) = Util::extract_value(context, xpath_author) {
debug!("Article author: '{}'", author);
return Some(author);
}
}
}
// check global config
for xpath_author in &global_config.xpath_author {
if let Ok(author) = Util::extract_value(context, xpath_author) {
debug!("Article author: '{}'", author);
return Some(author);
}
}
// generic meta (readablity)
get_meta(context, "dc:creator")
.or_else(|| get_meta(context, "dcterm:creator"))
.or_else(|| get_meta(context, "author"))
}
fn extract_date(
context: &Context,
config: Option<&ConfigEntry>,
global_config: &ConfigEntry
) -> Option<DateTime<Utc>> {
// check site specific config
if let Some(config) = config {
for xpath_date in &config.xpath_date {
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) {
return Some(date);
} else {
warn!("Parsing the date string '{}' failed", date_string);
}
}
}
}
// check global config
for xpath_date in &global_config.xpath_date {
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) {
return Some(date);
} else {
warn!("Parsing the date string '{}' failed", date_string);
}
}
}
None
}
fn get_meta(context: &Context, name: &str) -> Option<String> {
Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok()
}

View file

@ -1,15 +1,18 @@
pub mod config;
pub mod error;
mod fingerprints;
mod readability;
mod metadata;
#[cfg(test)]
mod tests;
use self::config::{ConfigCollection, ConfigEntry};
use self::error::FullTextParserError;
use self::readability::Readability;
use crate::article::Article;
use crate::util::Util;
use chrono::DateTime;
use encoding_rs::Encoding;
use fingerprints::Fingerprints;
use libxml::parser::Parser;
@ -19,7 +22,7 @@ use log::{debug, error, info, warn};
use reqwest::header::HeaderMap;
use reqwest::Client;
use std::path::Path;
use std::str::{from_utf8, FromStr};
use std::str::from_utf8;
pub struct FullTextParser {
config_files: ConfigCollection,
@ -154,7 +157,7 @@ impl FullTextParser {
// parse again with single page url
debug!("Single page link found '{}'", single_page_url);
return self
if let Err(error) = self
.parse_single_page(
article,
&single_page_url,
@ -163,16 +166,27 @@ impl FullTextParser {
global_config,
client,
)
.await;
.await
{
log::warn!("Single Page parsing: {}", error);
log::debug!("Continuing with regular parser.");
}
}
}
Self::extract_metadata(&xpath_ctx, config, global_config, article);
metadata::extract(&xpath_ctx, config, global_config, article);
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, article);
}
Self::strip_junk(&xpath_ctx, config, global_config, url);
Self::extract_body(&xpath_ctx, root, config, global_config)?;
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
if found_body {
if let Err(error) = Readability::extract_body_readability(&document, root) {
log::error!("Both ftr and readability failed to find content: {}", error);
return Err(error);
}
}
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
let headers = Util::generate_headers(config, global_config)?;
@ -232,7 +246,7 @@ impl FullTextParser {
let html = Self::download(url, client, headers).await?;
let document = Self::parse_html(&html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?;
Self::extract_metadata(&xpath_ctx, config, global_config, article);
metadata::extract(&xpath_ctx, config, global_config, article);
Self::check_for_thumbnail(&xpath_ctx, article);
Self::strip_junk(&xpath_ctx, config, global_config, url);
Self::extract_body(&xpath_ctx, root, config, global_config)?;
@ -363,7 +377,7 @@ impl FullTextParser {
}
fn check_for_thumbnail(context: &Context, article: &mut Article) {
if let Ok(thumb) = Self::get_attribute(
if let Ok(thumb) = Util::get_attribute(
context,
"//meta[contains(@name, 'twitter:image')]",
"content",
@ -373,14 +387,14 @@ impl FullTextParser {
}
if let Ok(thumb) =
Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
{
article.thumbnail_url = Some(thumb);
return;
}
if let Ok(thumb) =
Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
{
article.thumbnail_url = Some(thumb);
}
@ -472,17 +486,6 @@ impl FullTextParser {
Ok(())
}
fn get_attribute(
context: &Context,
xpath: &str,
attribute: &str,
) -> Result<String, FullTextParserError> {
Util::evaluate_xpath(context, xpath, false)?
.iter()
.find_map(|node| node.get_attribute(attribute))
.ok_or(FullTextParserError::Xml)
}
fn repair_urls(
context: &Context,
xpath: &str,
@ -612,90 +615,12 @@ impl FullTextParser {
let _ = Util::strip_node(context, "//*[@type='text/css']");
}
fn extract_metadata(
context: &Context,
config: Option<&ConfigEntry>,
global_config: &ConfigEntry,
article: &mut Article,
) {
// try to get title
if let Some(config) = config {
for xpath_title in &config.xpath_title {
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
debug!("Article title: '{}'", title);
article.title = Some(title);
break;
}
}
}
if article.title.is_none() {
for xpath_title in &global_config.xpath_title {
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
debug!("Article title: '{}'", title);
article.title = Some(title);
break;
}
}
}
// try to get the author
if let Some(config) = config {
for xpath_author in &config.xpath_author {
if let Ok(author) = Util::extract_value(context, xpath_author) {
debug!("Article author: '{}'", author);
article.author = Some(author);
break;
}
}
}
if article.author.is_none() {
for xpath_author in &global_config.xpath_author {
if let Ok(author) = Util::extract_value(context, xpath_author) {
debug!("Article author: '{}'", author);
article.author = Some(author);
break;
}
}
}
// try to get the date
if let Some(config) = config {
for xpath_date in &config.xpath_date {
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) {
article.date = Some(date);
break;
} else {
warn!("Parsing the date string '{}' failed", date_string);
}
}
}
}
if article.date.is_none() {
for xpath_date in &global_config.xpath_date {
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
debug!("Article date: '{}'", date_string);
if let Ok(date) = DateTime::from_str(&date_string) {
article.date = Some(date);
break;
} else {
warn!("Parsing the date string '{}' failed", date_string);
}
}
}
}
}
fn extract_body(
context: &Context,
root: &mut Node,
config: Option<&ConfigEntry>,
global_config: &ConfigEntry,
) -> Result<(), FullTextParserError> {
) -> Result<bool, FullTextParserError> {
let mut found_something = false;
if let Some(config) = config {
@ -712,10 +637,9 @@ impl FullTextParser {
if !found_something {
log::error!("no body found");
return Err(FullTextParserError::Scrape);
}
Ok(())
Ok(found_something)
}
fn extract_body_single(
@ -752,7 +676,7 @@ impl FullTextParser {
) -> Option<url::Url> {
if let Some(config) = config {
if let Some(next_page_xpath) = config.next_page_link.as_deref() {
if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href")
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href")
{
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
return Some(next_page_url);
@ -760,7 +684,7 @@ impl FullTextParser {
}
}
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") {
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") {
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
return Some(next_page_url);
}

View file

@ -0,0 +1,165 @@
mod regex;
mod state;
use libxml::tree::{Document, Node};
use self::state::State;
use super::error::FullTextParserError;
pub struct Readability;
impl Readability {
pub fn extract_body_readability(
document: &Document,
root: &mut Node,
) -> Result<bool, FullTextParserError> {
let mut state = State::default();
let mut node: Option<Node> = document.clone().get_root_element();
while let Some(node_ref) = node.as_mut() {
let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}"));
if !Self::is_probably_visible(node_ref) {
node = Self::remove_and_next(node_ref);
continue;
}
if Self::check_byline(node_ref, &match_string) {
node = Self::remove_and_next(node_ref);
continue;
}
if state.should_remove_title_header && Self::header_duplicates_title(node_ref) {
state.should_remove_title_header = false;
node = Self::remove_and_next(node_ref);
continue;
}
if state.strip_unlikely {
}
node = Self::next_node(node_ref, false);
}
unimplemented!()
}
fn is_probably_visible(node: &Node) -> bool {
let display_none = node
.get_attribute("display")
.map(|display| display == "none")
.unwrap_or(false);
let is_hidden = node.has_attribute("hidden");
let aria_hidden = node
.get_attribute("aria-hidden")
.map(|attr| attr == "true")
.unwrap_or(false);
let has_fallback_image = node.get_class_names().contains("fallback-image");
!display_none && !is_hidden && !aria_hidden || has_fallback_image
}
fn remove_and_next(node: &mut Node) -> Option<Node> {
let next_node = Self::next_node(node, true);
node.unlink();
return next_node;
}
fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
// First check for kids if those aren't being ignored
let first_child = node.get_first_child();
if !ignore_self_and_kids && first_child.is_some() {
return first_child;
}
// Then for siblings...
let next_sibling = node.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
loop {
let parent = node.get_parent();
if parent.is_none() {
break;
}
if let Some(parent) = parent {
let next_sibling = parent.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
}
}
}
None
}
fn check_byline(node: &Node, matchstring: &str) -> bool {
let rel = node
.get_attribute("rel")
.map(|rel| rel == "author")
.unwrap_or(false);
let itemprop = node
.get_attribute("itemprop")
.map(|prop| prop.contains("author"))
.unwrap_or(false);
let content = node.get_content();
if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) {
// FIXME
true
} else {
false
}
}
// Check whether the input string could be a byline.
// This verifies that the input length is less than 100 chars.
fn is_valid_byline(line: &str) -> bool {
let len = line.trim().len();
len > 0 && len < 100
}
// Check if this node is an H1 or H2 element whose content is mostly
// the same as the article title.
fn header_duplicates_title(node: &Node) -> bool {
let name = node.get_name().to_lowercase();
if name != "h1" || name != "h2" {
return false;
}
let heading = Self::get_inner_text(node, false);
Self::text_similarity(&heading, "FIXME") > 0.75
}
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
let content = node.get_content().trim().to_owned();
if normalize_spaces {
regex::NORMALIZE.replace(&content, " ").into()
} else {
content
}
}
fn text_similarity(a: &str, b: &str) -> f64 {
let a = a.to_lowercase();
let b = b.to_lowercase();
let tokens_a = regex::TOKENIZE.split(&a).collect::<Vec<_>>();
let tokens_b = regex::TOKENIZE.split(&b).collect::<Vec<_>>();
if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 {
return 0.0;
}
let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::<Vec<_>>();
let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
let distance_b = uniq_tokens_b_total / tokens_b_total;
1.0 - distance_b
}
}

View file

@ -0,0 +1,12 @@
use once_cell::sync::Lazy;
use regex::Regex;
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
});
pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")
});
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| {
Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")
});

View file

@ -0,0 +1,17 @@
pub struct State {
pub strip_unlikely: bool,
pub weigh_classes: bool,
pub clean_conditionally: bool,
pub should_remove_title_header: bool,
}
impl Default for State {
fn default() -> Self {
Self {
strip_unlikely: true,
weigh_classes: true,
clean_conditionally: true,
should_remove_title_header: true,
}
}
}

View file

@ -36,7 +36,7 @@ impl ImageDownloader {
doc: &Document,
client: &Client,
) -> Result<String, ImageDownloadError> {
let xpath_ctx = Context::new(&doc).map_err(|()| {
let xpath_ctx = Context::new(doc).map_err(|()| {
error!("Failed to create xpath context for document");
ImageDownloadError::HtmlParse
})?;

View file

@ -145,6 +145,17 @@ impl Util {
None
}
pub fn get_attribute(
context: &Context,
xpath: &str,
attribute: &str,
) -> Result<String, FullTextParserError> {
Util::evaluate_xpath(context, xpath, false)?
.iter()
.find_map(|node| node.get_attribute(attribute))
.ok_or(FullTextParserError::Xml)
}
pub fn extract_value(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
if let Some(val) = node_vec.get(0) {