mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
start implementing readability
This commit is contained in:
parent
c08f5afa5d
commit
2750ad648d
10 changed files with 375 additions and 124 deletions
|
@ -1,20 +1,10 @@
|
||||||
image: rust:latest
|
|
||||||
|
|
||||||
stages:
|
stages:
|
||||||
- lint
|
|
||||||
- build
|
- build
|
||||||
|
|
||||||
|
|
||||||
run-build:
|
run-build:
|
||||||
stage: build
|
stage: build
|
||||||
image: rust:latest
|
image: rust:latest
|
||||||
script:
|
|
||||||
- rustc --version && cargo --version
|
|
||||||
- cargo build --release --jobs 1
|
|
||||||
|
|
||||||
run-lint:
|
|
||||||
stage: lint
|
|
||||||
image: rust:latest
|
|
||||||
before_script:
|
before_script:
|
||||||
- rustup component add rustfmt
|
- rustup component add rustfmt
|
||||||
- rustup component add clippy
|
- rustup component add clippy
|
||||||
|
@ -22,3 +12,4 @@ run-lint:
|
||||||
- rustc --version && cargo --version
|
- rustc --version && cargo --version
|
||||||
- cargo fmt -- --check
|
- cargo fmt -- --check
|
||||||
- cargo clippy --all-targets --all-features -- -D warnings
|
- cargo clippy --all-targets --all-features -- -D warnings
|
||||||
|
- cargo build --release --jobs 1
|
||||||
|
|
|
@ -16,8 +16,9 @@ url = "2.3"
|
||||||
regex = "1.7"
|
regex = "1.7"
|
||||||
encoding_rs = "0.8"
|
encoding_rs = "0.8"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
base64 = "0.13"
|
base64 = "0.20"
|
||||||
image = "0.24"
|
image = "0.24"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
rust-embed="6.4"
|
rust-embed="6.4"
|
||||||
once_cell = "1.16"
|
once_cell = "1.16"
|
||||||
|
escaper = "0.1"
|
|
@ -7,28 +7,26 @@ static FINGERPRINT_REGEXES: Lazy<HashMap<&'static str, Regex>> = Lazy::new(|| {
|
||||||
let mut m = HashMap::with_capacity(4);
|
let mut m = HashMap::with_capacity(4);
|
||||||
m.insert(
|
m.insert(
|
||||||
"fingerprint.blogspot.com",
|
"fingerprint.blogspot.com",
|
||||||
regex::Regex::new(
|
Regex::new(
|
||||||
r#"/\\<meta\s*content=([\\'"])blogger([\\'"])\s*name=([\\'"])generator([\\'"])/i"#,
|
r#"/\\<meta\s*content=([\\'"])blogger([\\'"])\s*name=([\\'"])generator([\\'"])/i"#,
|
||||||
)
|
)
|
||||||
.expect("failed to build static regex"),
|
.expect("failed to build static regex"),
|
||||||
);
|
);
|
||||||
m.insert(
|
m.insert(
|
||||||
"fingerprint.blogspot.com",
|
"fingerprint.blogspot.com",
|
||||||
regex::Regex::new(
|
Regex::new(
|
||||||
r#"/\\<meta\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])Blogger([\\'"])/i"#,
|
r#"/\\<meta\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])Blogger([\\'"])/i"#,
|
||||||
)
|
)
|
||||||
.expect("failed to build static regex"),
|
.expect("failed to build static regex"),
|
||||||
);
|
);
|
||||||
m.insert(
|
m.insert(
|
||||||
"fingerprint.wordpress.com",
|
"fingerprint.wordpress.com",
|
||||||
regex::Regex::new(
|
Regex::new(r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#)
|
||||||
r#"/\\<meta\\s*name=([\\'"])generator([\\'"])\s*content=([\\'"])WordPress/i"#,
|
.expect("failed to build static regex"),
|
||||||
)
|
|
||||||
.expect("failed to build static regex"),
|
|
||||||
);
|
);
|
||||||
m.insert(
|
m.insert(
|
||||||
"fingerprint.ippen.media",
|
"fingerprint.ippen.media",
|
||||||
regex::Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
|
Regex::new(r#"/\\<div\\s*class=([\\'"])id-SiteBEEPWrap([\\'"])\\>/i"#)
|
||||||
.expect("failed to build static regex"),
|
.expect("failed to build static regex"),
|
||||||
);
|
);
|
||||||
m
|
m
|
||||||
|
|
132
src/full_text_parser/metadata.rs
Normal file
132
src/full_text_parser/metadata.rs
Normal file
|
@ -0,0 +1,132 @@
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use libxml::xpath::Context;
|
||||||
|
use log::{debug, warn};
|
||||||
|
use std::str::FromStr;
|
||||||
|
use crate::{article::Article, util::Util};
|
||||||
|
use super::config::ConfigEntry;
|
||||||
|
|
||||||
|
pub fn extract(
|
||||||
|
context: &Context,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry,
|
||||||
|
article: &mut Article,
|
||||||
|
) {
|
||||||
|
|
||||||
|
if article.title.is_none() {
|
||||||
|
article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) {
|
||||||
|
Ok(escaped_title) => escaped_title,
|
||||||
|
Err(_error) => title,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.author.is_none() {
|
||||||
|
article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) {
|
||||||
|
Ok(escaped_author) => escaped_author,
|
||||||
|
Err(_error) => author,
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
if article.date.is_none() {
|
||||||
|
article.date = extract_date(context, config, global_config);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_title(
|
||||||
|
context: &Context,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry
|
||||||
|
) -> Option<String> {
|
||||||
|
// check site specific config
|
||||||
|
if let Some(config) = config {
|
||||||
|
for xpath_title in &config.xpath_title {
|
||||||
|
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
|
||||||
|
debug!("Article title: '{}'", title);
|
||||||
|
return Some(title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check global config
|
||||||
|
for xpath_title in &global_config.xpath_title {
|
||||||
|
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
|
||||||
|
debug!("Article title: '{}'", title);
|
||||||
|
return Some(title);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// generic meta (readablity)
|
||||||
|
get_meta(context, "dc:title")
|
||||||
|
.or_else(|| get_meta(context, "dcterm:title"))
|
||||||
|
.or_else(|| get_meta(context, "og:title"))
|
||||||
|
.or_else(|| get_meta(context, "weibo:article:title"))
|
||||||
|
.or_else(|| get_meta(context, "weibo:webpage:title"))
|
||||||
|
.or_else(|| get_meta(context, "title"))
|
||||||
|
.or_else(|| get_meta(context, "twitter:title"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_author(
|
||||||
|
context: &Context,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry
|
||||||
|
) -> Option<String> {
|
||||||
|
// check site specific config
|
||||||
|
if let Some(config) = config {
|
||||||
|
for xpath_author in &config.xpath_author {
|
||||||
|
if let Ok(author) = Util::extract_value(context, xpath_author) {
|
||||||
|
debug!("Article author: '{}'", author);
|
||||||
|
return Some(author);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check global config
|
||||||
|
for xpath_author in &global_config.xpath_author {
|
||||||
|
if let Ok(author) = Util::extract_value(context, xpath_author) {
|
||||||
|
debug!("Article author: '{}'", author);
|
||||||
|
return Some(author);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// generic meta (readablity)
|
||||||
|
get_meta(context, "dc:creator")
|
||||||
|
.or_else(|| get_meta(context, "dcterm:creator"))
|
||||||
|
.or_else(|| get_meta(context, "author"))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_date(
|
||||||
|
context: &Context,
|
||||||
|
config: Option<&ConfigEntry>,
|
||||||
|
global_config: &ConfigEntry
|
||||||
|
) -> Option<DateTime<Utc>> {
|
||||||
|
// check site specific config
|
||||||
|
if let Some(config) = config {
|
||||||
|
for xpath_date in &config.xpath_date {
|
||||||
|
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
|
||||||
|
debug!("Article date: '{}'", date_string);
|
||||||
|
if let Ok(date) = DateTime::from_str(&date_string) {
|
||||||
|
return Some(date);
|
||||||
|
} else {
|
||||||
|
warn!("Parsing the date string '{}' failed", date_string);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// check global config
|
||||||
|
for xpath_date in &global_config.xpath_date {
|
||||||
|
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
|
||||||
|
debug!("Article date: '{}'", date_string);
|
||||||
|
if let Ok(date) = DateTime::from_str(&date_string) {
|
||||||
|
return Some(date);
|
||||||
|
} else {
|
||||||
|
warn!("Parsing the date string '{}' failed", date_string);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_meta(context: &Context, name: &str) -> Option<String> {
|
||||||
|
Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok()
|
||||||
|
}
|
|
@ -1,15 +1,18 @@
|
||||||
pub mod config;
|
pub mod config;
|
||||||
pub mod error;
|
pub mod error;
|
||||||
mod fingerprints;
|
mod fingerprints;
|
||||||
|
mod readability;
|
||||||
|
mod metadata;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests;
|
mod tests;
|
||||||
|
|
||||||
use self::config::{ConfigCollection, ConfigEntry};
|
use self::config::{ConfigCollection, ConfigEntry};
|
||||||
use self::error::FullTextParserError;
|
use self::error::FullTextParserError;
|
||||||
|
use self::readability::Readability;
|
||||||
use crate::article::Article;
|
use crate::article::Article;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use chrono::DateTime;
|
|
||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
use fingerprints::Fingerprints;
|
use fingerprints::Fingerprints;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
|
@ -19,7 +22,7 @@ use log::{debug, error, info, warn};
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::str::{from_utf8, FromStr};
|
use std::str::from_utf8;
|
||||||
|
|
||||||
pub struct FullTextParser {
|
pub struct FullTextParser {
|
||||||
config_files: ConfigCollection,
|
config_files: ConfigCollection,
|
||||||
|
@ -154,7 +157,7 @@ impl FullTextParser {
|
||||||
// parse again with single page url
|
// parse again with single page url
|
||||||
debug!("Single page link found '{}'", single_page_url);
|
debug!("Single page link found '{}'", single_page_url);
|
||||||
|
|
||||||
return self
|
if let Err(error) = self
|
||||||
.parse_single_page(
|
.parse_single_page(
|
||||||
article,
|
article,
|
||||||
&single_page_url,
|
&single_page_url,
|
||||||
|
@ -163,16 +166,27 @@ impl FullTextParser {
|
||||||
global_config,
|
global_config,
|
||||||
client,
|
client,
|
||||||
)
|
)
|
||||||
.await;
|
.await
|
||||||
|
{
|
||||||
|
log::warn!("Single Page parsing: {}", error);
|
||||||
|
log::debug!("Continuing with regular parser.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
metadata::extract(&xpath_ctx, config, global_config, article);
|
||||||
if article.thumbnail_url.is_none() {
|
if article.thumbnail_url.is_none() {
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
}
|
}
|
||||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
|
if found_body {
|
||||||
|
if let Err(error) = Readability::extract_body_readability(&document, root) {
|
||||||
|
log::error!("Both ftr and readability failed to find content: {}", error);
|
||||||
|
return Err(error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
|
@ -232,7 +246,7 @@ impl FullTextParser {
|
||||||
let html = Self::download(url, client, headers).await?;
|
let html = Self::download(url, client, headers).await?;
|
||||||
let document = Self::parse_html(&html, config, global_config)?;
|
let document = Self::parse_html(&html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
metadata::extract(&xpath_ctx, config, global_config, article);
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
@ -363,7 +377,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_for_thumbnail(context: &Context, article: &mut Article) {
|
fn check_for_thumbnail(context: &Context, article: &mut Article) {
|
||||||
if let Ok(thumb) = Self::get_attribute(
|
if let Ok(thumb) = Util::get_attribute(
|
||||||
context,
|
context,
|
||||||
"//meta[contains(@name, 'twitter:image')]",
|
"//meta[contains(@name, 'twitter:image')]",
|
||||||
"content",
|
"content",
|
||||||
|
@ -373,14 +387,14 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(thumb) =
|
if let Ok(thumb) =
|
||||||
Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
|
Util::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
|
||||||
{
|
{
|
||||||
article.thumbnail_url = Some(thumb);
|
article.thumbnail_url = Some(thumb);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(thumb) =
|
if let Ok(thumb) =
|
||||||
Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
|
Util::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
|
||||||
{
|
{
|
||||||
article.thumbnail_url = Some(thumb);
|
article.thumbnail_url = Some(thumb);
|
||||||
}
|
}
|
||||||
|
@ -472,17 +486,6 @@ impl FullTextParser {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_attribute(
|
|
||||||
context: &Context,
|
|
||||||
xpath: &str,
|
|
||||||
attribute: &str,
|
|
||||||
) -> Result<String, FullTextParserError> {
|
|
||||||
Util::evaluate_xpath(context, xpath, false)?
|
|
||||||
.iter()
|
|
||||||
.find_map(|node| node.get_attribute(attribute))
|
|
||||||
.ok_or(FullTextParserError::Xml)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn repair_urls(
|
fn repair_urls(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
xpath: &str,
|
xpath: &str,
|
||||||
|
@ -612,90 +615,12 @@ impl FullTextParser {
|
||||||
let _ = Util::strip_node(context, "//*[@type='text/css']");
|
let _ = Util::strip_node(context, "//*[@type='text/css']");
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_metadata(
|
|
||||||
context: &Context,
|
|
||||||
config: Option<&ConfigEntry>,
|
|
||||||
global_config: &ConfigEntry,
|
|
||||||
article: &mut Article,
|
|
||||||
) {
|
|
||||||
// try to get title
|
|
||||||
if let Some(config) = config {
|
|
||||||
for xpath_title in &config.xpath_title {
|
|
||||||
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
|
|
||||||
debug!("Article title: '{}'", title);
|
|
||||||
article.title = Some(title);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if article.title.is_none() {
|
|
||||||
for xpath_title in &global_config.xpath_title {
|
|
||||||
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
|
|
||||||
debug!("Article title: '{}'", title);
|
|
||||||
article.title = Some(title);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// try to get the author
|
|
||||||
if let Some(config) = config {
|
|
||||||
for xpath_author in &config.xpath_author {
|
|
||||||
if let Ok(author) = Util::extract_value(context, xpath_author) {
|
|
||||||
debug!("Article author: '{}'", author);
|
|
||||||
article.author = Some(author);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if article.author.is_none() {
|
|
||||||
for xpath_author in &global_config.xpath_author {
|
|
||||||
if let Ok(author) = Util::extract_value(context, xpath_author) {
|
|
||||||
debug!("Article author: '{}'", author);
|
|
||||||
article.author = Some(author);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// try to get the date
|
|
||||||
if let Some(config) = config {
|
|
||||||
for xpath_date in &config.xpath_date {
|
|
||||||
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
|
|
||||||
debug!("Article date: '{}'", date_string);
|
|
||||||
if let Ok(date) = DateTime::from_str(&date_string) {
|
|
||||||
article.date = Some(date);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
warn!("Parsing the date string '{}' failed", date_string);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if article.date.is_none() {
|
|
||||||
for xpath_date in &global_config.xpath_date {
|
|
||||||
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
|
|
||||||
debug!("Article date: '{}'", date_string);
|
|
||||||
if let Ok(date) = DateTime::from_str(&date_string) {
|
|
||||||
article.date = Some(date);
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
warn!("Parsing the date string '{}' failed", date_string);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn extract_body(
|
fn extract_body(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: &ConfigEntry,
|
||||||
) -> Result<(), FullTextParserError> {
|
) -> Result<bool, FullTextParserError> {
|
||||||
let mut found_something = false;
|
let mut found_something = false;
|
||||||
|
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
|
@ -712,10 +637,9 @@ impl FullTextParser {
|
||||||
|
|
||||||
if !found_something {
|
if !found_something {
|
||||||
log::error!("no body found");
|
log::error!("no body found");
|
||||||
return Err(FullTextParserError::Scrape);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(found_something)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_body_single(
|
fn extract_body_single(
|
||||||
|
@ -752,7 +676,7 @@ impl FullTextParser {
|
||||||
) -> Option<url::Url> {
|
) -> Option<url::Url> {
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
if let Some(next_page_xpath) = config.next_page_link.as_deref() {
|
if let Some(next_page_xpath) = config.next_page_link.as_deref() {
|
||||||
if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href")
|
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href")
|
||||||
{
|
{
|
||||||
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
||||||
return Some(next_page_url);
|
return Some(next_page_url);
|
||||||
|
@ -760,7 +684,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
|
} else if let Some(next_page_xpath) = global_config.next_page_link.as_deref() {
|
||||||
if let Ok(next_page_string) = Self::get_attribute(context, next_page_xpath, "href") {
|
if let Ok(next_page_string) = Util::get_attribute(context, next_page_xpath, "href") {
|
||||||
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
if let Ok(next_page_url) = url::Url::parse(&next_page_string) {
|
||||||
return Some(next_page_url);
|
return Some(next_page_url);
|
||||||
}
|
}
|
||||||
|
|
165
src/full_text_parser/readability/mod.rs
Normal file
165
src/full_text_parser/readability/mod.rs
Normal file
|
@ -0,0 +1,165 @@
|
||||||
|
mod regex;
|
||||||
|
mod state;
|
||||||
|
|
||||||
|
use libxml::tree::{Document, Node};
|
||||||
|
|
||||||
|
use self::state::State;
|
||||||
|
use super::error::FullTextParserError;
|
||||||
|
|
||||||
|
pub struct Readability;
|
||||||
|
|
||||||
|
impl Readability {
|
||||||
|
pub fn extract_body_readability(
|
||||||
|
document: &Document,
|
||||||
|
root: &mut Node,
|
||||||
|
) -> Result<bool, FullTextParserError> {
|
||||||
|
let mut state = State::default();
|
||||||
|
let mut node: Option<Node> = document.clone().get_root_element();
|
||||||
|
|
||||||
|
while let Some(node_ref) = node.as_mut() {
|
||||||
|
|
||||||
|
let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}"));
|
||||||
|
|
||||||
|
if !Self::is_probably_visible(node_ref) {
|
||||||
|
node = Self::remove_and_next(node_ref);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if Self::check_byline(node_ref, &match_string) {
|
||||||
|
node = Self::remove_and_next(node_ref);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if state.should_remove_title_header && Self::header_duplicates_title(node_ref) {
|
||||||
|
state.should_remove_title_header = false;
|
||||||
|
node = Self::remove_and_next(node_ref);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if state.strip_unlikely {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
node = Self::next_node(node_ref, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
unimplemented!()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_probably_visible(node: &Node) -> bool {
|
||||||
|
let display_none = node
|
||||||
|
.get_attribute("display")
|
||||||
|
.map(|display| display == "none")
|
||||||
|
.unwrap_or(false);
|
||||||
|
let is_hidden = node.has_attribute("hidden");
|
||||||
|
let aria_hidden = node
|
||||||
|
.get_attribute("aria-hidden")
|
||||||
|
.map(|attr| attr == "true")
|
||||||
|
.unwrap_or(false);
|
||||||
|
let has_fallback_image = node.get_class_names().contains("fallback-image");
|
||||||
|
|
||||||
|
!display_none && !is_hidden && !aria_hidden || has_fallback_image
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_and_next(node: &mut Node) -> Option<Node> {
|
||||||
|
let next_node = Self::next_node(node, true);
|
||||||
|
node.unlink();
|
||||||
|
return next_node;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
|
||||||
|
// First check for kids if those aren't being ignored
|
||||||
|
let first_child = node.get_first_child();
|
||||||
|
if !ignore_self_and_kids && first_child.is_some() {
|
||||||
|
return first_child;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Then for siblings...
|
||||||
|
let next_sibling = node.get_next_sibling();
|
||||||
|
if next_sibling.is_some() {
|
||||||
|
return next_sibling;
|
||||||
|
}
|
||||||
|
|
||||||
|
// And finally, move up the parent chain *and* find a sibling
|
||||||
|
// (because this is depth-first traversal, we will have already
|
||||||
|
// seen the parent nodes themselves).
|
||||||
|
loop {
|
||||||
|
let parent = node.get_parent();
|
||||||
|
if parent.is_none() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(parent) = parent {
|
||||||
|
let next_sibling = parent.get_next_sibling();
|
||||||
|
if next_sibling.is_some() {
|
||||||
|
return next_sibling;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
fn check_byline(node: &Node, matchstring: &str) -> bool {
|
||||||
|
let rel = node
|
||||||
|
.get_attribute("rel")
|
||||||
|
.map(|rel| rel == "author")
|
||||||
|
.unwrap_or(false);
|
||||||
|
let itemprop = node
|
||||||
|
.get_attribute("itemprop")
|
||||||
|
.map(|prop| prop.contains("author"))
|
||||||
|
.unwrap_or(false);
|
||||||
|
|
||||||
|
let content = node.get_content();
|
||||||
|
if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) {
|
||||||
|
// FIXME
|
||||||
|
true
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check whether the input string could be a byline.
|
||||||
|
// This verifies that the input length is less than 100 chars.
|
||||||
|
fn is_valid_byline(line: &str) -> bool {
|
||||||
|
let len = line.trim().len();
|
||||||
|
len > 0 && len < 100
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if this node is an H1 or H2 element whose content is mostly
|
||||||
|
// the same as the article title.
|
||||||
|
fn header_duplicates_title(node: &Node) -> bool {
|
||||||
|
let name = node.get_name().to_lowercase();
|
||||||
|
if name != "h1" || name != "h2" {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
let heading = Self::get_inner_text(node, false);
|
||||||
|
Self::text_similarity(&heading, "FIXME") > 0.75
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||||
|
let content = node.get_content().trim().to_owned();
|
||||||
|
if normalize_spaces {
|
||||||
|
regex::NORMALIZE.replace(&content, " ").into()
|
||||||
|
} else {
|
||||||
|
content
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn text_similarity(a: &str, b: &str) -> f64 {
|
||||||
|
let a = a.to_lowercase();
|
||||||
|
let b = b.to_lowercase();
|
||||||
|
let tokens_a = regex::TOKENIZE.split(&a).collect::<Vec<_>>();
|
||||||
|
let tokens_b = regex::TOKENIZE.split(&b).collect::<Vec<_>>();
|
||||||
|
if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
|
||||||
|
let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::<Vec<_>>();
|
||||||
|
let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
|
||||||
|
|
||||||
|
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
||||||
|
1.0 - distance_b
|
||||||
|
}
|
||||||
|
}
|
12
src/full_text_parser/readability/regex.rs
Normal file
12
src/full_text_parser/readability/regex.rs
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use regex::Regex;
|
||||||
|
|
||||||
|
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
|
||||||
|
});
|
||||||
|
pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")
|
||||||
|
});
|
||||||
|
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| {
|
||||||
|
Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")
|
||||||
|
});
|
17
src/full_text_parser/readability/state.rs
Normal file
17
src/full_text_parser/readability/state.rs
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
pub struct State {
|
||||||
|
pub strip_unlikely: bool,
|
||||||
|
pub weigh_classes: bool,
|
||||||
|
pub clean_conditionally: bool,
|
||||||
|
pub should_remove_title_header: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for State {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
strip_unlikely: true,
|
||||||
|
weigh_classes: true,
|
||||||
|
clean_conditionally: true,
|
||||||
|
should_remove_title_header: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -36,7 +36,7 @@ impl ImageDownloader {
|
||||||
doc: &Document,
|
doc: &Document,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<String, ImageDownloadError> {
|
) -> Result<String, ImageDownloadError> {
|
||||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
let xpath_ctx = Context::new(doc).map_err(|()| {
|
||||||
error!("Failed to create xpath context for document");
|
error!("Failed to create xpath context for document");
|
||||||
ImageDownloadError::HtmlParse
|
ImageDownloadError::HtmlParse
|
||||||
})?;
|
})?;
|
||||||
|
|
11
src/util.rs
11
src/util.rs
|
@ -145,6 +145,17 @@ impl Util {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_attribute(
|
||||||
|
context: &Context,
|
||||||
|
xpath: &str,
|
||||||
|
attribute: &str,
|
||||||
|
) -> Result<String, FullTextParserError> {
|
||||||
|
Util::evaluate_xpath(context, xpath, false)?
|
||||||
|
.iter()
|
||||||
|
.find_map(|node| node.get_attribute(attribute))
|
||||||
|
.ok_or(FullTextParserError::Xml)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn extract_value(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
|
pub fn extract_value(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||||
if let Some(val) = node_vec.get(0) {
|
if let Some(val) = node_vec.get(0) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue