mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
improve title extraction
This commit is contained in:
parent
cce912c354
commit
98c06e11f4
7 changed files with 107 additions and 54 deletions
|
@ -11,7 +11,7 @@ pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||||
});
|
});
|
||||||
pub static NORMALIZE: Lazy<Regex> =
|
pub static NORMALIZE: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
|
Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
|
||||||
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex"));
|
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\W+"#).expect("TOKENIZE regex"));
|
||||||
pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
|
pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
|
||||||
Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex")
|
Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex")
|
||||||
});
|
});
|
||||||
|
@ -31,6 +31,14 @@ pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
|
||||||
pub static NEGATIVE: Lazy<Regex> =
|
pub static NEGATIVE: Lazy<Regex> =
|
||||||
Lazy::new(|| Regex::new(r#"/-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
|
Lazy::new(|| Regex::new(r#"/-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
|
||||||
|
|
||||||
|
pub static TITLE_SEPARATOR: Lazy<Regex> =
|
||||||
|
Lazy::new(|| Regex::new(r#"[-|\\/>»]"#).expect("TITLE_SEPARATOR regex"));
|
||||||
|
pub static TITLE_CUT_END: Lazy<Regex> =
|
||||||
|
Lazy::new(|| Regex::new(r#"(.*)[-|\\/>»] .*"#).expect("TITLE_CUT_END regex"));
|
||||||
|
pub static WORD_COUNT: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s+"#).expect("WORD_COUNT regex"));
|
||||||
|
pub static TITLE_CUT_FRONT: Lazy<Regex> =
|
||||||
|
Lazy::new(|| Regex::new(r#"/[^-|\\/>»]*[-|\\/>»](.*)/gi"#).expect("TITLE_CUT_FRONT regex"));
|
||||||
|
|
||||||
pub const SCORE_ATTR: &str = "content_score";
|
pub const SCORE_ATTR: &str = "content_score";
|
||||||
pub const MINIMUM_TOPCANDIDATES: usize = 3;
|
pub const MINIMUM_TOPCANDIDATES: usize = 3;
|
||||||
pub const UNLIKELY_ROLES: &[&str] = &[
|
pub const UNLIKELY_ROLES: &[&str] = &[
|
|
@ -1,5 +1,5 @@
|
||||||
use super::config::ConfigEntry;
|
use super::config::ConfigEntry;
|
||||||
use crate::{article::Article, util::Util};
|
use crate::{article::Article, constants, util::Util};
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, warn};
|
use log::{debug, warn};
|
||||||
|
@ -8,16 +8,29 @@ use std::str::FromStr;
|
||||||
pub fn extract(
|
pub fn extract(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: Option<&ConfigEntry>,
|
||||||
article: &mut Article,
|
article: &mut Article,
|
||||||
) {
|
) {
|
||||||
if article.title.is_none() {
|
if article.title.is_none() {
|
||||||
article.title = extract_title(context, config, global_config).map(|title| {
|
article.title = extract_title(context, config, global_config)
|
||||||
match escaper::decode_html(&title) {
|
.map(|title| match escaper::decode_html(&title) {
|
||||||
Ok(escaped_title) => escaped_title,
|
Ok(escaped_title) => escaped_title,
|
||||||
Err(_error) => title,
|
Err(_error) => title,
|
||||||
}
|
})
|
||||||
});
|
.map(|title| {
|
||||||
|
// clean titles that contain separators
|
||||||
|
if constants::TITLE_SEPARATOR.is_match(&title) {
|
||||||
|
let new_title = constants::TITLE_CUT_END.replace(&title, "$1");
|
||||||
|
let word_count = constants::WORD_COUNT.split(&title).count();
|
||||||
|
if word_count < 3 {
|
||||||
|
constants::TITLE_CUT_FRONT.replace(&title, "$1").to_string()
|
||||||
|
} else {
|
||||||
|
new_title.to_string()
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
title
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if article.author.is_none() {
|
if article.author.is_none() {
|
||||||
|
@ -38,7 +51,7 @@ pub fn extract(
|
||||||
fn extract_title(
|
fn extract_title(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: Option<&ConfigEntry>,
|
||||||
) -> Option<String> {
|
) -> Option<String> {
|
||||||
// check site specific config
|
// check site specific config
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
|
@ -51,27 +64,30 @@ fn extract_title(
|
||||||
}
|
}
|
||||||
|
|
||||||
// check global config
|
// check global config
|
||||||
for xpath_title in &global_config.xpath_title {
|
if let Some(global_config) = global_config {
|
||||||
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
|
for xpath_title in &global_config.xpath_title {
|
||||||
debug!("Article title: '{}'", title);
|
if let Ok(title) = Util::extract_value_merge(context, xpath_title) {
|
||||||
return Some(title);
|
debug!("Article title: '{}'", title);
|
||||||
|
return Some(title);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// generic meta (readablity)
|
// generic meta (readablity)
|
||||||
get_meta(context, "dc:title")
|
Util::extract_value(context, "//title")
|
||||||
|
.ok()
|
||||||
|
.or_else(|| get_meta(context, "dc:title"))
|
||||||
.or_else(|| get_meta(context, "dcterm:title"))
|
.or_else(|| get_meta(context, "dcterm:title"))
|
||||||
.or_else(|| get_meta(context, "og:title"))
|
.or_else(|| get_meta(context, "og:title"))
|
||||||
.or_else(|| get_meta(context, "weibo:article:title"))
|
.or_else(|| get_meta(context, "weibo:article:title"))
|
||||||
.or_else(|| get_meta(context, "weibo:webpage:title"))
|
.or_else(|| get_meta(context, "weibo:webpage:title"))
|
||||||
.or_else(|| get_meta(context, "title"))
|
|
||||||
.or_else(|| get_meta(context, "twitter:title"))
|
.or_else(|| get_meta(context, "twitter:title"))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_author(
|
fn extract_author(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: Option<&ConfigEntry>,
|
||||||
) -> Option<String> {
|
) -> Option<String> {
|
||||||
// check site specific config
|
// check site specific config
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
|
@ -84,23 +100,26 @@ fn extract_author(
|
||||||
}
|
}
|
||||||
|
|
||||||
// check global config
|
// check global config
|
||||||
for xpath_author in &global_config.xpath_author {
|
if let Some(global_config) = global_config {
|
||||||
if let Ok(author) = Util::extract_value(context, xpath_author) {
|
for xpath_author in &global_config.xpath_author {
|
||||||
debug!("Article author: '{}'", author);
|
if let Ok(author) = Util::extract_value(context, xpath_author) {
|
||||||
return Some(author);
|
debug!("Article author: '{}'", author);
|
||||||
|
return Some(author);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// generic meta (readablity)
|
// generic meta (readablity)
|
||||||
get_meta(context, "dc:creator")
|
Util::extract_value(context, "//author")
|
||||||
|
.ok()
|
||||||
|
.or_else(|| get_meta(context, "dc:creator"))
|
||||||
.or_else(|| get_meta(context, "dcterm:creator"))
|
.or_else(|| get_meta(context, "dcterm:creator"))
|
||||||
.or_else(|| get_meta(context, "author"))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_date(
|
fn extract_date(
|
||||||
context: &Context,
|
context: &Context,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: Option<&ConfigEntry>,
|
||||||
) -> Option<DateTime<Utc>> {
|
) -> Option<DateTime<Utc>> {
|
||||||
// check site specific config
|
// check site specific config
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
|
@ -117,13 +136,15 @@ fn extract_date(
|
||||||
}
|
}
|
||||||
|
|
||||||
// check global config
|
// check global config
|
||||||
for xpath_date in &global_config.xpath_date {
|
if let Some(global_config) = global_config {
|
||||||
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
|
for xpath_date in &global_config.xpath_date {
|
||||||
debug!("Article date: '{}'", date_string);
|
if let Ok(date_string) = Util::extract_value(context, xpath_date) {
|
||||||
if let Ok(date) = DateTime::from_str(&date_string) {
|
debug!("Article date: '{}'", date_string);
|
||||||
return Some(date);
|
if let Ok(date) = DateTime::from_str(&date_string) {
|
||||||
} else {
|
return Some(date);
|
||||||
warn!("Parsing the date string '{}' failed", date_string);
|
} else {
|
||||||
|
warn!("Parsing the date string '{}' failed", date_string);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,7 +174,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
metadata::extract(&xpath_ctx, config, global_config, article);
|
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
||||||
if article.thumbnail_url.is_none() {
|
if article.thumbnail_url.is_none() {
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
}
|
}
|
||||||
|
@ -182,7 +182,8 @@ impl FullTextParser {
|
||||||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
||||||
if !found_body {
|
if !found_body {
|
||||||
if let Err(error) = Readability::extract_body(document, root) {
|
if let Err(error) = Readability::extract_body(document, root, article.title.as_deref())
|
||||||
|
{
|
||||||
log::error!("Both ftr and readability failed to find content: {}", error);
|
log::error!("Both ftr and readability failed to find content: {}", error);
|
||||||
return Err(error);
|
return Err(error);
|
||||||
}
|
}
|
||||||
|
@ -246,7 +247,7 @@ impl FullTextParser {
|
||||||
let html = Self::download(url, client, headers).await?;
|
let html = Self::download(url, client, headers).await?;
|
||||||
let document = Self::parse_html(&html, config, global_config)?;
|
let document = Self::parse_html(&html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
metadata::extract(&xpath_ctx, config, global_config, article);
|
metadata::extract(&xpath_ctx, config, Some(global_config), article);
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
mod constants;
|
|
||||||
mod state;
|
mod state;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
@ -10,11 +9,16 @@ use libxml::tree::{node, Document, Node, NodeType};
|
||||||
|
|
||||||
use self::state::State;
|
use self::state::State;
|
||||||
use super::error::FullTextParserError;
|
use super::error::FullTextParserError;
|
||||||
|
use crate::constants;
|
||||||
|
|
||||||
pub struct Readability;
|
pub struct Readability;
|
||||||
|
|
||||||
impl Readability {
|
impl Readability {
|
||||||
pub fn extract_body(document: Document, root: &mut Node) -> Result<bool, FullTextParserError> {
|
pub fn extract_body(
|
||||||
|
document: Document,
|
||||||
|
root: &mut Node,
|
||||||
|
title: Option<&str>,
|
||||||
|
) -> Result<bool, FullTextParserError> {
|
||||||
node::set_node_rc_guard(6);
|
node::set_node_rc_guard(6);
|
||||||
|
|
||||||
let mut state = State::default();
|
let mut state = State::default();
|
||||||
|
@ -49,7 +53,9 @@ impl Readability {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if state.should_remove_title_header && Self::header_duplicates_title(node_ref) {
|
if state.should_remove_title_header
|
||||||
|
&& Self::header_duplicates_title(node_ref, title)
|
||||||
|
{
|
||||||
state.should_remove_title_header = false;
|
state.should_remove_title_header = false;
|
||||||
node = Self::remove_and_next(node_ref);
|
node = Self::remove_and_next(node_ref);
|
||||||
continue;
|
continue;
|
||||||
|
@ -278,7 +284,8 @@ impl Readability {
|
||||||
constants::MINIMUM_TOPCANDIDATES,
|
constants::MINIMUM_TOPCANDIDATES,
|
||||||
);
|
);
|
||||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
||||||
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
|
lists_containing_this_ancestor +=
|
||||||
|
if ancestor == parent { 1 } else { 0 };
|
||||||
}
|
}
|
||||||
|
|
||||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||||
|
@ -668,13 +675,18 @@ impl Readability {
|
||||||
|
|
||||||
// Check if this node is an H1 or H2 element whose content is mostly
|
// Check if this node is an H1 or H2 element whose content is mostly
|
||||||
// the same as the article title.
|
// the same as the article title.
|
||||||
fn header_duplicates_title(node: &Node) -> bool {
|
fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool {
|
||||||
let name = node.get_name().to_lowercase();
|
let name = node.get_name().to_lowercase();
|
||||||
if name != "h1" && name != "h2" {
|
if name != "h1" && name != "h2" {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
let heading = Self::get_inner_text(node, false);
|
let heading = Self::get_inner_text(node, false);
|
||||||
Self::text_similarity(&heading, "Get your Frontend JavaScript Code Covered") > 0.75
|
|
||||||
|
if let Some(title) = title {
|
||||||
|
Self::text_similarity(&heading, title) > 0.75
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||||
|
@ -695,18 +707,12 @@ impl Readability {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
let tokens_b_total: f64 = tokens_b
|
let tokens_b_total = tokens_b.join(" ").len() as f64;
|
||||||
.iter()
|
|
||||||
.map(|t| t.len())
|
|
||||||
.fold(0.0, |a, b| a + b as f64);
|
|
||||||
let uniq_tokens_b = tokens_b
|
let uniq_tokens_b = tokens_b
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter(|token| !tokens_a.iter().any(|t| t == token))
|
.filter(|token| !tokens_a.iter().any(|t| t == token))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
let uniq_tokens_b_total: f64 = uniq_tokens_b
|
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
|
||||||
.iter()
|
|
||||||
.map(|t| t.len())
|
|
||||||
.fold(0.0, |a, b| a + b as f64);
|
|
||||||
|
|
||||||
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
||||||
1.0 - distance_b
|
1.0 - distance_b
|
||||||
|
|
|
@ -1,17 +1,30 @@
|
||||||
use libxml::tree::{Document, Node};
|
use libxml::{
|
||||||
|
tree::{Document, Node},
|
||||||
|
xpath::Context,
|
||||||
|
};
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
|
||||||
use crate::full_text_parser::config::ConfigEntry;
|
use crate::{
|
||||||
|
article::Article,
|
||||||
|
full_text_parser::{config::ConfigEntry, metadata},
|
||||||
|
};
|
||||||
|
|
||||||
async fn prepare(html: &str, url: &Url) -> Document {
|
async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
|
||||||
let empty_config = ConfigEntry::default();
|
let empty_config = ConfigEntry::default();
|
||||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
||||||
document
|
let article = Article {
|
||||||
|
title: None,
|
||||||
|
author: None,
|
||||||
|
url: url.clone(),
|
||||||
|
date: None,
|
||||||
|
thumbnail_url: None,
|
||||||
|
document: None,
|
||||||
|
};
|
||||||
|
(document, xpath_ctx, article)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn test_1() {
|
async fn test_1() {
|
||||||
let _ = env_logger::builder().is_test(true).try_init();
|
let _ = env_logger::builder().is_test(true).try_init();
|
||||||
|
@ -19,9 +32,11 @@ async fn test_1() {
|
||||||
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
|
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
|
||||||
.expect("Failed to read HTML");
|
.expect("Failed to read HTML");
|
||||||
let url = Url::parse("http://google.com").unwrap();
|
let url = Url::parse("http://google.com").unwrap();
|
||||||
let document = prepare(&html, &url).await;
|
let (document, xpath_ctx, mut article) = prepare(&html, &url).await;
|
||||||
|
|
||||||
let mut root = Node::new("article", None, &document).unwrap();
|
let mut root = Node::new("article", None, &document).unwrap();
|
||||||
|
|
||||||
super::Readability::extract_body(document, &mut root).unwrap();
|
metadata::extract(&xpath_ctx, None, None, &mut article);
|
||||||
|
|
||||||
|
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
|
||||||
}
|
}
|
||||||
|
|
|
@ -145,7 +145,8 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
let small_image_base64 = base64::engine::general_purpose::STANDARD.encode(&small_image);
|
let small_image_base64 = base64::engine::general_purpose::STANDARD.encode(&small_image);
|
||||||
let big_image_base64 = big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img));
|
let big_image_base64 =
|
||||||
|
big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img));
|
||||||
let small_image_string =
|
let small_image_string =
|
||||||
format!("data:{};base64,{}", content_type_small, small_image_base64);
|
format!("data:{};base64,{}", content_type_small, small_image_base64);
|
||||||
let big_image_string = match big_image_base64 {
|
let big_image_string = match big_image_base64 {
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
mod article;
|
mod article;
|
||||||
|
mod constants;
|
||||||
mod error;
|
mod error;
|
||||||
mod full_text_parser;
|
mod full_text_parser;
|
||||||
pub mod images;
|
pub mod images;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue