mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-09 17:05:30 +02:00
improve title extraction
This commit is contained in:
parent
cce912c354
commit
98c06e11f4
7 changed files with 107 additions and 54 deletions
|
@ -1,71 +0,0 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||
pub static SIBLING_CONTENT: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex"));
|
||||
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
|
||||
});
|
||||
pub static NORMALIZE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
|
||||
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex"));
|
||||
pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex")
|
||||
});
|
||||
pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/and|article|body|column|content|main|shadow/i"#)
|
||||
.expect("OKAY_MAYBE_ITS_A_CANDIDATE regex")
|
||||
});
|
||||
pub static HAS_CONTENT: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
|
||||
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
|
||||
pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(
|
||||
r#"/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#,
|
||||
)
|
||||
.expect("POSITIVE regex")
|
||||
});
|
||||
pub static NEGATIVE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
|
||||
|
||||
pub const SCORE_ATTR: &str = "content_score";
|
||||
pub const MINIMUM_TOPCANDIDATES: usize = 3;
|
||||
pub const UNLIKELY_ROLES: &[&str] = &[
|
||||
"menu",
|
||||
"menubar",
|
||||
"complementary",
|
||||
"navigation",
|
||||
"alert",
|
||||
"alertdialog",
|
||||
"dialog",
|
||||
];
|
||||
|
||||
pub const DEFAULT_TAGS_TO_SCORE: &[&str] =
|
||||
&["SECTION", "H2", "H3", "H4", "H5", "H6", "P", "TD", "PRE"];
|
||||
pub static DIV_TO_P_ELEMS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
||||
HashSet::from([
|
||||
"BLOCKQUOTE",
|
||||
"DL",
|
||||
"DIV",
|
||||
"IMG",
|
||||
"OL",
|
||||
"P",
|
||||
"PRE",
|
||||
"TABLE",
|
||||
"UL",
|
||||
])
|
||||
});
|
||||
|
||||
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&'static str>> =
|
||||
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
|
||||
|
||||
pub const PHRASING_ELEMS: &[&str] = &[
|
||||
// "CANVAS", "IFRAME", "SVG", "VIDEO",
|
||||
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM",
|
||||
"EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT",
|
||||
"OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
|
||||
"SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
|
||||
];
|
|
@ -1,4 +1,3 @@
|
|||
mod constants;
|
||||
mod state;
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -10,11 +9,16 @@ use libxml::tree::{node, Document, Node, NodeType};
|
|||
|
||||
use self::state::State;
|
||||
use super::error::FullTextParserError;
|
||||
use crate::constants;
|
||||
|
||||
pub struct Readability;
|
||||
|
||||
impl Readability {
|
||||
pub fn extract_body(document: Document, root: &mut Node) -> Result<bool, FullTextParserError> {
|
||||
pub fn extract_body(
|
||||
document: Document,
|
||||
root: &mut Node,
|
||||
title: Option<&str>,
|
||||
) -> Result<bool, FullTextParserError> {
|
||||
node::set_node_rc_guard(6);
|
||||
|
||||
let mut state = State::default();
|
||||
|
@ -49,7 +53,9 @@ impl Readability {
|
|||
continue;
|
||||
}
|
||||
|
||||
if state.should_remove_title_header && Self::header_duplicates_title(node_ref) {
|
||||
if state.should_remove_title_header
|
||||
&& Self::header_duplicates_title(node_ref, title)
|
||||
{
|
||||
state.should_remove_title_header = false;
|
||||
node = Self::remove_and_next(node_ref);
|
||||
continue;
|
||||
|
@ -278,7 +284,8 @@ impl Readability {
|
|||
constants::MINIMUM_TOPCANDIDATES,
|
||||
);
|
||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
|
||||
lists_containing_this_ancestor +=
|
||||
if ancestor == parent { 1 } else { 0 };
|
||||
}
|
||||
|
||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||
|
@ -668,13 +675,18 @@ impl Readability {
|
|||
|
||||
// Check if this node is an H1 or H2 element whose content is mostly
|
||||
// the same as the article title.
|
||||
fn header_duplicates_title(node: &Node) -> bool {
|
||||
fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool {
|
||||
let name = node.get_name().to_lowercase();
|
||||
if name != "h1" && name != "h2" {
|
||||
return false;
|
||||
}
|
||||
let heading = Self::get_inner_text(node, false);
|
||||
Self::text_similarity(&heading, "Get your Frontend JavaScript Code Covered") > 0.75
|
||||
|
||||
if let Some(title) = title {
|
||||
Self::text_similarity(&heading, title) > 0.75
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||
|
@ -695,18 +707,12 @@ impl Readability {
|
|||
return 0.0;
|
||||
}
|
||||
|
||||
let tokens_b_total: f64 = tokens_b
|
||||
.iter()
|
||||
.map(|t| t.len())
|
||||
.fold(0.0, |a, b| a + b as f64);
|
||||
let tokens_b_total = tokens_b.join(" ").len() as f64;
|
||||
let uniq_tokens_b = tokens_b
|
||||
.into_iter()
|
||||
.filter(|token| !tokens_a.iter().any(|t| t == token))
|
||||
.collect::<Vec<_>>();
|
||||
let uniq_tokens_b_total: f64 = uniq_tokens_b
|
||||
.iter()
|
||||
.map(|t| t.len())
|
||||
.fold(0.0, |a, b| a + b as f64);
|
||||
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
|
||||
|
||||
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
||||
1.0 - distance_b
|
||||
|
|
|
@ -1,17 +1,30 @@
|
|||
use libxml::tree::{Document, Node};
|
||||
use libxml::{
|
||||
tree::{Document, Node},
|
||||
xpath::Context,
|
||||
};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::full_text_parser::config::ConfigEntry;
|
||||
use crate::{
|
||||
article::Article,
|
||||
full_text_parser::{config::ConfigEntry, metadata},
|
||||
};
|
||||
|
||||
async fn prepare(html: &str, url: &Url) -> Document {
|
||||
async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
|
||||
let empty_config = ConfigEntry::default();
|
||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
||||
document
|
||||
let article = Article {
|
||||
title: None,
|
||||
author: None,
|
||||
url: url.clone(),
|
||||
date: None,
|
||||
thumbnail_url: None,
|
||||
document: None,
|
||||
};
|
||||
(document, xpath_ctx, article)
|
||||
}
|
||||
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_1() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
@ -19,9 +32,11 @@ async fn test_1() {
|
|||
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
|
||||
.expect("Failed to read HTML");
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let document = prepare(&html, &url).await;
|
||||
let (document, xpath_ctx, mut article) = prepare(&html, &url).await;
|
||||
|
||||
let mut root = Node::new("article", None, &document).unwrap();
|
||||
|
||||
super::Readability::extract_body(document, &mut root).unwrap();
|
||||
metadata::extract(&xpath_ctx, None, None, &mut article);
|
||||
|
||||
super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue