mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
more
This commit is contained in:
parent
2750ad648d
commit
979358fd35
6 changed files with 318 additions and 51 deletions
|
@ -1,9 +1,9 @@
|
|||
use super::config::ConfigEntry;
|
||||
use crate::{article::Article, util::Util};
|
||||
use chrono::{DateTime, Utc};
|
||||
use libxml::xpath::Context;
|
||||
use log::{debug, warn};
|
||||
use std::str::FromStr;
|
||||
use crate::{article::Article, util::Util};
|
||||
use super::config::ConfigEntry;
|
||||
|
||||
pub fn extract(
|
||||
context: &Context,
|
||||
|
@ -11,19 +11,23 @@ pub fn extract(
|
|||
global_config: &ConfigEntry,
|
||||
article: &mut Article,
|
||||
) {
|
||||
|
||||
if article.title.is_none() {
|
||||
article.title = extract_title(context, config, global_config).and_then(|title| Some(match escaper::decode_html(&title) {
|
||||
article.title = extract_title(context, config, global_config).map(|title| {
|
||||
match escaper::decode_html(&title) {
|
||||
Ok(escaped_title) => escaped_title,
|
||||
Err(_error) => title,
|
||||
}));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
if article.author.is_none() {
|
||||
article.author = extract_author(context, config, global_config).and_then(|author| Some(match escaper::decode_html(&author) {
|
||||
article.author =
|
||||
extract_author(context, config, global_config).map(
|
||||
|author| match escaper::decode_html(&author) {
|
||||
Ok(escaped_author) => escaped_author,
|
||||
Err(_error) => author,
|
||||
}));
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
if article.date.is_none() {
|
||||
|
@ -34,7 +38,7 @@ pub fn extract(
|
|||
fn extract_title(
|
||||
context: &Context,
|
||||
config: Option<&ConfigEntry>,
|
||||
global_config: &ConfigEntry
|
||||
global_config: &ConfigEntry,
|
||||
) -> Option<String> {
|
||||
// check site specific config
|
||||
if let Some(config) = config {
|
||||
|
@ -67,7 +71,7 @@ fn extract_title(
|
|||
fn extract_author(
|
||||
context: &Context,
|
||||
config: Option<&ConfigEntry>,
|
||||
global_config: &ConfigEntry
|
||||
global_config: &ConfigEntry,
|
||||
) -> Option<String> {
|
||||
// check site specific config
|
||||
if let Some(config) = config {
|
||||
|
@ -96,7 +100,7 @@ fn extract_author(
|
|||
fn extract_date(
|
||||
context: &Context,
|
||||
config: Option<&ConfigEntry>,
|
||||
global_config: &ConfigEntry
|
||||
global_config: &ConfigEntry,
|
||||
) -> Option<DateTime<Utc>> {
|
||||
// check site specific config
|
||||
if let Some(config) = config {
|
||||
|
@ -128,5 +132,10 @@ fn extract_date(
|
|||
}
|
||||
|
||||
fn get_meta(context: &Context, name: &str) -> Option<String> {
|
||||
Util::get_attribute(context, &format!("//meta[contains(@name, '{}')]", name), "content").ok()
|
||||
Util::get_attribute(
|
||||
context,
|
||||
&format!("//meta[contains(@name, '{}')]", name),
|
||||
"content",
|
||||
)
|
||||
.ok()
|
||||
}
|
|
@ -1,8 +1,8 @@
|
|||
pub mod config;
|
||||
pub mod error;
|
||||
mod fingerprints;
|
||||
mod readability;
|
||||
mod metadata;
|
||||
mod readability;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
|
40
src/full_text_parser/readability/constants.rs
Normal file
40
src/full_text_parser/readability/constants.rs
Normal file
|
@ -0,0 +1,40 @@
|
|||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
|
||||
});
|
||||
pub static NORMALIZE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
|
||||
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex"));
|
||||
pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i"#).expect("UNLIELY_CANDIDATES regex")
|
||||
});
|
||||
pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/and|article|body|column|content|main|shadow/i"#)
|
||||
.expect("OKAY_MAYBE_ITS_A_CANDIDATE regex")
|
||||
});
|
||||
pub static HAS_CONTENT: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
|
||||
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
|
||||
|
||||
pub const UNLIKELY_ROLES: &[&str] = &[
|
||||
"menu",
|
||||
"menubar",
|
||||
"complementary",
|
||||
"navigation",
|
||||
"alert",
|
||||
"alertdialog",
|
||||
"dialog",
|
||||
];
|
||||
|
||||
pub const DEFAULT_TAGS_TO_SCORE: &[&str] =
|
||||
&["SECTION", "H2", "H3", "H4", "H5", "H6", "P", "TD", "PRE"];
|
||||
|
||||
pub const PHRASING_ELEMS: &[&str] = &[
|
||||
// "CANVAS", "IFRAME", "SVG", "VIDEO",
|
||||
"ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA", "DATALIST", "DFN", "EM",
|
||||
"EMBED", "I", "IMG", "INPUT", "KBD", "LABEL", "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT",
|
||||
"OUTPUT", "PROGRESS", "Q", "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG",
|
||||
"SUB", "SUP", "TEXTAREA", "TIME", "VAR", "WBR",
|
||||
];
|
|
@ -1,7 +1,7 @@
|
|||
mod regex;
|
||||
mod constants;
|
||||
mod state;
|
||||
|
||||
use libxml::tree::{Document, Node};
|
||||
use libxml::tree::{Document, Node, NodeType};
|
||||
|
||||
use self::state::State;
|
||||
use super::error::FullTextParserError;
|
||||
|
@ -11,21 +11,29 @@ pub struct Readability;
|
|||
impl Readability {
|
||||
pub fn extract_body_readability(
|
||||
document: &Document,
|
||||
root: &mut Node,
|
||||
_root: &mut Node,
|
||||
) -> Result<bool, FullTextParserError> {
|
||||
let mut state = State::default();
|
||||
let mut elements_to_score = Vec::new();
|
||||
let mut node: Option<Node> = document.clone().get_root_element();
|
||||
|
||||
while let Some(node_ref) = node.as_mut() {
|
||||
|
||||
let match_string = node_ref.get_class_names().iter().fold(String::new(), |a, b| format!("{a} {b}"));
|
||||
let tag_name = node_ref.get_name().to_uppercase();
|
||||
let match_string = node_ref
|
||||
.get_class_names()
|
||||
.iter()
|
||||
.fold(String::new(), |a, b| format!("{a} {b}"));
|
||||
let match_string = match node_ref.get_property("id") {
|
||||
Some(id) => format!("{match_string} {id}"),
|
||||
None => match_string,
|
||||
};
|
||||
|
||||
if !Self::is_probably_visible(node_ref) {
|
||||
node = Self::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
if Self::check_byline(node_ref, &match_string) {
|
||||
if Self::check_byline(node_ref, &match_string, &mut state) {
|
||||
node = Self::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
@ -36,8 +44,78 @@ impl Readability {
|
|||
continue;
|
||||
}
|
||||
|
||||
// Remove unlikely candidates
|
||||
if state.strip_unlikely {
|
||||
if constants::UNLIELY_CANDIDATES.is_match(&match_string)
|
||||
&& !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
|
||||
&& !Self::has_ancestor_tag(node_ref, "table", None)
|
||||
&& !Self::has_ancestor_tag(node_ref, "code", None)
|
||||
&& tag_name != "BODY"
|
||||
&& tag_name != "A"
|
||||
{
|
||||
node = Self::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(role) = node_ref.get_attribute("role") {
|
||||
if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
|
||||
node = Self::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
||||
if tag_name == "DIV"
|
||||
|| tag_name == "SECTION"
|
||||
|| tag_name == "HEADER"
|
||||
|| tag_name == "H1"
|
||||
|| tag_name == "H2"
|
||||
|| tag_name == "H3"
|
||||
|| tag_name == "H4"
|
||||
|| tag_name == "H5"
|
||||
|| tag_name == "H6" && Self::is_element_without_content(node_ref)
|
||||
{
|
||||
node = Self::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
if constants::DEFAULT_TAGS_TO_SCORE.contains(&tag_name.as_str()) {
|
||||
elements_to_score.push(node_ref.clone());
|
||||
}
|
||||
|
||||
// Turn all divs that don't have children block level elements into p's
|
||||
if tag_name == "DIV" {
|
||||
// Put phrasing content into paragraphs.
|
||||
let mut p: Option<Node> = None;
|
||||
for mut child_node in node_ref.get_child_nodes().into_iter() {
|
||||
if Self::is_phrasing_content(&child_node) {
|
||||
if let Some(p) = p.as_mut() {
|
||||
let _ = p.add_child(&mut child_node);
|
||||
} else if !Self::is_whitespace(&child_node) {
|
||||
let mut new_node = Node::new("p", None, document).unwrap();
|
||||
node_ref
|
||||
.replace_child_node(new_node.clone(), child_node.clone())
|
||||
.unwrap();
|
||||
new_node.add_child(&mut child_node).unwrap();
|
||||
p.replace(new_node);
|
||||
}
|
||||
} else if let Some(p) = p.as_mut() {
|
||||
for mut r_node in p.get_child_nodes().into_iter().rev() {
|
||||
if Self::is_whitespace(&r_node) {
|
||||
r_node.unlink();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
|
||||
// element. DIVs with only a P element inside and no text content can be
|
||||
// safely converted into plain P elements to avoid confusing the scoring
|
||||
// algorithm with DIVs with are, in practice, paragraphs.
|
||||
if Self::has_single_tag_inside_element(node_ref, "P")
|
||||
&& Self::get_link_density(node_ref) < 0.25
|
||||
{}
|
||||
}
|
||||
|
||||
node = Self::next_node(node_ref, false);
|
||||
|
@ -61,10 +139,24 @@ impl Readability {
|
|||
!display_none && !is_hidden && !aria_hidden || has_fallback_image
|
||||
}
|
||||
|
||||
fn is_whitespace(node: &Node) -> bool {
|
||||
let is_text_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false);
|
||||
let is_element_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::ElementNode)
|
||||
.unwrap_or(false);
|
||||
|
||||
(is_text_node && node.get_content().trim().is_empty())
|
||||
|| (is_element_node && node.get_name().to_uppercase() == "BR")
|
||||
}
|
||||
|
||||
fn remove_and_next(node: &mut Node) -> Option<Node> {
|
||||
let next_node = Self::next_node(node, true);
|
||||
node.unlink();
|
||||
return next_node;
|
||||
next_node
|
||||
}
|
||||
|
||||
fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
|
||||
|
@ -100,7 +192,11 @@ impl Readability {
|
|||
None
|
||||
}
|
||||
|
||||
fn check_byline(node: &Node, matchstring: &str) -> bool {
|
||||
fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
|
||||
if state.byline.is_some() {
|
||||
return false;
|
||||
}
|
||||
|
||||
let rel = node
|
||||
.get_attribute("rel")
|
||||
.map(|rel| rel == "author")
|
||||
|
@ -111,8 +207,11 @@ impl Readability {
|
|||
.unwrap_or(false);
|
||||
|
||||
let content = node.get_content();
|
||||
if rel || itemprop || regex::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) {
|
||||
// FIXME
|
||||
if rel
|
||||
|| itemprop
|
||||
|| constants::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content)
|
||||
{
|
||||
state.byline = Some(content.trim().into());
|
||||
true
|
||||
} else {
|
||||
false
|
||||
|
@ -140,7 +239,7 @@ impl Readability {
|
|||
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||
let content = node.get_content().trim().to_owned();
|
||||
if normalize_spaces {
|
||||
regex::NORMALIZE.replace(&content, " ").into()
|
||||
constants::NORMALIZE.replace(&content, " ").into()
|
||||
} else {
|
||||
content
|
||||
}
|
||||
|
@ -149,17 +248,146 @@ impl Readability {
|
|||
fn text_similarity(a: &str, b: &str) -> f64 {
|
||||
let a = a.to_lowercase();
|
||||
let b = b.to_lowercase();
|
||||
let tokens_a = regex::TOKENIZE.split(&a).collect::<Vec<_>>();
|
||||
let tokens_b = regex::TOKENIZE.split(&b).collect::<Vec<_>>();
|
||||
if tokens_a.iter().count() == 0 || tokens_b.iter().count() == 0 {
|
||||
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
|
||||
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
|
||||
if tokens_a.is_empty() || tokens_b.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let tokens_b_total: f64 = tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
|
||||
let uniq_tokens_b = tokens_b.into_iter().filter(|token| !tokens_a.iter().any(|t| t == token)).collect::<Vec<_>>();
|
||||
let uniq_tokens_b_total: f64 = uniq_tokens_b.iter().map(|t| t.len()).fold(0.0, |a, b| a + b as f64);
|
||||
let tokens_b_total: f64 = tokens_b
|
||||
.iter()
|
||||
.map(|t| t.len())
|
||||
.fold(0.0, |a, b| a + b as f64);
|
||||
let uniq_tokens_b = tokens_b
|
||||
.into_iter()
|
||||
.filter(|token| !tokens_a.iter().any(|t| t == token))
|
||||
.collect::<Vec<_>>();
|
||||
let uniq_tokens_b_total: f64 = uniq_tokens_b
|
||||
.iter()
|
||||
.map(|t| t.len())
|
||||
.fold(0.0, |a, b| a + b as f64);
|
||||
|
||||
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
||||
1.0 - distance_b
|
||||
}
|
||||
|
||||
fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
|
||||
let max_depth = max_depth.unwrap_or(3);
|
||||
let tag_name = tag_name.to_uppercase();
|
||||
let mut depth = 0;
|
||||
let mut node = node.get_parent();
|
||||
|
||||
loop {
|
||||
if depth > max_depth {
|
||||
return false;
|
||||
}
|
||||
|
||||
let tmp_node = match node {
|
||||
Some(node) => node,
|
||||
None => return false,
|
||||
};
|
||||
|
||||
if tmp_node.get_name() == tag_name {
|
||||
return true;
|
||||
}
|
||||
|
||||
node = tmp_node.get_parent();
|
||||
depth += 1;
|
||||
}
|
||||
}
|
||||
|
||||
fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
|
||||
// There should be exactly 1 element child with given tag
|
||||
if node.get_child_nodes().len() == 1
|
||||
|| node
|
||||
.get_child_nodes()
|
||||
.first()
|
||||
.map(|n| n.get_name().to_uppercase() == tag)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// And there should be no text nodes with real content
|
||||
node.get_child_nodes().iter().any(|n| {
|
||||
n.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false)
|
||||
&& constants::HAS_CONTENT.is_match(&n.get_content())
|
||||
})
|
||||
}
|
||||
|
||||
fn is_element_without_content(node: &Node) -> bool {
|
||||
if let Some(node_type) = node.get_type() {
|
||||
let len = node.get_child_nodes().len();
|
||||
|
||||
return node_type == NodeType::ElementNode
|
||||
&& node.get_content().trim().is_empty()
|
||||
&& (len == 0
|
||||
|| len
|
||||
== Self::get_elements_by_tag_name(node, "br").len()
|
||||
+ Self::get_elements_by_tag_name(node, "hr").len());
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
|
||||
let tag = tag.to_uppercase();
|
||||
let all_tags = tag == "*";
|
||||
let mut vec = Vec::new();
|
||||
|
||||
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
|
||||
for child in node.get_child_elements() {
|
||||
if all_tags || child.get_name() == tag {
|
||||
vec.push(child);
|
||||
}
|
||||
get_elems(node, tag, vec, all_tags);
|
||||
}
|
||||
}
|
||||
|
||||
get_elems(node, &tag, &mut vec, all_tags);
|
||||
vec
|
||||
}
|
||||
|
||||
fn is_phrasing_content(node: &Node) -> bool {
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
let is_text_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false);
|
||||
|
||||
is_text_node
|
||||
|| constants::PHRASING_ELEMS.contains(&tag_name.as_str())
|
||||
|| (tag_name == "A" || tag_name == "DEL" || tag_name == "INS")
|
||||
&& node
|
||||
.get_child_nodes()
|
||||
.iter()
|
||||
.map(Self::is_phrasing_content)
|
||||
.all(|val| val)
|
||||
}
|
||||
|
||||
fn get_link_density(node: &Node) -> f64 {
|
||||
let text_length = Self::get_inner_text(node, false).len();
|
||||
if text_length == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut link_length = 0.0;
|
||||
|
||||
// XXX implement _reduceNodeList?
|
||||
let link_nodes = Self::get_elements_by_tag_name(node, "A");
|
||||
for link_node in link_nodes {
|
||||
if let Some(href) = link_node.get_attribute("href") {
|
||||
let coefficient = if constants::HASH_URL.is_match(&href) {
|
||||
0.3
|
||||
} else {
|
||||
1.0
|
||||
};
|
||||
link_length += Self::get_inner_text(&link_node, false).len() as f64 * coefficient;
|
||||
}
|
||||
}
|
||||
|
||||
link_length / text_length as f64
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,12 +0,0 @@
|
|||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
|
||||
});
|
||||
pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex")
|
||||
});
|
||||
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/\W+/g"#).expect("TOKENIZE regex")
|
||||
});
|
|
@ -3,6 +3,7 @@ pub struct State {
|
|||
pub weigh_classes: bool,
|
||||
pub clean_conditionally: bool,
|
||||
pub should_remove_title_header: bool,
|
||||
pub byline: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for State {
|
||||
|
@ -12,6 +13,7 @@ impl Default for State {
|
|||
weigh_classes: true,
|
||||
clean_conditionally: true,
|
||||
should_remove_title_header: true,
|
||||
byline: None,
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue