mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
somewhat complete readability algorithm
This commit is contained in:
parent
979358fd35
commit
71a8816747
5 changed files with 620 additions and 92 deletions
|
@ -42,7 +42,7 @@ impl Article {
|
|||
};
|
||||
file_name.push_str(".html");
|
||||
let path = path.join(file_name);
|
||||
let mut html_file = File::create(&path)?;
|
||||
let mut html_file = File::create(path)?;
|
||||
html_file.write_all(html.as_bytes())?;
|
||||
return Ok(());
|
||||
}
|
||||
|
|
|
@ -18,6 +18,8 @@ pub enum FullTextParserError {
|
|||
ContentType,
|
||||
#[error("Invalid UTF8 Text")]
|
||||
Utf8(#[from] std::str::Utf8Error),
|
||||
#[error("Readability Error")]
|
||||
Readability,
|
||||
#[error("Unknown Error")]
|
||||
Unknown,
|
||||
}
|
||||
|
|
|
@ -182,7 +182,7 @@ impl FullTextParser {
|
|||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
|
||||
if found_body {
|
||||
if let Err(error) = Readability::extract_body_readability(&document, root) {
|
||||
if let Err(error) = Readability::extract_body_readability(document, root) {
|
||||
log::error!("Both ftr and readability failed to find content: {}", error);
|
||||
return Err(error);
|
||||
}
|
||||
|
|
|
@ -1,6 +1,11 @@
|
|||
use std::collections::HashSet;
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
|
||||
pub const DEFAULT_CHAR_THRESHOLD: usize = 500;
|
||||
pub static SIBLING_CONTENT: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\.( |$)/"#).expect("SIBLING_CONTENT regex"));
|
||||
pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(r#"/byline|author|dateline|writtenby|p-author/i"#).expect("BYLINE regex")
|
||||
});
|
||||
|
@ -17,7 +22,17 @@ pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
|
|||
pub static HAS_CONTENT: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
|
||||
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
|
||||
pub static POSITIVE: Lazy<Regex> = Lazy::new(|| {
|
||||
Regex::new(
|
||||
r#"/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i"#,
|
||||
)
|
||||
.expect("POSITIVE regex")
|
||||
});
|
||||
pub static NEGATIVE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/-ad-|hidden|^hid$| hid$| hid |^hid"#).expect("NEGATIVE regex"));
|
||||
|
||||
pub const SCORE_ATTR: &str = "content_score";
|
||||
pub const MINIMUM_TOPCANDIDATES: usize = 3;
|
||||
pub const UNLIKELY_ROLES: &[&str] = &[
|
||||
"menu",
|
||||
"menubar",
|
||||
|
@ -30,6 +45,22 @@ pub const UNLIKELY_ROLES: &[&str] = &[
|
|||
|
||||
pub const DEFAULT_TAGS_TO_SCORE: &[&str] =
|
||||
&["SECTION", "H2", "H3", "H4", "H5", "H6", "P", "TD", "PRE"];
|
||||
pub static DIV_TO_P_ELEMS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
|
||||
HashSet::from([
|
||||
"BLOCKQUOTE",
|
||||
"DL",
|
||||
"DIV",
|
||||
"IMG",
|
||||
"OL",
|
||||
"P",
|
||||
"PRE",
|
||||
"TABLE",
|
||||
"UL",
|
||||
])
|
||||
});
|
||||
|
||||
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&'static str>> =
|
||||
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
|
||||
|
||||
pub const PHRASING_ELEMS: &[&str] = &[
|
||||
// "CANVAS", "IFRAME", "SVG", "VIDEO",
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
mod constants;
|
||||
mod state;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use libxml::tree::{Document, Node, NodeType};
|
||||
|
||||
use self::state::State;
|
||||
|
@ -10,10 +12,17 @@ pub struct Readability;
|
|||
|
||||
impl Readability {
|
||||
pub fn extract_body_readability(
|
||||
document: &Document,
|
||||
_root: &mut Node,
|
||||
document: Document,
|
||||
root: &mut Node,
|
||||
) -> Result<bool, FullTextParserError> {
|
||||
let mut state = State::default();
|
||||
let mut document = document;
|
||||
let mut attempts: Vec<(Node, usize)> = Vec::new();
|
||||
let document_cache = document
|
||||
.dup()
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
|
||||
loop {
|
||||
let mut elements_to_score = Vec::new();
|
||||
let mut node: Option<Node> = document.clone().get_root_element();
|
||||
|
||||
|
@ -93,11 +102,18 @@ impl Readability {
|
|||
if let Some(p) = p.as_mut() {
|
||||
let _ = p.add_child(&mut child_node);
|
||||
} else if !Self::is_whitespace(&child_node) {
|
||||
let mut new_node = Node::new("p", None, document).unwrap();
|
||||
let mut new_node = Node::new("p", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
node_ref
|
||||
.replace_child_node(new_node.clone(), child_node.clone())
|
||||
.unwrap();
|
||||
new_node.add_child(&mut child_node).unwrap();
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
new_node.add_child(&mut child_node).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
p.replace(new_node);
|
||||
}
|
||||
} else if let Some(p) = p.as_mut() {
|
||||
|
@ -115,13 +131,415 @@ impl Readability {
|
|||
// algorithm with DIVs with are, in practice, paragraphs.
|
||||
if Self::has_single_tag_inside_element(node_ref, "P")
|
||||
&& Self::get_link_density(node_ref) < 0.25
|
||||
{}
|
||||
{
|
||||
if let Some(new_node) = node_ref.get_child_nodes().first() {
|
||||
if let Some(mut parent) = node_ref.get_parent() {
|
||||
parent
|
||||
.replace_child_node(new_node.clone(), node_ref.clone())
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
node = Some(new_node.clone());
|
||||
elements_to_score.push(new_node.clone());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else if !Self::has_child_block_element(node_ref) && node_ref.set_name("P").is_ok() {
|
||||
elements_to_score.push(node_ref.clone());
|
||||
}
|
||||
}
|
||||
|
||||
node = Self::next_node(node_ref, false);
|
||||
}
|
||||
|
||||
unimplemented!()
|
||||
let mut candidates = Vec::new();
|
||||
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
||||
// Then add their score to their parent node.
|
||||
// A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
|
||||
for element_to_score in elements_to_score {
|
||||
if element_to_score.get_parent().is_none() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let inner_text = Self::get_inner_text(&element_to_score, true);
|
||||
|
||||
// If this paragraph is less than 25 characters, don't even count it.
|
||||
if inner_text.len() < 25 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Exclude nodes with no ancestor.
|
||||
let ancestors = Self::get_node_ancestors(&element_to_score, 5);
|
||||
if ancestors.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut content_score = 0.0;
|
||||
|
||||
// Add a point for the paragraph itself as a base.
|
||||
content_score += 1.0;
|
||||
|
||||
// Add points for any commas within this paragraph.
|
||||
content_score += inner_text.split(',').count() as f64;
|
||||
|
||||
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
||||
content_score += f64::min(f64::floor(inner_text.len() as f64 / 100.0), 3.0);
|
||||
|
||||
// Initialize and score ancestors.
|
||||
for (level, mut ancestor) in ancestors.into_iter().enumerate() {
|
||||
if ancestor.get_parent().is_none() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if Self::get_content_score(&ancestor).is_none() {
|
||||
Self::initialize_node(&mut ancestor, &state);
|
||||
candidates.push(ancestor.clone());
|
||||
}
|
||||
|
||||
// Node score divider:
|
||||
// - parent: 1 (no division)
|
||||
// - grandparent: 2
|
||||
// - great grandparent+: ancestor level * 3
|
||||
let score_divider = if level == 0 {
|
||||
1.0
|
||||
} else if level == 1 {
|
||||
2.0
|
||||
} else {
|
||||
level as f64 * 3.0
|
||||
};
|
||||
|
||||
if let Some(mut score) = Self::get_content_score(&ancestor) {
|
||||
score += content_score / score_divider;
|
||||
Self::set_content_score(&mut ancestor, score);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// After we've calculated scores, loop through all of the possible
|
||||
// candidate nodes we found and find the one with the highest score.
|
||||
for candidate in candidates.iter_mut() {
|
||||
// Scale the final candidates score based on link density. Good content
|
||||
// should have a relatively small link density (5% or less) and be mostly
|
||||
// unaffected by this operation.
|
||||
if let Some(content_score) = Self::get_content_score(candidate) {
|
||||
let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
|
||||
Self::set_content_score(candidate, candidate_score);
|
||||
}
|
||||
}
|
||||
|
||||
candidates.sort_by(|a, b| {
|
||||
if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
|
||||
{
|
||||
a.partial_cmp(&b).unwrap_or(Ordering::Equal)
|
||||
} else {
|
||||
Ordering::Equal
|
||||
}
|
||||
});
|
||||
|
||||
let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
|
||||
let mut needed_to_create_top_candidate = false;
|
||||
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
// We also have to copy the body node so it is something we can modify.
|
||||
Self::initialize_node(root, &state);
|
||||
needed_to_create_top_candidate = true;
|
||||
root.clone()
|
||||
});
|
||||
#[allow(unused_assignments)]
|
||||
let mut parent_of_top_candidate = None;
|
||||
|
||||
let mut alternative_candidate_ancestors = Vec::new();
|
||||
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
|
||||
// and whose scores are quite closed with current `topCandidate` node.
|
||||
for top_candidate in &top_candidates {
|
||||
if let Some(score) = Self::get_content_score(top_candidate) {
|
||||
if score >= 0.75 {
|
||||
alternative_candidate_ancestors
|
||||
.push(Self::get_node_ancestors(top_candidate, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
|
||||
loop {
|
||||
if let Some(parent) = &parent_of_top_candidate {
|
||||
let mut lists_containing_this_ancestor = 0;
|
||||
let tmp = usize::min(
|
||||
alternative_candidate_ancestors.len(),
|
||||
constants::MINIMUM_TOPCANDIDATES,
|
||||
);
|
||||
for item in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
let tmp = item.iter().any(|n| n == parent);
|
||||
lists_containing_this_ancestor += if tmp { 1 } else { 0 };
|
||||
}
|
||||
|
||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||
top_candidate = parent.clone();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
|
||||
}
|
||||
}
|
||||
|
||||
if Self::get_content_score(&top_candidate).is_none() {
|
||||
Self::initialize_node(&mut top_candidate, &state);
|
||||
}
|
||||
|
||||
// Because of our bonus system, parents of candidates might have scores
|
||||
// themselves. They get half of the node. There won't be nodes with higher
|
||||
// scores than our topCandidate, but if we see the score going *up* in the first
|
||||
// few steps up the tree, that's a decent sign that there might be more content
|
||||
// lurking in other places that we want to unify in. The sibling stuff
|
||||
// below does some of that - but only if we've looked high enough up the DOM
|
||||
// tree.
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);
|
||||
|
||||
// The scores shouldn't get too low.
|
||||
let score_threshold = last_score / 3.0;
|
||||
|
||||
while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
|
||||
if parent_of_top_candidate
|
||||
.as_ref()
|
||||
.map(|n| Self::get_content_score(n).is_none())
|
||||
.unwrap_or(false)
|
||||
{
|
||||
parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
|
||||
continue;
|
||||
}
|
||||
|
||||
let parent_score = parent_of_top_candidate
|
||||
.as_ref()
|
||||
.and_then(Self::get_content_score)
|
||||
.unwrap_or(0.0);
|
||||
if parent_score < score_threshold {
|
||||
break;
|
||||
}
|
||||
|
||||
if parent_score > last_score {
|
||||
// Alright! We found a better parent to use.
|
||||
if let Some(parent) = parent_of_top_candidate {
|
||||
top_candidate = parent;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
last_score = parent_of_top_candidate
|
||||
.as_ref()
|
||||
.and_then(Self::get_content_score)
|
||||
.unwrap_or(0.0);
|
||||
parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
|
||||
}
|
||||
|
||||
// If the top candidate is the only child, use parent instead. This will help sibling
|
||||
// joining logic when adjacent content is actually located in parent's sibling node.
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
|
||||
while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
|
||||
&& parent_of_top_candidate
|
||||
.as_ref()
|
||||
.map(|n| n.get_child_elements().len() == 1)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
top_candidate = parent_of_top_candidate.ok_or(FullTextParserError::Readability)?;
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
}
|
||||
|
||||
if Self::get_content_score(&top_candidate).is_none() {
|
||||
Self::initialize_node(&mut top_candidate, &state);
|
||||
}
|
||||
|
||||
// Now that we have the top candidate, look through its siblings for content
|
||||
// that might also be related. Things like preambles, content split by ads
|
||||
// that we removed, etc.
|
||||
let mut article_content =
|
||||
Node::new("DIV", None, &document).map_err(|()| FullTextParserError::Readability)?;
|
||||
|
||||
let sibling_score_threshold = f64::max(
|
||||
10.0,
|
||||
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2,
|
||||
);
|
||||
// Keep potential top candidate's parent node to try to get text direction of it later.
|
||||
parent_of_top_candidate = top_candidate.get_parent();
|
||||
let siblings = parent_of_top_candidate
|
||||
.as_ref()
|
||||
.map(|n| n.get_child_nodes());
|
||||
|
||||
if let Some(siblings) = siblings {
|
||||
for mut sibling in siblings {
|
||||
let mut append = false;
|
||||
|
||||
let score = Self::get_content_score(&sibling);
|
||||
log::debug!("Looking at sibling node: {sibling:?} with score {score:?}");
|
||||
|
||||
if top_candidate == sibling {
|
||||
append = true;
|
||||
} else {
|
||||
let mut content_bonus = 0.0;
|
||||
|
||||
// Give a bonus if sibling nodes and top candidates have the example same classname
|
||||
let sibling_classes = sibling.get_class_names();
|
||||
let tc_classes = top_candidate.get_class_names();
|
||||
|
||||
if sibling_classes
|
||||
.iter()
|
||||
.all(|class| tc_classes.contains(class))
|
||||
&& !tc_classes.is_empty()
|
||||
{
|
||||
content_bonus +=
|
||||
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
|
||||
}
|
||||
|
||||
if Self::get_content_score(&sibling).unwrap_or(0.0) + content_bonus
|
||||
>= sibling_score_threshold
|
||||
{
|
||||
append = true;
|
||||
} else if sibling.get_name().to_uppercase() == "P" {
|
||||
let link_density = Self::get_link_density(&sibling);
|
||||
let node_content = Self::get_inner_text(&sibling, false);
|
||||
let node_length = node_content.len();
|
||||
|
||||
if node_length > 80
|
||||
&& (link_density < 0.25
|
||||
|| (node_length > 0
|
||||
&& link_density == 0.0
|
||||
&& constants::SIBLING_CONTENT.is_match(&node_content)))
|
||||
{
|
||||
append = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if append {
|
||||
log::debug!("Appending node: {sibling:?}");
|
||||
|
||||
if !constants::ALTER_TO_DIV_EXCEPTIONS.contains(sibling.get_name().as_str())
|
||||
{
|
||||
// We have a node that isn't a common block level element, like a form or td tag.
|
||||
// Turn it into a div so it doesn't get filtered out later by accident.
|
||||
log::debug!("Altering sibling: {sibling:?} to div.");
|
||||
|
||||
sibling.set_name("DIV").map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
}
|
||||
|
||||
article_content.add_child(&mut sibling).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if needed_to_create_top_candidate {
|
||||
// We already created a fake div thing, and there wouldn't have been any siblings left
|
||||
// for the previous loop, so there's no point trying to create a new div, and then
|
||||
// move all the children over. Just assign IDs and class names here. No need to append
|
||||
// because that already happened anyway.
|
||||
top_candidate
|
||||
.set_property("id", "readability-page-1")
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
top_candidate
|
||||
.set_property("class", "page")
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
} else {
|
||||
let mut div = Node::new("DIV", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
div.set_property("id", "readability-page-1")
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
div.set_property("class", "page").map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
|
||||
for mut child in article_content.get_child_nodes() {
|
||||
div.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
}
|
||||
article_content.add_child(&mut div).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
}
|
||||
|
||||
let mut parse_successful = true;
|
||||
|
||||
// Now that we've gone through the full algorithm, check to see if
|
||||
// we got any meaningful content. If we didn't, we may need to re-run
|
||||
// grabArticle with different flags set. This gives us a higher likelihood of
|
||||
// finding the content, and the sieve approach gives us a higher likelihood of
|
||||
// finding the -right- content.
|
||||
let text_length = Self::get_inner_text(&article_content, true).len();
|
||||
|
||||
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
|
||||
parse_successful = false;
|
||||
document = document_cache
|
||||
.dup()
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
|
||||
if state.strip_unlikely {
|
||||
state.strip_unlikely = false;
|
||||
attempts.push((article_content, text_length));
|
||||
} else if state.weigh_classes {
|
||||
state.weigh_classes = false;
|
||||
attempts.push((article_content, text_length));
|
||||
} else if state.clean_conditionally {
|
||||
state.clean_conditionally = false;
|
||||
attempts.push((article_content, text_length));
|
||||
} else {
|
||||
attempts.push((article_content, text_length));
|
||||
// No luck after removing flags, just return the longest text we found during the different loops
|
||||
|
||||
attempts.sort_by(|(_, size_a), (_, size_b)| size_a.cmp(size_b));
|
||||
|
||||
// But first check if we actually have something
|
||||
if let Some((best_attempt, _len)) = attempts.first() {
|
||||
article_content = best_attempt.clone();
|
||||
root.add_child(&mut article_content).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
parse_successful = true;
|
||||
}
|
||||
|
||||
return Ok(parse_successful);
|
||||
}
|
||||
} else {
|
||||
root.add_child(&mut article_content).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
return Ok(parse_successful);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_content_score(node: &Node) -> Option<f64> {
|
||||
node.get_attribute(constants::SCORE_ATTR)
|
||||
.and_then(|a| a.parse::<f64>().ok())
|
||||
}
|
||||
|
||||
fn set_content_score(node: &mut Node, score: f64) {
|
||||
node.set_attribute(constants::SCORE_ATTR, &score.to_string())
|
||||
.expect("Failed to set content score");
|
||||
}
|
||||
|
||||
fn is_probably_visible(node: &Node) -> bool {
|
||||
|
@ -390,4 +808,81 @@ impl Readability {
|
|||
|
||||
link_length / text_length as f64
|
||||
}
|
||||
|
||||
// Determine whether element has any children block level elements.
|
||||
fn has_child_block_element(node: &Node) -> bool {
|
||||
node.get_child_elements().iter().any(|node| {
|
||||
constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
|
||||
|| Self::has_child_block_element(node)
|
||||
})
|
||||
}
|
||||
|
||||
fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
|
||||
let mut ancestors = Vec::new();
|
||||
let mut node = node.clone();
|
||||
|
||||
for _ in 0..max_depth {
|
||||
let parent = node.get_parent();
|
||||
match parent {
|
||||
Some(parent) => {
|
||||
ancestors.push(parent.clone());
|
||||
node = parent;
|
||||
}
|
||||
None => return ancestors,
|
||||
}
|
||||
}
|
||||
|
||||
ancestors
|
||||
}
|
||||
|
||||
// Initialize a node with the readability object. Also checks the
|
||||
// className/id for special names to add to its score.
|
||||
fn initialize_node(node: &mut Node, state: &State) {
|
||||
let score = match node.get_name().to_uppercase().as_str() {
|
||||
"DIV" => 5,
|
||||
"PRE" | "TD" | "BLOCKQUITE" => 3,
|
||||
"ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => -3,
|
||||
"H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5,
|
||||
_ => 0,
|
||||
};
|
||||
let score = score + Self::get_class_weight(node, state);
|
||||
Self::set_content_score(node, score as f64);
|
||||
}
|
||||
|
||||
fn get_class_weight(node: &Node, state: &State) -> i64 {
|
||||
if !state.weigh_classes {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let mut weight = 0;
|
||||
|
||||
// Look for a special classname
|
||||
if let Some(class_names) = node.get_property("class") {
|
||||
if constants::NEGATIVE.is_match(&class_names) {
|
||||
weight -= 25;
|
||||
}
|
||||
|
||||
if constants::POSITIVE.is_match(&class_names) {
|
||||
weight += 25;
|
||||
}
|
||||
}
|
||||
|
||||
// Look for a special ID
|
||||
if let Some(class_names) = node.get_property("id") {
|
||||
if constants::NEGATIVE.is_match(&class_names) {
|
||||
weight -= 25;
|
||||
}
|
||||
|
||||
if constants::POSITIVE.is_match(&class_names) {
|
||||
weight += 25;
|
||||
}
|
||||
}
|
||||
|
||||
weight
|
||||
}
|
||||
|
||||
fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
|
||||
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
|
||||
.unwrap_or(false)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue