mod state; #[cfg(test)] mod tests; use std::cmp::Ordering; use libxml::tree::{Document, Node}; use self::state::State; use super::error::FullTextParserError; use crate::{constants, util::Util}; pub struct Readability; impl Readability { pub fn extract_body( document: Document, root: &mut Node, title: Option<&str>, ) -> Result { let mut state = State::default(); let mut document = document; let mut attempts: Vec<(Node, usize, Document)> = Vec::new(); let document_cache = document .dup() .map_err(|()| FullTextParserError::Readability)?; loop { let mut elements_to_score = Vec::new(); let mut node: Option = document.clone().get_root_element(); while let Some(node_ref) = node.as_mut() { let tag_name = node_ref.get_name().to_uppercase(); if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() { node = Util::next_node(node_ref, true); continue; } let match_string = node_ref .get_class_names() .iter() .fold(String::new(), |a, b| format!("{a} {b}")); let match_string = match node_ref.get_property("id") { Some(id) => format!("{match_string} {id}"), None => match_string, }; if !Util::is_probably_visible(node_ref) { node = Util::remove_and_next(node_ref); continue; } if Self::check_byline(node_ref, &match_string, &mut state) { node = Util::remove_and_next(node_ref); continue; } if state.should_remove_title_header && Self::header_duplicates_title(node_ref, title) { state.should_remove_title_header = false; node = Util::remove_and_next(node_ref); continue; } // Remove unlikely candidates if state.strip_unlikely { if constants::UNLIELY_CANDIDATES.is_match(&match_string) && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string) && !Util::has_ancestor_tag( node_ref, "table", None, None:: bool>, ) && !Util::has_ancestor_tag( node_ref, "code", None, None:: bool>, ) && tag_name != "BODY" && tag_name != "A" { node = Util::remove_and_next(node_ref); continue; } if let Some(role) = node_ref.get_attribute("role") { if constants::UNLIKELY_ROLES.contains(&role.as_str()) { node = Util::remove_and_next(node_ref); continue; } } } // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe). if (tag_name == "DIV" || tag_name == "SECTION" || tag_name == "HEADER" || tag_name == "H1" || tag_name == "H2" || tag_name == "H3" || tag_name == "H4" || tag_name == "H5" || tag_name == "H6") && Util::is_element_without_content(node_ref) { node = Util::remove_and_next(node_ref); continue; } if constants::DEFAULT_TAGS_TO_SCORE.contains(&tag_name.as_str()) { elements_to_score.push(node_ref.clone()); } // Turn all divs that don't have children block level elements into p's if tag_name == "DIV" { // Put phrasing content into paragraphs. let mut p: Option = None; for mut child in node_ref.get_child_nodes().into_iter() { if Util::is_phrasing_content(&child) { if let Some(p) = p.as_mut() { child.unlink(); p.add_child(&mut child).map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } else if !Util::is_whitespace(&child) { let mut new_node = Node::new("p", None, &document) .map_err(|()| FullTextParserError::Readability)?; let mut old_node = node_ref .replace_child_node(new_node.clone(), child) .map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; new_node.add_child(&mut old_node).map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; p.replace(new_node); } } else if p.is_some() { if let Some(p) = p.as_mut() { for mut r_node in p.get_child_nodes().into_iter().rev() { if Util::is_whitespace(&r_node) { r_node.unlink(); continue; } break; } } _ = p.take(); } } // Sites like http://mobile.slate.com encloses each paragraph with a DIV // element. DIVs with only a P element inside and no text content can be // safely converted into plain P elements to avoid confusing the scoring // algorithm with DIVs with are, in practice, paragraphs. if Util::has_single_tag_inside_element(node_ref, "P") && Util::get_link_density(node_ref) < 0.25 { if let Some(new_node) = node_ref.get_first_element_child() { if let Some(mut parent) = node_ref.get_parent() { parent .replace_child_node(new_node.clone(), node_ref.clone()) .map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; node = Util::next_node(&new_node, false); elements_to_score.push(new_node.clone()); continue; } } } else if !Util::has_child_block_element(node_ref) && node_ref.set_name("P").is_ok() { elements_to_score.push(node_ref.clone()); } } node = Util::next_node(node_ref, false); } let mut candidates = Vec::new(); // Loop through all paragraphs, and assign a score to them based on how content-y they look. // Then add their score to their parent node. // A score is determined by things like number of commas, class names, etc. Maybe eventually link density. for element_to_score in elements_to_score.drain(..) { if element_to_score.get_parent().is_none() { continue; } let inner_text = Util::get_inner_text(&element_to_score, true); let inner_text_len = inner_text.len(); // If this paragraph is less than 25 characters, don't even count it. if inner_text_len < 25 { continue; } // Exclude nodes with no ancestor. let ancestors = Util::get_node_ancestors(&element_to_score, Some(5)); if ancestors.is_empty() { continue; } let mut content_score = 0.0; // Add a point for the paragraph itself as a base. content_score += 1.0; // Add points for any commas within this paragraph. content_score += inner_text.split(',').count() as f64; // For every 100 characters in this paragraph, add another point. Up to 3 points. content_score += f64::min(f64::floor(inner_text.len() as f64 / 100.0), 3.0); // Initialize and score ancestors. for (level, mut ancestor) in ancestors.into_iter().enumerate() { let tag_name = ancestor.get_name().to_uppercase(); if ancestor.get_parent().is_none() || tag_name == "HTML" { continue; } if Self::get_content_score(&ancestor).is_none() { Self::initialize_node(&mut ancestor, &state)?; candidates.push(ancestor.clone()); } // Node score divider: // - parent: 1 (no division) // - grandparent: 2 // - great grandparent+: ancestor level * 3 let score_divider = if level == 0 { 1.0 } else if level == 1 { 2.0 } else { level as f64 * 3.0 }; if let Some(score) = Self::get_content_score(&ancestor) { let add_score = content_score / score_divider; let new_score = score + add_score; log::debug!( "{}: {score} + {add_score} = {new_score}", ancestor.get_name() ); Self::set_content_score(&mut ancestor, new_score)?; } } } // After we've calculated scores, loop through all of the possible // candidate nodes we found and find the one with the highest score. for candidate in candidates.iter_mut() { // Scale the final candidates score based on link density. Good content // should have a relatively small link density (5% or less) and be mostly // unaffected by this operation. if let Some(content_score) = Self::get_content_score(candidate) { let candidate_score = content_score * (1.0 - Util::get_link_density(candidate)); Self::set_content_score(candidate, candidate_score)?; } } candidates.sort_by(|a, b| { if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b)) { b.partial_cmp(&a).unwrap_or(Ordering::Equal) } else { Ordering::Equal } }); let top_candidates = candidates.into_iter().take(5).collect::>(); for candidate in top_candidates.iter() { log::debug!( "candidate: {} {:?}", candidate.get_name(), candidate.get_attributes() ); } let mut needed_to_create_top_candidate = false; let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| { // If we still have no top candidate, just use the body as a last resort. // We also have to copy the body node so it is something we can modify. let mut root = document.get_root_element().expect("doc should have root"); if let Some(body) = root .get_child_elements() .into_iter() .find(|n| n.get_name().to_uppercase() == "BODY") { root = body; } let mut new_top_candidate = Node::new("DIV", None, &document).expect("can't create new node"); for mut child in root.get_child_elements().drain(..) { child.unlink(); new_top_candidate.add_child(&mut child).unwrap(); } root.add_child(&mut new_top_candidate).unwrap(); Self::initialize_node(&mut new_top_candidate, &state) .expect("init should not fail"); needed_to_create_top_candidate = true; new_top_candidate }); // Util::serialize_node(&top_candidate, "top_candidate.html"); let mut alternative_candidate_ancestors = Vec::new(); // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array // and whose scores are quite closed with current `topCandidate` node. if let Some(top_score) = Self::get_content_score(&top_candidate) { for candidate in top_candidates.iter().skip(1) { let score = Self::get_content_score(candidate).unwrap_or(0.0); if score / top_score >= 0.75 { alternative_candidate_ancestors .push(Util::get_node_ancestors(candidate, None)); } } } if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES { let mut parent_of_top_candidate = top_candidate.get_parent(); while let Some(parent) = &parent_of_top_candidate { if parent.get_name().to_uppercase() == "BODY" { break; } let mut lists_containing_this_ancestor = 0; let tmp = usize::min( alternative_candidate_ancestors.len(), constants::MINIMUM_TOPCANDIDATES, ); for ancestors in alternative_candidate_ancestors.iter().take(tmp) { lists_containing_this_ancestor += ancestors.iter().filter(|n| n == &parent).count(); } if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES { top_candidate = parent.clone(); break; } parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent()); } } if Self::get_content_score(&top_candidate).is_none() { Self::initialize_node(&mut top_candidate, &state)?; } //Util::serialize_node(&top_candidate, "new_top_candidate.html"); // Because of our bonus system, parents of candidates might have scores // themselves. They get half of the node. There won't be nodes with higher // scores than our topCandidate, but if we see the score going *up* in the first // few steps up the tree, that's a decent sign that there might be more content // lurking in other places that we want to unify in. The sibling stuff // below does some of that - but only if we've looked high enough up the DOM // tree. let mut parent_of_top_candidate = top_candidate.get_parent(); let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0); // The scores shouldn't get too low. let score_threshold = last_score / 3.0; while parent_of_top_candidate.is_some() && !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") { if parent_of_top_candidate .as_ref() .map(|n| Self::get_content_score(n).is_none()) .unwrap_or(false) { parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent()); continue; } let parent_score = parent_of_top_candidate .as_ref() .and_then(Self::get_content_score) .unwrap_or(0.0); if parent_score < score_threshold { break; } if parent_score > last_score { // Alright! We found a better parent to use. if let Some(parent) = parent_of_top_candidate { top_candidate = parent; } break; } last_score = parent_of_top_candidate .as_ref() .and_then(Self::get_content_score) .unwrap_or(0.0); parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent()); } // If the top candidate is the only child, use parent instead. This will help sibling // joining logic when adjacent content is actually located in parent's sibling node. parent_of_top_candidate = top_candidate.get_parent(); while !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") && parent_of_top_candidate .as_ref() .map(|n| n.get_child_elements().len() == 1) .unwrap_or(false) { top_candidate = parent_of_top_candidate.ok_or(FullTextParserError::Readability)?; parent_of_top_candidate = top_candidate.get_parent(); } if Self::get_content_score(&top_candidate).is_none() { Self::initialize_node(&mut top_candidate, &state)?; } // Now that we have the top candidate, look through its siblings for content // that might also be related. Things like preambles, content split by ads // that we removed, etc. let mut article_content = Node::new("DIV", None, &document).map_err(|()| FullTextParserError::Readability)?; let sibling_score_threshold = f64::max( 10.0, Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2, ); // Keep potential top candidate's parent node to try to get text direction of it later. parent_of_top_candidate = top_candidate.get_parent(); let siblings = parent_of_top_candidate .as_ref() .map(|n| n.get_child_elements()); if let Some(mut siblings) = siblings { for mut sibling in siblings.drain(..) { let mut append = false; let score = Self::get_content_score(&sibling).unwrap_or(0.0); log::debug!( "Looking at sibling node: {} ({:?}) with score {score}", sibling.get_name(), sibling.get_attribute("class") ); if top_candidate == sibling { append = true; } else { let mut content_bonus = 0.0; // Give a bonus if sibling nodes and top candidates have the example same classname let sibling_classes = sibling.get_class_names(); let tc_classes = top_candidate.get_class_names(); if !tc_classes.is_empty() && !sibling_classes.is_empty() && sibling_classes .iter() .all(|class| tc_classes.contains(class)) { content_bonus += Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2; } if score + content_bonus >= sibling_score_threshold { append = true; } else if sibling.get_name().to_uppercase() == "P" { let link_density = Util::get_link_density(&sibling); let node_content = Util::get_inner_text(&sibling, false); let node_length = node_content.len(); if node_length > 80 && (link_density < 0.25 || (node_length > 0 && link_density == 0.0 && constants::SIBLING_CONTENT.is_match(&node_content))) { append = true; } } } if append { log::debug!( "Appending node: {} ({:?})", sibling.get_name(), sibling.get_attribute("class") ); if !constants::ALTER_TO_DIV_EXCEPTIONS .contains(sibling.get_name().to_uppercase().as_str()) { // We have a node that isn't a common block level element, like a form or td tag. // Turn it into a div so it doesn't get filtered out later by accident. log::debug!( "Altering sibling: {} ({:?})", sibling.get_name(), sibling.get_attribute("class") ); sibling.set_name("DIV").map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } sibling.unlink(); article_content.add_child(&mut sibling).map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } } } if state.clean_conditionally { crate::FullTextParser::post_process_page(&mut article_content)?; } if needed_to_create_top_candidate { // We already created a fake div thing, and there wouldn't have been any siblings left // for the previous loop, so there's no point trying to create a new div, and then // move all the children over. Just assign IDs and class names here. No need to append // because that already happened anyway. top_candidate .set_property("id", "readability-page-1") .map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } else { let mut div = Node::new("DIV", None, &document) .map_err(|()| FullTextParserError::Readability)?; div.set_property("id", "readability-page-1") .map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; for mut child in article_content.get_child_nodes() { child.unlink(); div.add_child(&mut child).map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } article_content.add_child(&mut div).map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } let mut parse_successful = true; // Now that we've gone through the full algorithm, check to see if // we got any meaningful content. If we didn't, we may need to re-run // grabArticle with different flags set. This gives us a higher likelihood of // finding the content, and the sieve approach gives us a higher likelihood of // finding the -right- content. let text = Util::get_inner_text(&article_content, true); let text_length = text.len(); //Util::serialize_node(&article_content, "debug.html"); if text_length < constants::DEFAULT_CHAR_THRESHOLD { parse_successful = false; if state.strip_unlikely { state.strip_unlikely = false; attempts.push((article_content, text_length, document)); } else if state.weigh_classes { state.weigh_classes = false; attempts.push((article_content, text_length, document)); } else if state.clean_conditionally { state.clean_conditionally = false; attempts.push((article_content, text_length, document)); } else { attempts.push((article_content, text_length, document)); // No luck after removing flags, just return the longest text we found during the different loops attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b)); // But first check if we actually have something if let Some((best_attempt, _len, _document)) = attempts.pop() { for mut child in best_attempt.get_child_nodes() { child.unlink(); root.add_child(&mut child).map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } parse_successful = true; } return Ok(parse_successful); } document = document_cache .dup() .map_err(|()| FullTextParserError::Readability)?; } else { for mut child in article_content.get_child_nodes() { child.unlink(); root.add_child(&mut child).map_err(|error| { log::error!("{error}"); FullTextParserError::Readability })?; } return Ok(parse_successful); } } } fn get_content_score(node: &Node) -> Option { node.get_attribute(constants::SCORE_ATTR) .and_then(|a| a.parse::().ok()) } fn set_content_score(node: &mut Node, score: f64) -> Result<(), FullTextParserError> { node.set_attribute(constants::SCORE_ATTR, &score.to_string()) .map_err(|err| { log::error!("failed to set content score: {err}"); FullTextParserError::Readability }) } fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool { if state.byline.is_some() { return false; } let rel = node .get_attribute("rel") .map(|rel| rel == "author") .unwrap_or(false); let itemprop = node .get_attribute("itemprop") .map(|prop| prop.contains("author")) .unwrap_or(false); let content = node.get_content(); if rel || itemprop || constants::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content) { state.byline = Some(content.trim().into()); true } else { false } } // Check whether the input string could be a byline. // This verifies that the input length is less than 100 chars. fn is_valid_byline(line: &str) -> bool { let len = line.trim().len(); len > 0 && len < 100 } // Check if this node is an H1 or H2 element whose content is mostly // the same as the article title. fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool { let name = node.get_name().to_lowercase(); if name != "h1" && name != "h2" { return false; } let heading = Util::get_inner_text(node, false); if let Some(title) = title { Util::text_similarity(&heading, title) > 0.75 } else { false } } // Initialize a node with the readability object. Also checks the // className/id for special names to add to its score. fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> { let score = match node.get_name().to_uppercase().as_str() { "DIV" => 5, "PRE" | "TD" | "BLOCKQUITE" => 3, "ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => -3, "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5, _ => 0, }; let class_weight = if state.weigh_classes { Util::get_class_weight(node) } else { 0 }; let score = score + class_weight; Self::set_content_score(node, score as f64)?; Ok(()) } }