article_scraper/src/full_text_parser/readability/mod.rs

mod state;

#[cfg(test)]
mod tests;

use std::cmp::Ordering;

use libxml::tree::{Document, Node};

use self::state::State;
use super::error::FullTextParserError;
use crate::{constants, util::Util};

pub struct Readability;

impl Readability {
    pub fn extract_body(
        document: Document,
        root: &mut Node,
        title: Option<&str>,
    ) -> Result<bool, FullTextParserError> {
        let mut state = State::default();
        let mut document = document;
        let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
        let document_cache = document
            .dup()
            .map_err(|()| FullTextParserError::Readability)?;

        loop {
            let mut elements_to_score = Vec::new();
            let mut node: Option<Node> = document.clone().get_root_element();

            while let Some(node_ref) = node.as_mut() {
                let tag_name = node_ref.get_name().to_uppercase();

                if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
                    node = Util::next_node(node_ref, true);
                    continue;
                }

                let match_string = node_ref
                    .get_class_names()
                    .iter()
                    .fold(String::new(), |a, b| format!("{a} {b}"));
                let match_string = match node_ref.get_property("id") {
                    Some(id) => format!("{match_string} {id}"),
                    None => match_string,
                };

                if !Util::is_probably_visible(node_ref) {
                    node = Util::remove_and_next(node_ref);
                    continue;
                }

                if Self::check_byline(node_ref, &match_string, &mut state) {
                    node = Util::remove_and_next(node_ref);
                    continue;
                }

                if state.should_remove_title_header
                    && Self::header_duplicates_title(node_ref, title)
                {
                    state.should_remove_title_header = false;
                    node = Util::remove_and_next(node_ref);
                    continue;
                }

                // Remove unlikely candidates
                if state.strip_unlikely {
                    if constants::UNLIELY_CANDIDATES.is_match(&match_string)
                        && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
                        && !Util::has_ancestor_tag(
                            node_ref,
                            "table",
                            None,
                            None::<fn(&Node) -> bool>,
                        )
                        && !Util::has_ancestor_tag(
                            node_ref,
                            "code",
                            None,
                            None::<fn(&Node) -> bool>,
                        )
                        && tag_name != "BODY"
                        && tag_name != "A"
                    {
                        node = Util::remove_and_next(node_ref);
                        continue;
                    }

                    if let Some(role) = node_ref.get_attribute("role") {
                        if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
                            node = Util::remove_and_next(node_ref);
                            continue;
                        }
                    }
                }

                // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
                if (tag_name == "DIV"
                    || tag_name == "SECTION"
                    || tag_name == "HEADER"
                    || tag_name == "H1"
                    || tag_name == "H2"
                    || tag_name == "H3"
                    || tag_name == "H4"
                    || tag_name == "H5"
                    || tag_name == "H6")
                    && Util::is_element_without_content(node_ref)
                {
                    node = Util::remove_and_next(node_ref);
                    continue;
                }

                if constants::DEFAULT_TAGS_TO_SCORE.contains(&tag_name.as_str()) {
                    elements_to_score.push(node_ref.clone());
                }

                // Turn all divs that don't have children block level elements into p's
                if tag_name == "DIV" {
                    // Put phrasing content into paragraphs.
                    let mut p: Option<Node> = None;
                    for mut child in node_ref.get_child_nodes().into_iter() {
                        if Util::is_phrasing_content(&child) {
                            if let Some(p) = p.as_mut() {
                                child.unlink();
                                p.add_child(&mut child).map_err(|error| {
                                    log::error!("{error}");
                                    FullTextParserError::Readability
                                })?;
                            } else if !Util::is_whitespace(&child) {
                                let mut new_node = Node::new("p", None, &document)
                                    .map_err(|()| FullTextParserError::Readability)?;
                                let mut old_node = node_ref
                                    .replace_child_node(new_node.clone(), child)
                                    .map_err(|error| {
                                        log::error!("{error}");
                                        FullTextParserError::Readability
                                    })?;

                                new_node.add_child(&mut old_node).map_err(|error| {
                                    log::error!("{error}");
                                    FullTextParserError::Readability
                                })?;
                                p.replace(new_node);
                            }
                        } else if p.is_some() {
                            if let Some(p) = p.as_mut() {
                                for mut r_node in p.get_child_nodes().into_iter().rev() {
                                    if Util::is_whitespace(&r_node) {
                                        r_node.unlink();
                                        continue;
                                    }
                                    break;
                                }
                            }
                            _ = p.take();
                        }
                    }

                    // Sites like http://mobile.slate.com encloses each paragraph with a DIV
                    // element. DIVs with only a P element inside and no text content can be
                    // safely converted into plain P elements to avoid confusing the scoring
                    // algorithm with DIVs with are, in practice, paragraphs.
                    if Util::has_single_tag_inside_element(node_ref, "P")
                        && Util::get_link_density(node_ref) < 0.25
                    {
                        if let Some(new_node) = node_ref.get_first_element_child() {
                            if let Some(mut parent) = node_ref.get_parent() {
                                parent
                                    .replace_child_node(new_node.clone(), node_ref.clone())
                                    .map_err(|error| {
                                        log::error!("{error}");
                                        FullTextParserError::Readability
                                    })?;
                                node = Util::next_node(&new_node, false);
                                elements_to_score.push(new_node.clone());
                                continue;
                            }
                        }
                    } else if !Util::has_child_block_element(node_ref)
                        && node_ref.set_name("P").is_ok()
                    {
                        elements_to_score.push(node_ref.clone());
                    }
                }

                node = Util::next_node(node_ref, false);
            }

            let mut candidates = Vec::new();
            // Loop through all paragraphs, and assign a score to them based on how content-y they look.
            // Then add their score to their parent node.
            // A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
            for element_to_score in elements_to_score.drain(..) {
                if element_to_score.get_parent().is_none() {
                    continue;
                }

                let inner_text = Util::get_inner_text(&element_to_score, true);
                let inner_text_len = inner_text.len();

                // If this paragraph is less than 25 characters, don't even count it.
                if inner_text_len < 25 {
                    continue;
                }

                // Exclude nodes with no ancestor.
                let ancestors = Util::get_node_ancestors(&element_to_score, Some(5));
                if ancestors.is_empty() {
                    continue;
                }

                let mut content_score = 0.0;

                // Add a point for the paragraph itself as a base.
                content_score += 1.0;

                // Add points for any commas within this paragraph.
                content_score += inner_text.split(',').count() as f64;

                // For every 100 characters in this paragraph, add another point. Up to 3 points.
                content_score += f64::min(f64::floor(inner_text.len() as f64 / 100.0), 3.0);

                // Initialize and score ancestors.
                for (level, mut ancestor) in ancestors.into_iter().enumerate() {
                    let tag_name = ancestor.get_name().to_uppercase();

                    if ancestor.get_parent().is_none() || tag_name == "HTML" {
                        continue;
                    }

                    if Self::get_content_score(&ancestor).is_none() {
                        Self::initialize_node(&mut ancestor, &state)?;
                        candidates.push(ancestor.clone());
                    }

                    // Node score divider:
                    // - parent:             1 (no division)
                    // - grandparent:        2
                    // - great grandparent+: ancestor level * 3
                    let score_divider = if level == 0 {
                        1.0
                    } else if level == 1 {
                        2.0
                    } else {
                        level as f64 * 3.0
                    };

                    if let Some(score) = Self::get_content_score(&ancestor) {
                        let add_score = content_score / score_divider;
                        let new_score = score + add_score;
                        log::debug!(
                            "{}: {score} + {add_score} = {new_score}",
                            ancestor.get_name()
                        );
                        Self::set_content_score(&mut ancestor, new_score)?;
                    }
                }
            }

            // After we've calculated scores, loop through all of the possible
            // candidate nodes we found and find the one with the highest score.
            for candidate in candidates.iter_mut() {
                // Scale the final candidates score based on link density. Good content
                // should have a relatively small link density (5% or less) and be mostly
                // unaffected by this operation.
                if let Some(content_score) = Self::get_content_score(candidate) {
                    let candidate_score = content_score * (1.0 - Util::get_link_density(candidate));
                    Self::set_content_score(candidate, candidate_score)?;
                }
            }

            candidates.sort_by(|a, b| {
                if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
                {
                    b.partial_cmp(&a).unwrap_or(Ordering::Equal)
                } else {
                    Ordering::Equal
                }
            });

            let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();

            for candidate in top_candidates.iter() {
                log::debug!(
                    "candidate: {} {:?}",
                    candidate.get_name(),
                    candidate.get_attributes()
                );
            }
            let mut needed_to_create_top_candidate = false;
            let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
                // If we still have no top candidate, just use the body as a last resort.
                // We also have to copy the body node so it is something we can modify.
                let mut root = document.get_root_element().expect("doc should have root");
                if let Some(body) = root
                    .get_child_elements()
                    .into_iter()
                    .find(|n| n.get_name().to_uppercase() == "BODY")
                {
                    root = body;
                }

                let mut new_top_candidate =
                    Node::new("DIV", None, &document).expect("can't create new node");

                for mut child in root.get_child_elements().drain(..) {
                    child.unlink();
                    new_top_candidate.add_child(&mut child).unwrap();
                }

                root.add_child(&mut new_top_candidate).unwrap();

                Self::initialize_node(&mut new_top_candidate, &state)
                    .expect("init should not fail");
                needed_to_create_top_candidate = true;
                new_top_candidate
            });

            // Util::serialize_node(&top_candidate, "top_candidate.html");

            let mut alternative_candidate_ancestors = Vec::new();
            // Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
            // and whose scores are quite closed with current `topCandidate` node.
            if let Some(top_score) = Self::get_content_score(&top_candidate) {
                for candidate in top_candidates.iter().skip(1) {
                    let score = Self::get_content_score(candidate).unwrap_or(0.0);
                    if score / top_score >= 0.75 {
                        alternative_candidate_ancestors
                            .push(Util::get_node_ancestors(candidate, None));
                    }
                }
            }

            if alternative_candidate_ancestors.len() >= constants::MINIMUM_TOPCANDIDATES {
                let mut parent_of_top_candidate = top_candidate.get_parent();

                while let Some(parent) = &parent_of_top_candidate {
                    if parent.get_name().to_uppercase() == "BODY" {
                        break;
                    }

                    let mut lists_containing_this_ancestor = 0;
                    let tmp = usize::min(
                        alternative_candidate_ancestors.len(),
                        constants::MINIMUM_TOPCANDIDATES,
                    );
                    for ancestors in alternative_candidate_ancestors.iter().take(tmp) {
                        lists_containing_this_ancestor +=
                            ancestors.iter().filter(|n| n == &parent).count();
                    }

                    if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
                        top_candidate = parent.clone();
                        break;
                    }

                    parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
                }
            }

            if Self::get_content_score(&top_candidate).is_none() {
                Self::initialize_node(&mut top_candidate, &state)?;
            }

            //Util::serialize_node(&top_candidate, "new_top_candidate.html");

            // Because of our bonus system, parents of candidates might have scores
            // themselves. They get half of the node. There won't be nodes with higher
            // scores than our topCandidate, but if we see the score going *up* in the first
            // few steps up the tree, that's a decent sign that there might be more content
            // lurking in other places that we want to unify in. The sibling stuff
            // below does some of that - but only if we've looked high enough up the DOM
            // tree.
            let mut parent_of_top_candidate = top_candidate.get_parent();
            let mut last_score = Self::get_content_score(&top_candidate).unwrap_or(0.0);

            // The scores shouldn't get too low.
            let score_threshold = last_score / 3.0;

            while parent_of_top_candidate.is_some()
                && !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
            {
                if parent_of_top_candidate
                    .as_ref()
                    .map(|n| Self::get_content_score(n).is_none())
                    .unwrap_or(false)
                {
                    parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
                    continue;
                }

                let parent_score = parent_of_top_candidate
                    .as_ref()
                    .and_then(Self::get_content_score)
                    .unwrap_or(0.0);
                if parent_score < score_threshold {
                    break;
                }

                if parent_score > last_score {
                    // Alright! We found a better parent to use.
                    if let Some(parent) = parent_of_top_candidate {
                        top_candidate = parent;
                    }
                    break;
                }

                last_score = parent_of_top_candidate
                    .as_ref()
                    .and_then(Self::get_content_score)
                    .unwrap_or(0.0);
                parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
            }

            // If the top candidate is the only child, use parent instead. This will help sibling
            // joining logic when adjacent content is actually located in parent's sibling node.
            parent_of_top_candidate = top_candidate.get_parent();

            while !Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
                && parent_of_top_candidate
                    .as_ref()
                    .map(|n| n.get_child_elements().len() == 1)
                    .unwrap_or(false)
            {
                top_candidate = parent_of_top_candidate.ok_or(FullTextParserError::Readability)?;
                parent_of_top_candidate = top_candidate.get_parent();
            }

            if Self::get_content_score(&top_candidate).is_none() {
                Self::initialize_node(&mut top_candidate, &state)?;
            }

            // Now that we have the top candidate, look through its siblings for content
            // that might also be related. Things like preambles, content split by ads
            // that we removed, etc.
            let mut article_content =
                Node::new("DIV", None, &document).map_err(|()| FullTextParserError::Readability)?;

            let sibling_score_threshold = f64::max(
                10.0,
                Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2,
            );
            // Keep potential top candidate's parent node to try to get text direction of it later.
            parent_of_top_candidate = top_candidate.get_parent();
            let siblings = parent_of_top_candidate
                .as_ref()
                .map(|n| n.get_child_elements());

            if let Some(mut siblings) = siblings {
                for mut sibling in siblings.drain(..) {
                    let mut append = false;

                    let score = Self::get_content_score(&sibling).unwrap_or(0.0);
                    log::debug!(
                        "Looking at sibling node: {} ({:?}) with score {score}",
                        sibling.get_name(),
                        sibling.get_attribute("class")
                    );

                    if top_candidate == sibling {
                        append = true;
                    } else {
                        let mut content_bonus = 0.0;

                        // Give a bonus if sibling nodes and top candidates have the example same classname
                        let sibling_classes = sibling.get_class_names();
                        let tc_classes = top_candidate.get_class_names();

                        if !tc_classes.is_empty()
                            && !sibling_classes.is_empty()
                            && sibling_classes
                                .iter()
                                .all(|class| tc_classes.contains(class))
                        {
                            content_bonus +=
                                Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
                        }

                        if score + content_bonus >= sibling_score_threshold {
                            append = true;
                        } else if sibling.get_name().to_uppercase() == "P" {
                            let link_density = Util::get_link_density(&sibling);
                            let node_content = Util::get_inner_text(&sibling, false);
                            let node_length = node_content.len();

                            if node_length > 80
                                && (link_density < 0.25
                                    || (node_length > 0
                                        && link_density == 0.0
                                        && constants::SIBLING_CONTENT.is_match(&node_content)))
                            {
                                append = true;
                            }
                        }
                    }

                    if append {
                        log::debug!(
                            "Appending node: {} ({:?})",
                            sibling.get_name(),
                            sibling.get_attribute("class")
                        );

                        if !constants::ALTER_TO_DIV_EXCEPTIONS
                            .contains(sibling.get_name().to_uppercase().as_str())
                        {
                            // We have a node that isn't a common block level element, like a form or td tag.
                            // Turn it into a div so it doesn't get filtered out later by accident.
                            log::debug!(
                                "Altering sibling: {} ({:?})",
                                sibling.get_name(),
                                sibling.get_attribute("class")
                            );

                            sibling.set_name("DIV").map_err(|error| {
                                log::error!("{error}");
                                FullTextParserError::Readability
                            })?;
                        }

                        sibling.unlink();
                        article_content.add_child(&mut sibling).map_err(|error| {
                            log::error!("{error}");
                            FullTextParserError::Readability
                        })?;
                    }
                }
            }

            if state.clean_conditionally {
                crate::FullTextParser::post_process_page(&mut article_content)?;
            }

            if needed_to_create_top_candidate {
                // We already created a fake div thing, and there wouldn't have been any siblings left
                // for the previous loop, so there's no point trying to create a new div, and then
                // move all the children over. Just assign IDs and class names here. No need to append
                // because that already happened anyway.
                top_candidate
                    .set_property("id", "readability-page-1")
                    .map_err(|error| {
                        log::error!("{error}");
                        FullTextParserError::Readability
                    })?;
            } else {
                let mut div = Node::new("DIV", None, &document)
                    .map_err(|()| FullTextParserError::Readability)?;
                div.set_property("id", "readability-page-1")
                    .map_err(|error| {
                        log::error!("{error}");
                        FullTextParserError::Readability
                    })?;

                for mut child in article_content.get_child_nodes() {
                    child.unlink();
                    div.add_child(&mut child).map_err(|error| {
                        log::error!("{error}");
                        FullTextParserError::Readability
                    })?;
                }
                article_content.add_child(&mut div).map_err(|error| {
                    log::error!("{error}");
                    FullTextParserError::Readability
                })?;
            }

            let mut parse_successful = true;

            // Now that we've gone through the full algorithm, check to see if
            // we got any meaningful content. If we didn't, we may need to re-run
            // grabArticle with different flags set. This gives us a higher likelihood of
            // finding the content, and the sieve approach gives us a higher likelihood of
            // finding the -right- content.
            let text = Util::get_inner_text(&article_content, true);
            let text_length = text.len();

            //Util::serialize_node(&article_content, "debug.html");

            if text_length < constants::DEFAULT_CHAR_THRESHOLD {
                parse_successful = false;

                if state.strip_unlikely {
                    state.strip_unlikely = false;
                    attempts.push((article_content, text_length, document));
                } else if state.weigh_classes {
                    state.weigh_classes = false;
                    attempts.push((article_content, text_length, document));
                } else if state.clean_conditionally {
                    state.clean_conditionally = false;
                    attempts.push((article_content, text_length, document));
                } else {
                    attempts.push((article_content, text_length, document));
                    // No luck after removing flags, just return the longest text we found during the different loops

                    attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));

                    // But first check if we actually have something
                    if let Some((best_attempt, _len, _document)) = attempts.pop() {
                        for mut child in best_attempt.get_child_nodes() {
                            child.unlink();
                            root.add_child(&mut child).map_err(|error| {
                                log::error!("{error}");
                                FullTextParserError::Readability
                            })?;
                        }
                        parse_successful = true;
                    }

                    return Ok(parse_successful);
                }

                document = document_cache
                    .dup()
                    .map_err(|()| FullTextParserError::Readability)?;
            } else {
                for mut child in article_content.get_child_nodes() {
                    child.unlink();
                    root.add_child(&mut child).map_err(|error| {
                        log::error!("{error}");
                        FullTextParserError::Readability
                    })?;
                }
                return Ok(parse_successful);
            }
        }
    }

    fn get_content_score(node: &Node) -> Option<f64> {
        node.get_attribute(constants::SCORE_ATTR)
            .and_then(|a| a.parse::<f64>().ok())
    }

    fn set_content_score(node: &mut Node, score: f64) -> Result<(), FullTextParserError> {
        node.set_attribute(constants::SCORE_ATTR, &score.to_string())
            .map_err(|err| {
                log::error!("failed to set content score: {err}");
                FullTextParserError::Readability
            })
    }

    fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
        if state.byline.is_some() {
            return false;
        }

        let rel = node
            .get_attribute("rel")
            .map(|rel| rel == "author")
            .unwrap_or(false);
        let itemprop = node
            .get_attribute("itemprop")
            .map(|prop| prop.contains("author"))
            .unwrap_or(false);

        let content = node.get_content();
        if rel
            || itemprop
            || constants::BYLINE.is_match(matchstring) && Self::is_valid_byline(&content)
        {
            state.byline = Some(content.trim().into());
            true
        } else {
            false
        }
    }

    // Check whether the input string could be a byline.
    // This verifies that the input length is less than 100 chars.
    fn is_valid_byline(line: &str) -> bool {
        let len = line.trim().len();
        len > 0 && len < 100
    }

    // Check if this node is an H1 or H2 element whose content is mostly
    // the same as the article title.
    fn header_duplicates_title(node: &Node, title: Option<&str>) -> bool {
        let name = node.get_name().to_lowercase();
        if name != "h1" && name != "h2" {
            return false;
        }
        let heading = Util::get_inner_text(node, false);

        if let Some(title) = title {
            Util::text_similarity(&heading, title) > 0.75
        } else {
            false
        }
    }

    // Initialize a node with the readability object. Also checks the
    // className/id for special names to add to its score.
    fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
        let score = match node.get_name().to_uppercase().as_str() {
            "DIV" => 5,
            "PRE" | "TD" | "BLOCKQUITE" => 3,
            "ADDRESS" | "OL" | "UL" | "DL" | "DD" | "DT" | "LI" | "FORM" => -3,
            "H1" | "H2" | "H3" | "H4" | "H5" | "H6" | "TH" => -5,
            _ => 0,
        };
        let class_weight = if state.weigh_classes {
            Util::get_class_weight(node)
        } else {
            0
        };
        let score = score + class_weight;
        Self::set_content_score(node, score as f64)?;
        Ok(())
    }
}