unwrap noscript images

2025-07-07 16:15:32 +02:00 · 2023-02-23 01:53:42 +01:00 · 2023-02-23 01:53:42 +01:00 · 7ae98904d4
commit 7ae98904d4
parent 98c06e11f4
6 changed files with 537 additions and 284 deletions
--- a/src/article.rs
+++ b/src/article.rs
@ -18,7 +18,7 @@ impl Article {
    pub fn get_content(&self) -> Option<String> {
        // serialize content
        let options = SaveOptions {
-            format: false,
+            format: true,
            no_declaration: false,
            no_empty_tags: true,
            no_xhtml: false,
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@ -19,6 +19,7 @@ use libxml::parser::Parser;
 use libxml::tree::{Document, Node};
 use libxml::xpath::Context;
 use log::{debug, error, info, warn};
+use regex::Regex;
 use reqwest::header::HeaderMap;
 use reqwest::Client;
 use std::path::Path;
@ -124,6 +125,10 @@ impl FullTextParser {
            return Err(error);
        }

+        if let Some(mut root) = document.get_root_element() {
+            Self::post_process_content(&mut root)?;
+        }
+
        article.document = Some(document);

        Ok(article)
@ -179,6 +184,7 @@ impl FullTextParser {
            Self::check_for_thumbnail(&xpath_ctx, article);
        }
        Self::strip_junk(&xpath_ctx, config, global_config, url);
+        Self::unwrap_noscript_images(&xpath_ctx)?;
        let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;

        if !found_body {
@ -195,6 +201,7 @@ impl FullTextParser {
            document = Self::parse_html(&html, config, global_config)?;
            xpath_ctx = Self::get_xpath_ctx(&document)?;
            Self::strip_junk(&xpath_ctx, config, global_config, &url);
+            Self::unwrap_noscript_images(&xpath_ctx)?;
            Self::extract_body(&xpath_ctx, root, config, global_config)?;
        }

@ -609,6 +616,12 @@ impl FullTextParser {
        // strip all comments
        let _ = Util::strip_node(context, "//comment()");

+        // strip all scripts
+        let _ = Util::strip_node(context, "//script");
+
+        // strip all styles
+        let _ = Util::strip_node(context, "//style");
+
        // strip all empty url-tags <a/>
        let _ = Util::strip_node(context, "//a[not(node())]");

@ -616,6 +629,91 @@ impl FullTextParser {
        let _ = Util::strip_node(context, "//*[@type='text/css']");
    }

+    /**
+     * Find all <noscript> that are located after <img> nodes, and which contain only one
+     * <img> element. Replace the first image with the image from inside the <noscript> tag,
+     * and remove the <noscript> tag. This improves the quality of the images we use on
+     * some sites (e.g. Medium).
+     **/
+    fn unwrap_noscript_images(ctx: &Context) -> Result<(), FullTextParserError> {
+        // Find img without source or attributes that might contains image, and remove it.
+        // This is done to prevent a placeholder img is replaced by img from noscript in next step.
+        let img_regex = Regex::new(r#"/\.(jpg|jpeg|png|webp)/i"#).unwrap();
+        let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?;
+        for mut img_node in img_nodes {
+            let attrs = img_node.get_attributes();
+
+            let keep = attrs.iter().any(|(name, value)| {
+                name == "src"
+                    || name == "srcset"
+                    || name == "data-src"
+                    || name == "data-srcset"
+                    || img_regex.is_match(&value)
+            });
+            if !keep {
+                img_node.unlink();
+            }
+        }
+
+        // Next find noscript and try to extract its image
+        let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?;
+        for mut noscript_node in noscript_nodes {
+            // Parse content of noscript and make sure it only contains image
+            if !Util::is_single_image(&noscript_node) {
+                continue;
+            }
+
+            // If noscript has previous sibling and it only contains image,
+            // replace it with noscript content. However we also keep old
+            // attributes that might contains image.
+            if let Some(prev) = noscript_node.get_prev_element_sibling() {
+                if Util::is_single_image(&prev) {
+
+                    {
+                        let mut prev_img = prev.clone();
+
+                        if prev_img.get_name().to_uppercase() != "IMG" {
+                            if let Some(img_node) = Util::get_elements_by_tag_name(&prev_img, "img").into_iter().next() {
+                                prev_img = img_node;
+                            }
+                        }
+
+                        let new_img = Util::get_elements_by_tag_name(&noscript_node, "img").into_iter().next();
+                        if let Some(mut new_img) = new_img {
+                            for (key, value) in prev_img.get_attributes() {
+                                if value.is_empty() {
+                                    continue;
+                                }
+
+                                if key == "src" || key == "srcset" || img_regex.is_match(&value) {
+                                    if new_img.get_attribute(&key).as_deref() == Some(&value) {
+                                        continue;
+                                    }
+                                    
+                                    let mut attr_name = key;
+                                    if new_img.has_attribute(&attr_name) {
+                                        attr_name = format!("data-old-{attr_name}");
+                                    }
+                        
+                                    new_img.set_attribute(&attr_name, &value).unwrap();
+                                }
+                            }
+                        }
+                    }
+
+                    if let Some(mut parent) = noscript_node.get_parent() {
+                        if let Some(first_child) = noscript_node.get_first_child() {
+                            parent.replace_child_node(first_child, prev).unwrap();
+                            noscript_node.unlink();
+                        }
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+
    fn extract_body(
        context: &Context,
        root: &mut Node,
@ -726,4 +824,76 @@ impl FullTextParser {

        Ok(())
    }
+
+    pub(crate) fn post_process_content(root: &mut Node) -> Result<(), FullTextParserError> {
+        Self::clean_classes(root)?;
+        Self::simplify_nested_elements(root)?;
+        Ok(())
+    }
+
+    fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
+        let mut node_iter = Some(root.clone());
+
+        while let Some(mut node) = node_iter {
+            let classes = node.get_class_names();
+            if classes.contains("page") {
+                node.set_attribute("class", "page").map_err(|e| {
+                    log::error!("{e}");
+                    FullTextParserError::Xml
+                })?;
+            } else {
+                node.remove_attribute("class").map_err(|e| {
+                    log::error!("{e}");
+                    FullTextParserError::Xml
+                })?;
+            }
+
+            node.remove_attribute("content_score").map_err(|e| {
+                log::error!("{e}");
+                FullTextParserError::Xml
+            })?;
+
+            node_iter = Util::next_node(&node, false);
+        }
+        Ok(())
+    }
+
+    fn simplify_nested_elements(root: &mut Node) -> Result<(), FullTextParserError> {
+        let mut node_iter = Some(root.clone());
+
+        while let Some(mut node) = node_iter {
+            let tag_name = node.get_name().to_uppercase();
+            if tag_name != "ARTICLE"
+                && node.get_parent().is_some()
+                && (tag_name == "DIV" || tag_name == "SECTION")
+            {
+                if Util::is_element_without_content(&node) {
+                    node_iter = Util::remove_and_next(&mut node);
+                    continue;
+                } else if Util::has_single_tag_inside_element(&node, "DIV")
+                    || Util::has_single_tag_inside_element(&node, "SECTION")
+                {
+                    if let Some(mut parent) = node.get_parent() {
+                        if let Some(mut child) = node.get_child_nodes().into_iter().next() {
+                            for (k, v) in node.get_attributes().into_iter() {
+                                child.set_attribute(&k, &v).map_err(|e| {
+                                    log::error!("{e}");
+                                    FullTextParserError::Xml
+                                })?;
+                            }
+                            parent
+                                .replace_child_node(child, node.clone())
+                                .map_err(|e| {
+                                    log::error!("{e}");
+                                    FullTextParserError::Xml
+                                })?;
+                        }
+                    }
+                }
+            }
+
+            node_iter = Util::next_node(&node, false);
+        }
+        Ok(())
+    }
 }
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -9,7 +9,7 @@ use libxml::tree::{node, Document, Node, NodeType};

 use self::state::State;
 use super::error::FullTextParserError;
-use crate::constants;
+use crate::{constants, util::Util};

 pub struct Readability;

@ -43,13 +43,13 @@ impl Readability {
                    None => match_string,
                };

-                if !Self::is_probably_visible(node_ref) {
-                    node = Self::remove_and_next(node_ref);
+                if !Util::is_probably_visible(node_ref) {
+                    node = Util::remove_and_next(node_ref);
                    continue;
                }

                if Self::check_byline(node_ref, &match_string, &mut state) {
-                    node = Self::remove_and_next(node_ref);
+                    node = Util::remove_and_next(node_ref);
                    continue;
                }

@ -57,7 +57,7 @@ impl Readability {
                    && Self::header_duplicates_title(node_ref, title)
                {
                    state.should_remove_title_header = false;
-                    node = Self::remove_and_next(node_ref);
+                    node = Util::remove_and_next(node_ref);
                    continue;
                }

@ -65,18 +65,18 @@ impl Readability {
                if state.strip_unlikely {
                    if constants::UNLIELY_CANDIDATES.is_match(&match_string)
                        && !constants::OKAY_MAYBE_ITS_A_CANDIDATE.is_match(&match_string)
-                        && !Self::has_ancestor_tag(node_ref, "table", None)
-                        && !Self::has_ancestor_tag(node_ref, "code", None)
+                        && !Util::has_ancestor_tag(node_ref, "table", None)
+                        && !Util::has_ancestor_tag(node_ref, "code", None)
                        && tag_name != "BODY"
                        && tag_name != "A"
                    {
-                        node = Self::remove_and_next(node_ref);
+                        node = Util::remove_and_next(node_ref);
                        continue;
                    }

                    if let Some(role) = node_ref.get_attribute("role") {
                        if constants::UNLIKELY_ROLES.contains(&role.as_str()) {
-                            node = Self::remove_and_next(node_ref);
+                            node = Util::remove_and_next(node_ref);
                            continue;
                        }
                    }
@ -92,9 +92,9 @@ impl Readability {
                    || tag_name == "H4"
                    || tag_name == "H5"
                    || tag_name == "H6")
-                    && Self::is_element_without_content(node_ref)
+                    && Util::is_element_without_content(node_ref)
                {
-                    node = Self::remove_and_next(node_ref);
+                    node = Util::remove_and_next(node_ref);
                    continue;
                }

@ -110,7 +110,7 @@ impl Readability {
                        if Self::is_phrasing_content(&child_node) {
                            if let Some(p) = p.as_mut() {
                                let _ = p.add_child(&mut child_node);
-                            } else if !Self::is_whitespace(&child_node) {
+                            } else if !Util::is_whitespace(&child_node) {
                                let mut new_node = Node::new("p", None, &document)
                                    .map_err(|()| FullTextParserError::Readability)?;
                                node_ref
@ -127,7 +127,7 @@ impl Readability {
                            }
                        } else if let Some(p) = p.as_mut() {
                            for mut r_node in p.get_child_nodes().into_iter().rev() {
-                                if Self::is_whitespace(&r_node) {
+                                if Util::is_whitespace(&r_node) {
                                    r_node.unlink();
                                }
                            }
@ -138,8 +138,8 @@ impl Readability {
                    // element. DIVs with only a P element inside and no text content can be
                    // safely converted into plain P elements to avoid confusing the scoring
                    // algorithm with DIVs with are, in practice, paragraphs.
-                    if Self::has_single_tag_inside_element(node_ref, "P")
-                        && Self::get_link_density(node_ref) < 0.25
+                    if Util::has_single_tag_inside_element(node_ref, "P")
+                        && Util::get_link_density(node_ref) < 0.25
                    {
                        if let Some(new_node) = node_ref.get_child_nodes().first() {
                            if let Some(mut parent) = node_ref.get_parent() {
@ -154,14 +154,14 @@ impl Readability {
                                continue;
                            }
                        }
-                    } else if !Self::has_child_block_element(node_ref)
+                    } else if !Util::has_child_block_element(node_ref)
                        && node_ref.set_name("P").is_ok()
                    {
                        elements_to_score.push(node_ref.clone());
                    }
                }

-                node = Self::next_node(node_ref, false);
+                node = Util::next_node(node_ref, false);
            }

            let mut candidates = Vec::new();
@ -173,7 +173,7 @@ impl Readability {
                    continue;
                }

-                let inner_text = Self::get_inner_text(&element_to_score, true);
+                let inner_text = Util::get_inner_text(&element_to_score, true);

                // If this paragraph is less than 25 characters, don't even count it.
                if inner_text.len() < 25 {
@ -181,7 +181,7 @@ impl Readability {
                }

                // Exclude nodes with no ancestor.
-                let ancestors = Self::get_node_ancestors(&element_to_score, 5);
+                let ancestors = Util::get_node_ancestors(&element_to_score, 5);
                if ancestors.is_empty() {
                    continue;
                }
@ -234,7 +234,7 @@ impl Readability {
                // should have a relatively small link density (5% or less) and be mostly
                // unaffected by this operation.
                if let Some(content_score) = Self::get_content_score(candidate) {
-                    let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
+                    let candidate_score = content_score * (1.0 - Util::get_link_density(candidate));
                    Self::set_content_score(candidate, candidate_score)?;
                }
            }
@ -242,7 +242,7 @@ impl Readability {
            candidates.sort_by(|a, b| {
                if let (Some(a), Some(b)) = (Self::get_content_score(a), Self::get_content_score(b))
                {
-                    a.partial_cmp(&b).unwrap_or(Ordering::Equal)
+                    b.partial_cmp(&a).unwrap_or(Ordering::Equal)
                } else {
                    Ordering::Equal
                }
@ -317,7 +317,7 @@ impl Readability {
            // The scores shouldn't get too low.
            let score_threshold = last_score / 3.0;

-            while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
+            while Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY") {
                if parent_of_top_candidate
                    .as_ref()
                    .map(|n| Self::get_content_score(n).is_none())
@ -354,7 +354,7 @@ impl Readability {
            // joining logic when adjacent content is actually located in parent's sibling node.
            parent_of_top_candidate = top_candidate.get_parent();

-            while Self::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
+            while Util::has_tag_name(parent_of_top_candidate.as_ref(), "BODY")
                && parent_of_top_candidate
                    .as_ref()
                    .map(|n| n.get_child_elements().len() == 1)
@ -414,8 +414,8 @@ impl Readability {
                        {
                            append = true;
                        } else if sibling.get_name().to_uppercase() == "P" {
-                            let link_density = Self::get_link_density(&sibling);
-                            let node_content = Self::get_inner_text(&sibling, false);
+                            let link_density = Util::get_link_density(&sibling);
+                            let node_content = Util::get_inner_text(&sibling, false);
                            let node_length = node_content.len();

                            if node_length > 80
@ -432,7 +432,8 @@ impl Readability {
                    if append {
                        log::debug!("Appending node: {sibling:?}");

-                        if !constants::ALTER_TO_DIV_EXCEPTIONS.contains(sibling.get_name().as_str())
+                        if !constants::ALTER_TO_DIV_EXCEPTIONS
+                            .contains(sibling.get_name().to_uppercase().as_str())
                        {
                            // We have a node that isn't a common block level element, like a form or td tag.
                            // Turn it into a div so it doesn't get filtered out later by accident.
@ -503,7 +504,7 @@ impl Readability {
            // grabArticle with different flags set. This gives us a higher likelihood of
            // finding the content, and the sieve approach gives us a higher likelihood of
            // finding the -right- content.
-            let text = Self::get_inner_text(&article_content, true);
+            let text = Util::get_inner_text(&article_content, true);
            let text_length = text.len();

            if text_length < constants::DEFAULT_CHAR_THRESHOLD {
@ -525,12 +526,14 @@ impl Readability {
                    attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));

                    // But first check if we actually have something
-                    if let Some((mut best_attempt, _len, _document)) = attempts.pop() {
-                        best_attempt.unlink();
-                        root.add_child(&mut best_attempt).map_err(|error| {
-                            log::error!("{error}");
-                            FullTextParserError::Readability
-                        })?;
+                    if let Some((best_attempt, _len, _document)) = attempts.pop() {
+                        for mut child in best_attempt.get_child_nodes() {
+                            child.unlink();
+                            root.add_child(&mut child).map_err(|error| {
+                                log::error!("{error}");
+                                FullTextParserError::Readability
+                            })?;
+                        }
                        parse_successful = true;
                    }

@ -541,10 +544,13 @@ impl Readability {
                    .dup()
                    .map_err(|()| FullTextParserError::Readability)?;
            } else {
-                root.add_child(&mut article_content).map_err(|error| {
-                    log::error!("{error}");
-                    FullTextParserError::Readability
-                })?;
+                for mut child in article_content.get_child_nodes() {
+                    child.unlink();
+                    root.add_child(&mut child).map_err(|error| {
+                        log::error!("{error}");
+                        FullTextParserError::Readability
+                    })?;
+                }
                return Ok(parse_successful);
            }
        }
@ -563,83 +569,6 @@ impl Readability {
            })
    }

-    fn is_probably_visible(node: &Node) -> bool {
-        let display_none = node
-            .get_attribute("display")
-            .map(|display| display == "none")
-            .unwrap_or(false);
-        let is_hidden = node.has_attribute("hidden");
-        let aria_hidden = node
-            .get_attribute("aria-hidden")
-            .map(|attr| attr == "true")
-            .unwrap_or(false);
-        let has_fallback_image = node.get_class_names().contains("fallback-image");
-
-        !display_none && !is_hidden && !aria_hidden || has_fallback_image
-    }
-
-    fn is_whitespace(node: &Node) -> bool {
-        let is_text_node = node
-            .get_type()
-            .map(|t| t == NodeType::TextNode)
-            .unwrap_or(false);
-        let is_element_node = node
-            .get_type()
-            .map(|t| t == NodeType::ElementNode)
-            .unwrap_or(false);
-
-        (is_text_node && node.get_content().trim().is_empty())
-            || (is_element_node && node.get_name().to_uppercase() == "BR")
-    }
-
-    fn remove_and_next(node: &mut Node) -> Option<Node> {
-        let next_node = Self::next_node(node, true);
-        node.unlink();
-        next_node
-    }
-
-    fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
-        let mut node = node.clone();
-
-        // First check for kids if those aren't being ignored
-        let first_child = node.get_first_child();
-        if !ignore_self_and_kids && first_child.is_some() {
-            return first_child;
-        }
-
-        // Then for siblings...
-        let next_sibling = node.get_next_sibling();
-        if next_sibling.is_some() {
-            return next_sibling;
-        }
-
-        // And finally, move up the parent chain *and* find a sibling
-        // (because this is depth-first traversal, we will have already
-        // seen the parent nodes themselves).
-        loop {
-            let parent = node.get_parent();
-            if parent.is_none() {
-                break;
-            }
-
-            if let Some(parent) = parent {
-                let parent_name = parent.get_name().to_uppercase();
-                if parent_name == "HTML" {
-                    break;
-                }
-
-                let next_sibling = parent.get_next_sibling();
-                if next_sibling.is_some() {
-                    return next_sibling;
-                } else {
-                    node = parent;
-                }
-            }
-        }
-
-        None
-    }
-
    fn check_byline(node: &Node, matchstring: &str, state: &mut State) -> bool {
        if state.byline.is_some() {
            return false;
@ -680,123 +609,15 @@ impl Readability {
        if name != "h1" && name != "h2" {
            return false;
        }
-        let heading = Self::get_inner_text(node, false);
+        let heading = Util::get_inner_text(node, false);

        if let Some(title) = title {
-            Self::text_similarity(&heading, title) > 0.75
+            Util::text_similarity(&heading, title) > 0.75
        } else {
            false
        }
    }

-    fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
-        let content = node.get_content().trim().to_owned();
-        if normalize_spaces {
-            constants::NORMALIZE.replace(&content, " ").into()
-        } else {
-            content
-        }
-    }
-
-    fn text_similarity(a: &str, b: &str) -> f64 {
-        let a = a.to_lowercase();
-        let b = b.to_lowercase();
-        let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
-        let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
-        if tokens_a.is_empty() || tokens_b.is_empty() {
-            return 0.0;
-        }
-
-        let tokens_b_total = tokens_b.join(" ").len() as f64;
-        let uniq_tokens_b = tokens_b
-            .into_iter()
-            .filter(|token| !tokens_a.iter().any(|t| t == token))
-            .collect::<Vec<_>>();
-        let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
-
-        let distance_b = uniq_tokens_b_total / tokens_b_total;
-        1.0 - distance_b
-    }
-
-    fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
-        let max_depth = max_depth.unwrap_or(3);
-        let tag_name = tag_name.to_uppercase();
-        let mut depth = 0;
-        let mut node = node.get_parent();
-
-        loop {
-            if depth > max_depth {
-                return false;
-            }
-
-            let tmp_node = match node {
-                Some(node) => node,
-                None => return false,
-            };
-
-            if tmp_node.get_name() == tag_name {
-                return true;
-            }
-
-            node = tmp_node.get_parent();
-            depth += 1;
-        }
-    }
-
-    fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
-        // There should be exactly 1 element child with given tag
-        if node.get_child_nodes().len() == 1
-            || node
-                .get_child_nodes()
-                .first()
-                .map(|n| n.get_name().to_uppercase() == tag)
-                .unwrap_or(false)
-        {
-            return false;
-        }
-
-        // And there should be no text nodes with real content
-        node.get_child_nodes().iter().any(|n| {
-            n.get_type()
-                .map(|t| t == NodeType::TextNode)
-                .unwrap_or(false)
-                && constants::HAS_CONTENT.is_match(&n.get_content())
-        })
-    }
-
-    fn is_element_without_content(node: &Node) -> bool {
-        if let Some(node_type) = node.get_type() {
-            let len = node.get_child_nodes().len();
-
-            return node_type == NodeType::ElementNode
-                && node.get_content().trim().is_empty()
-                && (len == 0
-                    || len
-                        == Self::get_elements_by_tag_name(node, "br").len()
-                            + Self::get_elements_by_tag_name(node, "hr").len());
-        }
-
-        false
-    }
-
-    fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
-        let tag = tag.to_uppercase();
-        let all_tags = tag == "*";
-        let mut vec = Vec::new();
-
-        fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
-            for child in node.get_child_elements() {
-                if all_tags || child.get_name().to_uppercase() == tag {
-                    vec.push(child.clone());
-                }
-                get_elems(&child, tag, vec, all_tags);
-            }
-        }
-
-        get_elems(node, &tag, &mut vec, all_tags);
-        vec
-    }
-
    fn is_phrasing_content(node: &Node) -> bool {
        let tag_name = node.get_name().to_uppercase();
        let is_text_node = node
@ -814,56 +635,6 @@ impl Readability {
                    .all(|val| val)
    }

-    fn get_link_density(node: &Node) -> f64 {
-        let text_length = Self::get_inner_text(node, false).len();
-        if text_length == 0 {
-            return 0.0;
-        }
-
-        let mut link_length = 0.0;
-
-        // XXX implement _reduceNodeList?
-        let link_nodes = Self::get_elements_by_tag_name(node, "A");
-        for link_node in link_nodes {
-            if let Some(href) = link_node.get_attribute("href") {
-                let coefficient = if constants::HASH_URL.is_match(&href) {
-                    0.3
-                } else {
-                    1.0
-                };
-                link_length += Self::get_inner_text(&link_node, false).len() as f64 * coefficient;
-            }
-        }
-
-        link_length / text_length as f64
-    }
-
-    // Determine whether element has any children block level elements.
-    fn has_child_block_element(node: &Node) -> bool {
-        node.get_child_elements().iter().any(|node| {
-            constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
-                || Self::has_child_block_element(node)
-        })
-    }
-
-    fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
-        let mut ancestors = Vec::new();
-        let mut node = node.clone();
-
-        for _ in 0..=max_depth {
-            let parent = node.get_parent();
-            match parent {
-                Some(parent) => {
-                    ancestors.push(parent.clone());
-                    node = parent;
-                }
-                None => return ancestors,
-            }
-        }
-
-        ancestors
-    }
-
    // Initialize a node with the readability object. Also checks the
    // className/id for special names to add to its score.
    fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
@ -910,9 +681,4 @@ impl Readability {

        weight
    }
-
-    fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
-        node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
-            .unwrap_or(false)
-    }
 }
--- a/src/full_text_parser/readability/tests.rs
+++ b/src/full_text_parser/readability/tests.rs
@ -14,6 +14,7 @@ async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
    let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
    let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
    crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
+    crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
    let article = Article {
        title: None,
        author: None,
@ -34,9 +35,18 @@ async fn test_1() {
    let url = Url::parse("http://google.com").unwrap();
    let (document, xpath_ctx, mut article) = prepare(&html, &url).await;

+    let mut article_document = Document::new().unwrap();
    let mut root = Node::new("article", None, &document).unwrap();
+    article_document.set_root_element(&root);

    metadata::extract(&xpath_ctx, None, None, &mut article);
-
    super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
+
+    if let Some(mut root) = article_document.get_root_element() {
+        crate::FullTextParser::post_process_content(&mut root).unwrap();
+    }
+
+    article.document = Some(article_document);
+    let html = article.get_content().unwrap();
+    std::fs::write("test.html", html).unwrap();
 }
--- a/src/full_text_parser/tests.rs
+++ b/src/full_text_parser/tests.rs
@ -1,4 +1,5 @@
-use super::FullTextParser;
+use super::{FullTextParser, config::ConfigEntry};
+use libxml::tree::SaveOptions;
 use reqwest::Client;
 use std::path::PathBuf;

@ -72,3 +73,49 @@ async fn encoding_windows_1252() {
        .unwrap();
    assert!(html.contains("Bund-Länder-Konferenz"));
 }
+
+#[tokio::test]
+async fn unwrap_noscript_images() {
+    let _ = env_logger::builder().is_test(true).try_init();
+
+    let html = r#"
+<p>Lorem ipsum dolor sit amet,
+    <span class="lazyload">
+            <img src="foto-m0101.jpg" alt="image description">
+            <noscript><img src="foto-m0102.jpg" alt="image description"></noscript>
+    </span>
+    consectetur adipiscing elit.
+</p>
+    "#;
+
+    let expected = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+<html><body>
+<p>Lorem ipsum dolor sit amet,
+    <span class="lazyload">
+            <img src="foto-m0102.jpg" alt="image description" data-old-src="foto-m0101.jpg">
+            
+    </span>
+    consectetur adipiscing elit.
+</p>
+    </body></html>
+"#;
+
+    let empty_config = ConfigEntry::default();
+    let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
+    let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
+
+    crate::FullTextParser::unwrap_noscript_images(&xpath_ctx).unwrap();
+
+    let options = SaveOptions {
+        format: true,
+        no_declaration: false,
+        no_empty_tags: true,
+        no_xhtml: false,
+        xhtml: false,
+        as_xml: false,
+        as_html: true,
+        non_significant_whitespace: false,
+    };
+    let res = document.to_string_with_options(options);
+    assert_eq!(res, expected);
+}
--- a/src/util.rs
+++ b/src/util.rs
@ -1,11 +1,17 @@
-use libxml::{tree::Node, xpath::Context};
+use libxml::{
+    tree::{Node, NodeType},
+    xpath::Context,
+};
 use reqwest::{
    header::{HeaderMap, HeaderName, HeaderValue},
    Response,
 };
 use tokio::fs::DirEntry;

-use crate::full_text_parser::{config::ConfigEntry, error::FullTextParserError};
+use crate::{
+    constants,
+    full_text_parser::{config::ConfigEntry, error::FullTextParserError},
+};

 pub struct Util;

@ -219,4 +225,258 @@ impl Util {
        }
        Ok(())
    }
+
+    pub fn is_probably_visible(node: &Node) -> bool {
+        let display_none = node
+            .get_attribute("display")
+            .map(|display| display == "none")
+            .unwrap_or(false);
+        let is_hidden = node.has_attribute("hidden");
+        let aria_hidden = node
+            .get_attribute("aria-hidden")
+            .map(|attr| attr == "true")
+            .unwrap_or(false);
+        let has_fallback_image = node.get_class_names().contains("fallback-image");
+
+        !display_none && !is_hidden && !aria_hidden || has_fallback_image
+    }
+
+    pub fn is_whitespace(node: &Node) -> bool {
+        let is_text_node = node
+            .get_type()
+            .map(|t| t == NodeType::TextNode)
+            .unwrap_or(false);
+        let is_element_node = node
+            .get_type()
+            .map(|t| t == NodeType::ElementNode)
+            .unwrap_or(false);
+
+        (is_text_node && node.get_content().trim().is_empty())
+            || (is_element_node && node.get_name().to_uppercase() == "BR")
+    }
+
+    pub fn remove_and_next(node: &mut Node) -> Option<Node> {
+        let next_node = Self::next_node(node, true);
+        node.unlink();
+        next_node
+    }
+
+    pub fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
+        let mut node = node.clone();
+
+        // First check for kids if those aren't being ignored
+        let first_child = node.get_first_child();
+        if !ignore_self_and_kids && first_child.is_some() {
+            return first_child;
+        }
+
+        // Then for siblings...
+        let next_sibling = node.get_next_sibling();
+        if next_sibling.is_some() {
+            return next_sibling;
+        }
+
+        // And finally, move up the parent chain *and* find a sibling
+        // (because this is depth-first traversal, we will have already
+        // seen the parent nodes themselves).
+        loop {
+            let parent = node.get_parent();
+            if parent.is_none() {
+                break;
+            }
+
+            if let Some(parent) = parent {
+                let parent_name = parent.get_name().to_uppercase();
+                if parent_name == "HTML" {
+                    break;
+                }
+
+                let next_sibling = parent.get_next_sibling();
+                if next_sibling.is_some() {
+                    return next_sibling;
+                } else {
+                    node = parent;
+                }
+            }
+        }
+
+        None
+    }
+
+    pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
+        let content = node.get_content().trim().to_owned();
+        if normalize_spaces {
+            constants::NORMALIZE.replace(&content, " ").into()
+        } else {
+            content
+        }
+    }
+
+    pub fn text_similarity(a: &str, b: &str) -> f64 {
+        let a = a.to_lowercase();
+        let b = b.to_lowercase();
+        let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
+        let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
+        if tokens_a.is_empty() || tokens_b.is_empty() {
+            return 0.0;
+        }
+
+        let tokens_b_total = tokens_b.join(" ").len() as f64;
+        let uniq_tokens_b = tokens_b
+            .into_iter()
+            .filter(|token| !tokens_a.iter().any(|t| t == token))
+            .collect::<Vec<_>>();
+        let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
+
+        let distance_b = uniq_tokens_b_total / tokens_b_total;
+        1.0 - distance_b
+    }
+
+    pub fn has_ancestor_tag(node: &Node, tag_name: &str, max_depth: Option<u64>) -> bool {
+        let max_depth = max_depth.unwrap_or(3);
+        let tag_name = tag_name.to_uppercase();
+        let mut depth = 0;
+        let mut node = node.get_parent();
+
+        loop {
+            if depth > max_depth {
+                return false;
+            }
+
+            let tmp_node = match node {
+                Some(node) => node,
+                None => return false,
+            };
+
+            if tmp_node.get_name() == tag_name {
+                return true;
+            }
+
+            node = tmp_node.get_parent();
+            depth += 1;
+        }
+    }
+
+    pub fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
+        // There should be exactly 1 element child with given tag
+        if node.get_child_nodes().len() == 1
+            || node
+                .get_child_nodes()
+                .first()
+                .map(|n| n.get_name().to_uppercase() == tag)
+                .unwrap_or(false)
+        {
+            return false;
+        }
+
+        // And there should be no text nodes with real content
+        node.get_child_nodes().iter().any(|n| {
+            n.get_type()
+                .map(|t| t == NodeType::TextNode)
+                .unwrap_or(false)
+                && constants::HAS_CONTENT.is_match(&n.get_content())
+        })
+    }
+
+    pub fn is_element_without_content(node: &Node) -> bool {
+        if let Some(node_type) = node.get_type() {
+            let len = node.get_child_nodes().len();
+
+            return node_type == NodeType::ElementNode
+                && node.get_content().trim().is_empty()
+                && (len == 0
+                    || len
+                        == Self::get_elements_by_tag_name(node, "br").len()
+                            + Self::get_elements_by_tag_name(node, "hr").len());
+        }
+
+        false
+    }
+
+    pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
+        let tag = tag.to_uppercase();
+        let all_tags = tag == "*";
+        let mut vec = Vec::new();
+
+        fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
+            for child in node.get_child_elements() {
+                if all_tags || child.get_name().to_uppercase() == tag {
+                    vec.push(child.clone());
+                }
+                get_elems(&child, tag, vec, all_tags);
+            }
+        }
+
+        get_elems(node, &tag, &mut vec, all_tags);
+        vec
+    }
+
+    pub fn get_link_density(node: &Node) -> f64 {
+        let text_length = Util::get_inner_text(node, false).len();
+        if text_length == 0 {
+            return 0.0;
+        }
+
+        let mut link_length = 0.0;
+
+        // XXX implement _reduceNodeList?
+        let link_nodes = Util::get_elements_by_tag_name(node, "A");
+        for link_node in link_nodes {
+            if let Some(href) = link_node.get_attribute("href") {
+                let coefficient = if constants::HASH_URL.is_match(&href) {
+                    0.3
+                } else {
+                    1.0
+                };
+                link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient;
+            }
+        }
+
+        link_length / text_length as f64
+    }
+
+    // Determine whether element has any children block level elements.
+    pub fn has_child_block_element(node: &Node) -> bool {
+        node.get_child_elements().iter().any(|node| {
+            constants::DIV_TO_P_ELEMS.contains(node.get_name().as_str())
+                || Self::has_child_block_element(node)
+        })
+    }
+
+    pub fn get_node_ancestors(node: &Node, max_depth: u64) -> Vec<Node> {
+        let mut ancestors = Vec::new();
+        let mut node = node.clone();
+
+        for _ in 0..=max_depth {
+            let parent = node.get_parent();
+            match parent {
+                Some(parent) => {
+                    ancestors.push(parent.clone());
+                    node = parent;
+                }
+                None => return ancestors,
+            }
+        }
+
+        ancestors
+    }
+
+    pub fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
+        node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
+            .unwrap_or(false)
+    }
+
+    // Check if node is image, or if node contains exactly only one image
+    // whether as a direct child or as its descendants.
+    pub fn is_single_image(node: &Node) -> bool {
+        if node.get_name().to_uppercase() == "IMG" {
+            true
+        } else if node.get_child_nodes().len() != 1 || node.get_content().trim() != "" {
+            false
+        } else if let Some(first_child) = node.get_child_nodes().first() {
+            Self::is_single_image(first_child)
+        } else {
+            false
+        }
+    }
 }