don't move content nodes to <article> root node

could fix potential crash?
2025-07-07 16:15:32 +02:00 · 2023-06-29 19:47:49 +02:00 · 2023-06-29 19:47:49 +02:00 · fcec0d83ee
commit fcec0d83ee
parent fdb8d9a97e
4 changed files with 32 additions and 11 deletions
--- a/article_scraper/Cargo.toml
+++ b/article_scraper/Cargo.toml
@ -14,7 +14,7 @@ exclude = ["resources/tests"]
 thiserror = "1.0"
 libxml = "0.3"
 reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
-tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
+tokio = { version = "1.28", features = ["macros", "fs", "io-util"] }
 url = "2.3"
 regex = "1.8"
 encoding_rs = "0.8"
--- a/article_scraper/src/clean.rs
+++ b/article_scraper/src/clean.rs
@ -1,4 +1,3 @@
 use libxml::tree::Node;
 use reqwest::Url;
 use crate::full_text_parser::error::FullTextParserError;
@ -47,17 +46,21 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<CleanedHtml, FullTextPar
    }
    FullTextParser::post_process_document(&document)?;
-    let mut article_node =
+    let content_node = if let Some(root) = document.get_root_element() {
-        Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
+        if root.get_name() == "body" {
-    let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
+            Some(root)
-
+        } else if let Some(body) = Util::get_first_element_by_tag_name(&root, "body") {
-    for mut node in content_nodes {
+            Some(body)
-        node.unlink();
+        } else {
-        article_node.add_child(&mut node).unwrap();
+            Some(root)
        }
    } else {
        None
    }
    .ok_or(FullTextParserError::Xml)?;
    Ok(CleanedHtml {
-        html: document.node_to_string(&article_node),
+        html: document.node_to_string(&content_node),
        thumbnail,
    })
 }
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@ -633,7 +633,7 @@ impl FullTextParser {
                    })
                    .is_err();
                if !success {
-                    log::warn!("Failed to add iframe as child of video wrapper <div>");
+                    log::debug!("Failed to add iframe as child of video wrapper <div>");
                }
            } else {
                log::warn!("Failed to get parent of iframe");
--- a/article_scraper/src/util.rs
+++ b/article_scraper/src/util.rs
@ -547,6 +547,24 @@ impl Util {
        vec
    }
    pub fn get_first_element_by_tag_name(node: &Node, tag: &str) -> Option<Node> {
        let tag = tag.to_uppercase();
        fn get_elems(node: &Node, tag: &str) -> Option<Node> {
            for child in node.get_child_elements() {
                if child.get_name().to_uppercase() == tag {
                    return Some(child.clone());
                } else {
                    return get_elems(&child, tag);
                }
            }
            None
        }
        get_elems(node, &tag)
    }
    pub fn get_link_density(node: &Node) -> f64 {
        let text_length = Util::get_inner_text(node, true).len();
        if text_length == 0 {