1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

don't move content nodes to <article> root node

could fix potential crash?
This commit is contained in:
Jan Lukas Gernert 2023-06-29 19:47:49 +02:00
parent fdb8d9a97e
commit fcec0d83ee
4 changed files with 32 additions and 11 deletions

View file

@ -14,7 +14,7 @@ exclude = ["resources/tests"]
thiserror = "1.0"
libxml = "0.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
tokio = { version = "1.28", features = ["macros", "fs", "io-util"] }
url = "2.3"
regex = "1.8"
encoding_rs = "0.8"

View file

@ -1,4 +1,3 @@
use libxml::tree::Node;
use reqwest::Url;
use crate::full_text_parser::error::FullTextParserError;
@ -47,17 +46,21 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<CleanedHtml, FullTextPar
}
FullTextParser::post_process_document(&document)?;
let mut article_node =
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
for mut node in content_nodes {
node.unlink();
article_node.add_child(&mut node).unwrap();
let content_node = if let Some(root) = document.get_root_element() {
if root.get_name() == "body" {
Some(root)
} else if let Some(body) = Util::get_first_element_by_tag_name(&root, "body") {
Some(body)
} else {
Some(root)
}
} else {
None
}
.ok_or(FullTextParserError::Xml)?;
Ok(CleanedHtml {
html: document.node_to_string(&article_node),
html: document.node_to_string(&content_node),
thumbnail,
})
}

View file

@ -633,7 +633,7 @@ impl FullTextParser {
})
.is_err();
if !success {
log::warn!("Failed to add iframe as child of video wrapper <div>");
log::debug!("Failed to add iframe as child of video wrapper <div>");
}
} else {
log::warn!("Failed to get parent of iframe");

View file

@ -547,6 +547,24 @@ impl Util {
vec
}
pub fn get_first_element_by_tag_name(node: &Node, tag: &str) -> Option<Node> {
let tag = tag.to_uppercase();
fn get_elems(node: &Node, tag: &str) -> Option<Node> {
for child in node.get_child_elements() {
if child.get_name().to_uppercase() == tag {
return Some(child.clone());
} else {
return get_elems(&child, tag);
}
}
None
}
get_elems(node, &tag)
}
pub fn get_link_density(node: &Node) -> f64 {
let text_length = Util::get_inner_text(node, true).len();
if text_length == 0 {