mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
don't move content nodes to <article> root node
could fix potential crash?
This commit is contained in:
parent
fdb8d9a97e
commit
fcec0d83ee
4 changed files with 32 additions and 11 deletions
|
@ -14,7 +14,7 @@ exclude = ["resources/tests"]
|
|||
thiserror = "1.0"
|
||||
libxml = "0.3"
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
|
||||
tokio = { version = "1.28", features = ["macros", "fs", "io-util"] }
|
||||
url = "2.3"
|
||||
regex = "1.8"
|
||||
encoding_rs = "0.8"
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
use libxml::tree::Node;
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::full_text_parser::error::FullTextParserError;
|
||||
|
@ -47,17 +46,21 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<CleanedHtml, FullTextPar
|
|||
}
|
||||
FullTextParser::post_process_document(&document)?;
|
||||
|
||||
let mut article_node =
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||
let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
|
||||
|
||||
for mut node in content_nodes {
|
||||
node.unlink();
|
||||
article_node.add_child(&mut node).unwrap();
|
||||
let content_node = if let Some(root) = document.get_root_element() {
|
||||
if root.get_name() == "body" {
|
||||
Some(root)
|
||||
} else if let Some(body) = Util::get_first_element_by_tag_name(&root, "body") {
|
||||
Some(body)
|
||||
} else {
|
||||
Some(root)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
.ok_or(FullTextParserError::Xml)?;
|
||||
|
||||
Ok(CleanedHtml {
|
||||
html: document.node_to_string(&article_node),
|
||||
html: document.node_to_string(&content_node),
|
||||
thumbnail,
|
||||
})
|
||||
}
|
||||
|
|
|
@ -633,7 +633,7 @@ impl FullTextParser {
|
|||
})
|
||||
.is_err();
|
||||
if !success {
|
||||
log::warn!("Failed to add iframe as child of video wrapper <div>");
|
||||
log::debug!("Failed to add iframe as child of video wrapper <div>");
|
||||
}
|
||||
} else {
|
||||
log::warn!("Failed to get parent of iframe");
|
||||
|
|
|
@ -547,6 +547,24 @@ impl Util {
|
|||
vec
|
||||
}
|
||||
|
||||
pub fn get_first_element_by_tag_name(node: &Node, tag: &str) -> Option<Node> {
|
||||
let tag = tag.to_uppercase();
|
||||
|
||||
fn get_elems(node: &Node, tag: &str) -> Option<Node> {
|
||||
for child in node.get_child_elements() {
|
||||
if child.get_name().to_uppercase() == tag {
|
||||
return Some(child.clone());
|
||||
} else {
|
||||
return get_elems(&child, tag);
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
get_elems(node, &tag)
|
||||
}
|
||||
|
||||
pub fn get_link_density(node: &Node) -> f64 {
|
||||
let text_length = Util::get_inner_text(node, true).len();
|
||||
if text_length == 0 {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue