mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
don't move content nodes to <article> root node
could fix potential crash?
This commit is contained in:
parent
fdb8d9a97e
commit
fcec0d83ee
4 changed files with 32 additions and 11 deletions
|
@ -14,7 +14,7 @@ exclude = ["resources/tests"]
|
||||||
thiserror = "1.0"
|
thiserror = "1.0"
|
||||||
libxml = "0.3"
|
libxml = "0.3"
|
||||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||||
tokio = { version = "1.27", features = ["macros", "fs", "io-util"] }
|
tokio = { version = "1.28", features = ["macros", "fs", "io-util"] }
|
||||||
url = "2.3"
|
url = "2.3"
|
||||||
regex = "1.8"
|
regex = "1.8"
|
||||||
encoding_rs = "0.8"
|
encoding_rs = "0.8"
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
use libxml::tree::Node;
|
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
|
||||||
use crate::full_text_parser::error::FullTextParserError;
|
use crate::full_text_parser::error::FullTextParserError;
|
||||||
|
@ -47,17 +46,21 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<CleanedHtml, FullTextPar
|
||||||
}
|
}
|
||||||
FullTextParser::post_process_document(&document)?;
|
FullTextParser::post_process_document(&document)?;
|
||||||
|
|
||||||
let mut article_node =
|
let content_node = if let Some(root) = document.get_root_element() {
|
||||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
if root.get_name() == "body" {
|
||||||
let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
|
Some(root)
|
||||||
|
} else if let Some(body) = Util::get_first_element_by_tag_name(&root, "body") {
|
||||||
for mut node in content_nodes {
|
Some(body)
|
||||||
node.unlink();
|
} else {
|
||||||
article_node.add_child(&mut node).unwrap();
|
Some(root)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
}
|
}
|
||||||
|
.ok_or(FullTextParserError::Xml)?;
|
||||||
|
|
||||||
Ok(CleanedHtml {
|
Ok(CleanedHtml {
|
||||||
html: document.node_to_string(&article_node),
|
html: document.node_to_string(&content_node),
|
||||||
thumbnail,
|
thumbnail,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
|
@ -633,7 +633,7 @@ impl FullTextParser {
|
||||||
})
|
})
|
||||||
.is_err();
|
.is_err();
|
||||||
if !success {
|
if !success {
|
||||||
log::warn!("Failed to add iframe as child of video wrapper <div>");
|
log::debug!("Failed to add iframe as child of video wrapper <div>");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
log::warn!("Failed to get parent of iframe");
|
log::warn!("Failed to get parent of iframe");
|
||||||
|
|
|
@ -547,6 +547,24 @@ impl Util {
|
||||||
vec
|
vec
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_first_element_by_tag_name(node: &Node, tag: &str) -> Option<Node> {
|
||||||
|
let tag = tag.to_uppercase();
|
||||||
|
|
||||||
|
fn get_elems(node: &Node, tag: &str) -> Option<Node> {
|
||||||
|
for child in node.get_child_elements() {
|
||||||
|
if child.get_name().to_uppercase() == tag {
|
||||||
|
return Some(child.clone());
|
||||||
|
} else {
|
||||||
|
return get_elems(&child, tag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
get_elems(node, &tag)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_link_density(node: &Node) -> f64 {
|
pub fn get_link_density(node: &Node) -> f64 {
|
||||||
let text_length = Util::get_inner_text(node, true).len();
|
let text_length = Util::get_inner_text(node, true).len();
|
||||||
if text_length == 0 {
|
if text_length == 0 {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue