1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

do some null checks before unlinking nodes

This commit is contained in:
Jan Lukas Gernert 2024-02-13 19:06:05 +01:00
parent ed8a83708b
commit b13673ce3b
5 changed files with 92 additions and 0 deletions

View file

@ -678,6 +678,10 @@ impl FullTextParser {
let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.is_null() {
continue;
}
let video_wrapper = node let video_wrapper = node
.get_parent() .get_parent()
.and_then(|mut parent| parent.new_child(None, "div").ok()); .and_then(|mut parent| parent.new_child(None, "div").ok());
@ -732,6 +736,10 @@ impl FullTextParser {
) -> Result<(), FullTextParserError> { ) -> Result<(), FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.is_null() {
continue;
}
if let Some(url) = node.get_attribute(attribute) { if let Some(url) = node.get_attribute(attribute) {
let trimmed_url = url.trim(); let trimmed_url = url.trim();
@ -845,6 +853,10 @@ impl FullTextParser {
if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) { if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) {
for mut h2_node in h2_nodes { for mut h2_node in h2_nodes {
if h2_node.is_null() {
continue;
}
if Util::header_duplicates_title(&h2_node, title) { if Util::header_duplicates_title(&h2_node, title) {
h2_node.unlink(); h2_node.unlink();
} }
@ -969,6 +981,10 @@ impl FullTextParser {
// This is done to prevent a placeholder img is replaced by img from noscript in next step. // This is done to prevent a placeholder img is replaced by img from noscript in next step.
let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?; let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?;
for mut img_node in img_nodes { for mut img_node in img_nodes {
if img_node.is_null() {
continue;
}
let attrs = img_node.get_attributes(); let attrs = img_node.get_attributes();
let keep = attrs.iter().any(|(name, value)| { let keep = attrs.iter().any(|(name, value)| {
@ -986,6 +1002,10 @@ impl FullTextParser {
// Next find noscript and try to extract its image // Next find noscript and try to extract its image
let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?; let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?;
for mut noscript_node in noscript_nodes { for mut noscript_node in noscript_nodes {
if noscript_node.is_null() {
continue;
}
// Parse content of noscript and make sure it only contains image // Parse content of noscript and make sure it only contains image
if !Util::is_single_image(&noscript_node) { if !Util::is_single_image(&noscript_node) {
continue; continue;
@ -1091,6 +1111,10 @@ impl FullTextParser {
{ {
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.is_null() {
continue;
}
if node.get_property("style").is_some() && node.remove_property("style").is_err() { if node.get_property("style").is_some() && node.remove_property("style").is_err() {
return Err(FullTextParserError::Xml); return Err(FullTextParserError::Xml);
} }

View file

@ -179,6 +179,10 @@ impl Readability {
// Put phrasing content into paragraphs. // Put phrasing content into paragraphs.
let mut p: Option<Node> = None; let mut p: Option<Node> = None;
for mut child in node_ref.get_child_nodes().into_iter() { for mut child in node_ref.get_child_nodes().into_iter() {
if child.is_null() {
continue;
}
if Util::is_phrasing_content(&child) { if Util::is_phrasing_content(&child) {
if let Some(p) = p.as_mut() { if let Some(p) = p.as_mut() {
child.unlink(); child.unlink();
@ -205,6 +209,10 @@ impl Readability {
} else if p.is_some() { } else if p.is_some() {
if let Some(p) = p.as_mut() { if let Some(p) = p.as_mut() {
for mut r_node in p.get_child_nodes().into_iter().rev() { for mut r_node in p.get_child_nodes().into_iter().rev() {
if r_node.is_null() {
continue;
}
if Util::is_whitespace(&r_node) { if Util::is_whitespace(&r_node) {
r_node.unlink(); r_node.unlink();
continue; continue;
@ -366,6 +374,10 @@ impl Readability {
Node::new("DIV", None, &document).expect("can't create new node"); Node::new("DIV", None, &document).expect("can't create new node");
for mut child in root.get_child_elements().drain(..) { for mut child in root.get_child_elements().drain(..) {
if child.is_null() {
continue;
}
child.unlink(); child.unlink();
new_top_candidate.add_child(&mut child).unwrap(); new_top_candidate.add_child(&mut child).unwrap();
} }
@ -510,6 +522,10 @@ impl Readability {
if let Some(mut siblings) = siblings { if let Some(mut siblings) = siblings {
for mut sibling in siblings.drain(..) { for mut sibling in siblings.drain(..) {
if sibling.is_null() {
continue;
}
let mut append = false; let mut append = false;
let score = Self::get_content_score(&sibling).unwrap_or(0.0); let score = Self::get_content_score(&sibling).unwrap_or(0.0);
@ -614,6 +630,10 @@ impl Readability {
})?; })?;
for mut child in article_content.get_child_nodes() { for mut child in article_content.get_child_nodes() {
if child.is_null() {
continue;
}
child.unlink(); child.unlink();
div.add_child(&mut child).map_err(|error| { div.add_child(&mut child).map_err(|error| {
log::error!("{error}"); log::error!("{error}");
@ -657,6 +677,10 @@ impl Readability {
// But first check if we actually have something // But first check if we actually have something
if let Some((best_attempt, _len, _document)) = attempts.pop() { if let Some((best_attempt, _len, _document)) = attempts.pop() {
for mut child in best_attempt.get_child_nodes() { for mut child in best_attempt.get_child_nodes() {
if child.is_null() {
continue;
}
child.unlink(); child.unlink();
root.add_child(&mut child).map_err(|error| { root.add_child(&mut child).map_err(|error| {
log::error!("{error}"); log::error!("{error}");
@ -674,6 +698,10 @@ impl Readability {
.map_err(|()| FullTextParserError::Readability)?; .map_err(|()| FullTextParserError::Readability)?;
} else { } else {
for mut child in article_content.get_child_nodes() { for mut child in article_content.get_child_nodes() {
if child.is_null() {
continue;
}
child.unlink(); child.unlink();
root.add_child(&mut child).map_err(|error| { root.add_child(&mut child).map_err(|error| {
log::error!("{error}"); log::error!("{error}");

View file

@ -69,6 +69,10 @@ impl ImageObject {
} }
pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> { pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> {
if node.is_null() {
return Err(FullTextParserError::Xml);
}
let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?; let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?;
if parent.get_name().to_uppercase() == "A" { if parent.get_name().to_uppercase() == "A" {

View file

@ -221,6 +221,10 @@ impl Util {
let node_vec_clone = node_vec.clone(); let node_vec_clone = node_vec.clone();
for mut node in node_vec { for mut node in node_vec {
if node.is_null() {
continue;
}
let tag_name = node.get_name(); let tag_name = node.get_name();
if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str()) if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str())
&& node && node
@ -271,6 +275,10 @@ impl Util {
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let node_vec = Util::evaluate_xpath(context, query, false)?; let node_vec = Util::evaluate_xpath(context, query, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.is_null() {
continue;
}
node.unlink(); node.unlink();
} }
Ok(()) Ok(())
@ -318,6 +326,10 @@ impl Util {
} }
pub fn remove_and_next(node: &mut Node) -> Option<Node> { pub fn remove_and_next(node: &mut Node) -> Option<Node> {
if node.is_null() {
return None;
}
let next_node = Self::next_node(node, true); let next_node = Self::next_node(node, true);
node.unlink(); node.unlink();
next_node next_node
@ -641,6 +653,10 @@ impl Util {
nodes.append(&mut Util::get_elements_by_tag_name(root, "h2")); nodes.append(&mut Util::get_elements_by_tag_name(root, "h2"));
for mut node in nodes.into_iter().rev() { for mut node in nodes.into_iter().rev() {
if node.is_null() {
continue;
}
if Util::get_class_weight(&node) < 0 { if Util::get_class_weight(&node) < 0 {
log::debug!( log::debug!(
"Removing header with low class weight: {} {}", "Removing header with low class weight: {} {}",
@ -675,6 +691,10 @@ impl Util {
let nodes = Util::get_elements_by_tag_name(root, tag); let nodes = Util::get_elements_by_tag_name(root, tag);
for mut node in nodes.into_iter().rev() { for mut node in nodes.into_iter().rev() {
if node.is_null() {
continue;
}
if Self::should_remove(&node, tag) { if Self::should_remove(&node, tag) {
node.unlink(); node.unlink();
} }
@ -972,6 +992,10 @@ impl Util {
// or non-whitespace. This leaves behind the first <br> in the chain // or non-whitespace. This leaves behind the first <br> in the chain
// (which will be replaced with a <p> later). // (which will be replaced with a <p> later).
while let Some(mut n) = next { while let Some(mut n) = next {
if n.is_null() {
break;
}
let is_text_whitespace = n let is_text_whitespace = n
.get_type() .get_type()
.map(|t| t == NodeType::TextNode) .map(|t| t == NodeType::TextNode)
@ -1012,6 +1036,10 @@ impl Util {
next = p.get_next_sibling(); next = p.get_next_sibling();
while let Some(mut next_node) = next { while let Some(mut next_node) = next {
if next_node.is_null() {
break;
}
// If we've hit another <br><br>, we're done adding children to this <p>. // If we've hit another <br><br>, we're done adding children to this <p>.
if next_node.get_name().to_uppercase() == "BR" { if next_node.get_name().to_uppercase() == "BR" {
if let Some(next_elem) = next_node.get_next_element_sibling() { if let Some(next_elem) = next_node.get_next_element_sibling() {
@ -1039,6 +1067,10 @@ impl Util {
} }
while let Some(mut last_child) = p.get_last_child() { while let Some(mut last_child) = p.get_last_child() {
if last_child.is_null() {
continue;
}
let is_text_node = last_child let is_text_node = last_child
.get_type() .get_type()
.map(|t| t == NodeType::TextNode) .map(|t| t == NodeType::TextNode)

View file

@ -87,6 +87,10 @@ impl VideoObject {
} }
pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> { pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> {
if node.is_null() {
return Err(FullTextParserError::Xml);
}
let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?; let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?;
node.unlink(); node.unlink();