diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 325cd10..c7d8d9e 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -678,6 +678,10 @@ impl FullTextParser { let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + let video_wrapper = node .get_parent() .and_then(|mut parent| parent.new_child(None, "div").ok()); @@ -732,6 +736,10 @@ impl FullTextParser { ) -> Result<(), FullTextParserError> { let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + if let Some(url) = node.get_attribute(attribute) { let trimmed_url = url.trim(); @@ -845,6 +853,10 @@ impl FullTextParser { if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) { for mut h2_node in h2_nodes { + if h2_node.is_null() { + continue; + } + if Util::header_duplicates_title(&h2_node, title) { h2_node.unlink(); } @@ -969,6 +981,10 @@ impl FullTextParser { // This is done to prevent a placeholder img is replaced by img from noscript in next step. let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?; for mut img_node in img_nodes { + if img_node.is_null() { + continue; + } + let attrs = img_node.get_attributes(); let keep = attrs.iter().any(|(name, value)| { @@ -986,6 +1002,10 @@ impl FullTextParser { // Next find noscript and try to extract its image let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?; for mut noscript_node in noscript_nodes { + if noscript_node.is_null() { + continue; + } + // Parse content of noscript and make sure it only contains image if !Util::is_single_image(&noscript_node) { continue; @@ -1091,6 +1111,10 @@ impl FullTextParser { { let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + if node.get_property("style").is_some() && node.remove_property("style").is_err() { return Err(FullTextParserError::Xml); } diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index a0b8c03..ae5154d 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -179,6 +179,10 @@ impl Readability { // Put phrasing content into paragraphs. let mut p: Option = None; for mut child in node_ref.get_child_nodes().into_iter() { + if child.is_null() { + continue; + } + if Util::is_phrasing_content(&child) { if let Some(p) = p.as_mut() { child.unlink(); @@ -205,6 +209,10 @@ impl Readability { } else if p.is_some() { if let Some(p) = p.as_mut() { for mut r_node in p.get_child_nodes().into_iter().rev() { + if r_node.is_null() { + continue; + } + if Util::is_whitespace(&r_node) { r_node.unlink(); continue; @@ -366,6 +374,10 @@ impl Readability { Node::new("DIV", None, &document).expect("can't create new node"); for mut child in root.get_child_elements().drain(..) { + if child.is_null() { + continue; + } + child.unlink(); new_top_candidate.add_child(&mut child).unwrap(); } @@ -510,6 +522,10 @@ impl Readability { if let Some(mut siblings) = siblings { for mut sibling in siblings.drain(..) { + if sibling.is_null() { + continue; + } + let mut append = false; let score = Self::get_content_score(&sibling).unwrap_or(0.0); @@ -614,6 +630,10 @@ impl Readability { })?; for mut child in article_content.get_child_nodes() { + if child.is_null() { + continue; + } + child.unlink(); div.add_child(&mut child).map_err(|error| { log::error!("{error}"); @@ -657,6 +677,10 @@ impl Readability { // But first check if we actually have something if let Some((best_attempt, _len, _document)) = attempts.pop() { for mut child in best_attempt.get_child_nodes() { + if child.is_null() { + continue; + } + child.unlink(); root.add_child(&mut child).map_err(|error| { log::error!("{error}"); @@ -674,6 +698,10 @@ impl Readability { .map_err(|()| FullTextParserError::Readability)?; } else { for mut child in article_content.get_child_nodes() { + if child.is_null() { + continue; + } + child.unlink(); root.add_child(&mut child).map_err(|error| { log::error!("{error}"); diff --git a/article_scraper/src/image_object.rs b/article_scraper/src/image_object.rs index d6e7af0..e665414 100644 --- a/article_scraper/src/image_object.rs +++ b/article_scraper/src/image_object.rs @@ -69,6 +69,10 @@ impl ImageObject { } pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> { + if node.is_null() { + return Err(FullTextParserError::Xml); + } + let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?; if parent.get_name().to_uppercase() == "A" { diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index dcbbad9..874e696 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -221,6 +221,10 @@ impl Util { let node_vec_clone = node_vec.clone(); for mut node in node_vec { + if node.is_null() { + continue; + } + let tag_name = node.get_name(); if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str()) && node @@ -271,6 +275,10 @@ impl Util { let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); let node_vec = Util::evaluate_xpath(context, query, false)?; for mut node in node_vec { + if node.is_null() { + continue; + } + node.unlink(); } Ok(()) @@ -318,6 +326,10 @@ impl Util { } pub fn remove_and_next(node: &mut Node) -> Option { + if node.is_null() { + return None; + } + let next_node = Self::next_node(node, true); node.unlink(); next_node @@ -641,6 +653,10 @@ impl Util { nodes.append(&mut Util::get_elements_by_tag_name(root, "h2")); for mut node in nodes.into_iter().rev() { + if node.is_null() { + continue; + } + if Util::get_class_weight(&node) < 0 { log::debug!( "Removing header with low class weight: {} {}", @@ -675,6 +691,10 @@ impl Util { let nodes = Util::get_elements_by_tag_name(root, tag); for mut node in nodes.into_iter().rev() { + if node.is_null() { + continue; + } + if Self::should_remove(&node, tag) { node.unlink(); } @@ -972,6 +992,10 @@ impl Util { // or non-whitespace. This leaves behind the first
in the chain // (which will be replaced with a

later). while let Some(mut n) = next { + if n.is_null() { + break; + } + let is_text_whitespace = n .get_type() .map(|t| t == NodeType::TextNode) @@ -1012,6 +1036,10 @@ impl Util { next = p.get_next_sibling(); while let Some(mut next_node) = next { + if next_node.is_null() { + break; + } + // If we've hit another

, we're done adding children to this

. if next_node.get_name().to_uppercase() == "BR" { if let Some(next_elem) = next_node.get_next_element_sibling() { @@ -1039,6 +1067,10 @@ impl Util { } while let Some(mut last_child) = p.get_last_child() { + if last_child.is_null() { + continue; + } + let is_text_node = last_child .get_type() .map(|t| t == NodeType::TextNode) diff --git a/article_scraper/src/video_object.rs b/article_scraper/src/video_object.rs index fee9c07..55023a2 100644 --- a/article_scraper/src/video_object.rs +++ b/article_scraper/src/video_object.rs @@ -87,6 +87,10 @@ impl VideoObject { } pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> { + if node.is_null() { + return Err(FullTextParserError::Xml); + } + let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?; node.unlink();