mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
do some null checks before unlinking nodes
This commit is contained in:
parent
ed8a83708b
commit
b13673ce3b
5 changed files with 92 additions and 0 deletions
|
@ -678,6 +678,10 @@ impl FullTextParser {
|
|||
let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let video_wrapper = node
|
||||
.get_parent()
|
||||
.and_then(|mut parent| parent.new_child(None, "div").ok());
|
||||
|
@ -732,6 +736,10 @@ impl FullTextParser {
|
|||
) -> Result<(), FullTextParserError> {
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(url) = node.get_attribute(attribute) {
|
||||
let trimmed_url = url.trim();
|
||||
|
||||
|
@ -845,6 +853,10 @@ impl FullTextParser {
|
|||
|
||||
if let Ok(h2_nodes) = Util::evaluate_xpath(context, "//h2", false) {
|
||||
for mut h2_node in h2_nodes {
|
||||
if h2_node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if Util::header_duplicates_title(&h2_node, title) {
|
||||
h2_node.unlink();
|
||||
}
|
||||
|
@ -969,6 +981,10 @@ impl FullTextParser {
|
|||
// This is done to prevent a placeholder img is replaced by img from noscript in next step.
|
||||
let img_nodes = Util::evaluate_xpath(ctx, "//img", false)?;
|
||||
for mut img_node in img_nodes {
|
||||
if img_node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let attrs = img_node.get_attributes();
|
||||
|
||||
let keep = attrs.iter().any(|(name, value)| {
|
||||
|
@ -986,6 +1002,10 @@ impl FullTextParser {
|
|||
// Next find noscript and try to extract its image
|
||||
let noscript_nodes = Util::evaluate_xpath(ctx, "//noscript", false)?;
|
||||
for mut noscript_node in noscript_nodes {
|
||||
if noscript_node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Parse content of noscript and make sure it only contains image
|
||||
if !Util::is_single_image(&noscript_node) {
|
||||
continue;
|
||||
|
@ -1091,6 +1111,10 @@ impl FullTextParser {
|
|||
{
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if node.get_property("style").is_some() && node.remove_property("style").is_err() {
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
|
|
|
@ -179,6 +179,10 @@ impl Readability {
|
|||
// Put phrasing content into paragraphs.
|
||||
let mut p: Option<Node> = None;
|
||||
for mut child in node_ref.get_child_nodes().into_iter() {
|
||||
if child.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if Util::is_phrasing_content(&child) {
|
||||
if let Some(p) = p.as_mut() {
|
||||
child.unlink();
|
||||
|
@ -205,6 +209,10 @@ impl Readability {
|
|||
} else if p.is_some() {
|
||||
if let Some(p) = p.as_mut() {
|
||||
for mut r_node in p.get_child_nodes().into_iter().rev() {
|
||||
if r_node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if Util::is_whitespace(&r_node) {
|
||||
r_node.unlink();
|
||||
continue;
|
||||
|
@ -366,6 +374,10 @@ impl Readability {
|
|||
Node::new("DIV", None, &document).expect("can't create new node");
|
||||
|
||||
for mut child in root.get_child_elements().drain(..) {
|
||||
if child.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
child.unlink();
|
||||
new_top_candidate.add_child(&mut child).unwrap();
|
||||
}
|
||||
|
@ -510,6 +522,10 @@ impl Readability {
|
|||
|
||||
if let Some(mut siblings) = siblings {
|
||||
for mut sibling in siblings.drain(..) {
|
||||
if sibling.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut append = false;
|
||||
|
||||
let score = Self::get_content_score(&sibling).unwrap_or(0.0);
|
||||
|
@ -614,6 +630,10 @@ impl Readability {
|
|||
})?;
|
||||
|
||||
for mut child in article_content.get_child_nodes() {
|
||||
if child.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
child.unlink();
|
||||
div.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
|
@ -657,6 +677,10 @@ impl Readability {
|
|||
// But first check if we actually have something
|
||||
if let Some((best_attempt, _len, _document)) = attempts.pop() {
|
||||
for mut child in best_attempt.get_child_nodes() {
|
||||
if child.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
child.unlink();
|
||||
root.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
|
@ -674,6 +698,10 @@ impl Readability {
|
|||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
} else {
|
||||
for mut child in article_content.get_child_nodes() {
|
||||
if child.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
child.unlink();
|
||||
root.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
|
|
|
@ -69,6 +69,10 @@ impl ImageObject {
|
|||
}
|
||||
|
||||
pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> {
|
||||
if node.is_null() {
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
|
||||
let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?;
|
||||
|
||||
if parent.get_name().to_uppercase() == "A" {
|
||||
|
|
|
@ -221,6 +221,10 @@ impl Util {
|
|||
let node_vec_clone = node_vec.clone();
|
||||
|
||||
for mut node in node_vec {
|
||||
if node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let tag_name = node.get_name();
|
||||
if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str())
|
||||
&& node
|
||||
|
@ -271,6 +275,10 @@ impl Util {
|
|||
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
||||
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
node.unlink();
|
||||
}
|
||||
Ok(())
|
||||
|
@ -318,6 +326,10 @@ impl Util {
|
|||
}
|
||||
|
||||
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
|
||||
if node.is_null() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let next_node = Self::next_node(node, true);
|
||||
node.unlink();
|
||||
next_node
|
||||
|
@ -641,6 +653,10 @@ impl Util {
|
|||
nodes.append(&mut Util::get_elements_by_tag_name(root, "h2"));
|
||||
|
||||
for mut node in nodes.into_iter().rev() {
|
||||
if node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if Util::get_class_weight(&node) < 0 {
|
||||
log::debug!(
|
||||
"Removing header with low class weight: {} {}",
|
||||
|
@ -675,6 +691,10 @@ impl Util {
|
|||
let nodes = Util::get_elements_by_tag_name(root, tag);
|
||||
|
||||
for mut node in nodes.into_iter().rev() {
|
||||
if node.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
if Self::should_remove(&node, tag) {
|
||||
node.unlink();
|
||||
}
|
||||
|
@ -972,6 +992,10 @@ impl Util {
|
|||
// or non-whitespace. This leaves behind the first <br> in the chain
|
||||
// (which will be replaced with a <p> later).
|
||||
while let Some(mut n) = next {
|
||||
if n.is_null() {
|
||||
break;
|
||||
}
|
||||
|
||||
let is_text_whitespace = n
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
|
@ -1012,6 +1036,10 @@ impl Util {
|
|||
next = p.get_next_sibling();
|
||||
|
||||
while let Some(mut next_node) = next {
|
||||
if next_node.is_null() {
|
||||
break;
|
||||
}
|
||||
|
||||
// If we've hit another <br><br>, we're done adding children to this <p>.
|
||||
if next_node.get_name().to_uppercase() == "BR" {
|
||||
if let Some(next_elem) = next_node.get_next_element_sibling() {
|
||||
|
@ -1039,6 +1067,10 @@ impl Util {
|
|||
}
|
||||
|
||||
while let Some(mut last_child) = p.get_last_child() {
|
||||
if last_child.is_null() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let is_text_node = last_child
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
|
|
|
@ -87,6 +87,10 @@ impl VideoObject {
|
|||
}
|
||||
|
||||
pub fn replace(&self, node: &mut Node) -> Result<(), FullTextParserError> {
|
||||
if node.is_null() {
|
||||
return Err(FullTextParserError::Xml);
|
||||
}
|
||||
|
||||
let mut parent = node.get_parent().ok_or(FullTextParserError::Xml)?;
|
||||
node.unlink();
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue