diff --git a/article_scraper/src/constants.rs b/article_scraper/src/constants.rs index 549e78a..b4c1c94 100644 --- a/article_scraper/src/constants.rs +++ b/article_scraper/src/constants.rs @@ -137,6 +137,12 @@ pub static DIV_TO_P_ELEMS: Lazy> = Lazy::new(|| { ]) }); +pub static VALID_EMPTY_TAGS: Lazy> = Lazy::new(|| { + HashSet::from([ + "AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "LINK", "META", "SOURCE", "TRACK", + ]) +}); + pub static ALTER_TO_DIV_EXCEPTIONS: Lazy> = Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"])); diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index b6679fa..5521cb4 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -1113,6 +1113,7 @@ impl FullTextParser { Self::clean_attributes(node)?; Self::remove_single_cell_tables(node); Self::remove_extra_p_and_div(node); + Self::remove_empty_nodes(node); Ok(()) } @@ -1243,34 +1244,40 @@ impl FullTextParser { while let Some(mut node) = node_iter { let tag_name = node.get_name().to_uppercase(); - if tag_name != "ARTICLE" - && node.get_parent().is_some() - && (tag_name == "DIV" || tag_name == "SECTION") - { - if Util::is_element_without_content(&node) { - node_iter = Util::remove_and_next(&mut node); - continue; - } else if Util::has_single_tag_inside_element(&node, "DIV") - || Util::has_single_tag_inside_element(&node, "SECTION") - { - if let Some(mut parent) = node.get_parent() { - if let Some(mut child) = node.get_child_elements().into_iter().next() { - for (k, v) in node.get_attributes().into_iter() { - child.set_attribute(&k, &v).map_err(|e| { - log::error!("{e}"); - FullTextParserError::Xml - })?; - } - parent - .replace_child_node(child, node.clone()) - .map_err(|e| { - log::error!("{e}"); - FullTextParserError::Xml - })?; - node_iter = Util::next_node(&parent, false); - continue; + if tag_name == "ARTICLE" || node.get_parent().is_none() { + node_iter = Util::next_node(&node, false); + continue; + } + + if tag_name != "DIV" && tag_name != "SECTION" { + node_iter = Util::next_node(&node, false); + continue; + } + + if Util::is_element_without_content(&node) { + node_iter = Util::remove_and_next(&mut node); + continue; + } else if Util::has_single_tag_inside_element(&node, "DIV") + || Util::has_single_tag_inside_element(&node, "SECTION") + { + if let Some(mut parent) = node.get_parent() { + if let Some(mut child) = node.get_child_elements().into_iter().next() { + for (k, v) in node.get_attributes().into_iter() { + child.set_attribute(&k, &v).map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; } + parent + .replace_child_node(child, node.clone()) + .map_err(|e| { + log::error!("{e}"); + FullTextParserError::Xml + })?; + + node_iter = Util::next_node(&parent, false); + continue; } } } @@ -1279,4 +1286,24 @@ impl FullTextParser { } Ok(()) } + + fn remove_empty_nodes(root: &mut Node) { + let mut node_iter = Some(root.clone()); + + while let Some(mut node) = node_iter { + let tag_name = node.get_name().to_uppercase(); + + if constants::VALID_EMPTY_TAGS.contains(tag_name.as_str()) { + node_iter = Util::next_node(&node, false); + continue; + } + + if Util::is_element_without_children(&node) { + node_iter = Util::remove_and_next(&mut node); + continue; + } + + node_iter = Util::next_node(&node, false); + } + } } diff --git a/article_scraper/src/full_text_parser/readability/tests.rs b/article_scraper/src/full_text_parser/readability/tests.rs index f2a58eb..7d299c3 100644 --- a/article_scraper/src/full_text_parser/readability/tests.rs +++ b/article_scraper/src/full_text_parser/readability/tests.rs @@ -45,6 +45,8 @@ async fn run_test(name: &str) { article.root_node = Some(root); let html = article.get_content().unwrap(); + //std::fs::write(format!("./resources/tests/readability/{name}/expected.html"), &html).unwrap(); + let expected = std::fs::read_to_string(format!( "./resources/tests/readability/{name}/expected.html" )) diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index 7fd5843..47faadf 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -68,6 +68,17 @@ async fn youtube() { .unwrap_or(false)); } +#[tokio::test] +async fn hardwareluxx() { + let _ = env_logger::builder().is_test(true).try_init(); + let out_path = PathBuf::from(r"./test_output"); + let url = url::Url::parse("https://www.hardwareluxx.de/index.php/news/software/spiele/60882-half-life-mit-ray-tracing-mod-gibt-dem-25-jahr-alten-shooter-neuen-glanz.html").unwrap(); + + let grabber = FullTextParser::new(None).await; + let article = grabber.parse(&url, &Client::new()).await.unwrap(); + article.save_html(&out_path).unwrap(); +} + #[tokio::test] async fn encoding_windows_1252() { let _ = env_logger::builder().is_test(true).try_init(); diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 74797b2..abf06f4 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -439,6 +439,33 @@ impl Util { } } + pub fn is_element_without_children(node: &Node) -> bool { + if let Some(node_type) = node.get_type() { + let len = node.get_child_nodes().len(); + node_type == NodeType::ElementNode + && (len == 0 || node.get_content().trim().is_empty()) + && Self::get_elements_by_tag_names(node, &constants::VALID_EMPTY_TAGS).is_empty() + } else { + false + } + } + + pub fn get_elements_by_tag_names(node: &Node, tags: &HashSet<&str>) -> Vec { + let mut vec = Vec::new(); + + fn get_elems(node: &Node, tags: &HashSet<&str>, vec: &mut Vec) { + for child in node.get_child_elements() { + if tags.contains(child.get_name().to_uppercase().as_str()) { + vec.push(child.clone()); + } + get_elems(&child, tags, vec); + } + } + + get_elems(node, tags, &mut vec); + vec + } + pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec { let tag = tag.to_uppercase(); let all_tags = tag == "*"; @@ -629,17 +656,17 @@ impl Util { let link_density = Self::get_link_density(node); let content = Self::get_inner_text(node, true); let content_length = content.len(); + let has_figure_ancestor = + Self::has_ancestor_tag(node, "figure", None, None:: bool>); - let have_to_remove = (img > 1 - && (p as f64 / img as f64) < 0.5 - && !Self::has_ancestor_tag(node, "figure", None, None:: bool>)) + let have_to_remove = (img > 1 && (p as f64 / img as f64) < 0.5 && !has_figure_ancestor) || (!is_list && li > p as i64) || (input as f64 > f64::floor(p as f64 / 3.0)) || (!is_list && heading_density < 0.9 && content_length < 25 && (img == 0 || img > 2) - && !Self::has_ancestor_tag(node, "figure", None, None:: bool>)) + && !has_figure_ancestor) || (!is_list && weight < 25 && link_density > 0.2) || (weight >= 25 && link_density > 0.5) || ((embed_count == 1 && content_length < 75) || embed_count > 1);