1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

remove empty nodes

This commit is contained in:
Jan Lukas Gernert 2023-04-26 19:54:34 +02:00
parent 5621a0ea54
commit 62c0968619
5 changed files with 103 additions and 30 deletions

View file

@ -137,6 +137,12 @@ pub static DIV_TO_P_ELEMS: Lazy<HashSet<&str>> = Lazy::new(|| {
])
});
pub static VALID_EMPTY_TAGS: Lazy<HashSet<&str>> = Lazy::new(|| {
HashSet::from([
"AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "LINK", "META", "SOURCE", "TRACK",
])
});
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&str>> =
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));

View file

@ -1113,6 +1113,7 @@ impl FullTextParser {
Self::clean_attributes(node)?;
Self::remove_single_cell_tables(node);
Self::remove_extra_p_and_div(node);
Self::remove_empty_nodes(node);
Ok(())
}
@ -1243,10 +1244,17 @@ impl FullTextParser {
while let Some(mut node) = node_iter {
let tag_name = node.get_name().to_uppercase();
if tag_name != "ARTICLE"
&& node.get_parent().is_some()
&& (tag_name == "DIV" || tag_name == "SECTION")
{
if tag_name == "ARTICLE" || node.get_parent().is_none() {
node_iter = Util::next_node(&node, false);
continue;
}
if tag_name != "DIV" && tag_name != "SECTION" {
node_iter = Util::next_node(&node, false);
continue;
}
if Util::is_element_without_content(&node) {
node_iter = Util::remove_and_next(&mut node);
continue;
@ -1273,10 +1281,29 @@ impl FullTextParser {
}
}
}
}
node_iter = Util::next_node(&node, false);
}
Ok(())
}
fn remove_empty_nodes(root: &mut Node) {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
let tag_name = node.get_name().to_uppercase();
if constants::VALID_EMPTY_TAGS.contains(tag_name.as_str()) {
node_iter = Util::next_node(&node, false);
continue;
}
if Util::is_element_without_children(&node) {
node_iter = Util::remove_and_next(&mut node);
continue;
}
node_iter = Util::next_node(&node, false);
}
}
}

View file

@ -45,6 +45,8 @@ async fn run_test(name: &str) {
article.root_node = Some(root);
let html = article.get_content().unwrap();
//std::fs::write(format!("./resources/tests/readability/{name}/expected.html"), &html).unwrap();
let expected = std::fs::read_to_string(format!(
"./resources/tests/readability/{name}/expected.html"
))

View file

@ -68,6 +68,17 @@ async fn youtube() {
.unwrap_or(false));
}
#[tokio::test]
async fn hardwareluxx() {
let _ = env_logger::builder().is_test(true).try_init();
let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.hardwareluxx.de/index.php/news/software/spiele/60882-half-life-mit-ray-tracing-mod-gibt-dem-25-jahr-alten-shooter-neuen-glanz.html").unwrap();
let grabber = FullTextParser::new(None).await;
let article = grabber.parse(&url, &Client::new()).await.unwrap();
article.save_html(&out_path).unwrap();
}
#[tokio::test]
async fn encoding_windows_1252() {
let _ = env_logger::builder().is_test(true).try_init();

View file

@ -439,6 +439,33 @@ impl Util {
}
}
pub fn is_element_without_children(node: &Node) -> bool {
if let Some(node_type) = node.get_type() {
let len = node.get_child_nodes().len();
node_type == NodeType::ElementNode
&& (len == 0 || node.get_content().trim().is_empty())
&& Self::get_elements_by_tag_names(node, &constants::VALID_EMPTY_TAGS).is_empty()
} else {
false
}
}
pub fn get_elements_by_tag_names(node: &Node, tags: &HashSet<&str>) -> Vec<Node> {
let mut vec = Vec::new();
fn get_elems(node: &Node, tags: &HashSet<&str>, vec: &mut Vec<Node>) {
for child in node.get_child_elements() {
if tags.contains(child.get_name().to_uppercase().as_str()) {
vec.push(child.clone());
}
get_elems(&child, tags, vec);
}
}
get_elems(node, tags, &mut vec);
vec
}
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
let tag = tag.to_uppercase();
let all_tags = tag == "*";
@ -629,17 +656,17 @@ impl Util {
let link_density = Self::get_link_density(node);
let content = Self::get_inner_text(node, true);
let content_length = content.len();
let has_figure_ancestor =
Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>);
let have_to_remove = (img > 1
&& (p as f64 / img as f64) < 0.5
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
let have_to_remove = (img > 1 && (p as f64 / img as f64) < 0.5 && !has_figure_ancestor)
|| (!is_list && li > p as i64)
|| (input as f64 > f64::floor(p as f64 / 3.0))
|| (!is_list
&& heading_density < 0.9
&& content_length < 25
&& (img == 0 || img > 2)
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
&& !has_figure_ancestor)
|| (!is_list && weight < 25 && link_density > 0.2)
|| (weight >= 25 && link_density > 0.5)
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);