mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
remove empty nodes
This commit is contained in:
parent
5621a0ea54
commit
62c0968619
5 changed files with 103 additions and 30 deletions
|
@ -137,6 +137,12 @@ pub static DIV_TO_P_ELEMS: Lazy<HashSet<&str>> = Lazy::new(|| {
|
||||||
])
|
])
|
||||||
});
|
});
|
||||||
|
|
||||||
|
pub static VALID_EMPTY_TAGS: Lazy<HashSet<&str>> = Lazy::new(|| {
|
||||||
|
HashSet::from([
|
||||||
|
"AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "LINK", "META", "SOURCE", "TRACK",
|
||||||
|
])
|
||||||
|
});
|
||||||
|
|
||||||
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&str>> =
|
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&str>> =
|
||||||
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
|
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
|
||||||
|
|
||||||
|
|
|
@ -1113,6 +1113,7 @@ impl FullTextParser {
|
||||||
Self::clean_attributes(node)?;
|
Self::clean_attributes(node)?;
|
||||||
Self::remove_single_cell_tables(node);
|
Self::remove_single_cell_tables(node);
|
||||||
Self::remove_extra_p_and_div(node);
|
Self::remove_extra_p_and_div(node);
|
||||||
|
Self::remove_empty_nodes(node);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -1243,34 +1244,40 @@ impl FullTextParser {
|
||||||
|
|
||||||
while let Some(mut node) = node_iter {
|
while let Some(mut node) = node_iter {
|
||||||
let tag_name = node.get_name().to_uppercase();
|
let tag_name = node.get_name().to_uppercase();
|
||||||
if tag_name != "ARTICLE"
|
|
||||||
&& node.get_parent().is_some()
|
|
||||||
&& (tag_name == "DIV" || tag_name == "SECTION")
|
|
||||||
{
|
|
||||||
if Util::is_element_without_content(&node) {
|
|
||||||
node_iter = Util::remove_and_next(&mut node);
|
|
||||||
continue;
|
|
||||||
} else if Util::has_single_tag_inside_element(&node, "DIV")
|
|
||||||
|| Util::has_single_tag_inside_element(&node, "SECTION")
|
|
||||||
{
|
|
||||||
if let Some(mut parent) = node.get_parent() {
|
|
||||||
if let Some(mut child) = node.get_child_elements().into_iter().next() {
|
|
||||||
for (k, v) in node.get_attributes().into_iter() {
|
|
||||||
child.set_attribute(&k, &v).map_err(|e| {
|
|
||||||
log::error!("{e}");
|
|
||||||
FullTextParserError::Xml
|
|
||||||
})?;
|
|
||||||
}
|
|
||||||
parent
|
|
||||||
.replace_child_node(child, node.clone())
|
|
||||||
.map_err(|e| {
|
|
||||||
log::error!("{e}");
|
|
||||||
FullTextParserError::Xml
|
|
||||||
})?;
|
|
||||||
|
|
||||||
node_iter = Util::next_node(&parent, false);
|
if tag_name == "ARTICLE" || node.get_parent().is_none() {
|
||||||
continue;
|
node_iter = Util::next_node(&node, false);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if tag_name != "DIV" && tag_name != "SECTION" {
|
||||||
|
node_iter = Util::next_node(&node, false);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if Util::is_element_without_content(&node) {
|
||||||
|
node_iter = Util::remove_and_next(&mut node);
|
||||||
|
continue;
|
||||||
|
} else if Util::has_single_tag_inside_element(&node, "DIV")
|
||||||
|
|| Util::has_single_tag_inside_element(&node, "SECTION")
|
||||||
|
{
|
||||||
|
if let Some(mut parent) = node.get_parent() {
|
||||||
|
if let Some(mut child) = node.get_child_elements().into_iter().next() {
|
||||||
|
for (k, v) in node.get_attributes().into_iter() {
|
||||||
|
child.set_attribute(&k, &v).map_err(|e| {
|
||||||
|
log::error!("{e}");
|
||||||
|
FullTextParserError::Xml
|
||||||
|
})?;
|
||||||
}
|
}
|
||||||
|
parent
|
||||||
|
.replace_child_node(child, node.clone())
|
||||||
|
.map_err(|e| {
|
||||||
|
log::error!("{e}");
|
||||||
|
FullTextParserError::Xml
|
||||||
|
})?;
|
||||||
|
|
||||||
|
node_iter = Util::next_node(&parent, false);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1279,4 +1286,24 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn remove_empty_nodes(root: &mut Node) {
|
||||||
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
|
while let Some(mut node) = node_iter {
|
||||||
|
let tag_name = node.get_name().to_uppercase();
|
||||||
|
|
||||||
|
if constants::VALID_EMPTY_TAGS.contains(tag_name.as_str()) {
|
||||||
|
node_iter = Util::next_node(&node, false);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if Util::is_element_without_children(&node) {
|
||||||
|
node_iter = Util::remove_and_next(&mut node);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
node_iter = Util::next_node(&node, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,6 +45,8 @@ async fn run_test(name: &str) {
|
||||||
article.root_node = Some(root);
|
article.root_node = Some(root);
|
||||||
let html = article.get_content().unwrap();
|
let html = article.get_content().unwrap();
|
||||||
|
|
||||||
|
//std::fs::write(format!("./resources/tests/readability/{name}/expected.html"), &html).unwrap();
|
||||||
|
|
||||||
let expected = std::fs::read_to_string(format!(
|
let expected = std::fs::read_to_string(format!(
|
||||||
"./resources/tests/readability/{name}/expected.html"
|
"./resources/tests/readability/{name}/expected.html"
|
||||||
))
|
))
|
||||||
|
|
|
@ -68,6 +68,17 @@ async fn youtube() {
|
||||||
.unwrap_or(false));
|
.unwrap_or(false));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn hardwareluxx() {
|
||||||
|
let _ = env_logger::builder().is_test(true).try_init();
|
||||||
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
|
let url = url::Url::parse("https://www.hardwareluxx.de/index.php/news/software/spiele/60882-half-life-mit-ray-tracing-mod-gibt-dem-25-jahr-alten-shooter-neuen-glanz.html").unwrap();
|
||||||
|
|
||||||
|
let grabber = FullTextParser::new(None).await;
|
||||||
|
let article = grabber.parse(&url, &Client::new()).await.unwrap();
|
||||||
|
article.save_html(&out_path).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn encoding_windows_1252() {
|
async fn encoding_windows_1252() {
|
||||||
let _ = env_logger::builder().is_test(true).try_init();
|
let _ = env_logger::builder().is_test(true).try_init();
|
||||||
|
|
|
@ -439,6 +439,33 @@ impl Util {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn is_element_without_children(node: &Node) -> bool {
|
||||||
|
if let Some(node_type) = node.get_type() {
|
||||||
|
let len = node.get_child_nodes().len();
|
||||||
|
node_type == NodeType::ElementNode
|
||||||
|
&& (len == 0 || node.get_content().trim().is_empty())
|
||||||
|
&& Self::get_elements_by_tag_names(node, &constants::VALID_EMPTY_TAGS).is_empty()
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_elements_by_tag_names(node: &Node, tags: &HashSet<&str>) -> Vec<Node> {
|
||||||
|
let mut vec = Vec::new();
|
||||||
|
|
||||||
|
fn get_elems(node: &Node, tags: &HashSet<&str>, vec: &mut Vec<Node>) {
|
||||||
|
for child in node.get_child_elements() {
|
||||||
|
if tags.contains(child.get_name().to_uppercase().as_str()) {
|
||||||
|
vec.push(child.clone());
|
||||||
|
}
|
||||||
|
get_elems(&child, tags, vec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
get_elems(node, tags, &mut vec);
|
||||||
|
vec
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
|
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
|
||||||
let tag = tag.to_uppercase();
|
let tag = tag.to_uppercase();
|
||||||
let all_tags = tag == "*";
|
let all_tags = tag == "*";
|
||||||
|
@ -629,17 +656,17 @@ impl Util {
|
||||||
let link_density = Self::get_link_density(node);
|
let link_density = Self::get_link_density(node);
|
||||||
let content = Self::get_inner_text(node, true);
|
let content = Self::get_inner_text(node, true);
|
||||||
let content_length = content.len();
|
let content_length = content.len();
|
||||||
|
let has_figure_ancestor =
|
||||||
|
Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>);
|
||||||
|
|
||||||
let have_to_remove = (img > 1
|
let have_to_remove = (img > 1 && (p as f64 / img as f64) < 0.5 && !has_figure_ancestor)
|
||||||
&& (p as f64 / img as f64) < 0.5
|
|
||||||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
|
||||||
|| (!is_list && li > p as i64)
|
|| (!is_list && li > p as i64)
|
||||||
|| (input as f64 > f64::floor(p as f64 / 3.0))
|
|| (input as f64 > f64::floor(p as f64 / 3.0))
|
||||||
|| (!is_list
|
|| (!is_list
|
||||||
&& heading_density < 0.9
|
&& heading_density < 0.9
|
||||||
&& content_length < 25
|
&& content_length < 25
|
||||||
&& (img == 0 || img > 2)
|
&& (img == 0 || img > 2)
|
||||||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
&& !has_figure_ancestor)
|
||||||
|| (!is_list && weight < 25 && link_density > 0.2)
|
|| (!is_list && weight < 25 && link_density > 0.2)
|
||||||
|| (weight >= 25 && link_density > 0.5)
|
|| (weight >= 25 && link_density > 0.5)
|
||||||
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
|
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue