mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
remove empty nodes
This commit is contained in:
parent
5621a0ea54
commit
62c0968619
5 changed files with 103 additions and 30 deletions
|
@ -137,6 +137,12 @@ pub static DIV_TO_P_ELEMS: Lazy<HashSet<&str>> = Lazy::new(|| {
|
|||
])
|
||||
});
|
||||
|
||||
pub static VALID_EMPTY_TAGS: Lazy<HashSet<&str>> = Lazy::new(|| {
|
||||
HashSet::from([
|
||||
"AREA", "BASE", "BR", "COL", "EMBED", "HR", "IMG", "LINK", "META", "SOURCE", "TRACK",
|
||||
])
|
||||
});
|
||||
|
||||
pub static ALTER_TO_DIV_EXCEPTIONS: Lazy<HashSet<&str>> =
|
||||
Lazy::new(|| HashSet::from(["DIV", "ARTICLE", "SECTION", "P"]));
|
||||
|
||||
|
|
|
@ -1113,6 +1113,7 @@ impl FullTextParser {
|
|||
Self::clean_attributes(node)?;
|
||||
Self::remove_single_cell_tables(node);
|
||||
Self::remove_extra_p_and_div(node);
|
||||
Self::remove_empty_nodes(node);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -1243,34 +1244,40 @@ impl FullTextParser {
|
|||
|
||||
while let Some(mut node) = node_iter {
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
if tag_name != "ARTICLE"
|
||||
&& node.get_parent().is_some()
|
||||
&& (tag_name == "DIV" || tag_name == "SECTION")
|
||||
{
|
||||
if Util::is_element_without_content(&node) {
|
||||
node_iter = Util::remove_and_next(&mut node);
|
||||
continue;
|
||||
} else if Util::has_single_tag_inside_element(&node, "DIV")
|
||||
|| Util::has_single_tag_inside_element(&node, "SECTION")
|
||||
{
|
||||
if let Some(mut parent) = node.get_parent() {
|
||||
if let Some(mut child) = node.get_child_elements().into_iter().next() {
|
||||
for (k, v) in node.get_attributes().into_iter() {
|
||||
child.set_attribute(&k, &v).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
parent
|
||||
.replace_child_node(child, node.clone())
|
||||
.map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
node_iter = Util::next_node(&parent, false);
|
||||
continue;
|
||||
if tag_name == "ARTICLE" || node.get_parent().is_none() {
|
||||
node_iter = Util::next_node(&node, false);
|
||||
continue;
|
||||
}
|
||||
|
||||
if tag_name != "DIV" && tag_name != "SECTION" {
|
||||
node_iter = Util::next_node(&node, false);
|
||||
continue;
|
||||
}
|
||||
|
||||
if Util::is_element_without_content(&node) {
|
||||
node_iter = Util::remove_and_next(&mut node);
|
||||
continue;
|
||||
} else if Util::has_single_tag_inside_element(&node, "DIV")
|
||||
|| Util::has_single_tag_inside_element(&node, "SECTION")
|
||||
{
|
||||
if let Some(mut parent) = node.get_parent() {
|
||||
if let Some(mut child) = node.get_child_elements().into_iter().next() {
|
||||
for (k, v) in node.get_attributes().into_iter() {
|
||||
child.set_attribute(&k, &v).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
parent
|
||||
.replace_child_node(child, node.clone())
|
||||
.map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
node_iter = Util::next_node(&parent, false);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1279,4 +1286,24 @@ impl FullTextParser {
|
|||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_empty_nodes(root: &mut Node) {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
|
||||
if constants::VALID_EMPTY_TAGS.contains(tag_name.as_str()) {
|
||||
node_iter = Util::next_node(&node, false);
|
||||
continue;
|
||||
}
|
||||
|
||||
if Util::is_element_without_children(&node) {
|
||||
node_iter = Util::remove_and_next(&mut node);
|
||||
continue;
|
||||
}
|
||||
|
||||
node_iter = Util::next_node(&node, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,6 +45,8 @@ async fn run_test(name: &str) {
|
|||
article.root_node = Some(root);
|
||||
let html = article.get_content().unwrap();
|
||||
|
||||
//std::fs::write(format!("./resources/tests/readability/{name}/expected.html"), &html).unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!(
|
||||
"./resources/tests/readability/{name}/expected.html"
|
||||
))
|
||||
|
|
|
@ -68,6 +68,17 @@ async fn youtube() {
|
|||
.unwrap_or(false));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn hardwareluxx() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url = url::Url::parse("https://www.hardwareluxx.de/index.php/news/software/spiele/60882-half-life-mit-ray-tracing-mod-gibt-dem-25-jahr-alten-shooter-neuen-glanz.html").unwrap();
|
||||
|
||||
let grabber = FullTextParser::new(None).await;
|
||||
let article = grabber.parse(&url, &Client::new()).await.unwrap();
|
||||
article.save_html(&out_path).unwrap();
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn encoding_windows_1252() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
|
|
@ -439,6 +439,33 @@ impl Util {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn is_element_without_children(node: &Node) -> bool {
|
||||
if let Some(node_type) = node.get_type() {
|
||||
let len = node.get_child_nodes().len();
|
||||
node_type == NodeType::ElementNode
|
||||
&& (len == 0 || node.get_content().trim().is_empty())
|
||||
&& Self::get_elements_by_tag_names(node, &constants::VALID_EMPTY_TAGS).is_empty()
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_elements_by_tag_names(node: &Node, tags: &HashSet<&str>) -> Vec<Node> {
|
||||
let mut vec = Vec::new();
|
||||
|
||||
fn get_elems(node: &Node, tags: &HashSet<&str>, vec: &mut Vec<Node>) {
|
||||
for child in node.get_child_elements() {
|
||||
if tags.contains(child.get_name().to_uppercase().as_str()) {
|
||||
vec.push(child.clone());
|
||||
}
|
||||
get_elems(&child, tags, vec);
|
||||
}
|
||||
}
|
||||
|
||||
get_elems(node, tags, &mut vec);
|
||||
vec
|
||||
}
|
||||
|
||||
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
|
||||
let tag = tag.to_uppercase();
|
||||
let all_tags = tag == "*";
|
||||
|
@ -629,17 +656,17 @@ impl Util {
|
|||
let link_density = Self::get_link_density(node);
|
||||
let content = Self::get_inner_text(node, true);
|
||||
let content_length = content.len();
|
||||
let has_figure_ancestor =
|
||||
Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>);
|
||||
|
||||
let have_to_remove = (img > 1
|
||||
&& (p as f64 / img as f64) < 0.5
|
||||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
||||
let have_to_remove = (img > 1 && (p as f64 / img as f64) < 0.5 && !has_figure_ancestor)
|
||||
|| (!is_list && li > p as i64)
|
||||
|| (input as f64 > f64::floor(p as f64 / 3.0))
|
||||
|| (!is_list
|
||||
&& heading_density < 0.9
|
||||
&& content_length < 25
|
||||
&& (img == 0 || img > 2)
|
||||
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
||||
&& !has_figure_ancestor)
|
||||
|| (!is_list && weight < 25 && link_density > 0.2)
|
||||
|| (weight >= 25 && link_density > 0.5)
|
||||
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue