mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fix cleaning of empty p/div-tags
This commit is contained in:
parent
d9c92ea42c
commit
779afd6245
2 changed files with 5 additions and 4 deletions
|
@ -1,7 +1,6 @@
|
||||||
<article><div id="readability-page-1">
|
<article><div id="readability-page-1">
|
||||||
<h2>The U.S. has long been heralded as a land of opportunity -- a place where anyone can succeed regardless of the economic class they were born into.</h2>
|
<h2>The U.S. has long been heralded as a land of opportunity -- a place where anyone can succeed regardless of the economic class they were born into.</h2>
|
||||||
<p> But a new report released on Monday by <a href="http://web.stanford.edu/group/scspi-dev/cgi-bin/" target="_blank">Stanford University's Center on Poverty and Inequality</a> calls that into question. </p>
|
<p> But a new report released on Monday by <a href="http://web.stanford.edu/group/scspi-dev/cgi-bin/" target="_blank">Stanford University's Center on Poverty and Inequality</a> calls that into question. </p>
|
||||||
<P id="ie_column"></P>
|
|
||||||
<p> The report assessed poverty levels, income and wealth inequality, economic mobility and unemployment levels among 10 wealthy countries with social welfare programs. </p>
|
<p> The report assessed poverty levels, income and wealth inequality, economic mobility and unemployment levels among 10 wealthy countries with social welfare programs. </p>
|
||||||
<div id="smartassetcontainer"><div><div><div id="smartasset-article"><div><p>
|
<div id="smartassetcontainer"><div><div><div id="smartasset-article"><div><p>
|
||||||
Powered by SmartAsset.com
|
Powered by SmartAsset.com
|
||||||
|
|
|
@ -848,8 +848,6 @@ impl FullTextParser {
|
||||||
Util::mark_data_tables(&context)?;
|
Util::mark_data_tables(&context)?;
|
||||||
|
|
||||||
if let Some(mut root) = document.get_root_element() {
|
if let Some(mut root) = document.get_root_element() {
|
||||||
Self::remove_extra_p_and_div(&mut root);
|
|
||||||
|
|
||||||
Util::clean_conditionally(&mut root, "fieldset");
|
Util::clean_conditionally(&mut root, "fieldset");
|
||||||
Util::clean_conditionally(&mut root, "table");
|
Util::clean_conditionally(&mut root, "table");
|
||||||
Util::clean_conditionally(&mut root, "ul");
|
Util::clean_conditionally(&mut root, "ul");
|
||||||
|
@ -857,6 +855,8 @@ impl FullTextParser {
|
||||||
|
|
||||||
Self::clean_attributes(&mut root)?;
|
Self::clean_attributes(&mut root)?;
|
||||||
Self::simplify_nested_elements(&mut root)?;
|
Self::simplify_nested_elements(&mut root)?;
|
||||||
|
|
||||||
|
Self::remove_extra_p_and_div(&mut root);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -866,7 +866,7 @@ impl FullTextParser {
|
||||||
let mut node_iter = Some(root.clone());
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
while let Some(mut node) = node_iter {
|
while let Some(mut node) = node_iter {
|
||||||
let tag_name = node.get_name();
|
let tag_name = node.get_name().to_uppercase();
|
||||||
if tag_name == "P" || tag_name == "DIV" {
|
if tag_name == "P" || tag_name == "DIV" {
|
||||||
let img_count = Util::get_elements_by_tag_name(&node, "img").len();
|
let img_count = Util::get_elements_by_tag_name(&node, "img").len();
|
||||||
let embed_count = Util::get_elements_by_tag_name(&node, "embed").len();
|
let embed_count = Util::get_elements_by_tag_name(&node, "embed").len();
|
||||||
|
@ -875,7 +875,9 @@ impl FullTextParser {
|
||||||
let total_count = img_count + embed_count + object_count + iframe_count;
|
let total_count = img_count + embed_count + object_count + iframe_count;
|
||||||
|
|
||||||
if total_count == 0 && Util::get_inner_text(&node, false).trim().is_empty() {
|
if total_count == 0 && Util::get_inner_text(&node, false).trim().is_empty() {
|
||||||
|
node_iter = Util::next_node(&node, false);
|
||||||
node.unlink();
|
node.unlink();
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue