1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

fix turning div's into p's

This commit is contained in:
Jan Lukas Gernert 2023-03-04 17:41:14 +01:00
parent d93f5c9677
commit daa5543c4e
2 changed files with 32 additions and 21 deletions

View file

@ -120,31 +120,39 @@ impl Readability {
if tag_name == "DIV" { if tag_name == "DIV" {
// Put phrasing content into paragraphs. // Put phrasing content into paragraphs.
let mut p: Option<Node> = None; let mut p: Option<Node> = None;
for mut child_node in node_ref.get_child_nodes().into_iter() { for mut child in node_ref.get_child_nodes().into_iter() {
if Self::is_phrasing_content(&child_node) { if Self::is_phrasing_content(&child) {
if let Some(p) = p.as_mut() { if let Some(p) = p.as_mut() {
child_node.unlink(); child.unlink();
let _ = p.add_child(&mut child_node); p.add_child(&mut child).map_err(|error| {
} else if !Util::is_whitespace(&child_node) {
child_node.unlink();
let mut new_node = Node::new("p", None, &document)
.map_err(|()| FullTextParserError::Readability)?;
new_node.add_child(&mut child_node).map_err(|error| {
log::error!("{error}"); log::error!("{error}");
FullTextParserError::Readability FullTextParserError::Readability
})?; })?;
node_ref.add_child(&mut new_node).map_err(|error| { } else if !Util::is_whitespace(&child) {
let mut new_node = Node::new("p", None, &document)
.map_err(|()| FullTextParserError::Readability)?;
let mut old_node = node_ref.replace_child_node(new_node.clone(), child).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
new_node.add_child(&mut old_node).map_err(|error| {
log::error!("{error}"); log::error!("{error}");
FullTextParserError::Readability FullTextParserError::Readability
})?; })?;
p.replace(new_node); p.replace(new_node);
} }
} else if let Some(p) = p.as_mut() { } else if p.is_some() {
for mut r_node in p.get_child_nodes().into_iter().rev() { if let Some(p) = p.as_mut() {
if Util::is_whitespace(&r_node) { for mut r_node in p.get_child_nodes().into_iter().rev() {
r_node.unlink(); if Util::is_whitespace(&r_node) {
r_node.unlink();
continue;
}
break;
} }
} }
_ = p.take();
} }
} }
@ -337,7 +345,7 @@ impl Readability {
// non_significant_whitespace: false, // non_significant_whitespace: false,
// }); // });
// std::fs::write("doc.html", &html).unwrap(); // std::fs::write("doc.html", &html).unwrap();
// The scores shouldn't get too low. // The scores shouldn't get too low.
let score_threshold = last_score / 3.0; let score_threshold = last_score / 3.0;
@ -424,10 +432,11 @@ impl Readability {
let sibling_classes = sibling.get_class_names(); let sibling_classes = sibling.get_class_names();
let tc_classes = top_candidate.get_class_names(); let tc_classes = top_candidate.get_class_names();
if sibling_classes if !tc_classes.is_empty()
.iter() && !sibling_classes.is_empty()
.all(|class| tc_classes.contains(class)) && sibling_classes
&& !tc_classes.is_empty() .iter()
.all(|class| tc_classes.contains(class))
{ {
content_bonus += content_bonus +=
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2; Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;

View file

@ -249,6 +249,9 @@ impl Util {
} }
pub fn is_whitespace(node: &Node) -> bool { pub fn is_whitespace(node: &Node) -> bool {
let content = node.get_content();
let tag_name = node.get_name().to_uppercase();
let is_text_node = node let is_text_node = node
.get_type() .get_type()
.map(|t| t == NodeType::TextNode) .map(|t| t == NodeType::TextNode)
@ -258,8 +261,7 @@ impl Util {
.map(|t| t == NodeType::ElementNode) .map(|t| t == NodeType::ElementNode)
.unwrap_or(false); .unwrap_or(false);
(is_text_node && node.get_content().trim().is_empty()) (is_text_node && content.trim().is_empty()) || (is_element_node && tag_name == "BR")
|| (is_element_node && node.get_name().to_uppercase() == "BR")
} }
pub fn remove_and_next(node: &mut Node) -> Option<Node> { pub fn remove_and_next(node: &mut Node) -> Option<Node> {