1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

fix turning div's into p's

This commit is contained in:
Jan Lukas Gernert 2023-03-04 17:41:14 +01:00
parent d93f5c9677
commit daa5543c4e
2 changed files with 32 additions and 21 deletions

View file

@ -120,31 +120,39 @@ impl Readability {
if tag_name == "DIV" {
// Put phrasing content into paragraphs.
let mut p: Option<Node> = None;
for mut child_node in node_ref.get_child_nodes().into_iter() {
if Self::is_phrasing_content(&child_node) {
for mut child in node_ref.get_child_nodes().into_iter() {
if Self::is_phrasing_content(&child) {
if let Some(p) = p.as_mut() {
child_node.unlink();
let _ = p.add_child(&mut child_node);
} else if !Util::is_whitespace(&child_node) {
child_node.unlink();
let mut new_node = Node::new("p", None, &document)
.map_err(|()| FullTextParserError::Readability)?;
new_node.add_child(&mut child_node).map_err(|error| {
child.unlink();
p.add_child(&mut child).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
node_ref.add_child(&mut new_node).map_err(|error| {
} else if !Util::is_whitespace(&child) {
let mut new_node = Node::new("p", None, &document)
.map_err(|()| FullTextParserError::Readability)?;
let mut old_node = node_ref.replace_child_node(new_node.clone(), child).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
new_node.add_child(&mut old_node).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
p.replace(new_node);
}
} else if let Some(p) = p.as_mut() {
} else if p.is_some() {
if let Some(p) = p.as_mut() {
for mut r_node in p.get_child_nodes().into_iter().rev() {
if Util::is_whitespace(&r_node) {
r_node.unlink();
continue;
}
break;
}
}
_ = p.take();
}
}
@ -424,10 +432,11 @@ impl Readability {
let sibling_classes = sibling.get_class_names();
let tc_classes = top_candidate.get_class_names();
if sibling_classes
if !tc_classes.is_empty()
&& !sibling_classes.is_empty()
&& sibling_classes
.iter()
.all(|class| tc_classes.contains(class))
&& !tc_classes.is_empty()
{
content_bonus +=
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;

View file

@ -249,6 +249,9 @@ impl Util {
}
pub fn is_whitespace(node: &Node) -> bool {
let content = node.get_content();
let tag_name = node.get_name().to_uppercase();
let is_text_node = node
.get_type()
.map(|t| t == NodeType::TextNode)
@ -258,8 +261,7 @@ impl Util {
.map(|t| t == NodeType::ElementNode)
.unwrap_or(false);
(is_text_node && node.get_content().trim().is_empty())
|| (is_element_node && node.get_name().to_uppercase() == "BR")
(is_text_node && content.trim().is_empty()) || (is_element_node && tag_name == "BR")
}
pub fn remove_and_next(node: &mut Node) -> Option<Node> {