mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fix turning div's into p's
This commit is contained in:
parent
d93f5c9677
commit
daa5543c4e
2 changed files with 32 additions and 21 deletions
|
@ -120,31 +120,39 @@ impl Readability {
|
|||
if tag_name == "DIV" {
|
||||
// Put phrasing content into paragraphs.
|
||||
let mut p: Option<Node> = None;
|
||||
for mut child_node in node_ref.get_child_nodes().into_iter() {
|
||||
if Self::is_phrasing_content(&child_node) {
|
||||
for mut child in node_ref.get_child_nodes().into_iter() {
|
||||
if Self::is_phrasing_content(&child) {
|
||||
if let Some(p) = p.as_mut() {
|
||||
child_node.unlink();
|
||||
let _ = p.add_child(&mut child_node);
|
||||
} else if !Util::is_whitespace(&child_node) {
|
||||
child_node.unlink();
|
||||
let mut new_node = Node::new("p", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
new_node.add_child(&mut child_node).map_err(|error| {
|
||||
child.unlink();
|
||||
p.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
node_ref.add_child(&mut new_node).map_err(|error| {
|
||||
} else if !Util::is_whitespace(&child) {
|
||||
let mut new_node = Node::new("p", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
let mut old_node = node_ref.replace_child_node(new_node.clone(), child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
|
||||
new_node.add_child(&mut old_node).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
p.replace(new_node);
|
||||
}
|
||||
} else if let Some(p) = p.as_mut() {
|
||||
} else if p.is_some() {
|
||||
if let Some(p) = p.as_mut() {
|
||||
for mut r_node in p.get_child_nodes().into_iter().rev() {
|
||||
if Util::is_whitespace(&r_node) {
|
||||
r_node.unlink();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
_ = p.take();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -424,10 +432,11 @@ impl Readability {
|
|||
let sibling_classes = sibling.get_class_names();
|
||||
let tc_classes = top_candidate.get_class_names();
|
||||
|
||||
if sibling_classes
|
||||
if !tc_classes.is_empty()
|
||||
&& !sibling_classes.is_empty()
|
||||
&& sibling_classes
|
||||
.iter()
|
||||
.all(|class| tc_classes.contains(class))
|
||||
&& !tc_classes.is_empty()
|
||||
{
|
||||
content_bonus +=
|
||||
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
|
||||
|
|
|
@ -249,6 +249,9 @@ impl Util {
|
|||
}
|
||||
|
||||
pub fn is_whitespace(node: &Node) -> bool {
|
||||
let content = node.get_content();
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
|
||||
let is_text_node = node
|
||||
.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
|
@ -258,8 +261,7 @@ impl Util {
|
|||
.map(|t| t == NodeType::ElementNode)
|
||||
.unwrap_or(false);
|
||||
|
||||
(is_text_node && node.get_content().trim().is_empty())
|
||||
|| (is_element_node && node.get_name().to_uppercase() == "BR")
|
||||
(is_text_node && content.trim().is_empty()) || (is_element_node && tag_name == "BR")
|
||||
}
|
||||
|
||||
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue