mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fix turning div's into p's
This commit is contained in:
parent
d93f5c9677
commit
daa5543c4e
2 changed files with 32 additions and 21 deletions
|
@ -120,31 +120,39 @@ impl Readability {
|
||||||
if tag_name == "DIV" {
|
if tag_name == "DIV" {
|
||||||
// Put phrasing content into paragraphs.
|
// Put phrasing content into paragraphs.
|
||||||
let mut p: Option<Node> = None;
|
let mut p: Option<Node> = None;
|
||||||
for mut child_node in node_ref.get_child_nodes().into_iter() {
|
for mut child in node_ref.get_child_nodes().into_iter() {
|
||||||
if Self::is_phrasing_content(&child_node) {
|
if Self::is_phrasing_content(&child) {
|
||||||
if let Some(p) = p.as_mut() {
|
if let Some(p) = p.as_mut() {
|
||||||
child_node.unlink();
|
child.unlink();
|
||||||
let _ = p.add_child(&mut child_node);
|
p.add_child(&mut child).map_err(|error| {
|
||||||
} else if !Util::is_whitespace(&child_node) {
|
|
||||||
child_node.unlink();
|
|
||||||
let mut new_node = Node::new("p", None, &document)
|
|
||||||
.map_err(|()| FullTextParserError::Readability)?;
|
|
||||||
new_node.add_child(&mut child_node).map_err(|error| {
|
|
||||||
log::error!("{error}");
|
log::error!("{error}");
|
||||||
FullTextParserError::Readability
|
FullTextParserError::Readability
|
||||||
})?;
|
})?;
|
||||||
node_ref.add_child(&mut new_node).map_err(|error| {
|
} else if !Util::is_whitespace(&child) {
|
||||||
|
let mut new_node = Node::new("p", None, &document)
|
||||||
|
.map_err(|()| FullTextParserError::Readability)?;
|
||||||
|
let mut old_node = node_ref.replace_child_node(new_node.clone(), child).map_err(|error| {
|
||||||
|
log::error!("{error}");
|
||||||
|
FullTextParserError::Readability
|
||||||
|
})?;
|
||||||
|
|
||||||
|
new_node.add_child(&mut old_node).map_err(|error| {
|
||||||
log::error!("{error}");
|
log::error!("{error}");
|
||||||
FullTextParserError::Readability
|
FullTextParserError::Readability
|
||||||
})?;
|
})?;
|
||||||
p.replace(new_node);
|
p.replace(new_node);
|
||||||
}
|
}
|
||||||
} else if let Some(p) = p.as_mut() {
|
} else if p.is_some() {
|
||||||
for mut r_node in p.get_child_nodes().into_iter().rev() {
|
if let Some(p) = p.as_mut() {
|
||||||
if Util::is_whitespace(&r_node) {
|
for mut r_node in p.get_child_nodes().into_iter().rev() {
|
||||||
r_node.unlink();
|
if Util::is_whitespace(&r_node) {
|
||||||
|
r_node.unlink();
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
_ = p.take();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -337,7 +345,7 @@ impl Readability {
|
||||||
// non_significant_whitespace: false,
|
// non_significant_whitespace: false,
|
||||||
// });
|
// });
|
||||||
// std::fs::write("doc.html", &html).unwrap();
|
// std::fs::write("doc.html", &html).unwrap();
|
||||||
|
|
||||||
// The scores shouldn't get too low.
|
// The scores shouldn't get too low.
|
||||||
let score_threshold = last_score / 3.0;
|
let score_threshold = last_score / 3.0;
|
||||||
|
|
||||||
|
@ -424,10 +432,11 @@ impl Readability {
|
||||||
let sibling_classes = sibling.get_class_names();
|
let sibling_classes = sibling.get_class_names();
|
||||||
let tc_classes = top_candidate.get_class_names();
|
let tc_classes = top_candidate.get_class_names();
|
||||||
|
|
||||||
if sibling_classes
|
if !tc_classes.is_empty()
|
||||||
.iter()
|
&& !sibling_classes.is_empty()
|
||||||
.all(|class| tc_classes.contains(class))
|
&& sibling_classes
|
||||||
&& !tc_classes.is_empty()
|
.iter()
|
||||||
|
.all(|class| tc_classes.contains(class))
|
||||||
{
|
{
|
||||||
content_bonus +=
|
content_bonus +=
|
||||||
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
|
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
|
||||||
|
|
|
@ -249,6 +249,9 @@ impl Util {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn is_whitespace(node: &Node) -> bool {
|
pub fn is_whitespace(node: &Node) -> bool {
|
||||||
|
let content = node.get_content();
|
||||||
|
let tag_name = node.get_name().to_uppercase();
|
||||||
|
|
||||||
let is_text_node = node
|
let is_text_node = node
|
||||||
.get_type()
|
.get_type()
|
||||||
.map(|t| t == NodeType::TextNode)
|
.map(|t| t == NodeType::TextNode)
|
||||||
|
@ -258,8 +261,7 @@ impl Util {
|
||||||
.map(|t| t == NodeType::ElementNode)
|
.map(|t| t == NodeType::ElementNode)
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
(is_text_node && node.get_content().trim().is_empty())
|
(is_text_node && content.trim().is_empty()) || (is_element_node && tag_name == "BR")
|
||||||
|| (is_element_node && node.get_name().to_uppercase() == "BR")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
|
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue