mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-10 01:15:31 +02:00
fix has_single_tag_inside_element & update tests
This commit is contained in:
parent
31a8033844
commit
aea57d0cf3
10 changed files with 86 additions and 39 deletions
|
@ -900,18 +900,10 @@ impl FullTextParser {
|
|||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
let classes = node.get_class_names();
|
||||
if classes.contains("page") {
|
||||
node.set_attribute("class", "page").map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
} else {
|
||||
node.remove_attribute("class").map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
}
|
||||
node.remove_attribute("class").map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
node.remove_attribute(constants::SCORE_ATTR).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
|
|
|
@ -399,8 +399,8 @@ impl Readability {
|
|||
for mut sibling in siblings {
|
||||
let mut append = false;
|
||||
|
||||
let score = Self::get_content_score(&sibling);
|
||||
log::debug!("Looking at sibling node: {sibling:?} with score {score:?}");
|
||||
let score = Self::get_content_score(&sibling).unwrap_or(0.0);
|
||||
log::debug!("Looking at sibling node: {sibling:?} with score {score}");
|
||||
|
||||
if top_candidate == sibling {
|
||||
append = true;
|
||||
|
@ -420,9 +420,7 @@ impl Readability {
|
|||
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
|
||||
}
|
||||
|
||||
if Self::get_content_score(&sibling).unwrap_or(0.0) + content_bonus
|
||||
>= sibling_score_threshold
|
||||
{
|
||||
if score + content_bonus >= sibling_score_threshold {
|
||||
append = true;
|
||||
} else if sibling.get_name().to_uppercase() == "P" {
|
||||
let link_density = Util::get_link_density(&sibling);
|
||||
|
@ -476,12 +474,6 @@ impl Readability {
|
|||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
top_candidate
|
||||
.set_property("class", "page")
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
} else {
|
||||
let mut div = Node::new("DIV", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
|
@ -490,10 +482,6 @@ impl Readability {
|
|||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
div.set_property("class", "page").map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
|
||||
for mut child in article_content.get_child_nodes() {
|
||||
child.unlink();
|
||||
|
|
|
@ -40,7 +40,7 @@ async fn run_test(name: &str) {
|
|||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
|
||||
std::fs::write("expected.html", &html).unwrap();
|
||||
//std::fs::write("expected.html", &html).unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!(
|
||||
"./resources/tests/readability/{name}/expected.html"
|
||||
|
|
|
@ -380,7 +380,7 @@ impl Util {
|
|||
}
|
||||
|
||||
// And there should be no text nodes with real content
|
||||
node.get_child_nodes().iter().any(|n| {
|
||||
!node.get_child_nodes().iter().any(|n| {
|
||||
n.get_type()
|
||||
.map(|t| t == NodeType::TextNode)
|
||||
.unwrap_or(false)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue