mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
fix has_single_tag_inside_element & update tests
This commit is contained in:
parent
31a8033844
commit
aea57d0cf3
10 changed files with 86 additions and 39 deletions
|
@ -399,8 +399,8 @@ impl Readability {
|
|||
for mut sibling in siblings {
|
||||
let mut append = false;
|
||||
|
||||
let score = Self::get_content_score(&sibling);
|
||||
log::debug!("Looking at sibling node: {sibling:?} with score {score:?}");
|
||||
let score = Self::get_content_score(&sibling).unwrap_or(0.0);
|
||||
log::debug!("Looking at sibling node: {sibling:?} with score {score}");
|
||||
|
||||
if top_candidate == sibling {
|
||||
append = true;
|
||||
|
@ -420,9 +420,7 @@ impl Readability {
|
|||
Self::get_content_score(&top_candidate).unwrap_or(0.0) * 0.2;
|
||||
}
|
||||
|
||||
if Self::get_content_score(&sibling).unwrap_or(0.0) + content_bonus
|
||||
>= sibling_score_threshold
|
||||
{
|
||||
if score + content_bonus >= sibling_score_threshold {
|
||||
append = true;
|
||||
} else if sibling.get_name().to_uppercase() == "P" {
|
||||
let link_density = Util::get_link_density(&sibling);
|
||||
|
@ -476,12 +474,6 @@ impl Readability {
|
|||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
top_candidate
|
||||
.set_property("class", "page")
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
} else {
|
||||
let mut div = Node::new("DIV", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
|
@ -490,10 +482,6 @@ impl Readability {
|
|||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
div.set_property("class", "page").map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
|
||||
for mut child in article_content.get_child_nodes() {
|
||||
child.unlink();
|
||||
|
|
|
@ -40,7 +40,7 @@ async fn run_test(name: &str) {
|
|||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
|
||||
std::fs::write("expected.html", &html).unwrap();
|
||||
//std::fs::write("expected.html", &html).unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!(
|
||||
"./resources/tests/readability/{name}/expected.html"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue