1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-02-25 00:43:42 +01:00
parent e3246af28b
commit 63035ca028
4 changed files with 11 additions and 9 deletions

View file

@ -23,7 +23,10 @@ pub fn extract(
let new_title = constants::TITLE_CUT_END.replace(&title, "$1"); let new_title = constants::TITLE_CUT_END.replace(&title, "$1");
let word_count = constants::WORD_COUNT.split(&title).count(); let word_count = constants::WORD_COUNT.split(&title).count();
if word_count < 3 { if word_count < 3 {
constants::TITLE_CUT_FRONT.replace(&title, "$1").trim().to_string() constants::TITLE_CUT_FRONT
.replace(&title, "$1")
.trim()
.to_string()
} else { } else {
new_title.trim().to_string() new_title.trim().to_string()
} }

View file

@ -837,9 +837,7 @@ impl FullTextParser {
Ok(()) Ok(())
} }
pub(crate) fn post_process_content( pub(crate) fn post_process_content(root: &mut Node) -> Result<(), FullTextParserError> {
root: &mut Node
) -> Result<(), FullTextParserError> {
Self::clean_classes(root)?; Self::clean_classes(root)?;
Self::simplify_nested_elements(root)?; Self::simplify_nested_elements(root)?;
Ok(()) Ok(())

View file

@ -280,8 +280,7 @@ impl Readability {
constants::MINIMUM_TOPCANDIDATES, constants::MINIMUM_TOPCANDIDATES,
); );
for ancestor in alternative_candidate_ancestors.iter().take(tmp) { for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
lists_containing_this_ancestor += lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
if ancestor == parent { 1 } else { 0 };
} }
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES { if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {

View file

@ -43,11 +43,13 @@ async fn run_test(name: &str) {
article.document = Some(article_document); article.document = Some(article_document);
let html = article.get_content().unwrap(); let html = article.get_content().unwrap();
let expected = std::fs::read_to_string(format!("./resources/tests/readability/{name}/expected.html")) let expected = std::fs::read_to_string(format!(
.expect("Failed to read expected HTML"); "./resources/tests/readability/{name}/expected.html"
))
.expect("Failed to read expected HTML");
//std::fs::write("expected.html", &html).unwrap(); //std::fs::write("expected.html", &html).unwrap();
assert_eq!(expected, html); assert_eq!(expected, html);
} }