1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-02-25 00:43:42 +01:00
parent e3246af28b
commit 63035ca028
4 changed files with 11 additions and 9 deletions

View file

@ -23,7 +23,10 @@ pub fn extract(
let new_title = constants::TITLE_CUT_END.replace(&title, "$1");
let word_count = constants::WORD_COUNT.split(&title).count();
if word_count < 3 {
constants::TITLE_CUT_FRONT.replace(&title, "$1").trim().to_string()
constants::TITLE_CUT_FRONT
.replace(&title, "$1")
.trim()
.to_string()
} else {
new_title.trim().to_string()
}

View file

@ -837,9 +837,7 @@ impl FullTextParser {
Ok(())
}
pub(crate) fn post_process_content(
root: &mut Node
) -> Result<(), FullTextParserError> {
pub(crate) fn post_process_content(root: &mut Node) -> Result<(), FullTextParserError> {
Self::clean_classes(root)?;
Self::simplify_nested_elements(root)?;
Ok(())

View file

@ -280,8 +280,7 @@ impl Readability {
constants::MINIMUM_TOPCANDIDATES,
);
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
lists_containing_this_ancestor +=
if ancestor == parent { 1 } else { 0 };
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
}
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {

View file

@ -43,11 +43,13 @@ async fn run_test(name: &str) {
article.document = Some(article_document);
let html = article.get_content().unwrap();
let expected = std::fs::read_to_string(format!("./resources/tests/readability/{name}/expected.html"))
.expect("Failed to read expected HTML");
let expected = std::fs::read_to_string(format!(
"./resources/tests/readability/{name}/expected.html"
))
.expect("Failed to read expected HTML");
//std::fs::write("expected.html", &html).unwrap();
assert_eq!(expected, html);
}