1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

fix replacing font tags

This commit is contained in:
Jan Lukas Gernert 2023-04-01 12:31:56 +02:00
parent 253afc48f0
commit be6e08bd6d
4 changed files with 30 additions and 8 deletions

View file

@ -0,0 +1,18 @@
<article><DIV id="readability-page-1"><article>
<h2>Lorem</h2>
<p><span face="Arial" size="2"><span face="Times" size="10">Lorem</span> ipsum dolor</span> sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. <span face="Arial" size="2">Duis</span> aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<h2>Foo</h2>
<p>
Tempor incididunt ut labore et <span face="Arial" size="2">dolore</span> magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. <span face="Arial" size="2">Excepteur sint occaecat</span> cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article></DIV></article>

View file

@ -646,6 +646,13 @@ impl FullTextParser {
} }
} }
// rename all font nodes to span
if let Ok(font_nodes) = Util::evaluate_xpath(context, "//font", false) {
for mut font_node in font_nodes {
_ = font_node.set_name("span");
}
}
_ = Util::mark_data_tables(context); _ = Util::mark_data_tables(context);
// strip specified xpath // strip specified xpath
@ -1071,10 +1078,6 @@ impl FullTextParser {
FullTextParserError::Xml FullTextParserError::Xml
})?; })?;
if node.get_name().to_uppercase() == "FONT" {
node.set_name("span").unwrap();
}
node_iter = Util::next_node(&node, false); node_iter = Util::next_node(&node, false);
} }
Ok(()) Ok(())

View file

@ -705,6 +705,7 @@ impl Readability {
0 0
}; };
let score = score + class_weight; let score = score + class_weight;
log::debug!("initialize node {} {}: {score}", node.get_name(), node.get_attribute("class").unwrap_or_default());
Self::set_content_score(node, score as f64)?; Self::set_content_score(node, score as f64)?;
Ok(()) Ok(())
} }

View file

@ -433,10 +433,10 @@ async fn remove_script_tags() {
run_test("remove-script-tags").await run_test("remove-script-tags").await
} }
// #[tokio::test] #[tokio::test]
// async fn replace_font_tags() { async fn replace_font_tags() {
// run_test("replace-font-tags").await run_test("replace-font-tags").await
// } }
#[tokio::test] #[tokio::test]
async fn webmd_1() { async fn webmd_1() {