mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fix replacing font tags
This commit is contained in:
parent
253afc48f0
commit
be6e08bd6d
4 changed files with 30 additions and 8 deletions
18
resources/tests/readability/replace-font-tags/expected.html
Normal file
18
resources/tests/readability/replace-font-tags/expected.html
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
<article><DIV id="readability-page-1"><article>
|
||||||
|
<h2>Lorem</h2>
|
||||||
|
<p><span face="Arial" size="2"><span face="Times" size="10">Lorem</span> ipsum dolor</span> sit amet, consectetur adipisicing elit, sed do eiusmod
|
||||||
|
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
|
||||||
|
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||||
|
consequat. <span face="Arial" size="2">Duis</span> aute irure dolor in reprehenderit in voluptate velit esse
|
||||||
|
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
|
||||||
|
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||||
|
</p>
|
||||||
|
<h2>Foo</h2>
|
||||||
|
<p>
|
||||||
|
Tempor incididunt ut labore et <span face="Arial" size="2">dolore</span> magna aliqua. Ut enim ad minim veniam,
|
||||||
|
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
|
||||||
|
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
|
||||||
|
cillum dolore eu fugiat nulla pariatur. <span face="Arial" size="2">Excepteur sint occaecat</span> cupidatat non
|
||||||
|
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
|
||||||
|
</p>
|
||||||
|
</article></DIV></article>
|
|
@ -646,6 +646,13 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rename all font nodes to span
|
||||||
|
if let Ok(font_nodes) = Util::evaluate_xpath(context, "//font", false) {
|
||||||
|
for mut font_node in font_nodes {
|
||||||
|
_ = font_node.set_name("span");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
_ = Util::mark_data_tables(context);
|
_ = Util::mark_data_tables(context);
|
||||||
|
|
||||||
// strip specified xpath
|
// strip specified xpath
|
||||||
|
@ -1071,10 +1078,6 @@ impl FullTextParser {
|
||||||
FullTextParserError::Xml
|
FullTextParserError::Xml
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
if node.get_name().to_uppercase() == "FONT" {
|
|
||||||
node.set_name("span").unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
node_iter = Util::next_node(&node, false);
|
node_iter = Util::next_node(&node, false);
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
|
|
@ -705,6 +705,7 @@ impl Readability {
|
||||||
0
|
0
|
||||||
};
|
};
|
||||||
let score = score + class_weight;
|
let score = score + class_weight;
|
||||||
|
log::debug!("initialize node {} {}: {score}", node.get_name(), node.get_attribute("class").unwrap_or_default());
|
||||||
Self::set_content_score(node, score as f64)?;
|
Self::set_content_score(node, score as f64)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
|
@ -433,10 +433,10 @@ async fn remove_script_tags() {
|
||||||
run_test("remove-script-tags").await
|
run_test("remove-script-tags").await
|
||||||
}
|
}
|
||||||
|
|
||||||
// #[tokio::test]
|
#[tokio::test]
|
||||||
// async fn replace_font_tags() {
|
async fn replace_font_tags() {
|
||||||
// run_test("replace-font-tags").await
|
run_test("replace-font-tags").await
|
||||||
// }
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn webmd_1() {
|
async fn webmd_1() {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue