1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-09 17:05:30 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-02-26 02:22:53 +01:00
parent d8e3a75b01
commit 0834c4d72a
8 changed files with 3234 additions and 489 deletions

View file

@ -32,6 +32,12 @@ impl Readability {
while let Some(node_ref) = node.as_mut() {
let tag_name = node_ref.get_name().to_uppercase();
if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
node = Util::remove_and_next(node_ref);
continue;
}
let match_string = node_ref
.get_class_names()
.iter()
@ -107,16 +113,12 @@ impl Readability {
for mut child_node in node_ref.get_child_nodes().into_iter() {
if Self::is_phrasing_content(&child_node) {
if let Some(p) = p.as_mut() {
child_node.unlink();
let _ = p.add_child(&mut child_node);
} else if !Util::is_whitespace(&child_node) {
child_node.unlink();
let mut new_node = Node::new("p", None, &document)
.map_err(|()| FullTextParserError::Readability)?;
node_ref
.replace_child_node(new_node.clone(), child_node.clone())
.map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
new_node.add_child(&mut child_node).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
@ -247,6 +249,9 @@ impl Readability {
});
let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
// for candidate in top_candidates.iter() {
// println!("candidate: {} {:?}", candidate.get_name(), candidate.get_attributes());
// }
let mut needed_to_create_top_candidate = false;
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
// If we still have no top candidate, just use the body as a last resort.
@ -619,12 +624,8 @@ impl Readability {
is_text_node
|| constants::PHRASING_ELEMS.contains(&tag_name.as_str())
|| (tag_name == "A" || tag_name == "DEL" || tag_name == "INS")
&& node
.get_child_nodes()
.iter()
.map(Self::is_phrasing_content)
.all(|val| val)
|| ((tag_name == "A" || tag_name == "DEL" || tag_name == "INS")
&& node.get_child_nodes().iter().all(Self::is_phrasing_content))
}
// Initialize a node with the readability object. Also checks the

View file

@ -7,7 +7,7 @@ use crate::{
};
async fn run_test(name: &str) {
libxml::tree::node::set_node_rc_guard(3);
libxml::tree::node::set_node_rc_guard(4);
let _ = env_logger::builder().is_test(true).try_init();
let empty_config = ConfigEntry::default();
@ -43,22 +43,27 @@ async fn run_test(name: &str) {
article.document = Some(article_document);
let html = article.get_content().unwrap();
//std::fs::write("expected.html", &html).unwrap();
let expected = std::fs::read_to_string(format!(
"./resources/tests/readability/{name}/expected.html"
))
.expect("Failed to read expected HTML");
//std::fs::write("expected.html", &html).unwrap();
assert_eq!(expected, html);
}
#[tokio::test(flavor = "current_thread")]
#[tokio::test]
async fn test_001() {
run_test("001").await
}
#[tokio::test(flavor = "current_thread")]
#[tokio::test]
async fn test_002() {
run_test("002").await
}
#[tokio::test]
async fn webmd_1() {
run_test("webmd-1").await
}