mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-09 17:05:30 +02:00
fixes
This commit is contained in:
parent
d8e3a75b01
commit
0834c4d72a
8 changed files with 3234 additions and 489 deletions
|
@ -32,6 +32,12 @@ impl Readability {
|
|||
|
||||
while let Some(node_ref) = node.as_mut() {
|
||||
let tag_name = node_ref.get_name().to_uppercase();
|
||||
|
||||
if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
|
||||
node = Util::remove_and_next(node_ref);
|
||||
continue;
|
||||
}
|
||||
|
||||
let match_string = node_ref
|
||||
.get_class_names()
|
||||
.iter()
|
||||
|
@ -107,16 +113,12 @@ impl Readability {
|
|||
for mut child_node in node_ref.get_child_nodes().into_iter() {
|
||||
if Self::is_phrasing_content(&child_node) {
|
||||
if let Some(p) = p.as_mut() {
|
||||
child_node.unlink();
|
||||
let _ = p.add_child(&mut child_node);
|
||||
} else if !Util::is_whitespace(&child_node) {
|
||||
child_node.unlink();
|
||||
let mut new_node = Node::new("p", None, &document)
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
node_ref
|
||||
.replace_child_node(new_node.clone(), child_node.clone())
|
||||
.map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
new_node.add_child(&mut child_node).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
|
@ -247,6 +249,9 @@ impl Readability {
|
|||
});
|
||||
|
||||
let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
|
||||
// for candidate in top_candidates.iter() {
|
||||
// println!("candidate: {} {:?}", candidate.get_name(), candidate.get_attributes());
|
||||
// }
|
||||
let mut needed_to_create_top_candidate = false;
|
||||
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
|
@ -619,12 +624,8 @@ impl Readability {
|
|||
|
||||
is_text_node
|
||||
|| constants::PHRASING_ELEMS.contains(&tag_name.as_str())
|
||||
|| (tag_name == "A" || tag_name == "DEL" || tag_name == "INS")
|
||||
&& node
|
||||
.get_child_nodes()
|
||||
.iter()
|
||||
.map(Self::is_phrasing_content)
|
||||
.all(|val| val)
|
||||
|| ((tag_name == "A" || tag_name == "DEL" || tag_name == "INS")
|
||||
&& node.get_child_nodes().iter().all(Self::is_phrasing_content))
|
||||
}
|
||||
|
||||
// Initialize a node with the readability object. Also checks the
|
||||
|
|
|
@ -7,7 +7,7 @@ use crate::{
|
|||
};
|
||||
|
||||
async fn run_test(name: &str) {
|
||||
libxml::tree::node::set_node_rc_guard(3);
|
||||
libxml::tree::node::set_node_rc_guard(4);
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let empty_config = ConfigEntry::default();
|
||||
|
@ -43,22 +43,27 @@ async fn run_test(name: &str) {
|
|||
article.document = Some(article_document);
|
||||
let html = article.get_content().unwrap();
|
||||
|
||||
//std::fs::write("expected.html", &html).unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!(
|
||||
"./resources/tests/readability/{name}/expected.html"
|
||||
))
|
||||
.expect("Failed to read expected HTML");
|
||||
|
||||
//std::fs::write("expected.html", &html).unwrap();
|
||||
|
||||
assert_eq!(expected, html);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
#[tokio::test]
|
||||
async fn test_001() {
|
||||
run_test("001").await
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
#[tokio::test]
|
||||
async fn test_002() {
|
||||
run_test("002").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_1() {
|
||||
run_test("webmd-1").await
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue