1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

whitespace fixes

This commit is contained in:
Jan Lukas Gernert 2023-03-24 08:02:08 +01:00
parent 2217c3c71a
commit b541cd73f8
46 changed files with 3808 additions and 2111 deletions

View file

@ -34,7 +34,7 @@ impl Readability {
let tag_name = node_ref.get_name().to_uppercase();
if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
node = Util::remove_and_next(node_ref);
node = Util::next_node(node_ref, true);
continue;
}
@ -188,18 +188,6 @@ impl Readability {
node = Util::next_node(node_ref, false);
}
// let html = document.to_string_with_options(libxml::tree::SaveOptions {
// format: true,
// no_declaration: false,
// no_empty_tags: true,
// no_xhtml: false,
// xhtml: false,
// as_xml: false,
// as_html: true,
// non_significant_whitespace: false,
// });
// std::fs::write("debug.html", &html).unwrap();
let mut candidates = Vec::new();
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
// Then add their score to their parent node.
@ -262,6 +250,10 @@ impl Readability {
if let Some(score) = Self::get_content_score(&ancestor) {
let add_score = content_score / score_divider;
let new_score = score + add_score;
log::debug!(
"{}: {score} + {add_score} = {new_score}",
ancestor.get_name()
);
Self::set_content_score(&mut ancestor, new_score)?;
}
}
@ -289,9 +281,14 @@ impl Readability {
});
let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
// for candidate in top_candidates.iter() {
// println!("candidate: {} {:?}", candidate.get_name(), candidate.get_attributes());
// }
for candidate in top_candidates.iter() {
log::debug!(
"candidate: {} {:?}",
candidate.get_name(),
candidate.get_attributes()
);
}
let mut needed_to_create_top_candidate = false;
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
// If we still have no top candidate, just use the body as a last resort.
@ -302,6 +299,8 @@ impl Readability {
rt
});
//Util::serialize_node(&top_candidate, "top_candidate.html");
let mut alternative_candidate_ancestors = Vec::new();
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
// and whose scores are quite closed with current `topCandidate` node.
@ -346,6 +345,8 @@ impl Readability {
Self::initialize_node(&mut top_candidate, &state)?;
}
//Util::serialize_node(&top_candidate, "new_top_candidate.html");
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
@ -433,7 +434,11 @@ impl Readability {
let mut append = false;
let score = Self::get_content_score(&sibling).unwrap_or(0.0);
log::debug!("Looking at sibling node: {sibling:?} with score {score}");
log::debug!(
"Looking at sibling node: {} ({:?}) with score {score}",
sibling.get_name(),
sibling.get_attribute("class")
);
if top_candidate == sibling {
append = true;
@ -473,14 +478,22 @@ impl Readability {
}
if append {
log::debug!("Appending node: {sibling:?}");
log::debug!(
"Appending node: {} ({:?})",
sibling.get_name(),
sibling.get_attribute("class")
);
if !constants::ALTER_TO_DIV_EXCEPTIONS
.contains(sibling.get_name().to_uppercase().as_str())
{
// We have a node that isn't a common block level element, like a form or td tag.
// Turn it into a div so it doesn't get filtered out later by accident.
log::debug!("Altering sibling: {sibling:?} to div.");
log::debug!(
"Altering sibling: {} ({:?})",
sibling.get_name(),
sibling.get_attribute("class")
);
sibling.set_name("DIV").map_err(|error| {
log::error!("{error}");
@ -544,6 +557,8 @@ impl Readability {
let text = Util::get_inner_text(&article_content, true);
let text_length = text.len();
//Util::serialize_node(&article_content, "debug.html");
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
parse_successful = false;

View file

@ -8,7 +8,10 @@ use crate::{
async fn run_test(name: &str) {
libxml::tree::node::set_node_rc_guard(10);
let _ = env_logger::builder().is_test(true).try_init();
let _ = env_logger::builder()
.filter_level(log::LevelFilter::Debug)
.is_test(true)
.try_init();
let empty_config = ConfigEntry::default();
@ -170,7 +173,7 @@ async fn dropbox_blog() {
}
#[tokio::test]
async fn ebbb_org() {
async fn ebb_org() {
run_test("ebb-org").await
}