mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 08:30:00 +02:00
whitespace fixes
This commit is contained in:
parent
2217c3c71a
commit
b541cd73f8
46 changed files with 3808 additions and 2111 deletions
|
@ -34,7 +34,7 @@ impl Readability {
|
|||
let tag_name = node_ref.get_name().to_uppercase();
|
||||
|
||||
if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
|
||||
node = Util::remove_and_next(node_ref);
|
||||
node = Util::next_node(node_ref, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -188,18 +188,6 @@ impl Readability {
|
|||
node = Util::next_node(node_ref, false);
|
||||
}
|
||||
|
||||
// let html = document.to_string_with_options(libxml::tree::SaveOptions {
|
||||
// format: true,
|
||||
// no_declaration: false,
|
||||
// no_empty_tags: true,
|
||||
// no_xhtml: false,
|
||||
// xhtml: false,
|
||||
// as_xml: false,
|
||||
// as_html: true,
|
||||
// non_significant_whitespace: false,
|
||||
// });
|
||||
// std::fs::write("debug.html", &html).unwrap();
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
||||
// Then add their score to their parent node.
|
||||
|
@ -262,6 +250,10 @@ impl Readability {
|
|||
if let Some(score) = Self::get_content_score(&ancestor) {
|
||||
let add_score = content_score / score_divider;
|
||||
let new_score = score + add_score;
|
||||
log::debug!(
|
||||
"{}: {score} + {add_score} = {new_score}",
|
||||
ancestor.get_name()
|
||||
);
|
||||
Self::set_content_score(&mut ancestor, new_score)?;
|
||||
}
|
||||
}
|
||||
|
@ -289,9 +281,14 @@ impl Readability {
|
|||
});
|
||||
|
||||
let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
|
||||
// for candidate in top_candidates.iter() {
|
||||
// println!("candidate: {} {:?}", candidate.get_name(), candidate.get_attributes());
|
||||
// }
|
||||
|
||||
for candidate in top_candidates.iter() {
|
||||
log::debug!(
|
||||
"candidate: {} {:?}",
|
||||
candidate.get_name(),
|
||||
candidate.get_attributes()
|
||||
);
|
||||
}
|
||||
let mut needed_to_create_top_candidate = false;
|
||||
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
|
@ -302,6 +299,8 @@ impl Readability {
|
|||
rt
|
||||
});
|
||||
|
||||
//Util::serialize_node(&top_candidate, "top_candidate.html");
|
||||
|
||||
let mut alternative_candidate_ancestors = Vec::new();
|
||||
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
|
||||
// and whose scores are quite closed with current `topCandidate` node.
|
||||
|
@ -346,6 +345,8 @@ impl Readability {
|
|||
Self::initialize_node(&mut top_candidate, &state)?;
|
||||
}
|
||||
|
||||
//Util::serialize_node(&top_candidate, "new_top_candidate.html");
|
||||
|
||||
// Because of our bonus system, parents of candidates might have scores
|
||||
// themselves. They get half of the node. There won't be nodes with higher
|
||||
// scores than our topCandidate, but if we see the score going *up* in the first
|
||||
|
@ -433,7 +434,11 @@ impl Readability {
|
|||
let mut append = false;
|
||||
|
||||
let score = Self::get_content_score(&sibling).unwrap_or(0.0);
|
||||
log::debug!("Looking at sibling node: {sibling:?} with score {score}");
|
||||
log::debug!(
|
||||
"Looking at sibling node: {} ({:?}) with score {score}",
|
||||
sibling.get_name(),
|
||||
sibling.get_attribute("class")
|
||||
);
|
||||
|
||||
if top_candidate == sibling {
|
||||
append = true;
|
||||
|
@ -473,14 +478,22 @@ impl Readability {
|
|||
}
|
||||
|
||||
if append {
|
||||
log::debug!("Appending node: {sibling:?}");
|
||||
log::debug!(
|
||||
"Appending node: {} ({:?})",
|
||||
sibling.get_name(),
|
||||
sibling.get_attribute("class")
|
||||
);
|
||||
|
||||
if !constants::ALTER_TO_DIV_EXCEPTIONS
|
||||
.contains(sibling.get_name().to_uppercase().as_str())
|
||||
{
|
||||
// We have a node that isn't a common block level element, like a form or td tag.
|
||||
// Turn it into a div so it doesn't get filtered out later by accident.
|
||||
log::debug!("Altering sibling: {sibling:?} to div.");
|
||||
log::debug!(
|
||||
"Altering sibling: {} ({:?})",
|
||||
sibling.get_name(),
|
||||
sibling.get_attribute("class")
|
||||
);
|
||||
|
||||
sibling.set_name("DIV").map_err(|error| {
|
||||
log::error!("{error}");
|
||||
|
@ -544,6 +557,8 @@ impl Readability {
|
|||
let text = Util::get_inner_text(&article_content, true);
|
||||
let text_length = text.len();
|
||||
|
||||
//Util::serialize_node(&article_content, "debug.html");
|
||||
|
||||
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
|
||||
parse_successful = false;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue