mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-10 01:15:31 +02:00
whitespace fixes
This commit is contained in:
parent
2217c3c71a
commit
b541cd73f8
46 changed files with 3808 additions and 2111 deletions
|
@ -18,8 +18,7 @@ pub static BYLINE: Lazy<Regex> = Lazy::new(|| {
|
|||
.build()
|
||||
.expect("BYLINE regex")
|
||||
});
|
||||
pub static NORMALIZE: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\s{2,}/g"#).expect("NORMALIZE regex"));
|
||||
pub static NORMALIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\s{2,}"#).expect("NORMALIZE regex"));
|
||||
pub static TOKENIZE: Lazy<Regex> = Lazy::new(|| Regex::new(r#"\W+"#).expect("TOKENIZE regex"));
|
||||
pub static UNLIELY_CANDIDATES: Lazy<Regex> = Lazy::new(|| {
|
||||
RegexBuilder::new(r#"-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote"#).case_insensitive(true).build().expect("UNLIELY_CANDIDATES regex")
|
||||
|
|
|
@ -1001,7 +1001,7 @@ impl FullTextParser {
|
|||
|| Util::has_single_tag_inside_element(&node, "SECTION")
|
||||
{
|
||||
if let Some(mut parent) = node.get_parent() {
|
||||
if let Some(mut child) = node.get_child_nodes().into_iter().next() {
|
||||
if let Some(mut child) = node.get_child_elements().into_iter().next() {
|
||||
for (k, v) in node.get_attributes().into_iter() {
|
||||
child.set_attribute(&k, &v).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
|
|
|
@ -34,7 +34,7 @@ impl Readability {
|
|||
let tag_name = node_ref.get_name().to_uppercase();
|
||||
|
||||
if tag_name == "TEXT" && node_ref.get_content().trim().is_empty() {
|
||||
node = Util::remove_and_next(node_ref);
|
||||
node = Util::next_node(node_ref, true);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -188,18 +188,6 @@ impl Readability {
|
|||
node = Util::next_node(node_ref, false);
|
||||
}
|
||||
|
||||
// let html = document.to_string_with_options(libxml::tree::SaveOptions {
|
||||
// format: true,
|
||||
// no_declaration: false,
|
||||
// no_empty_tags: true,
|
||||
// no_xhtml: false,
|
||||
// xhtml: false,
|
||||
// as_xml: false,
|
||||
// as_html: true,
|
||||
// non_significant_whitespace: false,
|
||||
// });
|
||||
// std::fs::write("debug.html", &html).unwrap();
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
||||
// Then add their score to their parent node.
|
||||
|
@ -262,6 +250,10 @@ impl Readability {
|
|||
if let Some(score) = Self::get_content_score(&ancestor) {
|
||||
let add_score = content_score / score_divider;
|
||||
let new_score = score + add_score;
|
||||
log::debug!(
|
||||
"{}: {score} + {add_score} = {new_score}",
|
||||
ancestor.get_name()
|
||||
);
|
||||
Self::set_content_score(&mut ancestor, new_score)?;
|
||||
}
|
||||
}
|
||||
|
@ -289,9 +281,14 @@ impl Readability {
|
|||
});
|
||||
|
||||
let top_candidates = candidates.into_iter().take(5).collect::<Vec<_>>();
|
||||
// for candidate in top_candidates.iter() {
|
||||
// println!("candidate: {} {:?}", candidate.get_name(), candidate.get_attributes());
|
||||
// }
|
||||
|
||||
for candidate in top_candidates.iter() {
|
||||
log::debug!(
|
||||
"candidate: {} {:?}",
|
||||
candidate.get_name(),
|
||||
candidate.get_attributes()
|
||||
);
|
||||
}
|
||||
let mut needed_to_create_top_candidate = false;
|
||||
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
|
@ -302,6 +299,8 @@ impl Readability {
|
|||
rt
|
||||
});
|
||||
|
||||
//Util::serialize_node(&top_candidate, "top_candidate.html");
|
||||
|
||||
let mut alternative_candidate_ancestors = Vec::new();
|
||||
// Find a better top candidate node if it contains (at least three) nodes which belong to `topCandidates` array
|
||||
// and whose scores are quite closed with current `topCandidate` node.
|
||||
|
@ -346,6 +345,8 @@ impl Readability {
|
|||
Self::initialize_node(&mut top_candidate, &state)?;
|
||||
}
|
||||
|
||||
//Util::serialize_node(&top_candidate, "new_top_candidate.html");
|
||||
|
||||
// Because of our bonus system, parents of candidates might have scores
|
||||
// themselves. They get half of the node. There won't be nodes with higher
|
||||
// scores than our topCandidate, but if we see the score going *up* in the first
|
||||
|
@ -433,7 +434,11 @@ impl Readability {
|
|||
let mut append = false;
|
||||
|
||||
let score = Self::get_content_score(&sibling).unwrap_or(0.0);
|
||||
log::debug!("Looking at sibling node: {sibling:?} with score {score}");
|
||||
log::debug!(
|
||||
"Looking at sibling node: {} ({:?}) with score {score}",
|
||||
sibling.get_name(),
|
||||
sibling.get_attribute("class")
|
||||
);
|
||||
|
||||
if top_candidate == sibling {
|
||||
append = true;
|
||||
|
@ -473,14 +478,22 @@ impl Readability {
|
|||
}
|
||||
|
||||
if append {
|
||||
log::debug!("Appending node: {sibling:?}");
|
||||
log::debug!(
|
||||
"Appending node: {} ({:?})",
|
||||
sibling.get_name(),
|
||||
sibling.get_attribute("class")
|
||||
);
|
||||
|
||||
if !constants::ALTER_TO_DIV_EXCEPTIONS
|
||||
.contains(sibling.get_name().to_uppercase().as_str())
|
||||
{
|
||||
// We have a node that isn't a common block level element, like a form or td tag.
|
||||
// Turn it into a div so it doesn't get filtered out later by accident.
|
||||
log::debug!("Altering sibling: {sibling:?} to div.");
|
||||
log::debug!(
|
||||
"Altering sibling: {} ({:?})",
|
||||
sibling.get_name(),
|
||||
sibling.get_attribute("class")
|
||||
);
|
||||
|
||||
sibling.set_name("DIV").map_err(|error| {
|
||||
log::error!("{error}");
|
||||
|
@ -544,6 +557,8 @@ impl Readability {
|
|||
let text = Util::get_inner_text(&article_content, true);
|
||||
let text_length = text.len();
|
||||
|
||||
//Util::serialize_node(&article_content, "debug.html");
|
||||
|
||||
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
|
||||
parse_successful = false;
|
||||
|
||||
|
|
|
@ -8,7 +8,10 @@ use crate::{
|
|||
|
||||
async fn run_test(name: &str) {
|
||||
libxml::tree::node::set_node_rc_guard(10);
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
let _ = env_logger::builder()
|
||||
.filter_level(log::LevelFilter::Debug)
|
||||
.is_test(true)
|
||||
.try_init();
|
||||
|
||||
let empty_config = ConfigEntry::default();
|
||||
|
||||
|
@ -170,7 +173,7 @@ async fn dropbox_blog() {
|
|||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn ebbb_org() {
|
||||
async fn ebb_org() {
|
||||
run_test("ebb-org").await
|
||||
}
|
||||
|
||||
|
|
25
src/util.rs
25
src/util.rs
|
@ -308,7 +308,7 @@ impl Util {
|
|||
pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||
let content = node.get_content().trim().to_owned();
|
||||
if normalize_spaces {
|
||||
constants::NORMALIZE.replace(&content, " ").into()
|
||||
constants::NORMALIZE.replace_all(&content, " ").into()
|
||||
} else {
|
||||
content
|
||||
}
|
||||
|
@ -427,7 +427,7 @@ impl Util {
|
|||
}
|
||||
|
||||
pub fn get_link_density(node: &Node) -> f64 {
|
||||
let text_length = Util::get_inner_text(node, false).len();
|
||||
let text_length = Util::get_inner_text(node, true).len();
|
||||
if text_length == 0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
@ -443,7 +443,7 @@ impl Util {
|
|||
} else {
|
||||
1.0
|
||||
};
|
||||
link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient;
|
||||
link_length += Util::get_inner_text(&link_node, true).len() as f64 * coefficient;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -580,7 +580,7 @@ impl Util {
|
|||
}
|
||||
|
||||
let link_density = Self::get_link_density(node);
|
||||
let content = Self::get_inner_text(node, false);
|
||||
let content = Self::get_inner_text(node, true);
|
||||
let content_length = content.len();
|
||||
|
||||
let have_to_remove = (img > 1
|
||||
|
@ -780,4 +780,21 @@ impl Util {
|
|||
|| ((tag_name == "A" || tag_name == "DEL" || tag_name == "INS")
|
||||
&& node.get_child_nodes().iter().all(Self::is_phrasing_content))
|
||||
}
|
||||
|
||||
#[allow(dead_code)]
|
||||
pub fn serialize_node(node: &Node, filename: &str) {
|
||||
let mut doc = libxml::tree::Document::new().unwrap();
|
||||
doc.set_root_element(node);
|
||||
let html = doc.to_string_with_options(libxml::tree::SaveOptions {
|
||||
format: true,
|
||||
no_declaration: false,
|
||||
no_empty_tags: true,
|
||||
no_xhtml: false,
|
||||
xhtml: false,
|
||||
as_xml: false,
|
||||
as_html: true,
|
||||
non_significant_whitespace: false,
|
||||
});
|
||||
std::fs::write(filename, &html).unwrap();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue