mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
start adding nytimes tests
This commit is contained in:
parent
70e2ed8c82
commit
c42ffa57a2
10 changed files with 18063 additions and 4 deletions
|
@ -415,8 +415,16 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
fn fix_lazy_images(context: &Context, doc: &Document) -> Result<(), FullTextParserError> {
|
||||
let node_vec = Util::evaluate_xpath(context, "//img|picture|figure", false)?;
|
||||
for mut node in node_vec {
|
||||
let mut img_nodes = Util::evaluate_xpath(context, "//img", false)?;
|
||||
let pic_nodes = Util::evaluate_xpath(context, "//picture", false)?;
|
||||
let fig_nodes = Util::evaluate_xpath(context, "//figure", false)?;
|
||||
|
||||
img_nodes.extend(pic_nodes);
|
||||
img_nodes.extend(fig_nodes);
|
||||
|
||||
for mut node in img_nodes {
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
|
||||
// In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
|
||||
// So, here we check if the data uri is too short, just might as well remove it.
|
||||
if let Some(src) = node.get_attribute("src") {
|
||||
|
@ -481,8 +489,6 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
if let Some(copy_to) = copy_to {
|
||||
let tag_name = node.get_name().to_uppercase();
|
||||
|
||||
//if this is an img or picture, set the attribute directly
|
||||
if tag_name == "IMG" || tag_name == "PICTURE" {
|
||||
_ = node.set_attribute(copy_to, &val);
|
||||
|
@ -956,6 +962,7 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
pub(crate) fn post_process_page(node: &mut Node) -> Result<(), FullTextParserError> {
|
||||
Util::clean_headers(node);
|
||||
Util::clean_conditionally(node, "fieldset");
|
||||
Util::clean_conditionally(node, "table");
|
||||
Util::clean_conditionally(node, "ul");
|
||||
|
|
|
@ -372,6 +372,21 @@ async fn normalize_spaces() {
|
|||
run_test("normalize-spaces").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn nytimes_1() {
|
||||
run_test("nytimes-1").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn nytimes_2() {
|
||||
run_test("nytimes-2").await
|
||||
}
|
||||
|
||||
// #[tokio::test]
|
||||
// async fn nytimes_3() {
|
||||
// run_test("nytimes-3").await
|
||||
// }
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_1() {
|
||||
run_test("webmd-1").await
|
||||
|
|
12
src/util.rs
12
src/util.rs
|
@ -515,6 +515,18 @@ impl Util {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn clean_headers(root: &mut Node) {
|
||||
let mut nodes = Util::get_elements_by_tag_name(root, "h1");
|
||||
nodes.append(&mut Util::get_elements_by_tag_name(root, "h2"));
|
||||
|
||||
for mut node in nodes.into_iter().rev() {
|
||||
if Util::get_class_weight(&node) < 0 {
|
||||
log::debug!("Removing header with low class weight: {} {}", node.get_name(), node.get_attribute("class").unwrap_or_default());
|
||||
node.unlink();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean an element of all tags of type "tag" if they look fishy.
|
||||
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
||||
pub fn clean_conditionally(root: &mut Node, tag: &str) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue