1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

fix nytimes-3

This commit is contained in:
Jan Lukas Gernert 2023-03-31 10:38:04 +02:00
parent c42ffa57a2
commit c46d93058f
6 changed files with 933 additions and 13 deletions

View file

@ -21,6 +21,7 @@ use libxml::tree::{Document, Node, NodeType};
use libxml::xpath::Context;
use reqwest::header::HeaderMap;
use reqwest::{Client, Url};
use std::collections::HashSet;
use std::path::Path;
use std::str::from_utf8;
@ -493,8 +494,7 @@ impl FullTextParser {
if tag_name == "IMG" || tag_name == "PICTURE" {
_ = node.set_attribute(copy_to, &val);
} else if tag_name == "FIGURE"
&& !Util::has_decendent_tag(&node, "img")
&& !Util::has_decendent_tag(&node, "picture")
&& !Util::has_any_descendent_tag(&node, &HashSet::from(["IMG", "PICTURE"]))
{
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
//see the nytimes-3 testcase for an example