1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-03-28 07:25:22 +02:00
parent e6c11ec684
commit a649b93c03

View file

@ -417,13 +417,14 @@ impl FullTextParser {
fn fix_lazy_images(context: &Context, doc: &Document) -> Result<(), FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, "//img|picture|figure", false)?;
for mut node in node_vec {
// In some sites (e.g. Kotaku), they put 1px square image as base64 data uri in the src attribute.
// So, here we check if the data uri is too short, just might as well remove it.
if let Some(src) = node.get_attribute("src") {
// Make sure it's not SVG, because SVG can have a meaningful image in under 133 bytes.
if let Some(mime) = constants::BASE64_DATA_URL.captures(&src).and_then(|c| c.get(1).map(|c| c.as_str())) {
if let Some(mime) = constants::BASE64_DATA_URL
.captures(&src)
.and_then(|c| c.get(1).map(|c| c.as_str()))
{
if mime == "image/svg+xml" {
continue;
}
@ -456,7 +457,10 @@ impl FullTextParser {
}
}
let class_contains_lazy = node.get_attribute("class").map(|c| c.to_lowercase().contains("lazy")).unwrap_or(false);
let class_contains_lazy = node
.get_attribute("class")
.map(|c| c.to_lowercase().contains("lazy"))
.unwrap_or(false);
let has_scr = node.has_attribute("src");
let has_srcset = node.has_attribute("srcset");
@ -464,7 +468,6 @@ impl FullTextParser {
continue;
}
for (name, val) in node.get_attributes() {
if name == "src" || name == "srcset" || name == "alt" {
continue;
@ -482,13 +485,16 @@ impl FullTextParser {
//if this is an img or picture, set the attribute directly
if tag_name == "IMG" || tag_name == "PICTURE" {
_= node.set_attribute(copy_to, &val);
} else if tag_name == "FIGURE" && !Util::has_decendent_tag(&node, "img") && !Util::has_decendent_tag(&node, "picture") {
_ = node.set_attribute(copy_to, &val);
} else if tag_name == "FIGURE"
&& !Util::has_decendent_tag(&node, "img")
&& !Util::has_decendent_tag(&node, "picture")
{
//if the item is a <figure> that does not contain an image or picture, create one and place it inside the figure
//see the nytimes-3 testcase for an example
let mut img = Node::new("img", None, doc).unwrap();
_ = img.set_attribute(copy_to, &val);
_ = node.add_child(&mut img);
_ = img.set_attribute(copy_to, &val);
_ = node.add_child(&mut img);
}
}
}