1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

fix relative srcset urls & more tests

This commit is contained in:
Jan Lukas Gernert 2023-04-02 09:03:37 +02:00
parent 15eec43ad9
commit 3fa8c9674d
15 changed files with 26566 additions and 3 deletions

View file

@ -580,6 +580,36 @@ impl FullTextParser {
_ = node.set_attribute("target", "_blank");
}
if let Some(srcset) = node.get_attribute("srcset") {
let res = constants::SRC_SET_URL
.captures_iter(&srcset)
.map(|cap| {
let cap0 = cap.get(0).map_or("", |m| m.as_str());
let cap1 = cap.get(1).map_or("", |m| m.as_str());
let cap2 = cap.get(2).map_or("", |m| m.as_str());
let cap3 = cap.get(3).map_or("", |m| m.as_str());
let is_relative_url = url::Url::parse(&cap1)
.err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false);
if is_relative_url {
let completed_url = article_url
.join(&cap1)
.map(|u| u.as_str().to_owned())
.unwrap_or_default();
format!("{completed_url}{cap2}{cap3}")
} else {
cap0.to_string()
}
})
.collect::<Vec<String>>()
.join(" ");
_ = node.set_attribute("srcset", res.as_str());
}
if is_hash_url {
_ = node.set_attribute(attribute, trimmed_url);
} else if is_relative_url {