mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
fix relative srcset urls & more tests
This commit is contained in:
parent
15eec43ad9
commit
3fa8c9674d
15 changed files with 26566 additions and 3 deletions
|
@ -66,6 +66,8 @@ pub static SHARE_ELEMENTS: Lazy<Regex> = Lazy::new(|| {
|
|||
.build()
|
||||
.expect("SHARE_ELEMENTS regex")
|
||||
});
|
||||
pub static SRC_SET_URL: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"(\S+)(\s+[\d.]+[xw])?(\s*(?:,|$))"#).expect("SRC_SET_URL regex"));
|
||||
pub static TITLE_SEPARATOR: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#" [-|—\\/>»] "#).expect("TITLE_SEPARATOR regex"));
|
||||
pub static TITLE_CUT_END: Lazy<Regex> = Lazy::new(|| {
|
||||
|
|
|
@ -580,6 +580,36 @@ impl FullTextParser {
|
|||
_ = node.set_attribute("target", "_blank");
|
||||
}
|
||||
|
||||
if let Some(srcset) = node.get_attribute("srcset") {
|
||||
let res = constants::SRC_SET_URL
|
||||
.captures_iter(&srcset)
|
||||
.map(|cap| {
|
||||
let cap0 = cap.get(0).map_or("", |m| m.as_str());
|
||||
let cap1 = cap.get(1).map_or("", |m| m.as_str());
|
||||
let cap2 = cap.get(2).map_or("", |m| m.as_str());
|
||||
let cap3 = cap.get(3).map_or("", |m| m.as_str());
|
||||
|
||||
let is_relative_url = url::Url::parse(&cap1)
|
||||
.err()
|
||||
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
|
||||
.unwrap_or(false);
|
||||
|
||||
if is_relative_url {
|
||||
let completed_url = article_url
|
||||
.join(&cap1)
|
||||
.map(|u| u.as_str().to_owned())
|
||||
.unwrap_or_default();
|
||||
format!("{completed_url}{cap2}{cap3}")
|
||||
} else {
|
||||
cap0.to_string()
|
||||
}
|
||||
})
|
||||
.collect::<Vec<String>>()
|
||||
.join(" ");
|
||||
|
||||
_ = node.set_attribute("srcset", res.as_str());
|
||||
}
|
||||
|
||||
if is_hash_url {
|
||||
_ = node.set_attribute(attribute, trimmed_url);
|
||||
} else if is_relative_url {
|
||||
|
|
|
@ -518,7 +518,32 @@ async fn videos_2() {
|
|||
run_test("videos-2").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn wapo_1() {
|
||||
run_test("wapo-1").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn wapo_2() {
|
||||
run_test("wapo-2").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_1() {
|
||||
run_test("webmd-1").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_2() {
|
||||
run_test("webmd-2").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn wikia() {
|
||||
run_test("wikia").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn wikipedia() {
|
||||
run_test("wikipedia").await
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue