1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

use url.join() instead of custom code

This commit is contained in:
Jan Lukas Gernert 2023-03-01 00:42:03 +01:00
parent 13d147d270
commit 3a92585f4d

View file

@ -510,7 +510,7 @@ impl FullTextParser {
.unwrap_or(false); .unwrap_or(false);
if is_relative_url { if is_relative_url {
let completed_url = Self::complete_url(article_url, &url)?; let completed_url = article_url.join(&url)?;
node.set_attribute(attribute, completed_url.as_str()) node.set_attribute(attribute, completed_url.as_str())
.map_err(|_| FullTextParserError::Scrape)?; .map_err(|_| FullTextParserError::Scrape)?;
} }
@ -519,31 +519,6 @@ impl FullTextParser {
Ok(()) Ok(())
} }
fn complete_url(
article_url: &url::Url,
incomplete_url: &str,
) -> Result<url::Url, FullTextParserError> {
let mut completed_url = article_url.scheme().to_owned();
completed_url.push(':');
if !incomplete_url.starts_with("//") {
match article_url.host() {
Some(url::Host::Domain(host)) => {
completed_url.push_str("//");
completed_url.push_str(host);
}
_ => return Err(FullTextParserError::Scrape),
};
}
if !completed_url.ends_with('/') && !incomplete_url.starts_with('/') {
completed_url.push('/');
}
completed_url.push_str(incomplete_url);
let url = url::Url::parse(&completed_url)?;
Ok(url)
}
fn fix_urls(context: &Context, url: &Url) { fn fix_urls(context: &Context, url: &Url) {
let _ = Self::repair_urls(context, "//img", "src", url); let _ = Self::repair_urls(context, "//img", "src", url);
let _ = Self::repair_urls(context, "//a", "src", url); let _ = Self::repair_urls(context, "//a", "src", url);