mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
use url.join() instead of custom code
This commit is contained in:
parent
13d147d270
commit
3a92585f4d
1 changed files with 1 additions and 26 deletions
|
@ -510,7 +510,7 @@ impl FullTextParser {
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
|
|
||||||
if is_relative_url {
|
if is_relative_url {
|
||||||
let completed_url = Self::complete_url(article_url, &url)?;
|
let completed_url = article_url.join(&url)?;
|
||||||
node.set_attribute(attribute, completed_url.as_str())
|
node.set_attribute(attribute, completed_url.as_str())
|
||||||
.map_err(|_| FullTextParserError::Scrape)?;
|
.map_err(|_| FullTextParserError::Scrape)?;
|
||||||
}
|
}
|
||||||
|
@ -519,31 +519,6 @@ impl FullTextParser {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn complete_url(
|
|
||||||
article_url: &url::Url,
|
|
||||||
incomplete_url: &str,
|
|
||||||
) -> Result<url::Url, FullTextParserError> {
|
|
||||||
let mut completed_url = article_url.scheme().to_owned();
|
|
||||||
completed_url.push(':');
|
|
||||||
|
|
||||||
if !incomplete_url.starts_with("//") {
|
|
||||||
match article_url.host() {
|
|
||||||
Some(url::Host::Domain(host)) => {
|
|
||||||
completed_url.push_str("//");
|
|
||||||
completed_url.push_str(host);
|
|
||||||
}
|
|
||||||
_ => return Err(FullTextParserError::Scrape),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
if !completed_url.ends_with('/') && !incomplete_url.starts_with('/') {
|
|
||||||
completed_url.push('/');
|
|
||||||
}
|
|
||||||
completed_url.push_str(incomplete_url);
|
|
||||||
let url = url::Url::parse(&completed_url)?;
|
|
||||||
Ok(url)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn fix_urls(context: &Context, url: &Url) {
|
fn fix_urls(context: &Context, url: &Url) {
|
||||||
let _ = Self::repair_urls(context, "//img", "src", url);
|
let _ = Self::repair_urls(context, "//img", "src", url);
|
||||||
let _ = Self::repair_urls(context, "//a", "src", url);
|
let _ = Self::repair_urls(context, "//a", "src", url);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue