1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

add clean links test

This commit is contained in:
Jan Lukas Gernert 2023-03-09 21:24:29 +01:00
parent c5c6b788c8
commit 3ece2522bb
5 changed files with 3258 additions and 178 deletions

View file

@ -502,16 +502,20 @@ impl FullTextParser {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if let Some(url) = node.get_attribute(attribute) {
let trimmed_url = url.trim();
let is_relative_url = url::Url::parse(&url)
.err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false);
if is_relative_url {
let completed_url = article_url.join(&url)?;
node.set_attribute(attribute, completed_url.as_str())
.map_err(|_| FullTextParserError::Scrape)?;
}
let completed_url = if is_relative_url {
article_url.join(trimmed_url)?
} else {
Url::parse(trimmed_url)?
};
node.set_attribute(attribute, completed_url.as_str())
.map_err(|_| FullTextParserError::Scrape)?;
}
}
Ok(())
@ -867,7 +871,7 @@ impl FullTextParser {
Util::clean_conditionally(&mut root, "ul");
Util::clean_conditionally(&mut root, "div");
Self::clean_classes(&mut root)?;
Self::clean_attributes(&mut root)?;
Self::simplify_nested_elements(&mut root)?;
}
@ -895,7 +899,7 @@ impl FullTextParser {
}
}
fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
@ -904,6 +908,11 @@ impl FullTextParser {
FullTextParserError::Xml
})?;
node.remove_attribute("align").map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
node.remove_attribute(constants::SCORE_ATTR).map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
@ -915,6 +924,10 @@ impl FullTextParser {
FullTextParserError::Xml
})?;
if node.get_name().to_uppercase() == "FONT" {
node.set_name("span").unwrap();
}
node_iter = Util::next_node(&node, false);
}
Ok(())