1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

add clean links test

This commit is contained in:
Jan Lukas Gernert 2023-03-09 21:24:29 +01:00
parent c5c6b788c8
commit 3ece2522bb
5 changed files with 3258 additions and 178 deletions

View file

@ -502,16 +502,20 @@ impl FullTextParser {
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if let Some(url) = node.get_attribute(attribute) {
let trimmed_url = url.trim();
let is_relative_url = url::Url::parse(&url)
.err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false);
if is_relative_url {
let completed_url = article_url.join(&url)?;
node.set_attribute(attribute, completed_url.as_str())
.map_err(|_| FullTextParserError::Scrape)?;
}
let completed_url = if is_relative_url {
article_url.join(trimmed_url)?
} else {
Url::parse(trimmed_url)?
};
node.set_attribute(attribute, completed_url.as_str())
.map_err(|_| FullTextParserError::Scrape)?;
}
}
Ok(())
@ -867,7 +871,7 @@ impl FullTextParser {
Util::clean_conditionally(&mut root, "ul");
Util::clean_conditionally(&mut root, "div");
Self::clean_classes(&mut root)?;
Self::clean_attributes(&mut root)?;
Self::simplify_nested_elements(&mut root)?;
}
@ -895,7 +899,7 @@ impl FullTextParser {
}
}
fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
@ -904,6 +908,11 @@ impl FullTextParser {
FullTextParserError::Xml
})?;
node.remove_attribute("align").map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
})?;
node.remove_attribute(constants::SCORE_ATTR).map_err(|e| {
log::error!("{e}");
FullTextParserError::Xml
@ -915,6 +924,10 @@ impl FullTextParser {
FullTextParserError::Xml
})?;
if node.get_name().to_uppercase() == "FONT" {
node.set_name("span").unwrap();
}
node_iter = Util::next_node(&node, false);
}
Ok(())

View file

@ -19,7 +19,7 @@ async fn run_test(name: &str) {
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
crate::FullTextParser::fix_urls(&xpath_ctx, &url);
let mut article = Article {
title: None,
@ -126,6 +126,11 @@ async fn citylab_1() {
run_test("citylab-1").await
}
#[tokio::test]
async fn clean_links() {
run_test("clean-links").await
}
#[tokio::test]
async fn webmd_1() {
run_test("webmd-1").await