mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
add clean links test
This commit is contained in:
parent
c5c6b788c8
commit
3ece2522bb
5 changed files with 3258 additions and 178 deletions
|
@ -502,16 +502,20 @@ impl FullTextParser {
|
|||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if let Some(url) = node.get_attribute(attribute) {
|
||||
let trimmed_url = url.trim();
|
||||
let is_relative_url = url::Url::parse(&url)
|
||||
.err()
|
||||
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
|
||||
.unwrap_or(false);
|
||||
|
||||
if is_relative_url {
|
||||
let completed_url = article_url.join(&url)?;
|
||||
node.set_attribute(attribute, completed_url.as_str())
|
||||
.map_err(|_| FullTextParserError::Scrape)?;
|
||||
}
|
||||
let completed_url = if is_relative_url {
|
||||
article_url.join(trimmed_url)?
|
||||
} else {
|
||||
Url::parse(trimmed_url)?
|
||||
};
|
||||
|
||||
node.set_attribute(attribute, completed_url.as_str())
|
||||
.map_err(|_| FullTextParserError::Scrape)?;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
|
@ -867,7 +871,7 @@ impl FullTextParser {
|
|||
Util::clean_conditionally(&mut root, "ul");
|
||||
Util::clean_conditionally(&mut root, "div");
|
||||
|
||||
Self::clean_classes(&mut root)?;
|
||||
Self::clean_attributes(&mut root)?;
|
||||
Self::simplify_nested_elements(&mut root)?;
|
||||
}
|
||||
|
||||
|
@ -895,7 +899,7 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
fn clean_classes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
|
@ -904,6 +908,11 @@ impl FullTextParser {
|
|||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
node.remove_attribute("align").map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
node.remove_attribute(constants::SCORE_ATTR).map_err(|e| {
|
||||
log::error!("{e}");
|
||||
FullTextParserError::Xml
|
||||
|
@ -915,6 +924,10 @@ impl FullTextParser {
|
|||
FullTextParserError::Xml
|
||||
})?;
|
||||
|
||||
if node.get_name().to_uppercase() == "FONT" {
|
||||
node.set_name("span").unwrap();
|
||||
}
|
||||
|
||||
node_iter = Util::next_node(&node, false);
|
||||
}
|
||||
Ok(())
|
||||
|
|
|
@ -19,7 +19,7 @@ async fn run_test(name: &str) {
|
|||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config);
|
||||
|
||||
|
||||
crate::FullTextParser::fix_urls(&xpath_ctx, &url);
|
||||
let mut article = Article {
|
||||
title: None,
|
||||
|
@ -126,6 +126,11 @@ async fn citylab_1() {
|
|||
run_test("citylab-1").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn clean_links() {
|
||||
run_test("clean-links").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn webmd_1() {
|
||||
run_test("webmd-1").await
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue