1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-09 00:45:31 +02:00

fix url completion for hash urls

This commit is contained in:
Jan Lukas Gernert 2023-03-30 21:27:35 +02:00
parent b52212bf34
commit 027fab7602
10 changed files with 895 additions and 170 deletions

View file

@ -49,7 +49,7 @@ pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
});
pub static HAS_CONTENT: Lazy<Regex> =
Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^#.+"#).expect("HASH_URL regex"));
pub static POSITIVE: Lazy<Regex> =
Lazy::new(|| {
RegexBuilder::new(

View file

@ -551,24 +551,6 @@ impl FullTextParser {
Ok(())
}
fn add_attribute(
context: &Context,
tag: Option<&str>,
attribute: &str,
value: &str,
) -> Result<(), FullTextParserError> {
let xpath_tag = tag.unwrap_or("*");
let xpath = &format!("//{}", xpath_tag);
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if let Err(err) = node.set_attribute(attribute, value) {
log::warn!("Failed to set attribute '{}' on node: {}", attribute, err);
}
}
Ok(())
}
fn repair_urls(
context: &Context,
xpath: &str,
@ -580,13 +562,21 @@ impl FullTextParser {
for mut node in node_vec {
if let Some(url) = node.get_attribute(attribute) {
let trimmed_url = url.trim();
let is_hash_url = url.starts_with('#');
let is_relative_url = url::Url::parse(&url)
.err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false);
let is_javascript = trimmed_url.contains("javascript:");
if is_relative_url {
if !is_hash_url && node.get_name().to_uppercase() == "A" {
_ = node.set_attribute("target", "_blank");
}
if is_hash_url {
_ = node.set_attribute(attribute, trimmed_url);
} else if is_relative_url {
let completed_url = match article_url.join(trimmed_url) {
Ok(joined_url) => joined_url,
Err(_) => continue,
@ -697,7 +687,6 @@ impl FullTextParser {
_ = Self::fix_lazy_images(context, document);
_ = Self::fix_iframe_size(context, "youtube.com");
_ = Self::remove_attribute(context, Some("a"), "onclick");
_ = Self::add_attribute(context, Some("a"), "target", "_blank");
// strip elements using Readability.com and Instapaper.com ignore class names
// .entry-unrelated and .instapaper_ignore

View file

@ -578,8 +578,6 @@ impl Readability {
let text = Util::get_inner_text(&article_content, true);
let text_length = text.len();
//Util::serialize_node(&article_content, "dbg.html");
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
parse_successful = false;

View file

@ -337,10 +337,10 @@ async fn medium_3() {
run_test("medium-3").await
}
// #[tokio::test]
// async fn mercurial() {
// run_test("mercurial").await
// }
#[tokio::test]
async fn mercurial() {
run_test("mercurial").await
}
#[tokio::test]
async fn metadata_content_missing() {