fix url completion for hash urls

2025-07-09 00:45:31 +02:00 · 2023-03-30 21:27:35 +02:00 · 2023-03-30 21:27:35 +02:00 · 027fab7602
commit 027fab7602
parent b52212bf34
10 changed files with 895 additions and 170 deletions
--- a/src/constants.rs
+++ b/src/constants.rs
@ -49,7 +49,7 @@ pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
 });
 pub static HAS_CONTENT: Lazy<Regex> =
    Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
-pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
+pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^#.+"#).expect("HASH_URL regex"));
 pub static POSITIVE: Lazy<Regex> =
    Lazy::new(|| {
        RegexBuilder::new(
--- a/src/full_text_parser/mod.rs
+++ b/src/full_text_parser/mod.rs
@ -551,24 +551,6 @@ impl FullTextParser {
        Ok(())
    }

-    fn add_attribute(
-        context: &Context,
-        tag: Option<&str>,
-        attribute: &str,
-        value: &str,
-    ) -> Result<(), FullTextParserError> {
-        let xpath_tag = tag.unwrap_or("*");
-
-        let xpath = &format!("//{}", xpath_tag);
-        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
-        for mut node in node_vec {
-            if let Err(err) = node.set_attribute(attribute, value) {
-                log::warn!("Failed to set attribute '{}' on node: {}", attribute, err);
-            }
-        }
-        Ok(())
-    }
-
    fn repair_urls(
        context: &Context,
        xpath: &str,
@ -580,13 +562,21 @@ impl FullTextParser {
        for mut node in node_vec {
            if let Some(url) = node.get_attribute(attribute) {
                let trimmed_url = url.trim();
+
+                let is_hash_url = url.starts_with('#');
                let is_relative_url = url::Url::parse(&url)
                    .err()
                    .map(|err| err == url::ParseError::RelativeUrlWithoutBase)
                    .unwrap_or(false);
                let is_javascript = trimmed_url.contains("javascript:");

-                if is_relative_url {
+                if !is_hash_url && node.get_name().to_uppercase() == "A" {
+                    _ = node.set_attribute("target", "_blank");
+                }
+
+                if is_hash_url {
+                    _ = node.set_attribute(attribute, trimmed_url);
+                } else if is_relative_url {
                    let completed_url = match article_url.join(trimmed_url) {
                        Ok(joined_url) => joined_url,
                        Err(_) => continue,
@ -697,7 +687,6 @@ impl FullTextParser {
        _ = Self::fix_lazy_images(context, document);
        _ = Self::fix_iframe_size(context, "youtube.com");
        _ = Self::remove_attribute(context, Some("a"), "onclick");
-        _ = Self::add_attribute(context, Some("a"), "target", "_blank");

        // strip elements using Readability.com and Instapaper.com ignore class names
        // .entry-unrelated and .instapaper_ignore
--- a/src/full_text_parser/readability/mod.rs
+++ b/src/full_text_parser/readability/mod.rs
@ -578,8 +578,6 @@ impl Readability {
            let text = Util::get_inner_text(&article_content, true);
            let text_length = text.len();

-            //Util::serialize_node(&article_content, "dbg.html");
-
            if text_length < constants::DEFAULT_CHAR_THRESHOLD {
                parse_successful = false;

--- a/src/full_text_parser/readability/tests.rs
+++ b/src/full_text_parser/readability/tests.rs
@ -337,10 +337,10 @@ async fn medium_3() {
    run_test("medium-3").await
 }

-// #[tokio::test]
-// async fn mercurial() {
-//     run_test("mercurial").await
-// }
+#[tokio::test]
+async fn mercurial() {
+    run_test("mercurial").await
+}

 #[tokio::test]
 async fn metadata_content_missing() {