mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-09 00:45:31 +02:00
fix url completion for hash urls
This commit is contained in:
parent
b52212bf34
commit
027fab7602
10 changed files with 895 additions and 170 deletions
|
@ -49,7 +49,7 @@ pub static OKAY_MAYBE_ITS_A_CANDIDATE: Lazy<Regex> = Lazy::new(|| {
|
|||
});
|
||||
pub static HAS_CONTENT: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"/\S$/"#).expect("HAS_CONTENT regex"));
|
||||
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"/^#.+/"#).expect("HASH_URL regex"));
|
||||
pub static HASH_URL: Lazy<Regex> = Lazy::new(|| Regex::new(r#"^#.+"#).expect("HASH_URL regex"));
|
||||
pub static POSITIVE: Lazy<Regex> =
|
||||
Lazy::new(|| {
|
||||
RegexBuilder::new(
|
||||
|
|
|
@ -551,24 +551,6 @@ impl FullTextParser {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn add_attribute(
|
||||
context: &Context,
|
||||
tag: Option<&str>,
|
||||
attribute: &str,
|
||||
value: &str,
|
||||
) -> Result<(), FullTextParserError> {
|
||||
let xpath_tag = tag.unwrap_or("*");
|
||||
|
||||
let xpath = &format!("//{}", xpath_tag);
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if let Err(err) = node.set_attribute(attribute, value) {
|
||||
log::warn!("Failed to set attribute '{}' on node: {}", attribute, err);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn repair_urls(
|
||||
context: &Context,
|
||||
xpath: &str,
|
||||
|
@ -580,13 +562,21 @@ impl FullTextParser {
|
|||
for mut node in node_vec {
|
||||
if let Some(url) = node.get_attribute(attribute) {
|
||||
let trimmed_url = url.trim();
|
||||
|
||||
let is_hash_url = url.starts_with('#');
|
||||
let is_relative_url = url::Url::parse(&url)
|
||||
.err()
|
||||
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
|
||||
.unwrap_or(false);
|
||||
let is_javascript = trimmed_url.contains("javascript:");
|
||||
|
||||
if is_relative_url {
|
||||
if !is_hash_url && node.get_name().to_uppercase() == "A" {
|
||||
_ = node.set_attribute("target", "_blank");
|
||||
}
|
||||
|
||||
if is_hash_url {
|
||||
_ = node.set_attribute(attribute, trimmed_url);
|
||||
} else if is_relative_url {
|
||||
let completed_url = match article_url.join(trimmed_url) {
|
||||
Ok(joined_url) => joined_url,
|
||||
Err(_) => continue,
|
||||
|
@ -697,7 +687,6 @@ impl FullTextParser {
|
|||
_ = Self::fix_lazy_images(context, document);
|
||||
_ = Self::fix_iframe_size(context, "youtube.com");
|
||||
_ = Self::remove_attribute(context, Some("a"), "onclick");
|
||||
_ = Self::add_attribute(context, Some("a"), "target", "_blank");
|
||||
|
||||
// strip elements using Readability.com and Instapaper.com ignore class names
|
||||
// .entry-unrelated and .instapaper_ignore
|
||||
|
|
|
@ -578,8 +578,6 @@ impl Readability {
|
|||
let text = Util::get_inner_text(&article_content, true);
|
||||
let text_length = text.len();
|
||||
|
||||
//Util::serialize_node(&article_content, "dbg.html");
|
||||
|
||||
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
|
||||
parse_successful = false;
|
||||
|
||||
|
|
|
@ -337,10 +337,10 @@ async fn medium_3() {
|
|||
run_test("medium-3").await
|
||||
}
|
||||
|
||||
// #[tokio::test]
|
||||
// async fn mercurial() {
|
||||
// run_test("mercurial").await
|
||||
// }
|
||||
#[tokio::test]
|
||||
async fn mercurial() {
|
||||
run_test("mercurial").await
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn metadata_content_missing() {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue