1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 08:30:00 +02:00

Compare commits

..

2 commits

Author SHA1 Message Date
Jan Lukas Gernert
4c9709e292 bump version 2025-07-07 18:56:16 +02:00
Jan Lukas Gernert
a23a691c31 clean html fragment: don't remove same page links & footnotes 2025-07-07 18:03:45 +02:00
9 changed files with 69 additions and 43 deletions

View file

@ -1,17 +1,16 @@
stages:
- build
run-build:
stage: build
image: rust:1.86
before_script:
- rustup component add rustfmt
- rustup component add clippy
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
- rustup component add rustfmt
- rustup component add clippy
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
script:
- rustc --version && cargo --version
- echo $LIBXML2
- cargo fmt -- --check
- cargo clippy --all-targets --all-features -- -D warnings
- cargo build --release
- rustc --version && cargo --version
- echo $LIBXML2
- cargo fmt -- --check
- cargo clippy --all-targets --all-features -- -D warnings
- cargo build --release

View file

@ -3,8 +3,8 @@ members = ["article_scraper", "article_scraper_cli"]
resolver = "2"
[workspace.package]
version = "2.1.2"
version = "2.1.3"
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
edition = "2021"
license = "GPL-3.0-or-later"
repository = "https://gitlab.com/news-flash/article_scraper"
repository = "https://gitlab.com/news-flash/article_scraper"

View file

@ -23,8 +23,8 @@ chrono = "0.4"
base64 = "0.22"
image = "0.25"
log = "0.4"
rust-embed="8.6"
once_cell = "1.20"
rust-embed = "8.7"
once_cell = "1.21"
escaper = "0.1"
futures = "0.3"
unic-emoji-char = "0.9"

File diff suppressed because one or more lines are too long

View file

@ -158,7 +158,7 @@ fn extract_date(
fn get_meta(context: &Context, name: &str) -> Option<String> {
Util::get_attribute(
context,
&format!("//meta[contains(@name, '{}')]", name),
&format!("//meta[contains(@name, '{name}')]"),
"content",
)
.ok()

View file

@ -722,7 +722,7 @@ impl FullTextParser {
}
fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> {
let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
let xpath = &format!("//iframe[contains(@src, '{site_name}')]");
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if node.is_null() {
@ -761,7 +761,7 @@ impl FullTextParser {
) -> Result<(), FullTextParserError> {
let xpath_tag = tag.unwrap_or("*");
let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
let xpath = &format!("//{xpath_tag}[@{attribute}]");
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
if let Err(err) = node.remove_property(attribute) {
@ -791,17 +791,16 @@ impl FullTextParser {
if let Some(url) = node.get_attribute(attribute) {
let trimmed_url = url.trim();
let is_hash_url = url.starts_with('#');
if url.starts_with('#') || url.starts_with("\\#") {
continue;
}
let is_relative_url = url::Url::parse(&url)
.err()
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
.unwrap_or(false);
let is_javascript = trimmed_url.contains("javascript:");
if !is_hash_url && node.get_name().to_uppercase() == "A" {
_ = node.set_attribute("target", "_blank");
}
if let Some(srcset) = node.get_attribute("srcset") {
let res = constants::SRC_SET_URL
.captures_iter(&srcset)
@ -832,9 +831,7 @@ impl FullTextParser {
_ = node.set_attribute("srcset", res.as_str());
}
if is_hash_url {
_ = node.set_attribute(attribute, trimmed_url);
} else if is_relative_url {
if is_relative_url {
let completed_url = match article_url.join(trimmed_url) {
Ok(joined_url) => joined_url,
Err(_) => continue,
@ -947,7 +944,7 @@ impl FullTextParser {
for xpath_strip_img_src in &config.strip_image_src {
_ = Util::strip_node(
context,
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
);
}
}
@ -955,7 +952,7 @@ impl FullTextParser {
for xpath_strip_img_src in &global_config.strip_image_src {
_ = Util::strip_node(
context,
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
);
}
@ -1270,7 +1267,7 @@ impl FullTextParser {
Ok(())
}
fn remove_single_cell_tables(root: &mut Node) {
pub(crate) fn remove_single_cell_tables(root: &mut Node) {
let mut node_iter = Some(root.clone());
while let Some(node) = node_iter {
@ -1308,7 +1305,7 @@ impl FullTextParser {
}
}
fn remove_extra_p_and_div(root: &mut Node) {
pub(crate) fn remove_extra_p_and_div(root: &mut Node) {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
@ -1330,7 +1327,7 @@ impl FullTextParser {
}
}
fn remove_share_elements(root: &mut Node) {
pub(crate) fn remove_share_elements(root: &mut Node) {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
@ -1350,7 +1347,7 @@ impl FullTextParser {
}
}
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
pub(crate) fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {
@ -1439,7 +1436,7 @@ impl FullTextParser {
Ok(())
}
fn remove_empty_nodes(root: &mut Node) {
pub(crate) fn remove_empty_nodes(root: &mut Node) {
let mut node_iter = Some(root.clone());
while let Some(mut node) = node_iter {

View file

@ -276,7 +276,7 @@ impl ImageDownloader {
}
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
let image_string = format!("data:{};base64,{}", content_type, image_base64);
let image_string = format!("data:{content_type};base64,{image_base64}");
let image_data_base64 = ImageDataBase64 {
url: image.url,
data: image_string,

View file

@ -264,17 +264,15 @@ impl Util {
context: &Context,
id_or_class: &str,
) -> Result<(), FullTextParserError> {
let xpath = &format!(
"//*[contains(@class, '{}') or contains(@id, '{}')]",
id_or_class, id_or_class
);
let xpath =
&format!("//*[contains(@class, '{id_or_class}') or contains(@id, '{id_or_class}')]");
let mut ancestor = xpath.clone();
if ancestor.starts_with("//") {
ancestor = ancestor.chars().skip(2).collect();
}
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
let query = &format!("{xpath}[not(ancestor::{ancestor})]");
let node_vec = Util::evaluate_xpath(context, query, false)?;
for mut node in node_vec {
if node.is_null() {

View file

@ -9,11 +9,22 @@ repository.workspace = true
[dependencies]
article_scraper = { path = "../article_scraper/" }
clap = { version = "4.5", features = [ "derive" ] }
article_scraper = { path = "../article_scraper/" }
clap = { version = "4.5", features = ["derive"] }
simplelog = "0.12"
log = "0.4"
url = "2.5"
reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
indicatif = "0.17"
reqwest = { version = "0.12", features = [
"json",
"native-tls",
"gzip",
"brotli",
"stream",
] }
tokio = { version = "1", features = [
"macros",
"fs",
"io-util",
"rt-multi-thread",
] }
indicatif = "0.18"