mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 08:30:00 +02:00
Compare commits
2 commits
article_sc
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
4c9709e292 | ||
|
a23a691c31 |
9 changed files with 69 additions and 43 deletions
|
@ -1,7 +1,6 @@
|
|||
stages:
|
||||
- build
|
||||
|
||||
|
||||
run-build:
|
||||
stage: build
|
||||
image: rust:1.86
|
||||
|
|
|
@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"]
|
|||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
version = "2.1.2"
|
||||
version = "2.1.3"
|
||||
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
|
||||
edition = "2021"
|
||||
license = "GPL-3.0-or-later"
|
||||
|
|
|
@ -23,8 +23,8 @@ chrono = "0.4"
|
|||
base64 = "0.22"
|
||||
image = "0.25"
|
||||
log = "0.4"
|
||||
rust-embed="8.6"
|
||||
once_cell = "1.20"
|
||||
rust-embed = "8.7"
|
||||
once_cell = "1.21"
|
||||
escaper = "0.1"
|
||||
futures = "0.3"
|
||||
unic-emoji-char = "0.9"
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -158,7 +158,7 @@ fn extract_date(
|
|||
fn get_meta(context: &Context, name: &str) -> Option<String> {
|
||||
Util::get_attribute(
|
||||
context,
|
||||
&format!("//meta[contains(@name, '{}')]", name),
|
||||
&format!("//meta[contains(@name, '{name}')]"),
|
||||
"content",
|
||||
)
|
||||
.ok()
|
||||
|
|
|
@ -722,7 +722,7 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> {
|
||||
let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
|
||||
let xpath = &format!("//iframe[contains(@src, '{site_name}')]");
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.is_null() {
|
||||
|
@ -761,7 +761,7 @@ impl FullTextParser {
|
|||
) -> Result<(), FullTextParserError> {
|
||||
let xpath_tag = tag.unwrap_or("*");
|
||||
|
||||
let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
|
||||
let xpath = &format!("//{xpath_tag}[@{attribute}]");
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
for mut node in node_vec {
|
||||
if let Err(err) = node.remove_property(attribute) {
|
||||
|
@ -791,17 +791,16 @@ impl FullTextParser {
|
|||
if let Some(url) = node.get_attribute(attribute) {
|
||||
let trimmed_url = url.trim();
|
||||
|
||||
let is_hash_url = url.starts_with('#');
|
||||
if url.starts_with('#') || url.starts_with("\\#") {
|
||||
continue;
|
||||
}
|
||||
|
||||
let is_relative_url = url::Url::parse(&url)
|
||||
.err()
|
||||
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
|
||||
.unwrap_or(false);
|
||||
let is_javascript = trimmed_url.contains("javascript:");
|
||||
|
||||
if !is_hash_url && node.get_name().to_uppercase() == "A" {
|
||||
_ = node.set_attribute("target", "_blank");
|
||||
}
|
||||
|
||||
if let Some(srcset) = node.get_attribute("srcset") {
|
||||
let res = constants::SRC_SET_URL
|
||||
.captures_iter(&srcset)
|
||||
|
@ -832,9 +831,7 @@ impl FullTextParser {
|
|||
_ = node.set_attribute("srcset", res.as_str());
|
||||
}
|
||||
|
||||
if is_hash_url {
|
||||
_ = node.set_attribute(attribute, trimmed_url);
|
||||
} else if is_relative_url {
|
||||
if is_relative_url {
|
||||
let completed_url = match article_url.join(trimmed_url) {
|
||||
Ok(joined_url) => joined_url,
|
||||
Err(_) => continue,
|
||||
|
@ -947,7 +944,7 @@ impl FullTextParser {
|
|||
for xpath_strip_img_src in &config.strip_image_src {
|
||||
_ = Util::strip_node(
|
||||
context,
|
||||
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -955,7 +952,7 @@ impl FullTextParser {
|
|||
for xpath_strip_img_src in &global_config.strip_image_src {
|
||||
_ = Util::strip_node(
|
||||
context,
|
||||
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1270,7 +1267,7 @@ impl FullTextParser {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_single_cell_tables(root: &mut Node) {
|
||||
pub(crate) fn remove_single_cell_tables(root: &mut Node) {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(node) = node_iter {
|
||||
|
@ -1308,7 +1305,7 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
fn remove_extra_p_and_div(root: &mut Node) {
|
||||
pub(crate) fn remove_extra_p_and_div(root: &mut Node) {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
|
@ -1330,7 +1327,7 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
fn remove_share_elements(root: &mut Node) {
|
||||
pub(crate) fn remove_share_elements(root: &mut Node) {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
|
@ -1350,7 +1347,7 @@ impl FullTextParser {
|
|||
}
|
||||
}
|
||||
|
||||
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
pub(crate) fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
|
@ -1439,7 +1436,7 @@ impl FullTextParser {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_empty_nodes(root: &mut Node) {
|
||||
pub(crate) fn remove_empty_nodes(root: &mut Node) {
|
||||
let mut node_iter = Some(root.clone());
|
||||
|
||||
while let Some(mut node) = node_iter {
|
||||
|
|
|
@ -276,7 +276,7 @@ impl ImageDownloader {
|
|||
}
|
||||
|
||||
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
|
||||
let image_string = format!("data:{};base64,{}", content_type, image_base64);
|
||||
let image_string = format!("data:{content_type};base64,{image_base64}");
|
||||
let image_data_base64 = ImageDataBase64 {
|
||||
url: image.url,
|
||||
data: image_string,
|
||||
|
|
|
@ -264,17 +264,15 @@ impl Util {
|
|||
context: &Context,
|
||||
id_or_class: &str,
|
||||
) -> Result<(), FullTextParserError> {
|
||||
let xpath = &format!(
|
||||
"//*[contains(@class, '{}') or contains(@id, '{}')]",
|
||||
id_or_class, id_or_class
|
||||
);
|
||||
let xpath =
|
||||
&format!("//*[contains(@class, '{id_or_class}') or contains(@id, '{id_or_class}')]");
|
||||
|
||||
let mut ancestor = xpath.clone();
|
||||
if ancestor.starts_with("//") {
|
||||
ancestor = ancestor.chars().skip(2).collect();
|
||||
}
|
||||
|
||||
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
||||
let query = &format!("{xpath}[not(ancestor::{ancestor})]");
|
||||
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
||||
for mut node in node_vec {
|
||||
if node.is_null() {
|
||||
|
|
|
@ -10,10 +10,21 @@ repository.workspace = true
|
|||
|
||||
[dependencies]
|
||||
article_scraper = { path = "../article_scraper/" }
|
||||
clap = { version = "4.5", features = [ "derive" ] }
|
||||
clap = { version = "4.5", features = ["derive"] }
|
||||
simplelog = "0.12"
|
||||
log = "0.4"
|
||||
url = "2.5"
|
||||
reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||
tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
||||
indicatif = "0.17"
|
||||
reqwest = { version = "0.12", features = [
|
||||
"json",
|
||||
"native-tls",
|
||||
"gzip",
|
||||
"brotli",
|
||||
"stream",
|
||||
] }
|
||||
tokio = { version = "1", features = [
|
||||
"macros",
|
||||
"fs",
|
||||
"io-util",
|
||||
"rt-multi-thread",
|
||||
] }
|
||||
indicatif = "0.18"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue