mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
Compare commits
No commits in common. "master" and "article_scraper-v2.1.1" have entirely different histories.
master
...
article_sc
11 changed files with 97 additions and 154 deletions
|
@ -1,16 +1,15 @@
|
||||||
stages:
|
stages:
|
||||||
- build
|
- build
|
||||||
|
|
||||||
|
|
||||||
run-build:
|
run-build:
|
||||||
stage: build
|
stage: build
|
||||||
image: rust:1.86
|
image: rust:1.79
|
||||||
before_script:
|
before_script:
|
||||||
- rustup component add rustfmt
|
- rustup component add rustfmt
|
||||||
- rustup component add clippy
|
- rustup component add clippy
|
||||||
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
|
|
||||||
script:
|
script:
|
||||||
- rustc --version && cargo --version
|
- rustc --version && cargo --version
|
||||||
- echo $LIBXML2
|
|
||||||
- cargo fmt -- --check
|
- cargo fmt -- --check
|
||||||
- cargo clippy --all-targets --all-features -- -D warnings
|
- cargo clippy --all-targets --all-features -- -D warnings
|
||||||
- cargo build --release
|
- cargo build --release
|
||||||
|
|
|
@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"]
|
||||||
resolver = "2"
|
resolver = "2"
|
||||||
|
|
||||||
[workspace.package]
|
[workspace.package]
|
||||||
version = "2.1.3"
|
version = "2.1.1"
|
||||||
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
|
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
license = "GPL-3.0-or-later"
|
license = "GPL-3.0-or-later"
|
||||||
|
|
|
@ -14,17 +14,17 @@ exclude = ["resources/tests"]
|
||||||
[dependencies]
|
[dependencies]
|
||||||
thiserror = "2.0"
|
thiserror = "2.0"
|
||||||
libxml = "0.3"
|
libxml = "0.3"
|
||||||
reqwest = { version = "0.12", features = ["stream"] }
|
reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||||
tokio = { version = "1", features = ["macros", "fs", "io-util"] }
|
tokio = { version = "1", features = ["macros", "fs", "io-util"] }
|
||||||
url = "2.5"
|
url = "2.5"
|
||||||
regex = "1.11"
|
regex = "1.10"
|
||||||
encoding_rs = "0.8"
|
encoding_rs = "0.8"
|
||||||
chrono = "0.4"
|
chrono = "0.4"
|
||||||
base64 = "0.22"
|
base64 = "0.22"
|
||||||
image = "0.25"
|
image = "0.25"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
rust-embed = "8.7"
|
rust-embed="8.4"
|
||||||
once_cell = "1.21"
|
once_cell = "1.19"
|
||||||
escaper = "0.1"
|
escaper = "0.1"
|
||||||
futures = "0.3"
|
futures = "0.3"
|
||||||
unic-emoji-char = "0.9"
|
unic-emoji-char = "0.9"
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532
|
Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024
|
File diff suppressed because one or more lines are too long
|
@ -6,10 +6,10 @@ use thiserror::Error;
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum ScraperError {
|
pub enum ScraperError {
|
||||||
#[error("Configerror {0}")]
|
#[error("")]
|
||||||
Config(#[from] ConfigError),
|
Config(#[from] ConfigError),
|
||||||
#[error("ImageDownloadError {0}")]
|
#[error("")]
|
||||||
Image(#[from] ImageDownloadError),
|
Image(#[from] ImageDownloadError),
|
||||||
#[error("FullTextParserError {0}")]
|
#[error("")]
|
||||||
Scrap(#[from] FullTextParserError),
|
Scrap(#[from] FullTextParserError),
|
||||||
}
|
}
|
||||||
|
|
|
@ -158,7 +158,7 @@ fn extract_date(
|
||||||
fn get_meta(context: &Context, name: &str) -> Option<String> {
|
fn get_meta(context: &Context, name: &str) -> Option<String> {
|
||||||
Util::get_attribute(
|
Util::get_attribute(
|
||||||
context,
|
context,
|
||||||
&format!("//meta[contains(@name, '{name}')]"),
|
&format!("//meta[contains(@name, '{}')]", name),
|
||||||
"content",
|
"content",
|
||||||
)
|
)
|
||||||
.ok()
|
.ok()
|
||||||
|
|
|
@ -69,11 +69,6 @@ impl FullTextParser {
|
||||||
|
|
||||||
let html = Self::get_body(response).await?;
|
let html = Self::get_body(response).await?;
|
||||||
|
|
||||||
if html.is_empty() {
|
|
||||||
log::error!("Empty response body");
|
|
||||||
return Err(FullTextParserError::Http);
|
|
||||||
}
|
|
||||||
|
|
||||||
// check for fingerprints
|
// check for fingerprints
|
||||||
let config = if config.is_none() {
|
let config = if config.is_none() {
|
||||||
if let Some(url) = Fingerprints::detect(&html) {
|
if let Some(url) = Fingerprints::detect(&html) {
|
||||||
|
@ -269,17 +264,10 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// parse html
|
// parse html
|
||||||
let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||||
FullTextParserError::Xml
|
FullTextParserError::Xml
|
||||||
})?;
|
})
|
||||||
|
|
||||||
if document.get_root_element().is_none() {
|
|
||||||
log::error!("document without root");
|
|
||||||
Err(FullTextParserError::Xml)
|
|
||||||
} else {
|
|
||||||
Ok(document)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// FIXME: Here are some patched functions of libxml crate.
|
/// FIXME: Here are some patched functions of libxml crate.
|
||||||
|
@ -366,32 +354,19 @@ impl FullTextParser {
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
log::error!("Downloading HTML failed: GET '{url}' - '{err}'");
|
log::error!(
|
||||||
|
"Downloading HTML failed: GET '{}' - '{}'",
|
||||||
|
url.as_str(),
|
||||||
|
err
|
||||||
|
);
|
||||||
FullTextParserError::Http
|
FullTextParserError::Http
|
||||||
})?;
|
})?;
|
||||||
Ok(response)
|
Ok(response)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_body(response: Response) -> Result<String, FullTextParserError> {
|
async fn get_body(response: Response) -> Result<String, FullTextParserError> {
|
||||||
let status = response.status();
|
if response.status().is_success() {
|
||||||
if !status.is_success() {
|
|
||||||
log::error!("status code: {status}");
|
|
||||||
return Err(FullTextParserError::Http);
|
|
||||||
}
|
|
||||||
|
|
||||||
let headers = response.headers().clone();
|
let headers = response.headers().clone();
|
||||||
|
|
||||||
if headers
|
|
||||||
.get(reqwest::header::CONTENT_LENGTH)
|
|
||||||
.and_then(|hv| hv.to_str().ok())
|
|
||||||
.and_then(|str| str.parse::<i64>().ok())
|
|
||||||
.map(|content_length| content_length == 0)
|
|
||||||
.unwrap_or(false)
|
|
||||||
{
|
|
||||||
log::error!("Empty response body");
|
|
||||||
return Err(FullTextParserError::Http);
|
|
||||||
}
|
|
||||||
|
|
||||||
let bytes = response
|
let bytes = response
|
||||||
.bytes()
|
.bytes()
|
||||||
.await
|
.await
|
||||||
|
@ -400,14 +375,14 @@ impl FullTextParser {
|
||||||
match from_utf8(&bytes) {
|
match from_utf8(&bytes) {
|
||||||
Ok(utf8_str) => {
|
Ok(utf8_str) => {
|
||||||
log::debug!("Valid utf-8 string");
|
log::debug!("Valid utf-8 string");
|
||||||
Ok(utf8_str.into())
|
return Ok(utf8_str.into());
|
||||||
}
|
}
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
log::debug!("Invalid utf-8 string");
|
log::debug!("Invalid utf-8 string");
|
||||||
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
|
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
|
||||||
|
|
||||||
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
|
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
|
||||||
log::debug!("Encoding extracted from HTML: '{encoding}'");
|
log::debug!("Encoding extracted from HTML: '{}'", encoding);
|
||||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||||
let decoded_html = decoded_html.replacen(
|
let decoded_html = decoded_html.replacen(
|
||||||
&format!("charset=\"{encoding}\""),
|
&format!("charset=\"{encoding}\""),
|
||||||
|
@ -419,7 +394,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
|
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
|
||||||
log::debug!("Encoding extracted from headers: '{encoding}'");
|
log::debug!("Encoding extracted from headers: '{}'", encoding);
|
||||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||||
let decoded_html = decoded_html.replacen(
|
let decoded_html = decoded_html.replacen(
|
||||||
&format!("charset=\"{encoding}\""),
|
&format!("charset=\"{encoding}\""),
|
||||||
|
@ -430,11 +405,14 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(FullTextParserError::Utf8(error))
|
return Err(FullTextParserError::Utf8(error));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Err(FullTextParserError::Http)
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn download(
|
pub async fn download(
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
|
@ -444,13 +422,8 @@ impl FullTextParser {
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
let response = Self::get_response(url, client, headers).await?;
|
let response = Self::get_response(url, client, headers).await?;
|
||||||
let body = Self::get_body(response).await?;
|
let body = Self::get_body(response).await?;
|
||||||
if body.is_empty() {
|
|
||||||
log::error!("Empty response body");
|
|
||||||
Err(FullTextParserError::Http)
|
|
||||||
} else {
|
|
||||||
Ok(body)
|
Ok(body)
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||||
headers
|
headers
|
||||||
|
@ -722,7 +695,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> {
|
fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> {
|
||||||
let xpath = &format!("//iframe[contains(@src, '{site_name}')]");
|
let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if node.is_null() {
|
if node.is_null() {
|
||||||
|
@ -761,7 +734,7 @@ impl FullTextParser {
|
||||||
) -> Result<(), FullTextParserError> {
|
) -> Result<(), FullTextParserError> {
|
||||||
let xpath_tag = tag.unwrap_or("*");
|
let xpath_tag = tag.unwrap_or("*");
|
||||||
|
|
||||||
let xpath = &format!("//{xpath_tag}[@{attribute}]");
|
let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if let Err(err) = node.remove_property(attribute) {
|
if let Err(err) = node.remove_property(attribute) {
|
||||||
|
@ -791,16 +764,17 @@ impl FullTextParser {
|
||||||
if let Some(url) = node.get_attribute(attribute) {
|
if let Some(url) = node.get_attribute(attribute) {
|
||||||
let trimmed_url = url.trim();
|
let trimmed_url = url.trim();
|
||||||
|
|
||||||
if url.starts_with('#') || url.starts_with("\\#") {
|
let is_hash_url = url.starts_with('#');
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let is_relative_url = url::Url::parse(&url)
|
let is_relative_url = url::Url::parse(&url)
|
||||||
.err()
|
.err()
|
||||||
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
|
.map(|err| err == url::ParseError::RelativeUrlWithoutBase)
|
||||||
.unwrap_or(false);
|
.unwrap_or(false);
|
||||||
let is_javascript = trimmed_url.contains("javascript:");
|
let is_javascript = trimmed_url.contains("javascript:");
|
||||||
|
|
||||||
|
if !is_hash_url && node.get_name().to_uppercase() == "A" {
|
||||||
|
_ = node.set_attribute("target", "_blank");
|
||||||
|
}
|
||||||
|
|
||||||
if let Some(srcset) = node.get_attribute("srcset") {
|
if let Some(srcset) = node.get_attribute("srcset") {
|
||||||
let res = constants::SRC_SET_URL
|
let res = constants::SRC_SET_URL
|
||||||
.captures_iter(&srcset)
|
.captures_iter(&srcset)
|
||||||
|
@ -831,7 +805,9 @@ impl FullTextParser {
|
||||||
_ = node.set_attribute("srcset", res.as_str());
|
_ = node.set_attribute("srcset", res.as_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if is_relative_url {
|
if is_hash_url {
|
||||||
|
_ = node.set_attribute(attribute, trimmed_url);
|
||||||
|
} else if is_relative_url {
|
||||||
let completed_url = match article_url.join(trimmed_url) {
|
let completed_url = match article_url.join(trimmed_url) {
|
||||||
Ok(joined_url) => joined_url,
|
Ok(joined_url) => joined_url,
|
||||||
Err(_) => continue,
|
Err(_) => continue,
|
||||||
|
@ -944,7 +920,7 @@ impl FullTextParser {
|
||||||
for xpath_strip_img_src in &config.strip_image_src {
|
for xpath_strip_img_src in &config.strip_image_src {
|
||||||
_ = Util::strip_node(
|
_ = Util::strip_node(
|
||||||
context,
|
context,
|
||||||
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
|
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -952,7 +928,7 @@ impl FullTextParser {
|
||||||
for xpath_strip_img_src in &global_config.strip_image_src {
|
for xpath_strip_img_src in &global_config.strip_image_src {
|
||||||
_ = Util::strip_node(
|
_ = Util::strip_node(
|
||||||
context,
|
context,
|
||||||
&format!("//img[contains(@src,'{xpath_strip_img_src}')]"),
|
&format!("//img[contains(@src,'{}')]", xpath_strip_img_src),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1267,7 +1243,7 @@ impl FullTextParser {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn remove_single_cell_tables(root: &mut Node) {
|
fn remove_single_cell_tables(root: &mut Node) {
|
||||||
let mut node_iter = Some(root.clone());
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
while let Some(node) = node_iter {
|
while let Some(node) = node_iter {
|
||||||
|
@ -1305,7 +1281,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn remove_extra_p_and_div(root: &mut Node) {
|
fn remove_extra_p_and_div(root: &mut Node) {
|
||||||
let mut node_iter = Some(root.clone());
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
while let Some(mut node) = node_iter {
|
while let Some(mut node) = node_iter {
|
||||||
|
@ -1327,7 +1303,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn remove_share_elements(root: &mut Node) {
|
fn remove_share_elements(root: &mut Node) {
|
||||||
let mut node_iter = Some(root.clone());
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
while let Some(mut node) = node_iter {
|
while let Some(mut node) = node_iter {
|
||||||
|
@ -1347,7 +1323,7 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
|
fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> {
|
||||||
let mut node_iter = Some(root.clone());
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
while let Some(mut node) = node_iter {
|
while let Some(mut node) = node_iter {
|
||||||
|
@ -1436,7 +1412,7 @@ impl FullTextParser {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn remove_empty_nodes(root: &mut Node) {
|
fn remove_empty_nodes(root: &mut Node) {
|
||||||
let mut node_iter = Some(root.clone());
|
let mut node_iter = Some(root.clone());
|
||||||
|
|
||||||
while let Some(mut node) = node_iter {
|
while let Some(mut node) = node_iter {
|
||||||
|
|
|
@ -219,9 +219,7 @@ impl ImageDownloader {
|
||||||
let mut image_urls = Vec::new();
|
let mut image_urls = Vec::new();
|
||||||
|
|
||||||
for node in node_vec {
|
for node in node_vec {
|
||||||
if let Ok(url) = Self::harvest_image_urls_from_node(node) {
|
image_urls.push(Self::harvest_image_urls_from_node(node)?);
|
||||||
image_urls.push(url);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(image_urls)
|
Ok(image_urls)
|
||||||
|
@ -276,7 +274,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
|
let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data);
|
||||||
let image_string = format!("data:{content_type};base64,{image_base64}");
|
let image_string = format!("data:{};base64,{}", content_type, image_base64);
|
||||||
let image_data_base64 = ImageDataBase64 {
|
let image_data_base64 = ImageDataBase64 {
|
||||||
url: image.url,
|
url: image.url,
|
||||||
data: image_string,
|
data: image_string,
|
||||||
|
|
|
@ -264,15 +264,17 @@ impl Util {
|
||||||
context: &Context,
|
context: &Context,
|
||||||
id_or_class: &str,
|
id_or_class: &str,
|
||||||
) -> Result<(), FullTextParserError> {
|
) -> Result<(), FullTextParserError> {
|
||||||
let xpath =
|
let xpath = &format!(
|
||||||
&format!("//*[contains(@class, '{id_or_class}') or contains(@id, '{id_or_class}')]");
|
"//*[contains(@class, '{}') or contains(@id, '{}')]",
|
||||||
|
id_or_class, id_or_class
|
||||||
|
);
|
||||||
|
|
||||||
let mut ancestor = xpath.clone();
|
let mut ancestor = xpath.clone();
|
||||||
if ancestor.starts_with("//") {
|
if ancestor.starts_with("//") {
|
||||||
ancestor = ancestor.chars().skip(2).collect();
|
ancestor = ancestor.chars().skip(2).collect();
|
||||||
}
|
}
|
||||||
|
|
||||||
let query = &format!("{xpath}[not(ancestor::{ancestor})]");
|
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
||||||
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if node.is_null() {
|
if node.is_null() {
|
||||||
|
|
|
@ -10,21 +10,10 @@ repository.workspace = true
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
article_scraper = { path = "../article_scraper/" }
|
article_scraper = { path = "../article_scraper/" }
|
||||||
clap = { version = "4.5", features = ["derive"] }
|
clap = { version = "4.5", features = [ "derive" ] }
|
||||||
simplelog = "0.12"
|
simplelog = "0.12"
|
||||||
log = "0.4"
|
log = "0.4"
|
||||||
url = "2.5"
|
url = "2.5"
|
||||||
reqwest = { version = "0.12", features = [
|
reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||||
"json",
|
tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] }
|
||||||
"native-tls",
|
indicatif = "0.17"
|
||||||
"gzip",
|
|
||||||
"brotli",
|
|
||||||
"stream",
|
|
||||||
] }
|
|
||||||
tokio = { version = "1", features = [
|
|
||||||
"macros",
|
|
||||||
"fs",
|
|
||||||
"io-util",
|
|
||||||
"rt-multi-thread",
|
|
||||||
] }
|
|
||||||
indicatif = "0.18"
|
|
Loading…
Add table
Add a link
Reference in a new issue