mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
Compare commits
11 commits
article_sc
...
master
Author | SHA1 | Date | |
---|---|---|---|
|
9f349f8c6f | ||
|
498008f630 | ||
|
ee53f58aeb | ||
|
06990acbc0 | ||
|
f361392c04 | ||
|
9b374a28c7 | ||
|
b92500fca2 | ||
|
0978335d3b | ||
|
9f56ed03b8 | ||
|
8cfcd6d9f3 | ||
|
ca1cc47af1 |
7 changed files with 95 additions and 64 deletions
|
@ -4,12 +4,14 @@ stages:
|
|||
|
||||
run-build:
|
||||
stage: build
|
||||
image: rust:1.79
|
||||
image: rust:1.86
|
||||
before_script:
|
||||
- rustup component add rustfmt
|
||||
- rustup component add clippy
|
||||
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
|
||||
script:
|
||||
- rustc --version && cargo --version
|
||||
- echo $LIBXML2
|
||||
- cargo fmt -- --check
|
||||
- cargo clippy --all-targets --all-features -- -D warnings
|
||||
- cargo build --release
|
||||
|
|
|
@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"]
|
|||
resolver = "2"
|
||||
|
||||
[workspace.package]
|
||||
version = "2.1.1"
|
||||
version = "2.1.2"
|
||||
authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
|
||||
edition = "2021"
|
||||
license = "GPL-3.0-or-later"
|
||||
|
|
|
@ -14,17 +14,17 @@ exclude = ["resources/tests"]
|
|||
[dependencies]
|
||||
thiserror = "2.0"
|
||||
libxml = "0.3"
|
||||
reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] }
|
||||
reqwest = { version = "0.12", features = ["stream"] }
|
||||
tokio = { version = "1", features = ["macros", "fs", "io-util"] }
|
||||
url = "2.5"
|
||||
regex = "1.10"
|
||||
regex = "1.11"
|
||||
encoding_rs = "0.8"
|
||||
chrono = "0.4"
|
||||
base64 = "0.22"
|
||||
image = "0.25"
|
||||
log = "0.4"
|
||||
rust-embed="8.4"
|
||||
once_cell = "1.19"
|
||||
rust-embed="8.6"
|
||||
once_cell = "1.20"
|
||||
escaper = "0.1"
|
||||
futures = "0.3"
|
||||
unic-emoji-char = "0.9"
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024
|
||||
Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532
|
|
@ -6,10 +6,10 @@ use thiserror::Error;
|
|||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum ScraperError {
|
||||
#[error("")]
|
||||
#[error("Configerror {0}")]
|
||||
Config(#[from] ConfigError),
|
||||
#[error("")]
|
||||
#[error("ImageDownloadError {0}")]
|
||||
Image(#[from] ImageDownloadError),
|
||||
#[error("")]
|
||||
#[error("FullTextParserError {0}")]
|
||||
Scrap(#[from] FullTextParserError),
|
||||
}
|
||||
|
|
|
@ -69,6 +69,11 @@ impl FullTextParser {
|
|||
|
||||
let html = Self::get_body(response).await?;
|
||||
|
||||
if html.is_empty() {
|
||||
log::error!("Empty response body");
|
||||
return Err(FullTextParserError::Http);
|
||||
}
|
||||
|
||||
// check for fingerprints
|
||||
let config = if config.is_none() {
|
||||
if let Some(url) = Fingerprints::detect(&html) {
|
||||
|
@ -264,10 +269,17 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
// parse html
|
||||
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||
let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||
FullTextParserError::Xml
|
||||
})
|
||||
})?;
|
||||
|
||||
if document.get_root_element().is_none() {
|
||||
log::error!("document without root");
|
||||
Err(FullTextParserError::Xml)
|
||||
} else {
|
||||
Ok(document)
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: Here are some patched functions of libxml crate.
|
||||
|
@ -275,7 +287,7 @@ impl FullTextParser {
|
|||
/// See:
|
||||
/// - <https://github.com/KWARC/rust-libxml/issues/111>
|
||||
/// - <https://github.com/Orange-OpenSource/hurl/issues/1535>
|
||||
/// These two functions should be removed when the issue is fixed in libxml crate.
|
||||
/// These two functions should be removed when the issue is fixed in libxml crate.
|
||||
fn try_usize_to_i32(value: usize) -> Result<i32, libxml::parser::XmlParseError> {
|
||||
if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) {
|
||||
// Cannot safely use our value comparison, but the conversion if always safe.
|
||||
|
@ -354,63 +366,73 @@ impl FullTextParser {
|
|||
.send()
|
||||
.await
|
||||
.map_err(|err| {
|
||||
log::error!(
|
||||
"Downloading HTML failed: GET '{}' - '{}'",
|
||||
url.as_str(),
|
||||
err
|
||||
);
|
||||
log::error!("Downloading HTML failed: GET '{url}' - '{err}'");
|
||||
FullTextParserError::Http
|
||||
})?;
|
||||
Ok(response)
|
||||
}
|
||||
|
||||
async fn get_body(response: Response) -> Result<String, FullTextParserError> {
|
||||
if response.status().is_success() {
|
||||
let headers = response.headers().clone();
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|_| FullTextParserError::Http)?;
|
||||
|
||||
match from_utf8(&bytes) {
|
||||
Ok(utf8_str) => {
|
||||
log::debug!("Valid utf-8 string");
|
||||
return Ok(utf8_str.into());
|
||||
}
|
||||
Err(error) => {
|
||||
log::debug!("Invalid utf-8 string");
|
||||
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
|
||||
|
||||
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
|
||||
log::debug!("Encoding extracted from HTML: '{}'", encoding);
|
||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||
let decoded_html = decoded_html.replacen(
|
||||
&format!("charset=\"{encoding}\""),
|
||||
"charset=\"utf-8\"",
|
||||
1,
|
||||
);
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
|
||||
log::debug!("Encoding extracted from headers: '{}'", encoding);
|
||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||
let decoded_html = decoded_html.replacen(
|
||||
&format!("charset=\"{encoding}\""),
|
||||
"charset=\"utf-8\"",
|
||||
1,
|
||||
);
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
}
|
||||
|
||||
return Err(FullTextParserError::Utf8(error));
|
||||
}
|
||||
}
|
||||
let status = response.status();
|
||||
if !status.is_success() {
|
||||
log::error!("status code: {status}");
|
||||
return Err(FullTextParserError::Http);
|
||||
}
|
||||
|
||||
Err(FullTextParserError::Http)
|
||||
let headers = response.headers().clone();
|
||||
|
||||
if headers
|
||||
.get(reqwest::header::CONTENT_LENGTH)
|
||||
.and_then(|hv| hv.to_str().ok())
|
||||
.and_then(|str| str.parse::<i64>().ok())
|
||||
.map(|content_length| content_length == 0)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
log::error!("Empty response body");
|
||||
return Err(FullTextParserError::Http);
|
||||
}
|
||||
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
.map_err(|_| FullTextParserError::Http)?;
|
||||
|
||||
match from_utf8(&bytes) {
|
||||
Ok(utf8_str) => {
|
||||
log::debug!("Valid utf-8 string");
|
||||
Ok(utf8_str.into())
|
||||
}
|
||||
Err(error) => {
|
||||
log::debug!("Invalid utf-8 string");
|
||||
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
|
||||
|
||||
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
|
||||
log::debug!("Encoding extracted from HTML: '{encoding}'");
|
||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||
let decoded_html = decoded_html.replacen(
|
||||
&format!("charset=\"{encoding}\""),
|
||||
"charset=\"utf-8\"",
|
||||
1,
|
||||
);
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
|
||||
log::debug!("Encoding extracted from headers: '{encoding}'");
|
||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||
let decoded_html = decoded_html.replacen(
|
||||
&format!("charset=\"{encoding}\""),
|
||||
"charset=\"utf-8\"",
|
||||
1,
|
||||
);
|
||||
return Ok(decoded_html);
|
||||
}
|
||||
}
|
||||
|
||||
Err(FullTextParserError::Utf8(error))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn download(
|
||||
|
@ -422,7 +444,12 @@ impl FullTextParser {
|
|||
let headers = Util::generate_headers(config, global_config)?;
|
||||
let response = Self::get_response(url, client, headers).await?;
|
||||
let body = Self::get_body(response).await?;
|
||||
Ok(body)
|
||||
if body.is_empty() {
|
||||
log::error!("Empty response body");
|
||||
Err(FullTextParserError::Http)
|
||||
} else {
|
||||
Ok(body)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||
|
|
|
@ -219,7 +219,9 @@ impl ImageDownloader {
|
|||
let mut image_urls = Vec::new();
|
||||
|
||||
for node in node_vec {
|
||||
image_urls.push(Self::harvest_image_urls_from_node(node)?);
|
||||
if let Ok(url) = Self::harvest_image_urls_from_node(node) {
|
||||
image_urls.push(url);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(image_urls)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue