1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

Merge branch 'empty-body' into 'master'

check for empty http response and parsed documents without root element

See merge request news-flash/article_scraper!11
This commit is contained in:
Jan Lukas Gernert 2025-05-04 15:38:52 +00:00
commit 7535c76e43
2 changed files with 35 additions and 4 deletions

View file

@ -4,12 +4,14 @@ stages:
run-build:
stage: build
image: rust:1.83
image: rust:1.86
before_script:
- rustup component add rustfmt
- rustup component add clippy
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
script:
- rustc --version && cargo --version
- echo $LIBXML2
- cargo fmt -- --check
- cargo clippy --all-targets --all-features -- -D warnings
- cargo build --release

View file

@ -69,6 +69,11 @@ impl FullTextParser {
let html = Self::get_body(response).await?;
if html.is_empty() {
log::error!("Empty response body");
return Err(FullTextParserError::Http);
}
// check for fingerprints
let config = if config.is_none() {
if let Some(url) = Fingerprints::detect(&html) {
@ -264,10 +269,17 @@ impl FullTextParser {
}
// parse html
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| {
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
FullTextParserError::Xml
})
})?;
if document.get_root_element().is_none() {
log::error!("document without root");
Err(FullTextParserError::Xml)
} else {
Ok(document)
}
}
/// FIXME: Here are some patched functions of libxml crate.
@ -368,6 +380,18 @@ impl FullTextParser {
}
let headers = response.headers().clone();
if headers
.get(reqwest::header::CONTENT_LENGTH)
.and_then(|hv| hv.to_str().ok())
.and_then(|str| str.parse::<i64>().ok())
.map(|content_length| content_length == 0)
.unwrap_or(false)
{
log::error!("Empty response body");
return Err(FullTextParserError::Http);
}
let bytes = response
.bytes()
.await
@ -420,7 +444,12 @@ impl FullTextParser {
let headers = Util::generate_headers(config, global_config)?;
let response = Self::get_response(url, client, headers).await?;
let body = Self::get_body(response).await?;
Ok(body)
if body.is_empty() {
log::error!("Empty response body");
Err(FullTextParserError::Http)
} else {
Ok(body)
}
}
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {