mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
Merge branch 'empty-body' into 'master'
check for empty http response and parsed documents without root element See merge request news-flash/article_scraper!11
This commit is contained in:
commit
ee53f58aeb
2 changed files with 35 additions and 4 deletions
|
@ -4,12 +4,14 @@ stages:
|
||||||
|
|
||||||
run-build:
|
run-build:
|
||||||
stage: build
|
stage: build
|
||||||
image: rust:1.83
|
image: rust:1.86
|
||||||
before_script:
|
before_script:
|
||||||
- rustup component add rustfmt
|
- rustup component add rustfmt
|
||||||
- rustup component add clippy
|
- rustup component add clippy
|
||||||
|
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
|
||||||
script:
|
script:
|
||||||
- rustc --version && cargo --version
|
- rustc --version && cargo --version
|
||||||
|
- echo $LIBXML2
|
||||||
- cargo fmt -- --check
|
- cargo fmt -- --check
|
||||||
- cargo clippy --all-targets --all-features -- -D warnings
|
- cargo clippy --all-targets --all-features -- -D warnings
|
||||||
- cargo build --release
|
- cargo build --release
|
||||||
|
|
|
@ -69,6 +69,11 @@ impl FullTextParser {
|
||||||
|
|
||||||
let html = Self::get_body(response).await?;
|
let html = Self::get_body(response).await?;
|
||||||
|
|
||||||
|
if html.is_empty() {
|
||||||
|
log::error!("Empty response body");
|
||||||
|
return Err(FullTextParserError::Http);
|
||||||
|
}
|
||||||
|
|
||||||
// check for fingerprints
|
// check for fingerprints
|
||||||
let config = if config.is_none() {
|
let config = if config.is_none() {
|
||||||
if let Some(url) = Fingerprints::detect(&html) {
|
if let Some(url) = Fingerprints::detect(&html) {
|
||||||
|
@ -264,10 +269,17 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
// parse html
|
// parse html
|
||||||
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||||
FullTextParserError::Xml
|
FullTextParserError::Xml
|
||||||
})
|
})?;
|
||||||
|
|
||||||
|
if document.get_root_element().is_none() {
|
||||||
|
log::error!("document without root");
|
||||||
|
Err(FullTextParserError::Xml)
|
||||||
|
} else {
|
||||||
|
Ok(document)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// FIXME: Here are some patched functions of libxml crate.
|
/// FIXME: Here are some patched functions of libxml crate.
|
||||||
|
@ -368,6 +380,18 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
let headers = response.headers().clone();
|
let headers = response.headers().clone();
|
||||||
|
|
||||||
|
if headers
|
||||||
|
.get(reqwest::header::CONTENT_LENGTH)
|
||||||
|
.and_then(|hv| hv.to_str().ok())
|
||||||
|
.and_then(|str| str.parse::<i64>().ok())
|
||||||
|
.map(|content_length| content_length == 0)
|
||||||
|
.unwrap_or(false)
|
||||||
|
{
|
||||||
|
log::error!("Empty response body");
|
||||||
|
return Err(FullTextParserError::Http);
|
||||||
|
}
|
||||||
|
|
||||||
let bytes = response
|
let bytes = response
|
||||||
.bytes()
|
.bytes()
|
||||||
.await
|
.await
|
||||||
|
@ -420,7 +444,12 @@ impl FullTextParser {
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
let response = Self::get_response(url, client, headers).await?;
|
let response = Self::get_response(url, client, headers).await?;
|
||||||
let body = Self::get_body(response).await?;
|
let body = Self::get_body(response).await?;
|
||||||
Ok(body)
|
if body.is_empty() {
|
||||||
|
log::error!("Empty response body");
|
||||||
|
Err(FullTextParserError::Http)
|
||||||
|
} else {
|
||||||
|
Ok(body)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue