mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
Merge branch 'empty-body' into 'master'
check for empty http response and parsed documents without root element See merge request news-flash/article_scraper!11
This commit is contained in:
commit
7535c76e43
2 changed files with 35 additions and 4 deletions
|
@ -4,12 +4,14 @@ stages:
|
|||
|
||||
run-build:
|
||||
stage: build
|
||||
image: rust:1.83
|
||||
image: rust:1.86
|
||||
before_script:
|
||||
- rustup component add rustfmt
|
||||
- rustup component add clippy
|
||||
- export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so
|
||||
script:
|
||||
- rustc --version && cargo --version
|
||||
- echo $LIBXML2
|
||||
- cargo fmt -- --check
|
||||
- cargo clippy --all-targets --all-features -- -D warnings
|
||||
- cargo build --release
|
||||
|
|
|
@ -69,6 +69,11 @@ impl FullTextParser {
|
|||
|
||||
let html = Self::get_body(response).await?;
|
||||
|
||||
if html.is_empty() {
|
||||
log::error!("Empty response body");
|
||||
return Err(FullTextParserError::Http);
|
||||
}
|
||||
|
||||
// check for fingerprints
|
||||
let config = if config.is_none() {
|
||||
if let Some(url) = Fingerprints::detect(&html) {
|
||||
|
@ -264,10 +269,17 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
// parse html
|
||||
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||
let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| {
|
||||
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||
FullTextParserError::Xml
|
||||
})
|
||||
})?;
|
||||
|
||||
if document.get_root_element().is_none() {
|
||||
log::error!("document without root");
|
||||
Err(FullTextParserError::Xml)
|
||||
} else {
|
||||
Ok(document)
|
||||
}
|
||||
}
|
||||
|
||||
/// FIXME: Here are some patched functions of libxml crate.
|
||||
|
@ -368,6 +380,18 @@ impl FullTextParser {
|
|||
}
|
||||
|
||||
let headers = response.headers().clone();
|
||||
|
||||
if headers
|
||||
.get(reqwest::header::CONTENT_LENGTH)
|
||||
.and_then(|hv| hv.to_str().ok())
|
||||
.and_then(|str| str.parse::<i64>().ok())
|
||||
.map(|content_length| content_length == 0)
|
||||
.unwrap_or(false)
|
||||
{
|
||||
log::error!("Empty response body");
|
||||
return Err(FullTextParserError::Http);
|
||||
}
|
||||
|
||||
let bytes = response
|
||||
.bytes()
|
||||
.await
|
||||
|
@ -420,7 +444,12 @@ impl FullTextParser {
|
|||
let headers = Util::generate_headers(config, global_config)?;
|
||||
let response = Self::get_response(url, client, headers).await?;
|
||||
let body = Self::get_body(response).await?;
|
||||
Ok(body)
|
||||
if body.is_empty() {
|
||||
log::error!("Empty response body");
|
||||
Err(FullTextParserError::Http)
|
||||
} else {
|
||||
Ok(body)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue