1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

check for empty http response and parsed documents without root element

This commit is contained in:
Jan Lukas Gernert 2025-05-04 17:34:33 +02:00
parent 9b374a28c7
commit f361392c04

View file

@ -69,6 +69,11 @@ impl FullTextParser {
let html = Self::get_body(response).await?;
if html.is_empty() {
log::error!("Empty response body");
return Err(FullTextParserError::Http);
}
// check for fingerprints
let config = if config.is_none() {
if let Some(url) = Fingerprints::detect(&html) {
@ -264,10 +269,17 @@ impl FullTextParser {
}
// parse html
Self::parse_html_string_patched(html.as_str()).map_err(|err| {
let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| {
log::error!("Parsing HTML failed for downloaded HTML {:?}", err);
FullTextParserError::Xml
})
})?;
if document.get_root_element().is_none() {
log::error!("document without root");
Err(FullTextParserError::Xml)
} else {
Ok(document)
}
}
/// FIXME: Here are some patched functions of libxml crate.
@ -368,6 +380,18 @@ impl FullTextParser {
}
let headers = response.headers().clone();
if headers
.get(reqwest::header::CONTENT_LENGTH)
.and_then(|hv| hv.to_str().ok())
.and_then(|str| str.parse::<i64>().ok())
.map(|content_length| content_length == 0)
.unwrap_or(false)
{
log::error!("Empty response body");
return Err(FullTextParserError::Http);
}
let bytes = response
.bytes()
.await
@ -420,8 +444,13 @@ impl FullTextParser {
let headers = Util::generate_headers(config, global_config)?;
let response = Self::get_response(url, client, headers).await?;
let body = Self::get_body(response).await?;
if body.is_empty() {
log::error!("Empty response body");
Err(FullTextParserError::Http)
} else {
Ok(body)
}
}
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
headers