1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 08:05:31 +02:00

better error messages

This commit is contained in:
Jan Lukas Gernert 2025-04-05 15:45:41 +02:00
parent 0978335d3b
commit b92500fca2
2 changed files with 50 additions and 52 deletions

View file

@ -6,10 +6,10 @@ use thiserror::Error;
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum ScraperError { pub enum ScraperError {
#[error("")] #[error("Configerror {0}")]
Config(#[from] ConfigError), Config(#[from] ConfigError),
#[error("")] #[error("ImageDownloadError {0}")]
Image(#[from] ImageDownloadError), Image(#[from] ImageDownloadError),
#[error("")] #[error("FullTextParserError {0}")]
Scrap(#[from] FullTextParserError), Scrap(#[from] FullTextParserError),
} }

View file

@ -354,63 +354,61 @@ impl FullTextParser {
.send() .send()
.await .await
.map_err(|err| { .map_err(|err| {
log::error!( log::error!("Downloading HTML failed: GET '{url}' - '{err}'");
"Downloading HTML failed: GET '{}' - '{}'",
url.as_str(),
err
);
FullTextParserError::Http FullTextParserError::Http
})?; })?;
Ok(response) Ok(response)
} }
async fn get_body(response: Response) -> Result<String, FullTextParserError> { async fn get_body(response: Response) -> Result<String, FullTextParserError> {
if response.status().is_success() { let status = response.status();
let headers = response.headers().clone(); if !status.is_success() {
let bytes = response log::error!("status code: {status}");
.bytes() return Err(FullTextParserError::Http);
.await
.map_err(|_| FullTextParserError::Http)?;
match from_utf8(&bytes) {
Ok(utf8_str) => {
log::debug!("Valid utf-8 string");
return Ok(utf8_str.into());
}
Err(error) => {
log::debug!("Invalid utf-8 string");
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
log::debug!("Encoding extracted from HTML: '{}'", encoding);
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
let decoded_html = decoded_html.replacen(
&format!("charset=\"{encoding}\""),
"charset=\"utf-8\"",
1,
);
return Ok(decoded_html);
}
}
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
log::debug!("Encoding extracted from headers: '{}'", encoding);
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
let decoded_html = decoded_html.replacen(
&format!("charset=\"{encoding}\""),
"charset=\"utf-8\"",
1,
);
return Ok(decoded_html);
}
}
return Err(FullTextParserError::Utf8(error));
}
}
} }
Err(FullTextParserError::Http) let headers = response.headers().clone();
let bytes = response
.bytes()
.await
.map_err(|_| FullTextParserError::Http)?;
match from_utf8(&bytes) {
Ok(utf8_str) => {
log::debug!("Valid utf-8 string");
Ok(utf8_str.into())
}
Err(error) => {
log::debug!("Invalid utf-8 string");
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
log::debug!("Encoding extracted from HTML: '{encoding}'");
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
let decoded_html = decoded_html.replacen(
&format!("charset=\"{encoding}\""),
"charset=\"utf-8\"",
1,
);
return Ok(decoded_html);
}
}
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
log::debug!("Encoding extracted from headers: '{encoding}'");
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
let decoded_html = decoded_html.replacen(
&format!("charset=\"{encoding}\""),
"charset=\"utf-8\"",
1,
);
return Ok(decoded_html);
}
}
Err(FullTextParserError::Utf8(error))
}
}
} }
pub async fn download( pub async fn download(