diff --git a/article_scraper/src/error.rs b/article_scraper/src/error.rs index 4f915fd..41ac9de 100644 --- a/article_scraper/src/error.rs +++ b/article_scraper/src/error.rs @@ -6,10 +6,10 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum ScraperError { - #[error("")] + #[error("Configerror {0}")] Config(#[from] ConfigError), - #[error("")] + #[error("ImageDownloadError {0}")] Image(#[from] ImageDownloadError), - #[error("")] + #[error("FullTextParserError {0}")] Scrap(#[from] FullTextParserError), } diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 18fc682..4bb8a30 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -354,63 +354,61 @@ impl FullTextParser { .send() .await .map_err(|err| { - log::error!( - "Downloading HTML failed: GET '{}' - '{}'", - url.as_str(), - err - ); + log::error!("Downloading HTML failed: GET '{url}' - '{err}'"); FullTextParserError::Http })?; Ok(response) } async fn get_body(response: Response) -> Result { - if response.status().is_success() { - let headers = response.headers().clone(); - let bytes = response - .bytes() - .await - .map_err(|_| FullTextParserError::Http)?; - - match from_utf8(&bytes) { - Ok(utf8_str) => { - log::debug!("Valid utf-8 string"); - return Ok(utf8_str.into()); - } - Err(error) => { - log::debug!("Invalid utf-8 string"); - let lossy_string = std::string::String::from_utf8_lossy(&bytes); - - if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { - log::debug!("Encoding extracted from HTML: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { - log::debug!("Encoding extracted from headers: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - return Err(FullTextParserError::Utf8(error)); - } - } + let status = response.status(); + if !status.is_success() { + log::error!("status code: {status}"); + return Err(FullTextParserError::Http); } - Err(FullTextParserError::Http) + let headers = response.headers().clone(); + let bytes = response + .bytes() + .await + .map_err(|_| FullTextParserError::Http)?; + + match from_utf8(&bytes) { + Ok(utf8_str) => { + log::debug!("Valid utf-8 string"); + Ok(utf8_str.into()) + } + Err(error) => { + log::debug!("Invalid utf-8 string"); + let lossy_string = std::string::String::from_utf8_lossy(&bytes); + + if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { + log::debug!("Encoding extracted from HTML: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { + log::debug!("Encoding extracted from headers: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + Err(FullTextParserError::Utf8(error)) + } + } } pub async fn download(