mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
better error messages
This commit is contained in:
parent
0978335d3b
commit
b92500fca2
2 changed files with 50 additions and 52 deletions
|
@ -6,10 +6,10 @@ use thiserror::Error;
|
||||||
|
|
||||||
#[derive(Error, Debug)]
|
#[derive(Error, Debug)]
|
||||||
pub enum ScraperError {
|
pub enum ScraperError {
|
||||||
#[error("")]
|
#[error("Configerror {0}")]
|
||||||
Config(#[from] ConfigError),
|
Config(#[from] ConfigError),
|
||||||
#[error("")]
|
#[error("ImageDownloadError {0}")]
|
||||||
Image(#[from] ImageDownloadError),
|
Image(#[from] ImageDownloadError),
|
||||||
#[error("")]
|
#[error("FullTextParserError {0}")]
|
||||||
Scrap(#[from] FullTextParserError),
|
Scrap(#[from] FullTextParserError),
|
||||||
}
|
}
|
||||||
|
|
|
@ -354,63 +354,61 @@ impl FullTextParser {
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.map_err(|err| {
|
.map_err(|err| {
|
||||||
log::error!(
|
log::error!("Downloading HTML failed: GET '{url}' - '{err}'");
|
||||||
"Downloading HTML failed: GET '{}' - '{}'",
|
|
||||||
url.as_str(),
|
|
||||||
err
|
|
||||||
);
|
|
||||||
FullTextParserError::Http
|
FullTextParserError::Http
|
||||||
})?;
|
})?;
|
||||||
Ok(response)
|
Ok(response)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn get_body(response: Response) -> Result<String, FullTextParserError> {
|
async fn get_body(response: Response) -> Result<String, FullTextParserError> {
|
||||||
if response.status().is_success() {
|
let status = response.status();
|
||||||
let headers = response.headers().clone();
|
if !status.is_success() {
|
||||||
let bytes = response
|
log::error!("status code: {status}");
|
||||||
.bytes()
|
return Err(FullTextParserError::Http);
|
||||||
.await
|
|
||||||
.map_err(|_| FullTextParserError::Http)?;
|
|
||||||
|
|
||||||
match from_utf8(&bytes) {
|
|
||||||
Ok(utf8_str) => {
|
|
||||||
log::debug!("Valid utf-8 string");
|
|
||||||
return Ok(utf8_str.into());
|
|
||||||
}
|
|
||||||
Err(error) => {
|
|
||||||
log::debug!("Invalid utf-8 string");
|
|
||||||
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
|
|
||||||
|
|
||||||
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
|
|
||||||
log::debug!("Encoding extracted from HTML: '{}'", encoding);
|
|
||||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
|
||||||
let decoded_html = decoded_html.replacen(
|
|
||||||
&format!("charset=\"{encoding}\""),
|
|
||||||
"charset=\"utf-8\"",
|
|
||||||
1,
|
|
||||||
);
|
|
||||||
return Ok(decoded_html);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
|
|
||||||
log::debug!("Encoding extracted from headers: '{}'", encoding);
|
|
||||||
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
|
||||||
let decoded_html = decoded_html.replacen(
|
|
||||||
&format!("charset=\"{encoding}\""),
|
|
||||||
"charset=\"utf-8\"",
|
|
||||||
1,
|
|
||||||
);
|
|
||||||
return Ok(decoded_html);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Err(FullTextParserError::Utf8(error));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Err(FullTextParserError::Http)
|
let headers = response.headers().clone();
|
||||||
|
let bytes = response
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.map_err(|_| FullTextParserError::Http)?;
|
||||||
|
|
||||||
|
match from_utf8(&bytes) {
|
||||||
|
Ok(utf8_str) => {
|
||||||
|
log::debug!("Valid utf-8 string");
|
||||||
|
Ok(utf8_str.into())
|
||||||
|
}
|
||||||
|
Err(error) => {
|
||||||
|
log::debug!("Invalid utf-8 string");
|
||||||
|
let lossy_string = std::string::String::from_utf8_lossy(&bytes);
|
||||||
|
|
||||||
|
if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) {
|
||||||
|
log::debug!("Encoding extracted from HTML: '{encoding}'");
|
||||||
|
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||||
|
let decoded_html = decoded_html.replacen(
|
||||||
|
&format!("charset=\"{encoding}\""),
|
||||||
|
"charset=\"utf-8\"",
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
return Ok(decoded_html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(encoding) = Self::get_encoding_from_http_header(&headers) {
|
||||||
|
log::debug!("Encoding extracted from headers: '{encoding}'");
|
||||||
|
if let Some(decoded_html) = Self::decode_html(&bytes, encoding) {
|
||||||
|
let decoded_html = decoded_html.replacen(
|
||||||
|
&format!("charset=\"{encoding}\""),
|
||||||
|
"charset=\"utf-8\"",
|
||||||
|
1,
|
||||||
|
);
|
||||||
|
return Ok(decoded_html);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(FullTextParserError::Utf8(error))
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download(
|
pub async fn download(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue