1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

update to reqwest 0.9

This commit is contained in:
Jan Lukas Gernert 2018-12-07 02:14:50 +01:00
parent b76bb7eea7
commit fcea6cf5d1
3 changed files with 47 additions and 23 deletions

View file

@ -6,13 +6,13 @@ authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
[dependencies] [dependencies]
failure = "0.1" failure = "0.1"
libxml = "0.2" libxml = "0.2"
reqwest = "0.8" reqwest = "0.9"
url = "1.7" url = "1.7"
regex = "1.0" regex = "1.1"
encoding_rs = "0.8" encoding_rs = "0.8"
chrono = "0.4" chrono = "0.4"
htmlescape = "0.3" htmlescape = "0.3"
base64 = "0.9" base64 = "0.10"
image = "0.19" image = "0.20"
log = "0.4" log = "0.4"
mime_guess = "1.8" mime_guess = "1.8"

View file

@ -143,11 +143,11 @@ impl ImageDownloader {
Err(ImageDownloadErrorKind::InvalidUrl)? Err(ImageDownloadErrorKind::InvalidUrl)?
} }
fn check_image_content_type(response: &reqwest::Response) -> Result<reqwest::header::ContentType, ImageDownloadError> { fn check_image_content_type(response: &reqwest::Response) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_type) = response.headers().get::<reqwest::header::ContentType>() { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if content_type.type_() == reqwest::mime::IMAGE { if content_type.to_str().context(ImageDownloadErrorKind::ContentType)?.contains("image") {
return Ok(content_type.clone()) return Ok(content_type.clone())
} }
} }
@ -162,8 +162,12 @@ impl ImageDownloader {
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> { fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(&reqwest::header::ContentLength(content_length)) = response.headers().get::<reqwest::header::ContentLength>() { if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
return Ok(content_length) if let Ok(content_length) = content_length.to_str() {
if let Ok(content_length) = content_length.parse::<u64>() {
return Ok(content_length)
}
}
} }
} }
@ -188,7 +192,7 @@ impl ImageDownloader {
None None
} }
fn extract_image_name(url: &url::Url, content_type: reqwest::header::ContentType) -> Result<String, ImageDownloadError> { fn extract_image_name(url: &url::Url, content_type: reqwest::header::HeaderValue) -> Result<String, ImageDownloadError> {
if let Some(file_name) = url.path_segments().and_then(|segments| segments.last()) { if let Some(file_name) = url.path_segments().and_then(|segments| segments.last()) {
let mut image_name = file_name.to_owned(); let mut image_name = file_name.to_owned();
@ -197,12 +201,25 @@ impl ImageDownloader {
image_name.push_str(query); image_name.push_str(query);
} }
let primary_type = content_type.type_().as_str(); let header = content_type.to_str().context(ImageDownloadErrorKind::ContentType)?;
let mut sub_type = content_type.subtype().as_str().to_owned(); let primary_type = match header.find("/") {
if let Some(suffix) = content_type.suffix() { Some(end) => header[..end-1].to_string(),
None => "unknown".to_string(),
};
let mut sub_type = match header.find("/") {
None => "unknown".to_string(),
Some(start) => {
match header.find("+") {
None => "unknown".to_string(),
Some(end) => header[start..end-1].to_string(),
}
},
};
if let Some(start) = header.find("+") {
sub_type.push_str("+"); sub_type.push_str("+");
sub_type.push_str(suffix.as_str()); sub_type.push_str(&header[start..].to_string());
} };
if let Some(extensions) = mime_guess::get_extensions(primary_type, &sub_type) { if let Some(extensions) = mime_guess::get_extensions(primary_type, &sub_type) {
let mut extension_present = false; let mut extension_present = false;
for extension in extensions { for extension in extensions {

View file

@ -223,11 +223,16 @@ impl ArticleScraper {
Err(ScraperErrorKind::Http)? Err(ScraperErrorKind::Http)?
} }
fn get_encoding_from_http_header(headers: &reqwest::header::Headers) -> Option<&str> { fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
if let Some(content_type) = headers.get::<reqwest::header::ContentType>() { if let Some(content_type) = headers.get(reqwest::header::CONTENT_TYPE) {
if let Some(encoding) = content_type.get_param(reqwest::mime::CHARSET) { if let Ok(content_type) = content_type.to_str() {
return Some(encoding.as_str()) let regex = regex::Regex::new(r#"charset=([^"']+)"#).unwrap();
if let Some(captures) = regex.captures(content_type) {
if let Some(regex_match) = captures.get(1) {
return Some(regex_match.as_str())
}
}
} }
} }
None None
@ -288,9 +293,11 @@ impl ArticleScraper {
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> { fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> {
if response.status().is_success() { if response.status().is_success() {
if let Some(content_type) = response.headers().get::<reqwest::header::ContentType>() { if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
if content_type.type_() == reqwest::mime::TEXT && content_type.subtype() == reqwest::mime::HTML { if let Ok(content_type) = content_type.to_str() {
return Ok(true) if content_type.contains("text/html") {
return Ok(true)
}
} }
} }
@ -304,7 +311,7 @@ impl ArticleScraper {
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> { fn check_redirect(response: &reqwest::Response) -> Option<url::Url> {
if response.status() == reqwest::StatusCode::PermanentRedirect { if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
debug!("Article url redirects to {}", response.url().as_str()); debug!("Article url redirects to {}", response.url().as_str());
return Some(response.url().clone()) return Some(response.url().clone())
} }