mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
update to reqwest 0.9
This commit is contained in:
parent
b76bb7eea7
commit
fcea6cf5d1
3 changed files with 47 additions and 23 deletions
|
@ -6,13 +6,13 @@ authors = ["Jan Lukas Gernert <jangernert@gmail.com>"]
|
|||
[dependencies]
|
||||
failure = "0.1"
|
||||
libxml = "0.2"
|
||||
reqwest = "0.8"
|
||||
reqwest = "0.9"
|
||||
url = "1.7"
|
||||
regex = "1.0"
|
||||
regex = "1.1"
|
||||
encoding_rs = "0.8"
|
||||
chrono = "0.4"
|
||||
htmlescape = "0.3"
|
||||
base64 = "0.9"
|
||||
image = "0.19"
|
||||
base64 = "0.10"
|
||||
image = "0.20"
|
||||
log = "0.4"
|
||||
mime_guess = "1.8"
|
||||
|
|
|
@ -143,11 +143,11 @@ impl ImageDownloader {
|
|||
Err(ImageDownloadErrorKind::InvalidUrl)?
|
||||
}
|
||||
|
||||
fn check_image_content_type(response: &reqwest::Response) -> Result<reqwest::header::ContentType, ImageDownloadError> {
|
||||
fn check_image_content_type(response: &reqwest::Response) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
|
||||
|
||||
if response.status().is_success() {
|
||||
if let Some(content_type) = response.headers().get::<reqwest::header::ContentType>() {
|
||||
if content_type.type_() == reqwest::mime::IMAGE {
|
||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||
if content_type.to_str().context(ImageDownloadErrorKind::ContentType)?.contains("image") {
|
||||
return Ok(content_type.clone())
|
||||
}
|
||||
}
|
||||
|
@ -162,10 +162,14 @@ impl ImageDownloader {
|
|||
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> {
|
||||
|
||||
if response.status().is_success() {
|
||||
if let Some(&reqwest::header::ContentLength(content_length)) = response.headers().get::<reqwest::header::ContentLength>() {
|
||||
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
||||
if let Ok(content_length) = content_length.to_str() {
|
||||
if let Ok(content_length) = content_length.parse::<u64>() {
|
||||
return Ok(content_length)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(ImageDownloadErrorKind::ContentLenght)?
|
||||
}
|
||||
|
@ -188,7 +192,7 @@ impl ImageDownloader {
|
|||
None
|
||||
}
|
||||
|
||||
fn extract_image_name(url: &url::Url, content_type: reqwest::header::ContentType) -> Result<String, ImageDownloadError> {
|
||||
fn extract_image_name(url: &url::Url, content_type: reqwest::header::HeaderValue) -> Result<String, ImageDownloadError> {
|
||||
|
||||
if let Some(file_name) = url.path_segments().and_then(|segments| segments.last()) {
|
||||
let mut image_name = file_name.to_owned();
|
||||
|
@ -197,12 +201,25 @@ impl ImageDownloader {
|
|||
image_name.push_str(query);
|
||||
}
|
||||
|
||||
let primary_type = content_type.type_().as_str();
|
||||
let mut sub_type = content_type.subtype().as_str().to_owned();
|
||||
if let Some(suffix) = content_type.suffix() {
|
||||
sub_type.push_str("+");
|
||||
sub_type.push_str(suffix.as_str());
|
||||
let header = content_type.to_str().context(ImageDownloadErrorKind::ContentType)?;
|
||||
let primary_type = match header.find("/") {
|
||||
Some(end) => header[..end-1].to_string(),
|
||||
None => "unknown".to_string(),
|
||||
};
|
||||
let mut sub_type = match header.find("/") {
|
||||
None => "unknown".to_string(),
|
||||
Some(start) => {
|
||||
match header.find("+") {
|
||||
None => "unknown".to_string(),
|
||||
Some(end) => header[start..end-1].to_string(),
|
||||
}
|
||||
},
|
||||
};
|
||||
if let Some(start) = header.find("+") {
|
||||
sub_type.push_str("+");
|
||||
sub_type.push_str(&header[start..].to_string());
|
||||
};
|
||||
|
||||
if let Some(extensions) = mime_guess::get_extensions(primary_type, &sub_type) {
|
||||
let mut extension_present = false;
|
||||
for extension in extensions {
|
||||
|
|
21
src/lib.rs
21
src/lib.rs
|
@ -223,11 +223,16 @@ impl ArticleScraper {
|
|||
Err(ScraperErrorKind::Http)?
|
||||
}
|
||||
|
||||
fn get_encoding_from_http_header(headers: &reqwest::header::Headers) -> Option<&str> {
|
||||
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
|
||||
|
||||
if let Some(content_type) = headers.get::<reqwest::header::ContentType>() {
|
||||
if let Some(encoding) = content_type.get_param(reqwest::mime::CHARSET) {
|
||||
return Some(encoding.as_str())
|
||||
if let Some(content_type) = headers.get(reqwest::header::CONTENT_TYPE) {
|
||||
if let Ok(content_type) = content_type.to_str() {
|
||||
let regex = regex::Regex::new(r#"charset=([^"']+)"#).unwrap();
|
||||
if let Some(captures) = regex.captures(content_type) {
|
||||
if let Some(regex_match) = captures.get(1) {
|
||||
return Some(regex_match.as_str())
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None
|
||||
|
@ -288,11 +293,13 @@ impl ArticleScraper {
|
|||
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> {
|
||||
|
||||
if response.status().is_success() {
|
||||
if let Some(content_type) = response.headers().get::<reqwest::header::ContentType>() {
|
||||
if content_type.type_() == reqwest::mime::TEXT && content_type.subtype() == reqwest::mime::HTML {
|
||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||
if let Ok(content_type) = content_type.to_str() {
|
||||
if content_type.contains("text/html") {
|
||||
return Ok(true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
error!("Content type is not text/HTML");
|
||||
return Ok(false)
|
||||
|
@ -304,7 +311,7 @@ impl ArticleScraper {
|
|||
|
||||
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> {
|
||||
|
||||
if response.status() == reqwest::StatusCode::PermanentRedirect {
|
||||
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
||||
debug!("Article url redirects to {}", response.url().as_str());
|
||||
return Some(response.url().clone())
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue