1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

refactor: a bit less nested code

This commit is contained in:
Jan Lukas Gernert 2022-12-01 10:14:47 +01:00
parent 27be5a3204
commit 0c8aba4f4a

View file

@ -263,18 +263,17 @@ impl FullTextParser {
.text() .text()
.await .await
.map_err(|_| FullTextParserError::Http)?; .map_err(|_| FullTextParserError::Http)?;
{
if let Some(decoded_html) =
Self::decode_html(&text, Self::get_encoding_from_html(&text))
{
return Ok(decoded_html);
}
if let Some(decoded_html) = if let Some(decoded_html) =
Self::decode_html(&text, Self::get_encoding_from_http_header(&headers)) Self::decode_html(&text, Self::get_encoding_from_html(&text))
{ {
return Ok(decoded_html); return Ok(decoded_html);
} }
if let Some(decoded_html) =
Self::decode_html(&text, Self::get_encoding_from_http_header(&headers))
{
return Ok(decoded_html);
} }
warn!("No encoding of HTML detected - assuming utf-8"); warn!("No encoding of HTML detected - assuming utf-8");
@ -285,18 +284,16 @@ impl FullTextParser {
} }
fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> {
if let Some(content_type) = headers.get(reqwest::header::CONTENT_TYPE) { headers
if let Ok(content_type) = content_type.to_str() { .get(reqwest::header::CONTENT_TYPE)
let regex = .and_then(|header| header.to_str().ok())
regex::Regex::new(r#"charset=([^"']+)"#).expect("Failed to parse regex"); .and_then(|content_type| {
if let Some(captures) = regex.captures(content_type) { regex::Regex::new(r#"charset=([^"']+)"#)
if let Some(regex_match) = captures.get(1) { .expect("Failed to parse regex")
return Some(regex_match.as_str()); .captures(content_type)
} })
} .and_then(|captures| captures.get(1))
} .map(|regex_match| regex_match.as_str())
}
None
} }
fn get_encoding_from_html(html: &str) -> Option<&str> { fn get_encoding_from_html(html: &str) -> Option<&str> {
@ -361,10 +358,12 @@ impl FullTextParser {
let xpath = &format!("//img[contains(@class, '{}')]", class); let xpath = &format!("//img[contains(@class, '{}')]", class);
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(correct_url) = node.get_property(property_url) { if node
if node.set_property("src", &correct_url).is_err() { .get_property(property_url)
return Err(FullTextParserError::Xml); .and_then(|correct_url| node.set_property("src", &correct_url).ok())
} .is_none()
{
warn!("Failed to fix lazy loading image");
} }
} }
Ok(()) Ok(())
@ -374,27 +373,26 @@ impl FullTextParser {
let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); let xpath = &format!("//iframe[contains(@src, '{}')]", site_name);
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(mut parent) = node.get_parent() { let video_wrapper = node
if let Ok(mut video_wrapper) = parent.new_child(None, "div") { .get_parent()
if let Ok(()) = video_wrapper.set_property("class", "videoWrapper") { .and_then(|mut parent| parent.new_child(None, "div").ok());
if let Ok(()) = node.set_property("width", "100%") { if let Some(mut video_wrapper) = video_wrapper {
if let Ok(()) = node.set_property("height", "100%") { let success = video_wrapper
node.unlink(); .set_property("class", "videoWrapper")
video_wrapper.add_child(&mut node).map_err(|_| { .ok()
error!("Failed to add iframe as child of video wrapper <div>"); .and_then(|()| node.set_property("width", "100%").ok())
FullTextParserError::Xml .and_then(|()| node.set_property("height", "100%").ok())
})?; .ok_or_else(|| {
} node.unlink();
} video_wrapper.add_child(&mut node)
} })
.is_err();
if !success {
warn!("Failed to add iframe as child of video wrapper <div>");
} }
} else {
error!("Failed to add video wrapper <div> as parent of iframe"); warn!("Failed to get parent of iframe");
return Err(FullTextParserError::Xml);
} }
error!("Failed to get parent of iframe");
// return Err(ScraperErrorKind::Xml.into());
} }
Ok(()) Ok(())
} }
@ -409,8 +407,12 @@ impl FullTextParser {
let xpath = &format!("//{}[@{}]", xpath_tag, attribute); let xpath = &format!("//{}[@{}]", xpath_tag, attribute);
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.remove_property(attribute).is_err() { if let Err(err) = node.remove_property(attribute) {
return Err(FullTextParserError::Xml); log::warn!(
"Failed to remove attribute '{}' from node: {}",
attribute,
err
);
} }
} }
Ok(()) Ok(())
@ -427,8 +429,8 @@ impl FullTextParser {
let xpath = &format!("//{}", xpath_tag); let xpath = &format!("//{}", xpath_tag);
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.set_attribute(attribute, value).is_err() { if let Err(err) = node.set_attribute(attribute, value) {
return Err(FullTextParserError::Xml); log::warn!("Failed to set attribute '{}' on node: {}", attribute, err);
} }
} }
Ok(()) Ok(())
@ -439,14 +441,10 @@ impl FullTextParser {
xpath: &str, xpath: &str,
attribute: &str, attribute: &str,
) -> Result<String, FullTextParserError> { ) -> Result<String, FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, xpath, false)?; Util::evaluate_xpath(context, xpath, false)?
for node in node_vec { .iter()
if let Some(value) = node.get_attribute(attribute) { .find_map(|node| node.get_attribute(attribute))
return Ok(value); .ok_or(FullTextParserError::Xml)
}
}
Err(FullTextParserError::Xml)
} }
fn repair_urls( fn repair_urls(
@ -457,13 +455,17 @@ impl FullTextParser {
) -> Result<(), FullTextParserError> { ) -> Result<(), FullTextParserError> {
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(val) = node.get_attribute(attribute) { if let Some(url) = node.get_attribute(attribute) {
if let Err(url::ParseError::RelativeUrlWithoutBase) = url::Url::parse(&val) { let is_relative_url = url::Url::parse(&url)
if let Ok(fixed_url) = Self::complete_url(article_url, &val) { .err()
if node.set_attribute(attribute, fixed_url.as_str()).is_err() { .map(|err| err == url::ParseError::RelativeUrlWithoutBase)
return Err(FullTextParserError::Scrape); .unwrap_or(false);
}
} if is_relative_url {
let completed_url = Self::complete_url(article_url, &url)?;
let _ = node
.set_attribute(attribute, completed_url.as_str())
.map_err(|_| FullTextParserError::Scrape)?;
} }
} }
} }