1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
This commit is contained in:
Jan Lukas Gernert 2022-12-11 16:23:02 +01:00
parent dc1bf2ef0c
commit 88bb88a38f
3 changed files with 17 additions and 24 deletions

View file

@ -16,7 +16,7 @@ pub struct Article {
impl Article {
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
if let Some(ref html) = self.html {
if let Ok(()) = std::fs::create_dir_all(&path) {
if let Ok(()) = std::fs::create_dir_all(path) {
let mut file_name = match self.title.clone() {
Some(file_name) => file_name.replace('/', "_"),
None => "Unknown Title".to_owned(),

View file

@ -28,10 +28,7 @@ impl ConfigCollection {
if let Some(directory) = directory {
// create data dir if it doesn't already exist
if let Err(error) = std::fs::DirBuilder::new()
.recursive(true)
.create(&directory)
{
if let Err(error) = std::fs::DirBuilder::new().recursive(true).create(directory) {
log::warn!(
"Failed to create user config directory {:?}: {}",
directory,

View file

@ -209,17 +209,17 @@ impl FullTextParser {
// parse html
let parser = Parser::default_html();
Ok(parser.parse_string(html.as_str()).map_err(|err| {
parser.parse_string(html.as_str()).map_err(|err| {
error!("Parsing HTML failed for downloaded HTML {:?}", err);
FullTextParserError::Xml
})?)
})
}
fn get_xpath_ctx(doc: &Document) -> Result<Context, FullTextParserError> {
Ok(Context::new(doc).map_err(|()| {
Context::new(doc).map_err(|()| {
error!("Creating xpath context failed for downloaded HTML");
FullTextParserError::Xml
})?)
})
}
async fn parse_single_page(
@ -356,29 +356,26 @@ impl FullTextParser {
}
fn check_for_thumbnail(context: &Context, article: &mut Article) {
if let Some(thumb) = Self::get_attribute(
if let Ok(thumb) = Self::get_attribute(
context,
"//meta[contains(@name, 'twitter:image')]",
"content",
)
.ok()
) {
article.thumbnail_url = Some(thumb);
return;
}
if let Ok(thumb) =
Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content")
{
article.thumbnail_url = Some(thumb);
return;
}
if let Some(thumb) =
Self::get_attribute(context, "//meta[contains(@name, 'og:image')]", "content").ok()
if let Ok(thumb) =
Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href")
{
article.thumbnail_url = Some(thumb);
return;
}
if let Some(thumb) =
Self::get_attribute(context, "//link[contains(@rel, 'image_src')]", "href").ok()
{
article.thumbnail_url = Some(thumb);
return;
}
}
@ -495,8 +492,7 @@ impl FullTextParser {
if is_relative_url {
let completed_url = Self::complete_url(article_url, &url)?;
let _ = node
.set_attribute(attribute, completed_url.as_str())
node.set_attribute(attribute, completed_url.as_str())
.map_err(|_| FullTextParserError::Scrape)?;
}
}