1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

move stuff around

This commit is contained in:
Jan Lukas Gernert 2022-12-13 08:54:57 +01:00
parent 90383545e0
commit c08f5afa5d
6 changed files with 84 additions and 81 deletions

View file

@ -1,4 +1,5 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use libxml::tree::{Document, SaveOptions};
use std::fs::File; use std::fs::File;
use std::io::{Error, ErrorKind, Write}; use std::io::{Error, ErrorKind, Write};
use std::path::PathBuf; use std::path::PathBuf;
@ -9,13 +10,31 @@ pub struct Article {
pub author: Option<String>, pub author: Option<String>,
pub url: Url, pub url: Url,
pub date: Option<DateTime<Utc>>, pub date: Option<DateTime<Utc>>,
pub html: Option<String>,
pub thumbnail_url: Option<String>, pub thumbnail_url: Option<String>,
pub document: Option<Document>,
} }
impl Article { impl Article {
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> { pub fn get_content(&self) -> Option<String> {
if let Some(ref html) = self.html { // serialize content
let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
self.document
.as_ref()
.map(|doc| doc.to_string_with_options(options))
}
#[allow(dead_code)]
pub(crate) fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
if let Some(ref html) = self.get_content() {
if let Ok(()) = std::fs::create_dir_all(path) { if let Ok(()) = std::fs::create_dir_all(path) {
let mut file_name = match self.title.clone() { let mut file_name = match self.title.clone() {
Some(file_name) => file_name.replace('/', "_"), Some(file_name) => file_name.replace('/', "_"),

View file

@ -13,7 +13,7 @@ use chrono::DateTime;
use encoding_rs::Encoding; use encoding_rs::Encoding;
use fingerprints::Fingerprints; use fingerprints::Fingerprints;
use libxml::parser::Parser; use libxml::parser::Parser;
use libxml::tree::{Document, Node, SaveOptions}; use libxml::tree::{Document, Node};
use libxml::xpath::Context; use libxml::xpath::Context;
use log::{debug, error, info, warn}; use log::{debug, error, info, warn};
use reqwest::header::HeaderMap; use reqwest::header::HeaderMap;
@ -75,8 +75,8 @@ impl FullTextParser {
author: None, author: None,
url: url.clone(), url: url.clone(),
date: None, date: None,
html: None,
thumbnail_url: None, thumbnail_url: None,
document: None,
}; };
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
@ -86,8 +86,30 @@ impl FullTextParser {
Self::generate_head(&mut root, &document)?; Self::generate_head(&mut root, &document)?;
self.parse_pages(&mut article, &url, &mut root, config, global_config, client) let headers = Util::generate_headers(config, global_config)?;
.await?; let html = Self::download(&url, client, headers).await?;
// check for fingerprints
let config = if config.is_none() {
if let Some(url) = Fingerprints::detect(&html) {
self.get_grabber_config(&url)
} else {
config
}
} else {
config
};
self.parse_pages(
&mut article,
&url,
&html,
&mut root,
config,
global_config,
client,
)
.await?;
let context = Context::new(&document).map_err(|()| { let context = Context::new(&document).map_err(|()| {
error!("Failed to create xpath context for extracted article"); error!("Failed to create xpath context for extracted article");
@ -99,19 +121,7 @@ impl FullTextParser {
return Err(error); return Err(error);
} }
// serialize content article.document = Some(document);
let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
let html = document.to_string_with_options(options);
article.html = Some(html);
Ok(article) Ok(article)
} }
@ -120,25 +130,12 @@ impl FullTextParser {
&self, &self,
article: &mut Article, article: &mut Article,
url: &url::Url, url: &url::Url,
html: &str,
root: &mut Node, root: &mut Node,
config: Option<&ConfigEntry>, config: Option<&ConfigEntry>,
global_config: &ConfigEntry, global_config: &ConfigEntry,
client: &Client, client: &Client,
) -> Result<(), FullTextParserError> { ) -> Result<(), FullTextParserError> {
let headers = Util::generate_headers(config, global_config)?;
let html = Self::download(url, client, headers).await?;
// see if
let config = if config.is_none() {
if let Some(url) = Fingerprints::detect(&html) {
self.get_grabber_config(&url)
} else {
config
}
} else {
config
};
let mut document = Self::parse_html(html, config, global_config)?; let mut document = Self::parse_html(html, config, global_config)?;
let mut xpath_ctx = Self::get_xpath_ctx(&document)?; let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
@ -180,7 +177,7 @@ impl FullTextParser {
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) { while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let html = Self::download(&url, client, headers).await?; let html = Self::download(&url, client, headers).await?;
document = Self::parse_html(html, config, global_config)?; document = Self::parse_html(&html, config, global_config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?; xpath_ctx = Self::get_xpath_ctx(&document)?;
Self::strip_junk(&xpath_ctx, config, global_config, &url); Self::strip_junk(&xpath_ctx, config, global_config, &url);
Self::extract_body(&xpath_ctx, root, config, global_config)?; Self::extract_body(&xpath_ctx, root, config, global_config)?;
@ -190,13 +187,13 @@ impl FullTextParser {
} }
fn parse_html( fn parse_html(
html: String, html: &str,
config: Option<&ConfigEntry>, config: Option<&ConfigEntry>,
global_config: &ConfigEntry, global_config: &ConfigEntry,
) -> Result<Document, FullTextParserError> { ) -> Result<Document, FullTextParserError> {
// replace matches in raw html // replace matches in raw html
let mut html = html; let mut html = html.to_owned();
if let Some(config) = config { if let Some(config) = config {
for replace in &config.replace { for replace in &config.replace {
html = html.replace(&replace.to_replace, &replace.replace_with); html = html.replace(&replace.to_replace, &replace.replace_with);
@ -233,7 +230,7 @@ impl FullTextParser {
) -> Result<(), FullTextParserError> { ) -> Result<(), FullTextParserError> {
let headers = Util::generate_headers(config, global_config)?; let headers = Util::generate_headers(config, global_config)?;
let html = Self::download(url, client, headers).await?; let html = Self::download(url, client, headers).await?;
let document = Self::parse_html(html, config, global_config)?; let document = Self::parse_html(&html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?; let xpath_ctx = Self::get_xpath_ctx(&document)?;
Self::extract_metadata(&xpath_ctx, config, global_config, article); Self::extract_metadata(&xpath_ctx, config, global_config, article);
Self::check_for_thumbnail(&xpath_ctx, article); Self::check_for_thumbnail(&xpath_ctx, article);

View file

@ -59,7 +59,7 @@ async fn youtube() {
Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn") Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn")
); );
assert!(article assert!(article
.html .get_content()
.map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed")) .map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed"))
.unwrap_or(false)); .unwrap_or(false));
} }

View file

@ -1,7 +1,7 @@
pub use self::error::ImageDownloadError; pub use self::error::ImageDownloadError;
use crate::util::Util; use crate::util::Util;
use libxml::parser::Parser; use libxml::parser::Parser;
use libxml::tree::{Node, SaveOptions}; use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context; use libxml::xpath::Context;
use log::{debug, error}; use log::{debug, error};
use reqwest::{Client, Response}; use reqwest::{Client, Response};
@ -28,34 +28,21 @@ impl ImageDownloader {
.parse_string(html) .parse_string(html)
.map_err(|_| ImageDownloadError::HtmlParse)?; .map_err(|_| ImageDownloadError::HtmlParse)?;
self.download_images_from_document(&doc, client).await
}
pub async fn download_images_from_document(
&self,
doc: &Document,
client: &Client,
) -> Result<String, ImageDownloadError> {
let xpath_ctx = Context::new(&doc).map_err(|()| { let xpath_ctx = Context::new(&doc).map_err(|()| {
error!("Failed to create xpath context for document"); error!("Failed to create xpath context for document");
ImageDownloadError::HtmlParse ImageDownloadError::HtmlParse
})?; })?;
self.download_images_from_context(&xpath_ctx, client)
.await?;
let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
Ok(doc.to_string_with_options(options))
}
pub async fn download_images_from_context(
&self,
context: &Context,
client: &Client,
) -> Result<(), ImageDownloadError> {
let xpath = "//img"; let xpath = "//img";
let node_vec = Util::evaluate_xpath(context, xpath, false) let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false)
.map_err(|_| ImageDownloadError::HtmlParse)?; .map_err(|_| ImageDownloadError::HtmlParse)?;
for mut node in node_vec { for mut node in node_vec {
if let Some(url) = node.get_property("src") { if let Some(url) = node.get_property("src") {
@ -83,7 +70,17 @@ impl ImageDownloader {
} }
} }
Ok(()) let options = SaveOptions {
format: false,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
Ok(doc.to_string_with_options(options))
} }
async fn save_image( async fn save_image(

View file

@ -2,20 +2,17 @@ mod article;
mod error; mod error;
mod full_text_parser; mod full_text_parser;
pub mod images; pub mod images;
mod readability;
mod util; mod util;
use article::Article; use article::Article;
use error::ScraperError; use error::ScraperError;
use full_text_parser::FullTextParser; use full_text_parser::FullTextParser;
use images::ImageDownloader; use images::ImageDownloader;
use readability::Readability;
use reqwest::Client; use reqwest::Client;
use std::path::Path; use std::path::Path;
pub struct ArticleScraper { pub struct ArticleScraper {
full_text_parser: FullTextParser, full_text_parser: FullTextParser,
readability: Readability,
image_downloader: ImageDownloader, image_downloader: ImageDownloader,
} }
@ -23,7 +20,6 @@ impl ArticleScraper {
pub async fn new(user_configs: Option<&Path>) -> Self { pub async fn new(user_configs: Option<&Path>) -> Self {
Self { Self {
full_text_parser: FullTextParser::new(user_configs).await, full_text_parser: FullTextParser::new(user_configs).await,
readability: Readability::new(),
image_downloader: ImageDownloader::new((2048, 2048)), image_downloader: ImageDownloader::new((2048, 2048)),
} }
} }
@ -37,13 +33,14 @@ impl ArticleScraper {
let res = self.full_text_parser.parse(url, client).await; let res = self.full_text_parser.parse(url, client).await;
if download_images { if download_images {
// if let Err(error) = self if let Ok(res) = res {
// .image_downloader if let Some(document) = res.document.as_ref() {
// .download_images_from_context(&context, client) let _image_res = self
// .await .image_downloader
// { .download_images_from_document(document, client)
// log::error!("Downloading images failed: '{}'", error); .await;
// } }
}
} }
unimplemented!() unimplemented!()

View file

@ -1,7 +0,0 @@
pub struct Readability;
impl Readability {
pub fn new() -> Self {
unimplemented!()
}
}