mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
move stuff around
This commit is contained in:
parent
90383545e0
commit
c08f5afa5d
6 changed files with 84 additions and 81 deletions
|
@ -1,4 +1,5 @@
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
|
use libxml::tree::{Document, SaveOptions};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Error, ErrorKind, Write};
|
use std::io::{Error, ErrorKind, Write};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
@ -9,13 +10,31 @@ pub struct Article {
|
||||||
pub author: Option<String>,
|
pub author: Option<String>,
|
||||||
pub url: Url,
|
pub url: Url,
|
||||||
pub date: Option<DateTime<Utc>>,
|
pub date: Option<DateTime<Utc>>,
|
||||||
pub html: Option<String>,
|
|
||||||
pub thumbnail_url: Option<String>,
|
pub thumbnail_url: Option<String>,
|
||||||
|
pub document: Option<Document>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Article {
|
impl Article {
|
||||||
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
|
pub fn get_content(&self) -> Option<String> {
|
||||||
if let Some(ref html) = self.html {
|
// serialize content
|
||||||
|
let options = SaveOptions {
|
||||||
|
format: false,
|
||||||
|
no_declaration: false,
|
||||||
|
no_empty_tags: true,
|
||||||
|
no_xhtml: false,
|
||||||
|
xhtml: false,
|
||||||
|
as_xml: false,
|
||||||
|
as_html: true,
|
||||||
|
non_significant_whitespace: false,
|
||||||
|
};
|
||||||
|
self.document
|
||||||
|
.as_ref()
|
||||||
|
.map(|doc| doc.to_string_with_options(options))
|
||||||
|
}
|
||||||
|
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub(crate) fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
|
||||||
|
if let Some(ref html) = self.get_content() {
|
||||||
if let Ok(()) = std::fs::create_dir_all(path) {
|
if let Ok(()) = std::fs::create_dir_all(path) {
|
||||||
let mut file_name = match self.title.clone() {
|
let mut file_name = match self.title.clone() {
|
||||||
Some(file_name) => file_name.replace('/', "_"),
|
Some(file_name) => file_name.replace('/', "_"),
|
||||||
|
|
|
@ -13,7 +13,7 @@ use chrono::DateTime;
|
||||||
use encoding_rs::Encoding;
|
use encoding_rs::Encoding;
|
||||||
use fingerprints::Fingerprints;
|
use fingerprints::Fingerprints;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Document, Node, SaveOptions};
|
use libxml::tree::{Document, Node};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error, info, warn};
|
use log::{debug, error, info, warn};
|
||||||
use reqwest::header::HeaderMap;
|
use reqwest::header::HeaderMap;
|
||||||
|
@ -75,8 +75,8 @@ impl FullTextParser {
|
||||||
author: None,
|
author: None,
|
||||||
url: url.clone(),
|
url: url.clone(),
|
||||||
date: None,
|
date: None,
|
||||||
html: None,
|
|
||||||
thumbnail_url: None,
|
thumbnail_url: None,
|
||||||
|
document: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||||
|
@ -86,8 +86,30 @@ impl FullTextParser {
|
||||||
|
|
||||||
Self::generate_head(&mut root, &document)?;
|
Self::generate_head(&mut root, &document)?;
|
||||||
|
|
||||||
self.parse_pages(&mut article, &url, &mut root, config, global_config, client)
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
.await?;
|
let html = Self::download(&url, client, headers).await?;
|
||||||
|
|
||||||
|
// check for fingerprints
|
||||||
|
let config = if config.is_none() {
|
||||||
|
if let Some(url) = Fingerprints::detect(&html) {
|
||||||
|
self.get_grabber_config(&url)
|
||||||
|
} else {
|
||||||
|
config
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
config
|
||||||
|
};
|
||||||
|
|
||||||
|
self.parse_pages(
|
||||||
|
&mut article,
|
||||||
|
&url,
|
||||||
|
&html,
|
||||||
|
&mut root,
|
||||||
|
config,
|
||||||
|
global_config,
|
||||||
|
client,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
|
||||||
let context = Context::new(&document).map_err(|()| {
|
let context = Context::new(&document).map_err(|()| {
|
||||||
error!("Failed to create xpath context for extracted article");
|
error!("Failed to create xpath context for extracted article");
|
||||||
|
@ -99,19 +121,7 @@ impl FullTextParser {
|
||||||
return Err(error);
|
return Err(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
// serialize content
|
article.document = Some(document);
|
||||||
let options = SaveOptions {
|
|
||||||
format: false,
|
|
||||||
no_declaration: false,
|
|
||||||
no_empty_tags: true,
|
|
||||||
no_xhtml: false,
|
|
||||||
xhtml: false,
|
|
||||||
as_xml: false,
|
|
||||||
as_html: true,
|
|
||||||
non_significant_whitespace: false,
|
|
||||||
};
|
|
||||||
let html = document.to_string_with_options(options);
|
|
||||||
article.html = Some(html);
|
|
||||||
|
|
||||||
Ok(article)
|
Ok(article)
|
||||||
}
|
}
|
||||||
|
@ -120,25 +130,12 @@ impl FullTextParser {
|
||||||
&self,
|
&self,
|
||||||
article: &mut Article,
|
article: &mut Article,
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
|
html: &str,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: &ConfigEntry,
|
||||||
client: &Client,
|
client: &Client,
|
||||||
) -> Result<(), FullTextParserError> {
|
) -> Result<(), FullTextParserError> {
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
|
||||||
let html = Self::download(url, client, headers).await?;
|
|
||||||
|
|
||||||
// see if
|
|
||||||
let config = if config.is_none() {
|
|
||||||
if let Some(url) = Fingerprints::detect(&html) {
|
|
||||||
self.get_grabber_config(&url)
|
|
||||||
} else {
|
|
||||||
config
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
config
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut document = Self::parse_html(html, config, global_config)?;
|
let mut document = Self::parse_html(html, config, global_config)?;
|
||||||
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
|
|
||||||
|
@ -180,7 +177,7 @@ impl FullTextParser {
|
||||||
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
let html = Self::download(&url, client, headers).await?;
|
let html = Self::download(&url, client, headers).await?;
|
||||||
document = Self::parse_html(html, config, global_config)?;
|
document = Self::parse_html(&html, config, global_config)?;
|
||||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
Self::strip_junk(&xpath_ctx, config, global_config, &url);
|
Self::strip_junk(&xpath_ctx, config, global_config, &url);
|
||||||
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||||
|
@ -190,13 +187,13 @@ impl FullTextParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html(
|
fn parse_html(
|
||||||
html: String,
|
html: &str,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
global_config: &ConfigEntry,
|
||||||
) -> Result<Document, FullTextParserError> {
|
) -> Result<Document, FullTextParserError> {
|
||||||
// replace matches in raw html
|
// replace matches in raw html
|
||||||
|
|
||||||
let mut html = html;
|
let mut html = html.to_owned();
|
||||||
if let Some(config) = config {
|
if let Some(config) = config {
|
||||||
for replace in &config.replace {
|
for replace in &config.replace {
|
||||||
html = html.replace(&replace.to_replace, &replace.replace_with);
|
html = html.replace(&replace.to_replace, &replace.replace_with);
|
||||||
|
@ -233,7 +230,7 @@ impl FullTextParser {
|
||||||
) -> Result<(), FullTextParserError> {
|
) -> Result<(), FullTextParserError> {
|
||||||
let headers = Util::generate_headers(config, global_config)?;
|
let headers = Util::generate_headers(config, global_config)?;
|
||||||
let html = Self::download(url, client, headers).await?;
|
let html = Self::download(url, client, headers).await?;
|
||||||
let document = Self::parse_html(html, config, global_config)?;
|
let document = Self::parse_html(&html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
Self::extract_metadata(&xpath_ctx, config, global_config, article);
|
||||||
Self::check_for_thumbnail(&xpath_ctx, article);
|
Self::check_for_thumbnail(&xpath_ctx, article);
|
||||||
|
|
|
@ -59,7 +59,7 @@ async fn youtube() {
|
||||||
Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn")
|
Some("RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn")
|
||||||
);
|
);
|
||||||
assert!(article
|
assert!(article
|
||||||
.html
|
.get_content()
|
||||||
.map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed"))
|
.map(|html| html.contains("https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed"))
|
||||||
.unwrap_or(false));
|
.unwrap_or(false));
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
pub use self::error::ImageDownloadError;
|
pub use self::error::ImageDownloadError;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use libxml::parser::Parser;
|
use libxml::parser::Parser;
|
||||||
use libxml::tree::{Node, SaveOptions};
|
use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error};
|
use log::{debug, error};
|
||||||
use reqwest::{Client, Response};
|
use reqwest::{Client, Response};
|
||||||
|
@ -28,34 +28,21 @@ impl ImageDownloader {
|
||||||
.parse_string(html)
|
.parse_string(html)
|
||||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
|
|
||||||
|
self.download_images_from_document(&doc, client).await
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn download_images_from_document(
|
||||||
|
&self,
|
||||||
|
doc: &Document,
|
||||||
|
client: &Client,
|
||||||
|
) -> Result<String, ImageDownloadError> {
|
||||||
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
let xpath_ctx = Context::new(&doc).map_err(|()| {
|
||||||
error!("Failed to create xpath context for document");
|
error!("Failed to create xpath context for document");
|
||||||
ImageDownloadError::HtmlParse
|
ImageDownloadError::HtmlParse
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
self.download_images_from_context(&xpath_ctx, client)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let options = SaveOptions {
|
|
||||||
format: false,
|
|
||||||
no_declaration: false,
|
|
||||||
no_empty_tags: true,
|
|
||||||
no_xhtml: false,
|
|
||||||
xhtml: false,
|
|
||||||
as_xml: false,
|
|
||||||
as_html: true,
|
|
||||||
non_significant_whitespace: false,
|
|
||||||
};
|
|
||||||
Ok(doc.to_string_with_options(options))
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn download_images_from_context(
|
|
||||||
&self,
|
|
||||||
context: &Context,
|
|
||||||
client: &Client,
|
|
||||||
) -> Result<(), ImageDownloadError> {
|
|
||||||
let xpath = "//img";
|
let xpath = "//img";
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)
|
let node_vec = Util::evaluate_xpath(&xpath_ctx, xpath, false)
|
||||||
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
.map_err(|_| ImageDownloadError::HtmlParse)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if let Some(url) = node.get_property("src") {
|
if let Some(url) = node.get_property("src") {
|
||||||
|
@ -83,7 +70,17 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
let options = SaveOptions {
|
||||||
|
format: false,
|
||||||
|
no_declaration: false,
|
||||||
|
no_empty_tags: true,
|
||||||
|
no_xhtml: false,
|
||||||
|
xhtml: false,
|
||||||
|
as_xml: false,
|
||||||
|
as_html: true,
|
||||||
|
non_significant_whitespace: false,
|
||||||
|
};
|
||||||
|
Ok(doc.to_string_with_options(options))
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn save_image(
|
async fn save_image(
|
||||||
|
|
19
src/lib.rs
19
src/lib.rs
|
@ -2,20 +2,17 @@ mod article;
|
||||||
mod error;
|
mod error;
|
||||||
mod full_text_parser;
|
mod full_text_parser;
|
||||||
pub mod images;
|
pub mod images;
|
||||||
mod readability;
|
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
use article::Article;
|
use article::Article;
|
||||||
use error::ScraperError;
|
use error::ScraperError;
|
||||||
use full_text_parser::FullTextParser;
|
use full_text_parser::FullTextParser;
|
||||||
use images::ImageDownloader;
|
use images::ImageDownloader;
|
||||||
use readability::Readability;
|
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
full_text_parser: FullTextParser,
|
full_text_parser: FullTextParser,
|
||||||
readability: Readability,
|
|
||||||
image_downloader: ImageDownloader,
|
image_downloader: ImageDownloader,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,7 +20,6 @@ impl ArticleScraper {
|
||||||
pub async fn new(user_configs: Option<&Path>) -> Self {
|
pub async fn new(user_configs: Option<&Path>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
full_text_parser: FullTextParser::new(user_configs).await,
|
full_text_parser: FullTextParser::new(user_configs).await,
|
||||||
readability: Readability::new(),
|
|
||||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -37,13 +33,14 @@ impl ArticleScraper {
|
||||||
let res = self.full_text_parser.parse(url, client).await;
|
let res = self.full_text_parser.parse(url, client).await;
|
||||||
|
|
||||||
if download_images {
|
if download_images {
|
||||||
// if let Err(error) = self
|
if let Ok(res) = res {
|
||||||
// .image_downloader
|
if let Some(document) = res.document.as_ref() {
|
||||||
// .download_images_from_context(&context, client)
|
let _image_res = self
|
||||||
// .await
|
.image_downloader
|
||||||
// {
|
.download_images_from_document(document, client)
|
||||||
// log::error!("Downloading images failed: '{}'", error);
|
.await;
|
||||||
// }
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unimplemented!()
|
unimplemented!()
|
||||||
|
|
|
@ -1,7 +0,0 @@
|
||||||
pub struct Readability;
|
|
||||||
|
|
||||||
impl Readability {
|
|
||||||
pub fn new() -> Self {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Add table
Add a link
Reference in a new issue