mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
Merge branch 'master' of gitlab.com:news-flash/article_scraper
This commit is contained in:
commit
1fbce6413d
3 changed files with 64 additions and 66 deletions
|
@ -19,7 +19,7 @@ impl Article {
|
||||||
if let Some(ref html) = self.html {
|
if let Some(ref html) = self.html {
|
||||||
if let Ok(()) = std::fs::create_dir_all(&path) {
|
if let Ok(()) = std::fs::create_dir_all(&path) {
|
||||||
let mut file_name = match self.title.clone() {
|
let mut file_name = match self.title.clone() {
|
||||||
Some(file_name) => file_name,
|
Some(file_name) => file_name.replace("/", "_"),
|
||||||
None => "Unknown Title".to_owned(),
|
None => "Unknown Title".to_owned(),
|
||||||
};
|
};
|
||||||
file_name.push_str(".html");
|
file_name.push_str(".html");
|
||||||
|
|
|
@ -7,7 +7,7 @@ use libxml::parser::Parser;
|
||||||
use libxml::tree::{Node, SaveOptions};
|
use libxml::tree::{Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error};
|
use log::{debug, error};
|
||||||
use reqwest;
|
use reqwest::{Client, Response};
|
||||||
use std;
|
use std;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use url;
|
use url;
|
||||||
|
@ -15,21 +15,18 @@ use url;
|
||||||
mod error;
|
mod error;
|
||||||
|
|
||||||
pub struct ImageDownloader {
|
pub struct ImageDownloader {
|
||||||
client: reqwest::Client,
|
|
||||||
max_size: (u32, u32),
|
max_size: (u32, u32),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ImageDownloader {
|
impl ImageDownloader {
|
||||||
pub fn new(max_size: (u32, u32)) -> ImageDownloader {
|
pub fn new(max_size: (u32, u32)) -> Self {
|
||||||
ImageDownloader {
|
ImageDownloader { max_size }
|
||||||
client: reqwest::Client::new(),
|
|
||||||
max_size: max_size,
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn download_images_from_string(
|
pub async fn download_images_from_string(
|
||||||
&self,
|
&self,
|
||||||
html: &str,
|
html: &str,
|
||||||
|
client: &Client,
|
||||||
) -> Result<String, ImageDownloadError> {
|
) -> Result<String, ImageDownloadError> {
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
let doc = parser.parse_string(html).map_err(|_| {
|
let doc = parser.parse_string(html).map_err(|_| {
|
||||||
|
@ -42,7 +39,8 @@ impl ImageDownloader {
|
||||||
ImageDownloadErrorKind::HtmlParse
|
ImageDownloadErrorKind::HtmlParse
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
self.download_images_from_context(&xpath_ctx).await?;
|
self.download_images_from_context(&xpath_ctx, client)
|
||||||
|
.await?;
|
||||||
|
|
||||||
let options = SaveOptions {
|
let options = SaveOptions {
|
||||||
format: false,
|
format: false,
|
||||||
|
@ -60,6 +58,7 @@ impl ImageDownloader {
|
||||||
pub async fn download_images_from_context(
|
pub async fn download_images_from_context(
|
||||||
&self,
|
&self,
|
||||||
context: &Context,
|
context: &Context,
|
||||||
|
client: &Client,
|
||||||
) -> Result<(), ImageDownloadError> {
|
) -> Result<(), ImageDownloadError> {
|
||||||
let xpath = "//img";
|
let xpath = "//img";
|
||||||
let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false)
|
let node_vec = ArticleScraper::evaluate_xpath(context, xpath, false)
|
||||||
|
@ -68,13 +67,13 @@ impl ImageDownloader {
|
||||||
if let Some(url) = node.get_property("src") {
|
if let Some(url) = node.get_property("src") {
|
||||||
if !url.starts_with("data:") {
|
if !url.starts_with("data:") {
|
||||||
if let Ok(url) = url::Url::parse(&url) {
|
if let Ok(url) = url::Url::parse(&url) {
|
||||||
let parent_url = match self.check_image_parent(&node, &url).await {
|
let parent_url = match self.check_image_parent(&node, &url, client).await {
|
||||||
Ok(url) => Some(url),
|
Ok(url) => Some(url),
|
||||||
Err(_) => None,
|
Err(_) => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Ok((small_image, big_image)) =
|
if let Ok((small_image, big_image)) =
|
||||||
self.save_image(&url, &parent_url).await
|
self.save_image(&url, &parent_url, client).await
|
||||||
{
|
{
|
||||||
if let Err(_) = node.set_property("src", &small_image) {
|
if let Err(_) = node.set_property("src", &small_image) {
|
||||||
return Err(ImageDownloadErrorKind::HtmlParse)?;
|
return Err(ImageDownloadErrorKind::HtmlParse)?;
|
||||||
|
@ -97,9 +96,9 @@ impl ImageDownloader {
|
||||||
&self,
|
&self,
|
||||||
image_url: &url::Url,
|
image_url: &url::Url,
|
||||||
parent_url: &Option<url::Url>,
|
parent_url: &Option<url::Url>,
|
||||||
|
client: &Client,
|
||||||
) -> Result<(String, Option<String>), ImageDownloadError> {
|
) -> Result<(String, Option<String>), ImageDownloadError> {
|
||||||
let response = self
|
let response = client
|
||||||
.client
|
|
||||||
.get(image_url.clone())
|
.get(image_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
|
@ -125,8 +124,7 @@ impl ImageDownloader {
|
||||||
let mut big_image: Option<Vec<u8>> = None;
|
let mut big_image: Option<Vec<u8>> = None;
|
||||||
|
|
||||||
if let Some(parent_url) = parent_url {
|
if let Some(parent_url) = parent_url {
|
||||||
let response_big = self
|
let response_big = client
|
||||||
.client
|
|
||||||
.get(parent_url.clone())
|
.get(parent_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
|
@ -185,7 +183,7 @@ impl ImageDownloader {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_image_content_type(
|
fn check_image_content_type(
|
||||||
response: &reqwest::Response,
|
response: &Response,
|
||||||
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
|
) -> Result<reqwest::header::HeaderValue, ImageDownloadError> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||||
|
@ -263,22 +261,21 @@ impl ImageDownloader {
|
||||||
&self,
|
&self,
|
||||||
node: &Node,
|
node: &Node,
|
||||||
child_url: &url::Url,
|
child_url: &url::Url,
|
||||||
|
client: &Client,
|
||||||
) -> Result<url::Url, ImageDownloadError> {
|
) -> Result<url::Url, ImageDownloadError> {
|
||||||
if let Some(parent) = node.get_parent() {
|
if let Some(parent) = node.get_parent() {
|
||||||
if parent.get_name() == "a" {
|
if parent.get_name() == "a" {
|
||||||
if let Some(url) = parent.get_property("href") {
|
if let Some(url) = parent.get_property("href") {
|
||||||
let parent_url =
|
let parent_url =
|
||||||
url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
|
url::Url::parse(&url).context(ImageDownloadErrorKind::ParentDownload)?;
|
||||||
let parent_response = self
|
let parent_response = client
|
||||||
.client
|
|
||||||
.head(parent_url.clone())
|
.head(parent_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||||
let _ = ImageDownloader::check_image_content_type(&parent_response)
|
let _ = ImageDownloader::check_image_content_type(&parent_response)
|
||||||
.context(ImageDownloadErrorKind::ParentDownload)?;
|
.context(ImageDownloadErrorKind::ParentDownload)?;
|
||||||
let child_response = self
|
let child_response = client
|
||||||
.client
|
|
||||||
.get(child_url.clone())
|
.get(child_url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
|
@ -301,7 +298,7 @@ impl ImageDownloader {
|
||||||
Err(ImageDownloadErrorKind::ParentDownload)?
|
Err(ImageDownloadErrorKind::ParentDownload)?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_content_lenght(response: &reqwest::Response) -> Result<u64, ImageDownloadError> {
|
fn get_content_lenght(response: &Response) -> Result<u64, ImageDownloadError> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
if let Some(content_length) = response.headers().get(reqwest::header::CONTENT_LENGTH) {
|
||||||
if let Ok(content_length) = content_length.to_str() {
|
if let Ok(content_length) = content_length.to_str() {
|
||||||
|
@ -318,6 +315,7 @@ impl ImageDownloader {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use reqwest::Client;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|
||||||
|
@ -327,7 +325,7 @@ mod tests {
|
||||||
let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||||
.expect("Failed to read HTML");
|
.expect("Failed to read HTML");
|
||||||
let result = image_dowloader
|
let result = image_dowloader
|
||||||
.download_images_from_string(&hdyleaflet)
|
.download_images_from_string(&hdyleaflet, &Client::new())
|
||||||
.await
|
.await
|
||||||
.expect("Failed to downalod images");
|
.expect("Failed to downalod images");
|
||||||
let mut file =
|
let mut file =
|
||||||
|
|
88
src/lib.rs
88
src/lib.rs
|
@ -15,7 +15,7 @@ use libxml::tree::{Document, Node, SaveOptions};
|
||||||
use libxml::xpath::Context;
|
use libxml::xpath::Context;
|
||||||
use log::{debug, error, info, warn};
|
use log::{debug, error, info, warn};
|
||||||
use regex;
|
use regex;
|
||||||
use reqwest;
|
use reqwest::{Client, Response};
|
||||||
use std::collections;
|
use std::collections;
|
||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
@ -27,11 +27,10 @@ use url;
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
pub image_downloader: ImageDownloader,
|
pub image_downloader: ImageDownloader,
|
||||||
config_files: Arc<RwLock<Option<ConfigCollection>>>,
|
config_files: Arc<RwLock<Option<ConfigCollection>>>,
|
||||||
client: reqwest::Client,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArticleScraper {
|
impl ArticleScraper {
|
||||||
pub fn new(config_path: PathBuf) -> Result<ArticleScraper, ScraperError> {
|
pub fn new(config_path: PathBuf) -> Self {
|
||||||
let config_files = Arc::new(RwLock::new(None));
|
let config_files = Arc::new(RwLock::new(None));
|
||||||
|
|
||||||
let locked_config_files = config_files.clone();
|
let locked_config_files = config_files.clone();
|
||||||
|
@ -49,21 +48,20 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
Ok(ArticleScraper {
|
ArticleScraper {
|
||||||
image_downloader: ImageDownloader::new((2048, 2048)),
|
image_downloader: ImageDownloader::new((2048, 2048)),
|
||||||
config_files,
|
config_files,
|
||||||
client: reqwest::Client::new(),
|
}
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub async fn parse(
|
pub async fn parse(
|
||||||
&self,
|
&self,
|
||||||
url: url::Url,
|
url: url::Url,
|
||||||
download_images: bool,
|
download_images: bool,
|
||||||
|
client: &Client,
|
||||||
) -> Result<Article, ScraperError> {
|
) -> Result<Article, ScraperError> {
|
||||||
info!("Scraping article: '{}'", url.as_str());
|
info!("Scraping article: '{}'", url.as_str());
|
||||||
let response = self
|
let response = client
|
||||||
.client
|
|
||||||
.head(url.clone())
|
.head(url.clone())
|
||||||
.send()
|
.send()
|
||||||
.await
|
.await
|
||||||
|
@ -108,7 +106,7 @@ impl ArticleScraper {
|
||||||
|
|
||||||
ArticleScraper::generate_head(&mut root, &document)?;
|
ArticleScraper::generate_head(&mut root, &document)?;
|
||||||
|
|
||||||
self.parse_pages(&mut article, &url, &mut root, &config)
|
self.parse_pages(&mut article, &url, &mut root, &config, client)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let context = Context::new(&document).map_err(|()| {
|
let context = Context::new(&document).map_err(|()| {
|
||||||
|
@ -121,15 +119,15 @@ impl ArticleScraper {
|
||||||
return Err(error);
|
return Err(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Err(error) = ArticleScraper::eliminate_noscrip_tag(&context) {
|
// if let Err(error) = ArticleScraper::eliminate_noscript_tag(&context) {
|
||||||
error!("Eliminating <noscript> tag failed - '{}'", error);
|
// error!("Eliminating <noscript> tag failed - {}", error);
|
||||||
return Err(error);
|
// return Err(error)
|
||||||
}
|
// }
|
||||||
|
|
||||||
if download_images {
|
if download_images {
|
||||||
if let Err(error) = self
|
if let Err(error) = self
|
||||||
.image_downloader
|
.image_downloader
|
||||||
.download_images_from_context(&context)
|
.download_images_from_context(&context, client)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
error!("Downloading images failed: '{}'", error);
|
error!("Downloading images failed: '{}'", error);
|
||||||
|
@ -159,8 +157,9 @@ impl ArticleScraper {
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &GrabberConfig,
|
config: &GrabberConfig,
|
||||||
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
let mut document = Self::parse_html(html, config)?;
|
let mut document = Self::parse_html(html, config)?;
|
||||||
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
|
|
||||||
|
@ -174,9 +173,10 @@ impl ArticleScraper {
|
||||||
if !result.trim().is_empty() {
|
if !result.trim().is_empty() {
|
||||||
// parse again with single page url
|
// parse again with single page url
|
||||||
debug!("Single page link found '{}'", result);
|
debug!("Single page link found '{}'", result);
|
||||||
let single_page_url = url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
let single_page_url =
|
||||||
|
url::Url::parse(&result).context(ScraperErrorKind::Url)?;
|
||||||
return self
|
return self
|
||||||
.parse_single_page(article, &single_page_url, root, config)
|
.parse_single_page(article, &single_page_url, root, config, client)
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -188,7 +188,7 @@ impl ArticleScraper {
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
|
if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
|
||||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
document = Self::parse_html(html, config)?;
|
document = Self::parse_html(html, config)?;
|
||||||
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
||||||
|
@ -252,8 +252,9 @@ impl ArticleScraper {
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
config: &GrabberConfig,
|
config: &GrabberConfig,
|
||||||
|
client: &Client,
|
||||||
) -> Result<(), ScraperError> {
|
) -> Result<(), ScraperError> {
|
||||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
let html = ArticleScraper::download(&url, client).await?;
|
||||||
let document = Self::parse_html(html, config)?;
|
let document = Self::parse_html(html, config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
||||||
|
@ -263,7 +264,7 @@ impl ArticleScraper {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn download(url: &url::Url, client: &reqwest::Client) -> Result<String, ScraperError> {
|
async fn download(url: &url::Url, client: &Client) -> Result<String, ScraperError> {
|
||||||
let response = client
|
let response = client
|
||||||
.get(url.as_str())
|
.get(url.as_str())
|
||||||
.send()
|
.send()
|
||||||
|
@ -373,7 +374,7 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_content_type(response: &reqwest::Response) -> Result<bool, ScraperError> {
|
fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
|
||||||
if response.status().is_success() {
|
if response.status().is_success() {
|
||||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||||
if let Ok(content_type) = content_type.to_str() {
|
if let Ok(content_type) = content_type.to_str() {
|
||||||
|
@ -391,7 +392,7 @@ impl ArticleScraper {
|
||||||
Err(ScraperErrorKind::Http)?
|
Err(ScraperErrorKind::Http)?
|
||||||
}
|
}
|
||||||
|
|
||||||
fn check_redirect(response: &reqwest::Response) -> Option<url::Url> {
|
fn check_redirect(response: &Response) -> Option<url::Url> {
|
||||||
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
||||||
debug!("Article url redirects to '{}'", response.url().as_str());
|
debug!("Article url redirects to '{}'", response.url().as_str());
|
||||||
return Some(response.url().clone());
|
return Some(response.url().clone());
|
||||||
|
@ -646,7 +647,7 @@ impl ArticleScraper {
|
||||||
);
|
);
|
||||||
|
|
||||||
// strip all scripts
|
// strip all scripts
|
||||||
let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
|
//let _ = ArticleScraper::strip_node(&context, &String::from("//script"));
|
||||||
|
|
||||||
// strip all comments
|
// strip all comments
|
||||||
let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
|
let _ = ArticleScraper::strip_node(&context, &String::from("//comment()"));
|
||||||
|
@ -782,28 +783,27 @@ impl ArticleScraper {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn eliminate_noscrip_tag(context: &Context) -> Result<(), ScraperError> {
|
// fn eliminate_noscript_tag(context: &Context) -> Result<(), ScraperError> {
|
||||||
let xpath = "//noscript";
|
// let xpath = "//noscript";
|
||||||
let node_vec = Self::evaluate_xpath(context, xpath, false)?;
|
// let node_vec = Self::evaluate_xpath(context, xpath, false)?;
|
||||||
|
// for mut node in node_vec {
|
||||||
for mut node in node_vec {
|
// if let Some(mut parent) = node.get_parent() {
|
||||||
if let Some(mut parent) = node.get_parent() {
|
// node.unlink();
|
||||||
node.unlink();
|
// let children = node.get_child_nodes();
|
||||||
let children = node.get_child_nodes();
|
// for mut child in children {
|
||||||
for mut child in children {
|
// child.unlink();
|
||||||
child.unlink();
|
// let _ = parent.add_child(&mut child);
|
||||||
let _ = parent.add_child(&mut child);
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// Ok(())
|
||||||
|
// }
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
use reqwest::Client;
|
||||||
|
|
||||||
#[tokio::test(basic_scheduler)]
|
#[tokio::test(basic_scheduler)]
|
||||||
async fn golem() {
|
async fn golem() {
|
||||||
|
@ -811,8 +811,8 @@ mod tests {
|
||||||
let out_path = PathBuf::from(r"./test_output");
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path).unwrap();
|
let grabber = ArticleScraper::new(config_path);
|
||||||
let article = grabber.parse(url, true).await.unwrap();
|
let article = grabber.parse(url, true, &Client::new()).await.unwrap();
|
||||||
article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -833,8 +833,8 @@ mod tests {
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let grabber = ArticleScraper::new(config_path).unwrap();
|
let grabber = ArticleScraper::new(config_path);
|
||||||
let article = grabber.parse(url, true).await.unwrap();
|
let article = grabber.parse(url, true, &Client::new()).await.unwrap();
|
||||||
article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue