mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
write some docs
This commit is contained in:
parent
bfb31dc188
commit
57df2e6832
5 changed files with 174 additions and 4 deletions
53
Readme.md
Normal file
53
Readme.md
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
# article scraper
|
||||||
|
|
||||||
|
The `article_scraper` crate provides a simple way to extract meaningful content from the web.
|
||||||
|
It contains two ways of locating the desired content
|
||||||
|
|
||||||
|
## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
|
||||||
|
|
||||||
|
This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
|
||||||
|
The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
|
||||||
|
|
||||||
|
A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||||
|
Please consider contributing new rules or updates to it.
|
||||||
|
|
||||||
|
`article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
|
||||||
|
|
||||||
|
## 2. Mozilla Readability
|
||||||
|
|
||||||
|
In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
|
||||||
|
This re-implementation tries to mimic the original as closely as possible.
|
||||||
|
|
||||||
|
# Example
|
||||||
|
|
||||||
|
```
|
||||||
|
use article_scraper::ArticleScraper;
|
||||||
|
use url::Url;
|
||||||
|
use reqwest::Client;
|
||||||
|
|
||||||
|
let scraper = ArticleScraper::new(None);
|
||||||
|
let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||||
|
let client = Client::new();
|
||||||
|
let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||||
|
let html = article.get_doc_content();
|
||||||
|
```
|
||||||
|
|
||||||
|
# CLI
|
||||||
|
|
||||||
|
Various features of this crate can be used via [`article_scraper_cli`](./article_scraper_cli/).
|
||||||
|
|
||||||
|
```
|
||||||
|
Usage: article_scraper_cli [OPTIONS] <COMMAND>
|
||||||
|
|
||||||
|
Commands:
|
||||||
|
all Use the complete pipeline
|
||||||
|
readability Only use the Readability parser
|
||||||
|
ftr Only use (a subset of) the Ftr parser
|
||||||
|
help Print this message or the help of the given subcommand(s)
|
||||||
|
|
||||||
|
Options:
|
||||||
|
-d, --debug Turn debug logging on
|
||||||
|
-o, --output <FILE> Destination of resulting HTML file
|
||||||
|
-h, --help Print help
|
||||||
|
-V, --version Print version
|
||||||
|
```
|
File diff suppressed because one or more lines are too long
|
@ -11,10 +11,32 @@ use self::state::State;
|
||||||
use super::error::FullTextParserError;
|
use super::error::FullTextParserError;
|
||||||
use crate::{constants, util::Util};
|
use crate::{constants, util::Util};
|
||||||
|
|
||||||
|
/// Rust port of mozilla readability algorithm
|
||||||
|
///
|
||||||
|
/// Used as fallback for `ArticleScraper` if no fitting config can be found
|
||||||
pub struct Readability;
|
pub struct Readability;
|
||||||
|
|
||||||
impl Readability {
|
impl Readability {
|
||||||
pub async fn extract_from_str(
|
/// Parse HTML and extract meaningful content
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `html` - HTML of a website containing an article or similar content
|
||||||
|
/// * `base_url` - URL used to complete relative URLs
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// let html = reqwest::get("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html")
|
||||||
|
/// .await
|
||||||
|
/// .unwrap()
|
||||||
|
/// .text()
|
||||||
|
/// .await
|
||||||
|
/// .unwrap();
|
||||||
|
/// let base_url = Url::parse("https://www.nytimes.com").unwrap();
|
||||||
|
/// let extracted_content = Readability::extract(&html, Some(base_url)).unwrap();
|
||||||
|
/// ```
|
||||||
|
pub async fn extract(
|
||||||
html: &str,
|
html: &str,
|
||||||
base_url: Option<url::Url>,
|
base_url: Option<url::Url>,
|
||||||
) -> Result<String, FullTextParserError> {
|
) -> Result<String, FullTextParserError> {
|
||||||
|
@ -55,7 +77,7 @@ impl Readability {
|
||||||
Ok(html)
|
Ok(html)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn extract_body(
|
pub(crate) fn extract_body(
|
||||||
document: Document,
|
document: Document,
|
||||||
root: &mut Node,
|
root: &mut Node,
|
||||||
title: Option<&str>,
|
title: Option<&str>,
|
||||||
|
|
|
@ -1,15 +1,53 @@
|
||||||
|
//! # article scraper
|
||||||
|
//!
|
||||||
|
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
|
||||||
|
//! It contains two ways of locating the desired content
|
||||||
|
//!
|
||||||
|
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
|
||||||
|
//!
|
||||||
|
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
|
||||||
|
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
|
||||||
|
//!
|
||||||
|
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||||
|
//! Please consider contributing new rules or updates to it.
|
||||||
|
//!
|
||||||
|
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
|
||||||
|
//!
|
||||||
|
//! ## 2. Mozilla Readability
|
||||||
|
//!
|
||||||
|
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
|
||||||
|
//! This re-implementation tries to mimic the original as closely as possible.
|
||||||
|
//!
|
||||||
|
//! # Example
|
||||||
|
//!
|
||||||
|
//! ```
|
||||||
|
//! use article_scraper::ArticleScraper;
|
||||||
|
//! use url::Url;
|
||||||
|
//! use reqwest::Client;
|
||||||
|
//!
|
||||||
|
//! let scraper = ArticleScraper::new(None);
|
||||||
|
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||||
|
//! let client = Client::new();
|
||||||
|
//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||||
|
//! let html = article.get_doc_content();
|
||||||
|
//! ```
|
||||||
|
|
||||||
|
|
||||||
mod article;
|
mod article;
|
||||||
pub mod clean;
|
pub mod clean;
|
||||||
mod constants;
|
mod constants;
|
||||||
mod error;
|
mod error;
|
||||||
mod full_text_parser;
|
mod full_text_parser;
|
||||||
|
#[doc(hidden)]
|
||||||
pub mod images;
|
pub mod images;
|
||||||
mod util;
|
mod util;
|
||||||
|
|
||||||
use crate::images::Progress;
|
use crate::images::Progress;
|
||||||
use article::Article;
|
use article::Article;
|
||||||
use error::ScraperError;
|
use error::ScraperError;
|
||||||
|
#[doc(hidden)]
|
||||||
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
||||||
|
#[doc(hidden)]
|
||||||
pub use full_text_parser::FullTextParser;
|
pub use full_text_parser::FullTextParser;
|
||||||
pub use full_text_parser::Readability;
|
pub use full_text_parser::Readability;
|
||||||
use images::ImageDownloader;
|
use images::ImageDownloader;
|
||||||
|
@ -17,12 +55,26 @@ use reqwest::Client;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
|
|
||||||
|
/// Download & extract meaningful content from websites
|
||||||
|
///
|
||||||
|
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
|
||||||
|
/// of mozilla Readability.
|
||||||
|
///
|
||||||
|
/// For detailed information about extraction rules and how to contribute new rules please see
|
||||||
|
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
full_text_parser: FullTextParser,
|
full_text_parser: FullTextParser,
|
||||||
image_downloader: ImageDownloader,
|
image_downloader: ImageDownloader,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArticleScraper {
|
impl ArticleScraper {
|
||||||
|
|
||||||
|
/// Crate a new ArticleScraper
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `user_configs` - optional path to a folder containing additional ftr config files
|
||||||
|
///
|
||||||
pub async fn new(user_configs: Option<&Path>) -> Self {
|
pub async fn new(user_configs: Option<&Path>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
full_text_parser: FullTextParser::new(user_configs).await,
|
full_text_parser: FullTextParser::new(user_configs).await,
|
||||||
|
@ -30,6 +82,25 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/// Download & extract content of a website
|
||||||
|
///
|
||||||
|
/// # Arguments
|
||||||
|
///
|
||||||
|
/// * `url` - Url to an article
|
||||||
|
/// * `download_images` - if images should be downloaded & embedded into the HTML
|
||||||
|
/// * `client` - reqwest HTTP client to use
|
||||||
|
/// * `progress` - optional progress notifications (only for image downloads)
|
||||||
|
///
|
||||||
|
/// # Examples
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// let scraper = ArticleScraper::new(None);
|
||||||
|
/// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||||
|
/// let client = Client::new();
|
||||||
|
/// let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||||
|
/// let html = article.get_doc_content();
|
||||||
|
/// ```
|
||||||
pub async fn parse(
|
pub async fn parse(
|
||||||
&self,
|
&self,
|
||||||
url: &url::Url,
|
url: &url::Url,
|
||||||
|
|
|
@ -154,7 +154,7 @@ async fn extract_readability(
|
||||||
) {
|
) {
|
||||||
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
||||||
let html = get_html(html_file, source_url).await;
|
let html = get_html(html_file, source_url).await;
|
||||||
let result = match Readability::extract_from_str(&html, base_url).await {
|
let result = match Readability::extract(&html, base_url).await {
|
||||||
Ok(res) => res,
|
Ok(res) => res,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
log::error!("Failed to extract content with readability: {err}");
|
log::error!("Failed to extract content with readability: {err}");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue