1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

write some docs

This commit is contained in:
Jan Lukas Gernert 2023-04-23 16:35:00 +02:00
parent bfb31dc188
commit 57df2e6832
5 changed files with 174 additions and 4 deletions

53
Readme.md Normal file
View file

@ -0,0 +1,53 @@
# article scraper
The `article_scraper` crate provides a simple way to extract meaningful content from the web.
It contains two ways of locating the desired content
## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
Please consider contributing new rules or updates to it.
`article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
## 2. Mozilla Readability
In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
This re-implementation tries to mimic the original as closely as possible.
# Example
```
use article_scraper::ArticleScraper;
use url::Url;
use reqwest::Client;
let scraper = ArticleScraper::new(None);
let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
let client = Client::new();
let article = scraper.parse(&url, false, &client, None).await.unwrap();
let html = article.get_doc_content();
```
# CLI
Various features of this crate can be used via [`article_scraper_cli`](./article_scraper_cli/).
```
Usage: article_scraper_cli [OPTIONS] <COMMAND>
Commands:
all Use the complete pipeline
readability Only use the Readability parser
ftr Only use (a subset of) the Ftr parser
help Print this message or the help of the given subcommand(s)
Options:
-d, --debug Turn debug logging on
-o, --output <FILE> Destination of resulting HTML file
-h, --help Print help
-V, --version Print version
```

File diff suppressed because one or more lines are too long

View file

@ -11,10 +11,32 @@ use self::state::State;
use super::error::FullTextParserError;
use crate::{constants, util::Util};
/// Rust port of mozilla readability algorithm
///
/// Used as fallback for `ArticleScraper` if no fitting config can be found
pub struct Readability;
impl Readability {
pub async fn extract_from_str(
/// Parse HTML and extract meaningful content
///
/// # Arguments
///
/// * `html` - HTML of a website containing an article or similar content
/// * `base_url` - URL used to complete relative URLs
///
/// # Examples
///
/// ```
/// let html = reqwest::get("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html")
/// .await
/// .unwrap()
/// .text()
/// .await
/// .unwrap();
/// let base_url = Url::parse("https://www.nytimes.com").unwrap();
/// let extracted_content = Readability::extract(&html, Some(base_url)).unwrap();
/// ```
pub async fn extract(
html: &str,
base_url: Option<url::Url>,
) -> Result<String, FullTextParserError> {
@ -55,7 +77,7 @@ impl Readability {
Ok(html)
}
pub fn extract_body(
pub(crate) fn extract_body(
document: Document,
root: &mut Node,
title: Option<&str>,

View file

@ -1,15 +1,53 @@
//! # article scraper
//!
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
//! It contains two ways of locating the desired content
//!
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
//!
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
//!
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
//! Please consider contributing new rules or updates to it.
//!
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
//!
//! ## 2. Mozilla Readability
//!
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
//! This re-implementation tries to mimic the original as closely as possible.
//!
//! # Example
//!
//! ```
//! use article_scraper::ArticleScraper;
//! use url::Url;
//! use reqwest::Client;
//!
//! let scraper = ArticleScraper::new(None);
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
//! let client = Client::new();
//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
//! let html = article.get_doc_content();
//! ```
mod article;
pub mod clean;
mod constants;
mod error;
mod full_text_parser;
#[doc(hidden)]
pub mod images;
mod util;
use crate::images::Progress;
use article::Article;
use error::ScraperError;
#[doc(hidden)]
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
#[doc(hidden)]
pub use full_text_parser::FullTextParser;
pub use full_text_parser::Readability;
use images::ImageDownloader;
@ -17,12 +55,26 @@ use reqwest::Client;
use std::path::Path;
use tokio::sync::mpsc::Sender;
/// Download & extract meaningful content from websites
///
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
/// of mozilla Readability.
///
/// For detailed information about extraction rules and how to contribute new rules please see
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
pub struct ArticleScraper {
full_text_parser: FullTextParser,
image_downloader: ImageDownloader,
}
impl ArticleScraper {
/// Crate a new ArticleScraper
///
/// # Arguments
///
/// * `user_configs` - optional path to a folder containing additional ftr config files
///
pub async fn new(user_configs: Option<&Path>) -> Self {
Self {
full_text_parser: FullTextParser::new(user_configs).await,
@ -30,6 +82,25 @@ impl ArticleScraper {
}
}
/// Download & extract content of a website
///
/// # Arguments
///
/// * `url` - Url to an article
/// * `download_images` - if images should be downloaded & embedded into the HTML
/// * `client` - reqwest HTTP client to use
/// * `progress` - optional progress notifications (only for image downloads)
///
/// # Examples
///
/// ```
/// let scraper = ArticleScraper::new(None);
/// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
/// let client = Client::new();
/// let article = scraper.parse(&url, false, &client, None).await.unwrap();
/// let html = article.get_doc_content();
/// ```
pub async fn parse(
&self,
url: &url::Url,

View file

@ -154,7 +154,7 @@ async fn extract_readability(
) {
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
let html = get_html(html_file, source_url).await;
let result = match Readability::extract_from_str(&html, base_url).await {
let result = match Readability::extract(&html, base_url).await {
Ok(res) => res,
Err(err) => {
log::error!("Failed to extract content with readability: {err}");