mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 08:05:31 +02:00
write some docs
This commit is contained in:
parent
bfb31dc188
commit
57df2e6832
5 changed files with 174 additions and 4 deletions
53
Readme.md
Normal file
53
Readme.md
Normal file
|
@ -0,0 +1,53 @@
|
|||
# article scraper
|
||||
|
||||
The `article_scraper` crate provides a simple way to extract meaningful content from the web.
|
||||
It contains two ways of locating the desired content
|
||||
|
||||
## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
|
||||
|
||||
This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
|
||||
The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
|
||||
|
||||
A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||
Please consider contributing new rules or updates to it.
|
||||
|
||||
`article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
|
||||
|
||||
## 2. Mozilla Readability
|
||||
|
||||
In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
|
||||
This re-implementation tries to mimic the original as closely as possible.
|
||||
|
||||
# Example
|
||||
|
||||
```
|
||||
use article_scraper::ArticleScraper;
|
||||
use url::Url;
|
||||
use reqwest::Client;
|
||||
|
||||
let scraper = ArticleScraper::new(None);
|
||||
let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||
let client = Client::new();
|
||||
let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||
let html = article.get_doc_content();
|
||||
```
|
||||
|
||||
# CLI
|
||||
|
||||
Various features of this crate can be used via [`article_scraper_cli`](./article_scraper_cli/).
|
||||
|
||||
```
|
||||
Usage: article_scraper_cli [OPTIONS] <COMMAND>
|
||||
|
||||
Commands:
|
||||
all Use the complete pipeline
|
||||
readability Only use the Readability parser
|
||||
ftr Only use (a subset of) the Ftr parser
|
||||
help Print this message or the help of the given subcommand(s)
|
||||
|
||||
Options:
|
||||
-d, --debug Turn debug logging on
|
||||
-o, --output <FILE> Destination of resulting HTML file
|
||||
-h, --help Print help
|
||||
-V, --version Print version
|
||||
```
|
File diff suppressed because one or more lines are too long
|
@ -11,10 +11,32 @@ use self::state::State;
|
|||
use super::error::FullTextParserError;
|
||||
use crate::{constants, util::Util};
|
||||
|
||||
/// Rust port of mozilla readability algorithm
|
||||
///
|
||||
/// Used as fallback for `ArticleScraper` if no fitting config can be found
|
||||
pub struct Readability;
|
||||
|
||||
impl Readability {
|
||||
pub async fn extract_from_str(
|
||||
/// Parse HTML and extract meaningful content
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `html` - HTML of a website containing an article or similar content
|
||||
/// * `base_url` - URL used to complete relative URLs
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// let html = reqwest::get("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html")
|
||||
/// .await
|
||||
/// .unwrap()
|
||||
/// .text()
|
||||
/// .await
|
||||
/// .unwrap();
|
||||
/// let base_url = Url::parse("https://www.nytimes.com").unwrap();
|
||||
/// let extracted_content = Readability::extract(&html, Some(base_url)).unwrap();
|
||||
/// ```
|
||||
pub async fn extract(
|
||||
html: &str,
|
||||
base_url: Option<url::Url>,
|
||||
) -> Result<String, FullTextParserError> {
|
||||
|
@ -55,7 +77,7 @@ impl Readability {
|
|||
Ok(html)
|
||||
}
|
||||
|
||||
pub fn extract_body(
|
||||
pub(crate) fn extract_body(
|
||||
document: Document,
|
||||
root: &mut Node,
|
||||
title: Option<&str>,
|
||||
|
|
|
@ -1,15 +1,53 @@
|
|||
//! # article scraper
|
||||
//!
|
||||
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
|
||||
//! It contains two ways of locating the desired content
|
||||
//!
|
||||
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
|
||||
//!
|
||||
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
|
||||
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
|
||||
//!
|
||||
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||
//! Please consider contributing new rules or updates to it.
|
||||
//!
|
||||
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
|
||||
//!
|
||||
//! ## 2. Mozilla Readability
|
||||
//!
|
||||
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
|
||||
//! This re-implementation tries to mimic the original as closely as possible.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! use article_scraper::ArticleScraper;
|
||||
//! use url::Url;
|
||||
//! use reqwest::Client;
|
||||
//!
|
||||
//! let scraper = ArticleScraper::new(None);
|
||||
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||
//! let client = Client::new();
|
||||
//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||
//! let html = article.get_doc_content();
|
||||
//! ```
|
||||
|
||||
|
||||
mod article;
|
||||
pub mod clean;
|
||||
mod constants;
|
||||
mod error;
|
||||
mod full_text_parser;
|
||||
#[doc(hidden)]
|
||||
pub mod images;
|
||||
mod util;
|
||||
|
||||
use crate::images::Progress;
|
||||
use article::Article;
|
||||
use error::ScraperError;
|
||||
#[doc(hidden)]
|
||||
pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
|
||||
#[doc(hidden)]
|
||||
pub use full_text_parser::FullTextParser;
|
||||
pub use full_text_parser::Readability;
|
||||
use images::ImageDownloader;
|
||||
|
@ -17,12 +55,26 @@ use reqwest::Client;
|
|||
use std::path::Path;
|
||||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
/// Download & extract meaningful content from websites
|
||||
///
|
||||
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
|
||||
/// of mozilla Readability.
|
||||
///
|
||||
/// For detailed information about extraction rules and how to contribute new rules please see
|
||||
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||
pub struct ArticleScraper {
|
||||
full_text_parser: FullTextParser,
|
||||
image_downloader: ImageDownloader,
|
||||
}
|
||||
|
||||
impl ArticleScraper {
|
||||
|
||||
/// Crate a new ArticleScraper
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `user_configs` - optional path to a folder containing additional ftr config files
|
||||
///
|
||||
pub async fn new(user_configs: Option<&Path>) -> Self {
|
||||
Self {
|
||||
full_text_parser: FullTextParser::new(user_configs).await,
|
||||
|
@ -30,6 +82,25 @@ impl ArticleScraper {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/// Download & extract content of a website
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `url` - Url to an article
|
||||
/// * `download_images` - if images should be downloaded & embedded into the HTML
|
||||
/// * `client` - reqwest HTTP client to use
|
||||
/// * `progress` - optional progress notifications (only for image downloads)
|
||||
///
|
||||
/// # Examples
|
||||
///
|
||||
/// ```
|
||||
/// let scraper = ArticleScraper::new(None);
|
||||
/// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||
/// let client = Client::new();
|
||||
/// let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||
/// let html = article.get_doc_content();
|
||||
/// ```
|
||||
pub async fn parse(
|
||||
&self,
|
||||
url: &url::Url,
|
||||
|
|
|
@ -154,7 +154,7 @@ async fn extract_readability(
|
|||
) {
|
||||
let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
|
||||
let html = get_html(html_file, source_url).await;
|
||||
let result = match Readability::extract_from_str(&html, base_url).await {
|
||||
let result = match Readability::extract(&html, base_url).await {
|
||||
Ok(res) => res,
|
||||
Err(err) => {
|
||||
log::error!("Failed to extract content with readability: {err}");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue