write some docs

2025-07-07 16:15:32 +02:00 · 2023-04-23 16:35:00 +02:00 · 2023-04-23 16:35:00 +02:00 · 57df2e6832
commit 57df2e6832
parent bfb31dc188
5 changed files with 174 additions and 4 deletions
--- a/Readme.md
+++ b/Readme.md
@ -0,0 +1,53 @@
+# article scraper
+
+The `article_scraper` crate provides a simple way to extract meaningful content from the web.
+It contains two ways of locating the desired content
+
+## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
+
+This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
+The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
+
+A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
+Please consider contributing new rules or updates to it.
+
+`article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
+
+## 2. Mozilla Readability
+
+In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
+This re-implementation tries to mimic the original as closely as possible.
+
+# Example
+
+```
+use article_scraper::ArticleScraper;
+use url::Url;
+use reqwest::Client;
+
+let scraper = ArticleScraper::new(None);
+let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
+let client = Client::new();
+let article = scraper.parse(&url, false, &client, None).await.unwrap();
+let html = article.get_doc_content();
+```
+
+# CLI
+
+Various features of this crate can be used via [`article_scraper_cli`](./article_scraper_cli/).
+
+```
+Usage: article_scraper_cli [OPTIONS] <COMMAND>
+
+Commands:
+  all          Use the complete pipeline
+  readability  Only use the Readability parser
+  ftr          Only use (a subset of) the Ftr parser
+  help         Print this message or the help of the given subcommand(s)
+
+Options:
+  -d, --debug          Turn debug logging on
+  -o, --output <FILE>  Destination of resulting HTML file
+  -h, --help           Print help
+  -V, --version        Print version
+```
--- a/article_scraper/src/clean.rs
+++ b/article_scraper/src/clean.rs
--- a/article_scraper/src/full_text_parser/readability/mod.rs
+++ b/article_scraper/src/full_text_parser/readability/mod.rs
@ -11,10 +11,32 @@ use self::state::State;
 use super::error::FullTextParserError;
 use crate::{constants, util::Util};

+/// Rust port of mozilla readability algorithm
+/// 
+/// Used as fallback for `ArticleScraper` if no fitting config can be found
 pub struct Readability;

 impl Readability {
-    pub async fn extract_from_str(
+    /// Parse HTML and extract meaningful content
+    ///
+    /// # Arguments
+    ///
+    /// * `html` - HTML of a website containing an article or similar content
+    /// * `base_url` - URL used to complete relative URLs
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let html = reqwest::get("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html")
+    ///     .await
+    ///     .unwrap()
+    ///     .text()
+    ///     .await
+    ///     .unwrap();
+    /// let base_url = Url::parse("https://www.nytimes.com").unwrap();
+    /// let extracted_content = Readability::extract(&html, Some(base_url)).unwrap();
+    /// ```
+    pub async fn extract(
        html: &str,
        base_url: Option<url::Url>,
    ) -> Result<String, FullTextParserError> {
@ -55,7 +77,7 @@ impl Readability {
        Ok(html)
    }

-    pub fn extract_body(
+    pub(crate) fn extract_body(
        document: Document,
        root: &mut Node,
        title: Option<&str>,
--- a/article_scraper/src/lib.rs
+++ b/article_scraper/src/lib.rs
@ -1,15 +1,53 @@
+//! # article scraper
+//!
+//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
+//! It contains two ways of locating the desired content
+//! 
+//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
+//! 
+//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
+//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
+//! 
+//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
+//! Please consider contributing new rules or updates to it.
+//! 
+//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
+//! 
+//! ## 2. Mozilla Readability
+//! 
+//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
+//! This re-implementation tries to mimic the original as closely as possible.
+//! 
+//! # Example
+//! 
+//! ```
+//! use article_scraper::ArticleScraper;
+//! use url::Url;
+//! use reqwest::Client;
+//! 
+//! let scraper = ArticleScraper::new(None);
+//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
+//! let client = Client::new();
+//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
+//! let html = article.get_doc_content();
+//! ```
+
+
 mod article;
 pub mod clean;
 mod constants;
 mod error;
 mod full_text_parser;
+#[doc(hidden)]
 pub mod images;
 mod util;

 use crate::images::Progress;
 use article::Article;
 use error::ScraperError;
+#[doc(hidden)]
 pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
+#[doc(hidden)]
 pub use full_text_parser::FullTextParser;
 pub use full_text_parser::Readability;
 use images::ImageDownloader;
@ -17,12 +55,26 @@ use reqwest::Client;
 use std::path::Path;
 use tokio::sync::mpsc::Sender;

+/// Download & extract meaningful content from websites
+/// 
+/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
+/// of mozilla Readability.
+/// 
+/// For detailed information about extraction rules and how to contribute new rules please see
+/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
 pub struct ArticleScraper {
    full_text_parser: FullTextParser,
    image_downloader: ImageDownloader,
 }

 impl ArticleScraper {
+
+    /// Crate a new ArticleScraper
+    ///
+    /// # Arguments
+    ///
+    /// * `user_configs` - optional path to a folder containing additional ftr config files
+    ///
    pub async fn new(user_configs: Option<&Path>) -> Self {
        Self {
            full_text_parser: FullTextParser::new(user_configs).await,
@ -30,6 +82,25 @@ impl ArticleScraper {
        }
    }

+
+    /// Download & extract content of a website
+    ///
+    /// # Arguments
+    ///
+    /// * `url` - Url to an article
+    /// * `download_images` - if images should be downloaded & embedded into the HTML
+    /// * `client` - reqwest HTTP client to use
+    /// * `progress` - optional progress notifications (only for image downloads)
+    ///
+    /// # Examples
+    ///
+    /// ```
+    /// let scraper = ArticleScraper::new(None);
+    /// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
+    /// let client = Client::new();
+    /// let article = scraper.parse(&url, false, &client, None).await.unwrap();
+    /// let html = article.get_doc_content();
+    /// ```
    pub async fn parse(
        &self,
        url: &url::Url,
--- a/article_scraper_cli/src/main.rs
+++ b/article_scraper_cli/src/main.rs
@ -154,7 +154,7 @@ async fn extract_readability(
 ) {
    let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
    let html = get_html(html_file, source_url).await;
-    let result = match Readability::extract_from_str(&html, base_url).await {
+    let result = match Readability::extract(&html, base_url).await {
        Ok(res) => res,
        Err(err) => {
            log::error!("Failed to extract content with readability: {err}");