write some docs

2025-07-07 16:15:32 +02:00 · 2023-04-23 16:35:00 +02:00 · 2023-04-23 16:35:00 +02:00 · 57df2e6832
commit 57df2e6832
parent bfb31dc188
5 changed files with 174 additions and 4 deletions
--- a/Readme.md
+++ b/Readme.md
@ -0,0 +1,53 @@
 # article scraper
 The `article_scraper` crate provides a simple way to extract meaningful content from the web.
 It contains two ways of locating the desired content
 ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
 This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
 The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
 A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
 Please consider contributing new rules or updates to it.
 `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
 ## 2. Mozilla Readability
 In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
 This re-implementation tries to mimic the original as closely as possible.
 # Example
 ```
 use article_scraper::ArticleScraper;
 use url::Url;
 use reqwest::Client;
 let scraper = ArticleScraper::new(None);
 let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
 let client = Client::new();
 let article = scraper.parse(&url, false, &client, None).await.unwrap();
 let html = article.get_doc_content();
 ```
 # CLI
 Various features of this crate can be used via [`article_scraper_cli`](./article_scraper_cli/).
 ```
 Usage: article_scraper_cli [OPTIONS] <COMMAND>
 Commands:
  all          Use the complete pipeline
  readability  Only use the Readability parser
  ftr          Only use (a subset of) the Ftr parser
  help         Print this message or the help of the given subcommand(s)
 Options:
  -d, --debug          Turn debug logging on
  -o, --output <FILE>  Destination of resulting HTML file
  -h, --help           Print help
  -V, --version        Print version
 ```
--- a/article_scraper/src/clean.rs
+++ b/article_scraper/src/clean.rs
--- a/article_scraper/src/full_text_parser/readability/mod.rs
+++ b/article_scraper/src/full_text_parser/readability/mod.rs
@ -11,10 +11,32 @@ use self::state::State;
 use super::error::FullTextParserError;
 use crate::{constants, util::Util};
 /// Rust port of mozilla readability algorithm
 /// 
 /// Used as fallback for `ArticleScraper` if no fitting config can be found
 pub struct Readability;
 impl Readability {
-    pub async fn extract_from_str(
+    /// Parse HTML and extract meaningful content
    ///
    /// # Arguments
    ///
    /// * `html` - HTML of a website containing an article or similar content
    /// * `base_url` - URL used to complete relative URLs
    ///
    /// # Examples
    ///
    /// ```
    /// let html = reqwest::get("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html")
    ///     .await
    ///     .unwrap()
    ///     .text()
    ///     .await
    ///     .unwrap();
    /// let base_url = Url::parse("https://www.nytimes.com").unwrap();
    /// let extracted_content = Readability::extract(&html, Some(base_url)).unwrap();
    /// ```
    pub async fn extract(
        html: &str,
        base_url: Option<url::Url>,
    ) -> Result<String, FullTextParserError> {
@ -55,7 +77,7 @@ impl Readability {
        Ok(html)
    }
-    pub fn extract_body(
+    pub(crate) fn extract_body(
        document: Document,
        root: &mut Node,
        title: Option<&str>,
--- a/article_scraper/src/lib.rs
+++ b/article_scraper/src/lib.rs
@ -1,15 +1,53 @@
 //! # article scraper
 //!
 //! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
 //! It contains two ways of locating the desired content
 //! 
 //! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
 //! 
 //! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
 //! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
 //! 
 //! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
 //! Please consider contributing new rules or updates to it.
 //! 
 //! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
 //! 
 //! ## 2. Mozilla Readability
 //! 
 //! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
 //! This re-implementation tries to mimic the original as closely as possible.
 //! 
 //! # Example
 //! 
 //! ```
 //! use article_scraper::ArticleScraper;
 //! use url::Url;
 //! use reqwest::Client;
 //! 
 //! let scraper = ArticleScraper::new(None);
 //! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
 //! let client = Client::new();
 //! let article = scraper.parse(&url, false, &client, None).await.unwrap();
 //! let html = article.get_doc_content();
 //! ```
 mod article;
 pub mod clean;
 mod constants;
 mod error;
 mod full_text_parser;
 #[doc(hidden)]
 pub mod images;
 mod util;
 use crate::images::Progress;
 use article::Article;
 use error::ScraperError;
 #[doc(hidden)]
 pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
 #[doc(hidden)]
 pub use full_text_parser::FullTextParser;
 pub use full_text_parser::Readability;
 use images::ImageDownloader;
@ -17,12 +55,26 @@ use reqwest::Client;
 use std::path::Path;
 use tokio::sync::mpsc::Sender;
 /// Download & extract meaningful content from websites
 /// 
 /// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
 /// of mozilla Readability.
 /// 
 /// For detailed information about extraction rules and how to contribute new rules please see
 /// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
 pub struct ArticleScraper {
    full_text_parser: FullTextParser,
    image_downloader: ImageDownloader,
 }
 impl ArticleScraper {
    /// Crate a new ArticleScraper
    ///
    /// # Arguments
    ///
    /// * `user_configs` - optional path to a folder containing additional ftr config files
    ///
    pub async fn new(user_configs: Option<&Path>) -> Self {
        Self {
            full_text_parser: FullTextParser::new(user_configs).await,
@ -30,6 +82,25 @@ impl ArticleScraper {
        }
    }
    /// Download & extract content of a website
    ///
    /// # Arguments
    ///
    /// * `url` - Url to an article
    /// * `download_images` - if images should be downloaded & embedded into the HTML
    /// * `client` - reqwest HTTP client to use
    /// * `progress` - optional progress notifications (only for image downloads)
    ///
    /// # Examples
    ///
    /// ```
    /// let scraper = ArticleScraper::new(None);
    /// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
    /// let client = Client::new();
    /// let article = scraper.parse(&url, false, &client, None).await.unwrap();
    /// let html = article.get_doc_content();
    /// ```
    pub async fn parse(
        &self,
        url: &url::Url,
--- a/article_scraper_cli/src/main.rs
+++ b/article_scraper_cli/src/main.rs
@ -154,7 +154,7 @@ async fn extract_readability(
 ) {
    let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
    let html = get_html(html_file, source_url).await;
-    let result = match Readability::extract_from_str(&html, base_url).await {
+    let result = match Readability::extract(&html, base_url).await {
        Ok(res) => res,
        Err(err) => {
            log::error!("Failed to extract content with readability: {err}");