diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index 2bd97bf..4ac6090 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -5,7 +5,7 @@ use crate::full_text_parser::error::FullTextParserError; use crate::util::Util; use crate::{FtrConfigEntry, FullTextParser}; -/// Re-use crate internals to clean HTML of articles before +/// Re-use crate internals to clean HTML of articles before /// further processing: /// - replace H1 with H2 /// - rename all font nodes to span diff --git a/article_scraper/src/full_text_parser/readability/mod.rs b/article_scraper/src/full_text_parser/readability/mod.rs index 5b483a8..e9c50bb 100644 --- a/article_scraper/src/full_text_parser/readability/mod.rs +++ b/article_scraper/src/full_text_parser/readability/mod.rs @@ -12,7 +12,7 @@ use super::error::FullTextParserError; use crate::{constants, util::Util}; /// Rust port of mozilla readability algorithm -/// +/// /// Used as fallback for `ArticleScraper` if no fitting config can be found pub struct Readability; diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs index 5e83ecc..ccaffe1 100644 --- a/article_scraper/src/lib.rs +++ b/article_scraper/src/lib.rs @@ -2,29 +2,29 @@ //! //! The `article_scraper` crate provides a simple way to extract meaningful content from the web. //! It contains two ways of locating the desired content -//! +//! //! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) -//! +//! //! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results. //! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website. -//! +//! //! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config). //! Please consider contributing new rules or updates to it. -//! +//! //! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path. -//! +//! //! ## 2. Mozilla Readability -//! +//! //! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back. //! This re-implementation tries to mimic the original as closely as possible. -//! +//! //! # Example -//! +//! //! ``` //! use article_scraper::ArticleScraper; //! use url::Url; //! use reqwest::Client; -//! +//! //! let scraper = ArticleScraper::new(None); //! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html"); //! let client = Client::new(); @@ -32,7 +32,6 @@ //! let html = article.get_doc_content(); //! ``` - mod article; pub mod clean; mod constants; @@ -56,10 +55,10 @@ use std::path::Path; use tokio::sync::mpsc::Sender; /// Download & extract meaningful content from websites -/// +/// /// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback /// of mozilla Readability. -/// +/// /// For detailed information about extraction rules and how to contribute new rules please see /// [ftr-site-config](https://github.com/fivefilters/ftr-site-config). pub struct ArticleScraper { @@ -68,7 +67,6 @@ pub struct ArticleScraper { } impl ArticleScraper { - /// Crate a new ArticleScraper /// /// # Arguments @@ -82,7 +80,6 @@ impl ArticleScraper { } } - /// Download & extract content of a website /// /// # Arguments