1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-04-23 16:37:06 +02:00
parent 57df2e6832
commit 1695e33f9e
3 changed files with 13 additions and 16 deletions

View file

@ -5,7 +5,7 @@ use crate::full_text_parser::error::FullTextParserError;
use crate::util::Util;
use crate::{FtrConfigEntry, FullTextParser};
/// Re-use crate internals to clean HTML of articles before
/// Re-use crate internals to clean HTML of articles before
/// further processing:
/// - replace H1 with H2
/// - rename all font nodes to span

View file

@ -12,7 +12,7 @@ use super::error::FullTextParserError;
use crate::{constants, util::Util};
/// Rust port of mozilla readability algorithm
///
///
/// Used as fallback for `ArticleScraper` if no fitting config can be found
pub struct Readability;

View file

@ -2,29 +2,29 @@
//!
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
//! It contains two ways of locating the desired content
//!
//!
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
//!
//!
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
//!
//!
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
//! Please consider contributing new rules or updates to it.
//!
//!
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
//!
//!
//! ## 2. Mozilla Readability
//!
//!
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
//! This re-implementation tries to mimic the original as closely as possible.
//!
//!
//! # Example
//!
//!
//! ```
//! use article_scraper::ArticleScraper;
//! use url::Url;
//! use reqwest::Client;
//!
//!
//! let scraper = ArticleScraper::new(None);
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
//! let client = Client::new();
@ -32,7 +32,6 @@
//! let html = article.get_doc_content();
//! ```
mod article;
pub mod clean;
mod constants;
@ -56,10 +55,10 @@ use std::path::Path;
use tokio::sync::mpsc::Sender;
/// Download & extract meaningful content from websites
///
///
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
/// of mozilla Readability.
///
///
/// For detailed information about extraction rules and how to contribute new rules please see
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
pub struct ArticleScraper {
@ -68,7 +67,6 @@ pub struct ArticleScraper {
}
impl ArticleScraper {
/// Crate a new ArticleScraper
///
/// # Arguments
@ -82,7 +80,6 @@ impl ArticleScraper {
}
}
/// Download & extract content of a website
///
/// # Arguments