mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fmt
This commit is contained in:
parent
57df2e6832
commit
1695e33f9e
3 changed files with 13 additions and 16 deletions
|
@ -5,7 +5,7 @@ use crate::full_text_parser::error::FullTextParserError;
|
|||
use crate::util::Util;
|
||||
use crate::{FtrConfigEntry, FullTextParser};
|
||||
|
||||
/// Re-use crate internals to clean HTML of articles before
|
||||
/// Re-use crate internals to clean HTML of articles before
|
||||
/// further processing:
|
||||
/// - replace H1 with H2
|
||||
/// - rename all font nodes to span
|
||||
|
|
|
@ -12,7 +12,7 @@ use super::error::FullTextParserError;
|
|||
use crate::{constants, util::Util};
|
||||
|
||||
/// Rust port of mozilla readability algorithm
|
||||
///
|
||||
///
|
||||
/// Used as fallback for `ArticleScraper` if no fitting config can be found
|
||||
pub struct Readability;
|
||||
|
||||
|
|
|
@ -2,29 +2,29 @@
|
|||
//!
|
||||
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
|
||||
//! It contains two ways of locating the desired content
|
||||
//!
|
||||
//!
|
||||
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
|
||||
//!
|
||||
//!
|
||||
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
|
||||
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
|
||||
//!
|
||||
//!
|
||||
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||
//! Please consider contributing new rules or updates to it.
|
||||
//!
|
||||
//!
|
||||
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
|
||||
//!
|
||||
//!
|
||||
//! ## 2. Mozilla Readability
|
||||
//!
|
||||
//!
|
||||
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
|
||||
//! This re-implementation tries to mimic the original as closely as possible.
|
||||
//!
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//!
|
||||
//! ```
|
||||
//! use article_scraper::ArticleScraper;
|
||||
//! use url::Url;
|
||||
//! use reqwest::Client;
|
||||
//!
|
||||
//!
|
||||
//! let scraper = ArticleScraper::new(None);
|
||||
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||
//! let client = Client::new();
|
||||
|
@ -32,7 +32,6 @@
|
|||
//! let html = article.get_doc_content();
|
||||
//! ```
|
||||
|
||||
|
||||
mod article;
|
||||
pub mod clean;
|
||||
mod constants;
|
||||
|
@ -56,10 +55,10 @@ use std::path::Path;
|
|||
use tokio::sync::mpsc::Sender;
|
||||
|
||||
/// Download & extract meaningful content from websites
|
||||
///
|
||||
///
|
||||
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
|
||||
/// of mozilla Readability.
|
||||
///
|
||||
///
|
||||
/// For detailed information about extraction rules and how to contribute new rules please see
|
||||
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||
pub struct ArticleScraper {
|
||||
|
@ -68,7 +67,6 @@ pub struct ArticleScraper {
|
|||
}
|
||||
|
||||
impl ArticleScraper {
|
||||
|
||||
/// Crate a new ArticleScraper
|
||||
///
|
||||
/// # Arguments
|
||||
|
@ -82,7 +80,6 @@ impl ArticleScraper {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/// Download & extract content of a website
|
||||
///
|
||||
/// # Arguments
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue