mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fmt
This commit is contained in:
parent
57df2e6832
commit
1695e33f9e
3 changed files with 13 additions and 16 deletions
|
@ -5,7 +5,7 @@ use crate::full_text_parser::error::FullTextParserError;
|
||||||
use crate::util::Util;
|
use crate::util::Util;
|
||||||
use crate::{FtrConfigEntry, FullTextParser};
|
use crate::{FtrConfigEntry, FullTextParser};
|
||||||
|
|
||||||
/// Re-use crate internals to clean HTML of articles before
|
/// Re-use crate internals to clean HTML of articles before
|
||||||
/// further processing:
|
/// further processing:
|
||||||
/// - replace H1 with H2
|
/// - replace H1 with H2
|
||||||
/// - rename all font nodes to span
|
/// - rename all font nodes to span
|
||||||
|
|
|
@ -12,7 +12,7 @@ use super::error::FullTextParserError;
|
||||||
use crate::{constants, util::Util};
|
use crate::{constants, util::Util};
|
||||||
|
|
||||||
/// Rust port of mozilla readability algorithm
|
/// Rust port of mozilla readability algorithm
|
||||||
///
|
///
|
||||||
/// Used as fallback for `ArticleScraper` if no fitting config can be found
|
/// Used as fallback for `ArticleScraper` if no fitting config can be found
|
||||||
pub struct Readability;
|
pub struct Readability;
|
||||||
|
|
||||||
|
|
|
@ -2,29 +2,29 @@
|
||||||
//!
|
//!
|
||||||
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
|
//! The `article_scraper` crate provides a simple way to extract meaningful content from the web.
|
||||||
//! It contains two ways of locating the desired content
|
//! It contains two ways of locating the desired content
|
||||||
//!
|
//!
|
||||||
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
|
//! ## 1. Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/)
|
||||||
//!
|
//!
|
||||||
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
|
//! This makes use of website specific extraction rules. Which has the advantage of fast & accurate results.
|
||||||
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
|
//! The disadvantages however are: the config needs to be updated as the website changes and a new extraction rule is needed for every website.
|
||||||
//!
|
//!
|
||||||
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
//! A central repository of extraction rules and information about writing your own rules can be found here: [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||||
//! Please consider contributing new rules or updates to it.
|
//! Please consider contributing new rules or updates to it.
|
||||||
//!
|
//!
|
||||||
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
|
//! `article_scraper` embeds all the rules in the ftr-site-config repository for convenience. Custom and updated rules can be loaded from a `user_configs` path.
|
||||||
//!
|
//!
|
||||||
//! ## 2. Mozilla Readability
|
//! ## 2. Mozilla Readability
|
||||||
//!
|
//!
|
||||||
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
|
//! In case the ftr-config based extraction fails the [mozilla Readability](https://github.com/mozilla/readability) algorithm will be used as a fall-back.
|
||||||
//! This re-implementation tries to mimic the original as closely as possible.
|
//! This re-implementation tries to mimic the original as closely as possible.
|
||||||
//!
|
//!
|
||||||
//! # Example
|
//! # Example
|
||||||
//!
|
//!
|
||||||
//! ```
|
//! ```
|
||||||
//! use article_scraper::ArticleScraper;
|
//! use article_scraper::ArticleScraper;
|
||||||
//! use url::Url;
|
//! use url::Url;
|
||||||
//! use reqwest::Client;
|
//! use reqwest::Client;
|
||||||
//!
|
//!
|
||||||
//! let scraper = ArticleScraper::new(None);
|
//! let scraper = ArticleScraper::new(None);
|
||||||
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||||
//! let client = Client::new();
|
//! let client = Client::new();
|
||||||
|
@ -32,7 +32,6 @@
|
||||||
//! let html = article.get_doc_content();
|
//! let html = article.get_doc_content();
|
||||||
//! ```
|
//! ```
|
||||||
|
|
||||||
|
|
||||||
mod article;
|
mod article;
|
||||||
pub mod clean;
|
pub mod clean;
|
||||||
mod constants;
|
mod constants;
|
||||||
|
@ -56,10 +55,10 @@ use std::path::Path;
|
||||||
use tokio::sync::mpsc::Sender;
|
use tokio::sync::mpsc::Sender;
|
||||||
|
|
||||||
/// Download & extract meaningful content from websites
|
/// Download & extract meaningful content from websites
|
||||||
///
|
///
|
||||||
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
|
/// Rust implementation of [Full-Text RSS](https://www.fivefilters.org/full-text-rss/) with an additional fallback
|
||||||
/// of mozilla Readability.
|
/// of mozilla Readability.
|
||||||
///
|
///
|
||||||
/// For detailed information about extraction rules and how to contribute new rules please see
|
/// For detailed information about extraction rules and how to contribute new rules please see
|
||||||
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
/// [ftr-site-config](https://github.com/fivefilters/ftr-site-config).
|
||||||
pub struct ArticleScraper {
|
pub struct ArticleScraper {
|
||||||
|
@ -68,7 +67,6 @@ pub struct ArticleScraper {
|
||||||
}
|
}
|
||||||
|
|
||||||
impl ArticleScraper {
|
impl ArticleScraper {
|
||||||
|
|
||||||
/// Crate a new ArticleScraper
|
/// Crate a new ArticleScraper
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
|
@ -82,7 +80,6 @@ impl ArticleScraper {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/// Download & extract content of a website
|
/// Download & extract content of a website
|
||||||
///
|
///
|
||||||
/// # Arguments
|
/// # Arguments
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue