1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00
This commit is contained in:
Jan Lukas Gernert 2023-06-21 23:48:09 +02:00
parent e0ccd7e0b3
commit 582834cdf1
6 changed files with 17 additions and 16 deletions

View file

@ -29,7 +29,6 @@ let scraper = ArticleScraper::new(None);
let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html"); let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
let client = Client::new(); let client = Client::new();
let article = scraper.parse(&url, false, &client, None).await.unwrap(); let article = scraper.parse(&url, false, &client, None).await.unwrap();
let html = article.get_doc_content();
``` ```
# CLI # CLI

View file

@ -1 +1 @@
<article><iframe id="video" width="100%" height="100%" src="https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" title="RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn"/></article> <article><iframe id="video" width="100%" height="100%" src="https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" title="RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn"><empty></empty></iframe></article>

View file

@ -84,18 +84,28 @@ impl FullTextParser {
.download_all_pages(html, client, config, global_config, &url) .download_all_pages(html, client, config, global_config, &url)
.await?; .await?;
self.parse_offline(pages, config, global_config, Some(url)) self.parse_offline(pages, config, Some(url))
} }
pub fn parse_offline( pub fn parse_offline(
&self, &self,
pages: Vec<String>, pages: Vec<String>,
config: Option<&ConfigEntry>, config: Option<&ConfigEntry>,
global_config: &ConfigEntry,
url: Option<Url>, url: Option<Url>,
) -> Result<Article, FullTextParserError> { ) -> Result<Article, FullTextParserError> {
let url = url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap()); let url = url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
let config = if config.is_none() {
self.get_grabber_config(&url)
} else {
config
};
let global_config = self
.config_files
.get("global.txt")
.ok_or(FullTextParserError::Config)?;
let mut article = Article { let mut article = Article {
title: None, title: None,
author: None, author: None,
@ -1033,7 +1043,8 @@ impl FullTextParser {
let xpath = "//*[not(node())]"; let xpath = "//*[not(node())]";
let node_vec = Util::evaluate_xpath(context, xpath, false)?; let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec { for mut node in node_vec {
if node.get_name() == "meta" { let name = node.get_name().to_lowercase();
if name == "meta" || name == "img" || name == "br" {
continue; continue;
} }

View file

@ -14,9 +14,7 @@ async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&st
.expect("Failed to read source HTML"); .expect("Failed to read source HTML");
let parser = FullTextParser::new(None).await; let parser = FullTextParser::new(None).await;
let article = parser let article = parser.parse_offline(vec![html], None, Some(url)).unwrap();
.parse_offline(vec![html], None, &ConfigEntry::default(), Some(url))
.unwrap();
let content = article.html.unwrap(); let content = article.html.unwrap();

View file

@ -30,7 +30,6 @@
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap(); //! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
//! let client = Client::new(); //! let client = Client::new();
//! let article = scraper.parse(&url, false, &client, None).await.unwrap(); //! let article = scraper.parse(&url, false, &client, None).await.unwrap();
//! let html = article.get_doc_content();
//! } //! }
//! ``` //! ```
@ -105,7 +104,6 @@ impl ArticleScraper {
/// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap(); /// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
/// let client = Client::new(); /// let client = Client::new();
/// let article = scraper.parse(&url, false, &client, None).await.unwrap(); /// let article = scraper.parse(&url, false, &client, None).await.unwrap();
/// let html = article.get_doc_content();
/// } /// }
/// ``` /// ```
pub async fn parse( pub async fn parse(

View file

@ -119,12 +119,7 @@ async fn extract_ftr(
}; };
let full_text_parser = FullTextParser::new(None).await; let full_text_parser = FullTextParser::new(None).await;
let article = match full_text_parser.parse_offline( let article = match full_text_parser.parse_offline(vec![html], config.as_ref(), base_url) {
vec![html],
config.as_ref(),
&FtrConfigEntry::default(),
base_url,
) {
Ok(res) => res, Ok(res) => res,
Err(err) => { Err(err) => {
log::error!("Failed to extract content with ftr: {err}"); log::error!("Failed to extract content with ftr: {err}");