mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
fixes
This commit is contained in:
parent
e0ccd7e0b3
commit
582834cdf1
6 changed files with 17 additions and 16 deletions
|
@ -29,7 +29,6 @@ let scraper = ArticleScraper::new(None);
|
||||||
let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
|
||||||
let client = Client::new();
|
let client = Client::new();
|
||||||
let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||||
let html = article.get_doc_content();
|
|
||||||
```
|
```
|
||||||
|
|
||||||
# CLI
|
# CLI
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
<article><iframe id="video" width="100%" height="100%" src="https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" title="RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn"/></article>
|
<article><iframe id="video" width="100%" height="100%" src="https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" title="RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn"><empty></empty></iframe></article>
|
|
@ -84,18 +84,28 @@ impl FullTextParser {
|
||||||
.download_all_pages(html, client, config, global_config, &url)
|
.download_all_pages(html, client, config, global_config, &url)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
self.parse_offline(pages, config, global_config, Some(url))
|
self.parse_offline(pages, config, Some(url))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn parse_offline(
|
pub fn parse_offline(
|
||||||
&self,
|
&self,
|
||||||
pages: Vec<String>,
|
pages: Vec<String>,
|
||||||
config: Option<&ConfigEntry>,
|
config: Option<&ConfigEntry>,
|
||||||
global_config: &ConfigEntry,
|
|
||||||
url: Option<Url>,
|
url: Option<Url>,
|
||||||
) -> Result<Article, FullTextParserError> {
|
) -> Result<Article, FullTextParserError> {
|
||||||
let url = url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
|
let url = url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
|
||||||
|
|
||||||
|
let config = if config.is_none() {
|
||||||
|
self.get_grabber_config(&url)
|
||||||
|
} else {
|
||||||
|
config
|
||||||
|
};
|
||||||
|
|
||||||
|
let global_config = self
|
||||||
|
.config_files
|
||||||
|
.get("global.txt")
|
||||||
|
.ok_or(FullTextParserError::Config)?;
|
||||||
|
|
||||||
let mut article = Article {
|
let mut article = Article {
|
||||||
title: None,
|
title: None,
|
||||||
author: None,
|
author: None,
|
||||||
|
@ -1033,7 +1043,8 @@ impl FullTextParser {
|
||||||
let xpath = "//*[not(node())]";
|
let xpath = "//*[not(node())]";
|
||||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||||
for mut node in node_vec {
|
for mut node in node_vec {
|
||||||
if node.get_name() == "meta" {
|
let name = node.get_name().to_lowercase();
|
||||||
|
if name == "meta" || name == "img" || name == "br" {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -14,9 +14,7 @@ async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&st
|
||||||
.expect("Failed to read source HTML");
|
.expect("Failed to read source HTML");
|
||||||
|
|
||||||
let parser = FullTextParser::new(None).await;
|
let parser = FullTextParser::new(None).await;
|
||||||
let article = parser
|
let article = parser.parse_offline(vec![html], None, Some(url)).unwrap();
|
||||||
.parse_offline(vec![html], None, &ConfigEntry::default(), Some(url))
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let content = article.html.unwrap();
|
let content = article.html.unwrap();
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,6 @@
|
||||||
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
|
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
|
||||||
//! let client = Client::new();
|
//! let client = Client::new();
|
||||||
//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||||
//! let html = article.get_doc_content();
|
|
||||||
//! }
|
//! }
|
||||||
//! ```
|
//! ```
|
||||||
|
|
||||||
|
@ -105,7 +104,6 @@ impl ArticleScraper {
|
||||||
/// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
|
/// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
|
||||||
/// let client = Client::new();
|
/// let client = Client::new();
|
||||||
/// let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
/// let article = scraper.parse(&url, false, &client, None).await.unwrap();
|
||||||
/// let html = article.get_doc_content();
|
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
pub async fn parse(
|
pub async fn parse(
|
||||||
|
|
|
@ -119,12 +119,7 @@ async fn extract_ftr(
|
||||||
};
|
};
|
||||||
|
|
||||||
let full_text_parser = FullTextParser::new(None).await;
|
let full_text_parser = FullTextParser::new(None).await;
|
||||||
let article = match full_text_parser.parse_offline(
|
let article = match full_text_parser.parse_offline(vec![html], config.as_ref(), base_url) {
|
||||||
vec![html],
|
|
||||||
config.as_ref(),
|
|
||||||
&FtrConfigEntry::default(),
|
|
||||||
base_url,
|
|
||||||
) {
|
|
||||||
Ok(res) => res,
|
Ok(res) => res,
|
||||||
Err(err) => {
|
Err(err) => {
|
||||||
log::error!("Failed to extract content with ftr: {err}");
|
log::error!("Failed to extract content with ftr: {err}");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue