diff --git a/Readme.md b/Readme.md
index 91cf31e..ca90ee4 100644
--- a/Readme.md
+++ b/Readme.md
@@ -29,7 +29,6 @@ let scraper = ArticleScraper::new(None);
let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
let client = Client::new();
let article = scraper.parse(&url, false, &client, None).await.unwrap();
-let html = article.get_doc_content();
```
# CLI
diff --git a/article_scraper/resources/tests/ftr/youtube/expected.html b/article_scraper/resources/tests/ftr/youtube/expected.html
index 652d569..1213034 100644
--- a/article_scraper/resources/tests/ftr/youtube/expected.html
+++ b/article_scraper/resources/tests/ftr/youtube/expected.html
@@ -1 +1 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs
index 1953835..53b8b92 100644
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@@ -84,18 +84,28 @@ impl FullTextParser {
.download_all_pages(html, client, config, global_config, &url)
.await?;
- self.parse_offline(pages, config, global_config, Some(url))
+ self.parse_offline(pages, config, Some(url))
}
pub fn parse_offline(
&self,
pages: Vec,
config: Option<&ConfigEntry>,
- global_config: &ConfigEntry,
url: Option,
) -> Result {
let url = url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
+ let config = if config.is_none() {
+ self.get_grabber_config(&url)
+ } else {
+ config
+ };
+
+ let global_config = self
+ .config_files
+ .get("global.txt")
+ .ok_or(FullTextParserError::Config)?;
+
let mut article = Article {
title: None,
author: None,
@@ -1033,7 +1043,8 @@ impl FullTextParser {
let xpath = "//*[not(node())]";
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
for mut node in node_vec {
- if node.get_name() == "meta" {
+ let name = node.get_name().to_lowercase();
+ if name == "meta" || name == "img" || name == "br" {
continue;
}
diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs
index cfa8097..4ec0444 100644
--- a/article_scraper/src/full_text_parser/tests.rs
+++ b/article_scraper/src/full_text_parser/tests.rs
@@ -14,9 +14,7 @@ async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&st
.expect("Failed to read source HTML");
let parser = FullTextParser::new(None).await;
- let article = parser
- .parse_offline(vec![html], None, &ConfigEntry::default(), Some(url))
- .unwrap();
+ let article = parser.parse_offline(vec![html], None, Some(url)).unwrap();
let content = article.html.unwrap();
diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs
index 5629f30..a1819a3 100644
--- a/article_scraper/src/lib.rs
+++ b/article_scraper/src/lib.rs
@@ -30,7 +30,6 @@
//! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
//! let client = Client::new();
//! let article = scraper.parse(&url, false, &client, None).await.unwrap();
-//! let html = article.get_doc_content();
//! }
//! ```
@@ -105,7 +104,6 @@ impl ArticleScraper {
/// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
/// let client = Client::new();
/// let article = scraper.parse(&url, false, &client, None).await.unwrap();
- /// let html = article.get_doc_content();
/// }
/// ```
pub async fn parse(
diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs
index ff2b9b8..6e7de3e 100644
--- a/article_scraper_cli/src/main.rs
+++ b/article_scraper_cli/src/main.rs
@@ -119,12 +119,7 @@ async fn extract_ftr(
};
let full_text_parser = FullTextParser::new(None).await;
- let article = match full_text_parser.parse_offline(
- vec![html],
- config.as_ref(),
- &FtrConfigEntry::default(),
- base_url,
- ) {
+ let article = match full_text_parser.parse_offline(vec![html], config.as_ref(), base_url) {
Ok(res) => res,
Err(err) => {
log::error!("Failed to extract content with ftr: {err}");