From 582834cdf1f4ed944474f2bbc44dc2f1b6b664cd Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Wed, 21 Jun 2023 23:48:09 +0200 Subject: [PATCH] fixes --- Readme.md | 1 - .../resources/tests/ftr/youtube/expected.html | 2 +- article_scraper/src/full_text_parser/mod.rs | 17 ++++++++++++++--- article_scraper/src/full_text_parser/tests.rs | 4 +--- article_scraper/src/lib.rs | 2 -- article_scraper_cli/src/main.rs | 7 +------ 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/Readme.md b/Readme.md index 91cf31e..ca90ee4 100644 --- a/Readme.md +++ b/Readme.md @@ -29,7 +29,6 @@ let scraper = ArticleScraper::new(None); let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html"); let client = Client::new(); let article = scraper.parse(&url, false, &client, None).await.unwrap(); -let html = article.get_doc_content(); ``` # CLI diff --git a/article_scraper/resources/tests/ftr/youtube/expected.html b/article_scraper/resources/tests/ftr/youtube/expected.html index 652d569..1213034 100644 --- a/article_scraper/resources/tests/ftr/youtube/expected.html +++ b/article_scraper/resources/tests/ftr/youtube/expected.html @@ -1 +1 @@ -
\ No newline at end of file diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 1953835..53b8b92 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -84,18 +84,28 @@ impl FullTextParser { .download_all_pages(html, client, config, global_config, &url) .await?; - self.parse_offline(pages, config, global_config, Some(url)) + self.parse_offline(pages, config, Some(url)) } pub fn parse_offline( &self, pages: Vec, config: Option<&ConfigEntry>, - global_config: &ConfigEntry, url: Option, ) -> Result { let url = url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap()); + let config = if config.is_none() { + self.get_grabber_config(&url) + } else { + config + }; + + let global_config = self + .config_files + .get("global.txt") + .ok_or(FullTextParserError::Config)?; + let mut article = Article { title: None, author: None, @@ -1033,7 +1043,8 @@ impl FullTextParser { let xpath = "//*[not(node())]"; let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { - if node.get_name() == "meta" { + let name = node.get_name().to_lowercase(); + if name == "meta" || name == "img" || name == "br" { continue; } diff --git a/article_scraper/src/full_text_parser/tests.rs b/article_scraper/src/full_text_parser/tests.rs index cfa8097..4ec0444 100644 --- a/article_scraper/src/full_text_parser/tests.rs +++ b/article_scraper/src/full_text_parser/tests.rs @@ -14,9 +14,7 @@ async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&st .expect("Failed to read source HTML"); let parser = FullTextParser::new(None).await; - let article = parser - .parse_offline(vec![html], None, &ConfigEntry::default(), Some(url)) - .unwrap(); + let article = parser.parse_offline(vec![html], None, Some(url)).unwrap(); let content = article.html.unwrap(); diff --git a/article_scraper/src/lib.rs b/article_scraper/src/lib.rs index 5629f30..a1819a3 100644 --- a/article_scraper/src/lib.rs +++ b/article_scraper/src/lib.rs @@ -30,7 +30,6 @@ //! let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap(); //! let client = Client::new(); //! let article = scraper.parse(&url, false, &client, None).await.unwrap(); -//! let html = article.get_doc_content(); //! } //! ``` @@ -105,7 +104,6 @@ impl ArticleScraper { /// let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap(); /// let client = Client::new(); /// let article = scraper.parse(&url, false, &client, None).await.unwrap(); - /// let html = article.get_doc_content(); /// } /// ``` pub async fn parse( diff --git a/article_scraper_cli/src/main.rs b/article_scraper_cli/src/main.rs index ff2b9b8..6e7de3e 100644 --- a/article_scraper_cli/src/main.rs +++ b/article_scraper_cli/src/main.rs @@ -119,12 +119,7 @@ async fn extract_ftr( }; let full_text_parser = FullTextParser::new(None).await; - let article = match full_text_parser.parse_offline( - vec![html], - config.as_ref(), - &FtrConfigEntry::default(), - base_url, - ) { + let article = match full_text_parser.parse_offline(vec![html], config.as_ref(), base_url) { Ok(res) => res, Err(err) => { log::error!("Failed to extract content with ftr: {err}");