From edfbca3cf3d8d5354661b40cdb2bf523eaded228 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Tue, 19 Nov 2019 14:41:08 +0100 Subject: [PATCH] fix document going out of scope --- src/lib.rs | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1aeac3c..27c0cca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -146,7 +146,8 @@ impl ArticleScraper { async fn parse_pages(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> { let html = ArticleScraper::download(&url, &self.client).await?; - let mut xpath_ctx = Self::parse_html(html, config)?; + let mut document = Self::parse_html(html, config)?; + let mut xpath_ctx = Self::get_xpath_ctx(&document)?; // check for single page link if let Some(xpath_single_page_link) = config.single_page_link.clone() { @@ -166,7 +167,8 @@ impl ArticleScraper { loop { if let Some(url) = self.check_for_next_page(&xpath_ctx, config) { let html = ArticleScraper::download(&url, &self.client).await?; - xpath_ctx = Self::parse_html(html, config)?; + document = Self::parse_html(html, config)?; + xpath_ctx = Self::get_xpath_ctx(&document)?; ArticleScraper::strip_junk(&xpath_ctx, config, &url); ArticleScraper::extract_body(&xpath_ctx, root, config)?; } else { @@ -177,7 +179,7 @@ impl ArticleScraper { Ok(()) } - fn parse_html(html: String, config: &GrabberConfig) -> Result { + fn parse_html(html: String, config: &GrabberConfig) -> Result { // replace matches in raw html let mut html = html; @@ -187,11 +189,13 @@ impl ArticleScraper { // parse html let parser = Parser::default_html(); - let doc = parser.parse_string(html.as_str()).map_err(|err| { + Ok(parser.parse_string(html.as_str()).map_err(|err| { error!("Parsing HTML failed for downloaded HTML {:?}", err); ScraperErrorKind::Xml - })?; - + })?) + } + + fn get_xpath_ctx(doc: &Document) -> Result { Ok(Context::new(&doc).map_err(|()| { error!("Creating xpath context failed for downloaded HTML"); ScraperErrorKind::Xml @@ -221,7 +225,8 @@ impl ArticleScraper { async fn parse_single_page(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> { let html = ArticleScraper::download(&url, &self.client).await?; - let xpath_ctx = Self::parse_html(html, config)?; + let document = Self::parse_html(html, config)?; + let xpath_ctx = Self::get_xpath_ctx(&document)?; ArticleScraper::extract_metadata(&xpath_ctx, config, article); ArticleScraper::strip_junk(&xpath_ctx, config, &url); ArticleScraper::extract_body(&xpath_ctx, root, config)?; @@ -725,19 +730,19 @@ impl ArticleScraper { mod tests { use crate::*; - // #[tokio::test] - // async fn golem() { - // let config_path = PathBuf::from(r"./resources/tests/golem"); - // let out_path = PathBuf::from(r"./test_output"); - // let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); + #[tokio::test] + async fn golem() { + let config_path = PathBuf::from(r"./resources/tests/golem"); + let out_path = PathBuf::from(r"./test_output"); + let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); - // let grabber = ArticleScraper::new(config_path).unwrap(); - // let article = grabber.parse(url, true).await.unwrap(); - // article.save_html(&out_path).unwrap(); + let grabber = ArticleScraper::new(config_path).unwrap(); + let article = grabber.parse(url, true).await.unwrap(); + article.save_html(&out_path).unwrap(); - // assert_eq!(article.title, Some(String::from("HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"))); - // assert_eq!(article.author, Some(String::from("Hauke Gierow"))); - // } + assert_eq!(article.title, Some(String::from("HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"))); + assert_eq!(article.author, Some(String::from("Hauke Gierow"))); + } #[tokio::test] async fn phoronix() {