1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-08 16:40:00 +02:00

fix document going out of scope

This commit is contained in:
Jan Lukas Gernert 2019-11-19 14:41:08 +01:00
parent 2c6bfed550
commit edfbca3cf3

View file

@ -146,7 +146,8 @@ impl ArticleScraper {
async fn parse_pages(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> { async fn parse_pages(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> {
let html = ArticleScraper::download(&url, &self.client).await?; let html = ArticleScraper::download(&url, &self.client).await?;
let mut xpath_ctx = Self::parse_html(html, config)?; let mut document = Self::parse_html(html, config)?;
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
// check for single page link // check for single page link
if let Some(xpath_single_page_link) = config.single_page_link.clone() { if let Some(xpath_single_page_link) = config.single_page_link.clone() {
@ -166,7 +167,8 @@ impl ArticleScraper {
loop { loop {
if let Some(url) = self.check_for_next_page(&xpath_ctx, config) { if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
let html = ArticleScraper::download(&url, &self.client).await?; let html = ArticleScraper::download(&url, &self.client).await?;
xpath_ctx = Self::parse_html(html, config)?; document = Self::parse_html(html, config)?;
xpath_ctx = Self::get_xpath_ctx(&document)?;
ArticleScraper::strip_junk(&xpath_ctx, config, &url); ArticleScraper::strip_junk(&xpath_ctx, config, &url);
ArticleScraper::extract_body(&xpath_ctx, root, config)?; ArticleScraper::extract_body(&xpath_ctx, root, config)?;
} else { } else {
@ -177,7 +179,7 @@ impl ArticleScraper {
Ok(()) Ok(())
} }
fn parse_html(html: String, config: &GrabberConfig) -> Result<Context, ScraperError> { fn parse_html(html: String, config: &GrabberConfig) -> Result<Document, ScraperError> {
// replace matches in raw html // replace matches in raw html
let mut html = html; let mut html = html;
@ -187,11 +189,13 @@ impl ArticleScraper {
// parse html // parse html
let parser = Parser::default_html(); let parser = Parser::default_html();
let doc = parser.parse_string(html.as_str()).map_err(|err| { Ok(parser.parse_string(html.as_str()).map_err(|err| {
error!("Parsing HTML failed for downloaded HTML {:?}", err); error!("Parsing HTML failed for downloaded HTML {:?}", err);
ScraperErrorKind::Xml ScraperErrorKind::Xml
})?; })?)
}
fn get_xpath_ctx(doc: &Document) -> Result<Context, ScraperError> {
Ok(Context::new(&doc).map_err(|()| { Ok(Context::new(&doc).map_err(|()| {
error!("Creating xpath context failed for downloaded HTML"); error!("Creating xpath context failed for downloaded HTML");
ScraperErrorKind::Xml ScraperErrorKind::Xml
@ -221,7 +225,8 @@ impl ArticleScraper {
async fn parse_single_page(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> { async fn parse_single_page(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> {
let html = ArticleScraper::download(&url, &self.client).await?; let html = ArticleScraper::download(&url, &self.client).await?;
let xpath_ctx = Self::parse_html(html, config)?; let document = Self::parse_html(html, config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?;
ArticleScraper::extract_metadata(&xpath_ctx, config, article); ArticleScraper::extract_metadata(&xpath_ctx, config, article);
ArticleScraper::strip_junk(&xpath_ctx, config, &url); ArticleScraper::strip_junk(&xpath_ctx, config, &url);
ArticleScraper::extract_body(&xpath_ctx, root, config)?; ArticleScraper::extract_body(&xpath_ctx, root, config)?;
@ -725,19 +730,19 @@ impl ArticleScraper {
mod tests { mod tests {
use crate::*; use crate::*;
// #[tokio::test] #[tokio::test]
// async fn golem() { async fn golem() {
// let config_path = PathBuf::from(r"./resources/tests/golem"); let config_path = PathBuf::from(r"./resources/tests/golem");
// let out_path = PathBuf::from(r"./test_output"); let out_path = PathBuf::from(r"./test_output");
// let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap(); let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
// let grabber = ArticleScraper::new(config_path).unwrap(); let grabber = ArticleScraper::new(config_path).unwrap();
// let article = grabber.parse(url, true).await.unwrap(); let article = grabber.parse(url, true).await.unwrap();
// article.save_html(&out_path).unwrap(); article.save_html(&out_path).unwrap();
// assert_eq!(article.title, Some(String::from("HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben"))); assert_eq!(article.title, Some(String::from("HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben")));
// assert_eq!(article.author, Some(String::from("Hauke Gierow"))); assert_eq!(article.author, Some(String::from("Hauke Gierow")));
// } }
#[tokio::test] #[tokio::test]
async fn phoronix() { async fn phoronix() {