mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 08:30:00 +02:00
fix document going out of scope
This commit is contained in:
parent
2c6bfed550
commit
edfbca3cf3
1 changed files with 23 additions and 18 deletions
39
src/lib.rs
39
src/lib.rs
|
@ -146,7 +146,8 @@ impl ArticleScraper {
|
||||||
async fn parse_pages(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> {
|
async fn parse_pages(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> {
|
||||||
|
|
||||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
let html = ArticleScraper::download(&url, &self.client).await?;
|
||||||
let mut xpath_ctx = Self::parse_html(html, config)?;
|
let mut document = Self::parse_html(html, config)?;
|
||||||
|
let mut xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
|
|
||||||
// check for single page link
|
// check for single page link
|
||||||
if let Some(xpath_single_page_link) = config.single_page_link.clone() {
|
if let Some(xpath_single_page_link) = config.single_page_link.clone() {
|
||||||
|
@ -166,7 +167,8 @@ impl ArticleScraper {
|
||||||
loop {
|
loop {
|
||||||
if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
|
if let Some(url) = self.check_for_next_page(&xpath_ctx, config) {
|
||||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
let html = ArticleScraper::download(&url, &self.client).await?;
|
||||||
xpath_ctx = Self::parse_html(html, config)?;
|
document = Self::parse_html(html, config)?;
|
||||||
|
xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
||||||
ArticleScraper::extract_body(&xpath_ctx, root, config)?;
|
ArticleScraper::extract_body(&xpath_ctx, root, config)?;
|
||||||
} else {
|
} else {
|
||||||
|
@ -177,7 +179,7 @@ impl ArticleScraper {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn parse_html(html: String, config: &GrabberConfig) -> Result<Context, ScraperError> {
|
fn parse_html(html: String, config: &GrabberConfig) -> Result<Document, ScraperError> {
|
||||||
// replace matches in raw html
|
// replace matches in raw html
|
||||||
|
|
||||||
let mut html = html;
|
let mut html = html;
|
||||||
|
@ -187,11 +189,13 @@ impl ArticleScraper {
|
||||||
|
|
||||||
// parse html
|
// parse html
|
||||||
let parser = Parser::default_html();
|
let parser = Parser::default_html();
|
||||||
let doc = parser.parse_string(html.as_str()).map_err(|err| {
|
Ok(parser.parse_string(html.as_str()).map_err(|err| {
|
||||||
error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
error!("Parsing HTML failed for downloaded HTML {:?}", err);
|
||||||
ScraperErrorKind::Xml
|
ScraperErrorKind::Xml
|
||||||
})?;
|
})?)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_xpath_ctx(doc: &Document) -> Result<Context, ScraperError> {
|
||||||
Ok(Context::new(&doc).map_err(|()| {
|
Ok(Context::new(&doc).map_err(|()| {
|
||||||
error!("Creating xpath context failed for downloaded HTML");
|
error!("Creating xpath context failed for downloaded HTML");
|
||||||
ScraperErrorKind::Xml
|
ScraperErrorKind::Xml
|
||||||
|
@ -221,7 +225,8 @@ impl ArticleScraper {
|
||||||
async fn parse_single_page(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> {
|
async fn parse_single_page(&self, article: &mut Article, url: &url::Url, root: &mut Node, config: &GrabberConfig) -> Result<(), ScraperError> {
|
||||||
|
|
||||||
let html = ArticleScraper::download(&url, &self.client).await?;
|
let html = ArticleScraper::download(&url, &self.client).await?;
|
||||||
let xpath_ctx = Self::parse_html(html, config)?;
|
let document = Self::parse_html(html, config)?;
|
||||||
|
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||||
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
ArticleScraper::extract_metadata(&xpath_ctx, config, article);
|
||||||
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
ArticleScraper::strip_junk(&xpath_ctx, config, &url);
|
||||||
ArticleScraper::extract_body(&xpath_ctx, root, config)?;
|
ArticleScraper::extract_body(&xpath_ctx, root, config)?;
|
||||||
|
@ -725,19 +730,19 @@ impl ArticleScraper {
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::*;
|
use crate::*;
|
||||||
|
|
||||||
// #[tokio::test]
|
#[tokio::test]
|
||||||
// async fn golem() {
|
async fn golem() {
|
||||||
// let config_path = PathBuf::from(r"./resources/tests/golem");
|
let config_path = PathBuf::from(r"./resources/tests/golem");
|
||||||
// let out_path = PathBuf::from(r"./test_output");
|
let out_path = PathBuf::from(r"./test_output");
|
||||||
// let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||||
|
|
||||||
// let grabber = ArticleScraper::new(config_path).unwrap();
|
let grabber = ArticleScraper::new(config_path).unwrap();
|
||||||
// let article = grabber.parse(url, true).await.unwrap();
|
let article = grabber.parse(url, true).await.unwrap();
|
||||||
// article.save_html(&out_path).unwrap();
|
article.save_html(&out_path).unwrap();
|
||||||
|
|
||||||
// assert_eq!(article.title, Some(String::from("HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben")));
|
assert_eq!(article.title, Some(String::from("HTTP Error 418: Fehlercode \"Ich bin eine Teekanne\" darf bleiben")));
|
||||||
// assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
||||||
// }
|
}
|
||||||
|
|
||||||
#[tokio::test]
|
#[tokio::test]
|
||||||
async fn phoronix() {
|
async fn phoronix() {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue