1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

only serialize root node

This commit is contained in:
Jan Lukas Gernert 2023-04-21 08:46:10 +02:00
parent 3f58a39fcf
commit b4b5d802c9
5 changed files with 47 additions and 32 deletions

View file

@ -1,5 +1,5 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use libxml::tree::{Document, SaveOptions}; use libxml::tree::{Document, SaveOptions, Node};
use std::fs::File; use std::fs::File;
use std::io::{Error, ErrorKind, Write}; use std::io::{Error, ErrorKind, Write};
use std::path::PathBuf; use std::path::PathBuf;
@ -12,10 +12,11 @@ pub struct Article {
pub date: Option<DateTime<Utc>>, pub date: Option<DateTime<Utc>>,
pub thumbnail_url: Option<String>, pub thumbnail_url: Option<String>,
pub document: Option<Document>, pub document: Option<Document>,
pub root_node: Option<Node>,
} }
impl Article { impl Article {
pub fn get_content(&self) -> Option<String> { pub fn get_doc_content(&self) -> Option<String> {
// serialize content // serialize content
let options = SaveOptions { let options = SaveOptions {
format: true, format: true,
@ -32,6 +33,14 @@ impl Article {
.map(|doc| doc.to_string_with_options(options)) .map(|doc| doc.to_string_with_options(options))
} }
pub fn get_content(&self) -> Option<String> {
if let (Some(document), Some(root)) = (self.document.as_ref(), self.root_node.as_ref()) {
Some(document.node_to_string(root))
} else {
None
}
}
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> { pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
if let Some(ref html) = self.get_content() { if let Some(ref html) = self.get_content() {
if let Ok(()) = std::fs::create_dir_all(path) { if let Ok(()) = std::fs::create_dir_all(path) {

View file

@ -1,7 +1,8 @@
use libxml::tree::SaveOptions; use libxml::tree::Node;
use reqwest::Url; use reqwest::Url;
use crate::full_text_parser::error::FullTextParserError; use crate::full_text_parser::error::FullTextParserError;
use crate::util::Util;
use crate::{FtrConfigEntry, FullTextParser}; use crate::{FtrConfigEntry, FullTextParser};
pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> { pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
@ -12,34 +13,26 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?; let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document); FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document);
if let Some(mut root) = document.get_root_element() { if let Some(mut root) = document.get_root_element() {
FullTextParser::post_process_page(&mut root)?; FullTextParser::post_process_page(&mut root)?;
} }
FullTextParser::post_process_document(&document)?; FullTextParser::post_process_document(&document)?;
// serialize content let mut article_node =
let options = SaveOptions { Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
format: true, let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
if let Some(root) = document.get_root_element() { for mut node in content_nodes {
Ok(document.node_to_string(&root)) node.unlink();
} else { article_node.add_child(&mut node).unwrap();
Ok(document.to_string_with_options(options))
} }
Ok(document.node_to_string(&article_node))
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use reqwest::Url;
use super::clean_html; use super::clean_html;
use reqwest::Url;
#[test] #[test]
fn clean() { fn clean() {
@ -47,7 +40,6 @@ mod tests {
let url = Url::parse("https://finshots.in").unwrap(); let url = Url::parse("https://finshots.in").unwrap();
let res = clean_html(&html, &url).unwrap(); let res = clean_html(&html, &url).unwrap();
println!("{res}"); assert_eq!(res.len(), 11965);
assert_eq!(res.len(), 12118);
} }
} }

View file

@ -58,23 +58,30 @@ impl FullTextParser {
date: None, date: None,
thumbnail_url: None, thumbnail_url: None,
document: None, document: None,
root_node: None,
}; };
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; let mut new_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
let mut root = let mut root =
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?; Node::new("article", None, &new_document).map_err(|()| FullTextParserError::Xml)?;
document.set_root_element(&root); new_document.set_root_element(&root);
Self::generate_head(&mut root, &document)?; Self::generate_head(&mut root, &new_document)?;
let document = Self::parse_html(html, config, global_config)?; let old_document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?; let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article); metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
if article.thumbnail_url.is_none() { if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, &mut article); Self::check_for_thumbnail(&xpath_ctx, &mut article);
} }
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document); Self::prep_content(
&xpath_ctx,
config,
global_config,
&article.url,
&old_document,
);
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?; let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
if !found_body { if !found_body {
log::error!("Ftr failed to find content"); log::error!("Ftr failed to find content");
@ -90,9 +97,10 @@ impl FullTextParser {
return Err(error); return Err(error);
} }
Self::post_process_document(&document)?; Self::post_process_document(&new_document)?;
article.document = Some(document); article.document = Some(new_document);
article.root_node = Some(root);
let html = article.get_content().ok_or(FullTextParserError::Scrape)?; let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
Ok(html) Ok(html)
} }
@ -136,6 +144,7 @@ impl FullTextParser {
date: None, date: None,
thumbnail_url: None, thumbnail_url: None,
document: None, document: None,
root_node: None,
}; };
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?; let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
@ -181,6 +190,7 @@ impl FullTextParser {
Self::post_process_document(&document)?; Self::post_process_document(&document)?;
article.document = Some(document); article.document = Some(document);
article.root_node = Some(root);
Ok(article) Ok(article)
} }

View file

@ -34,6 +34,7 @@ impl Readability {
date: None, date: None,
thumbnail_url: None, thumbnail_url: None,
document: None, document: None,
root_node: None,
}; };
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?; let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
@ -46,6 +47,7 @@ impl Readability {
crate::FullTextParser::post_process_document(&article_document)?; crate::FullTextParser::post_process_document(&article_document)?;
article.document = Some(article_document); article.document = Some(article_document);
article.root_node = Some(root);
let html = article let html = article
.get_content() .get_content()
.ok_or(FullTextParserError::Readability)?; .ok_or(FullTextParserError::Readability)?;

View file

@ -30,6 +30,7 @@ async fn run_test(name: &str) {
date: None, date: None,
thumbnail_url: None, thumbnail_url: None,
document: None, document: None,
root_node: None,
}; };
let mut article_document = Document::new().unwrap(); let mut article_document = Document::new().unwrap();
@ -41,6 +42,7 @@ async fn run_test(name: &str) {
crate::FullTextParser::post_process_document(&article_document).unwrap(); crate::FullTextParser::post_process_document(&article_document).unwrap();
article.document = Some(article_document); article.document = Some(article_document);
article.root_node = Some(root);
let html = article.get_content().unwrap(); let html = article.get_content().unwrap();
let expected = std::fs::read_to_string(format!( let expected = std::fs::read_to_string(format!(