1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

only serialize root node

This commit is contained in:
Jan Lukas Gernert 2023-04-21 08:46:10 +02:00
parent 3f58a39fcf
commit b4b5d802c9
5 changed files with 47 additions and 32 deletions

View file

@ -1,5 +1,5 @@
use chrono::{DateTime, Utc};
use libxml::tree::{Document, SaveOptions};
use libxml::tree::{Document, SaveOptions, Node};
use std::fs::File;
use std::io::{Error, ErrorKind, Write};
use std::path::PathBuf;
@ -12,10 +12,11 @@ pub struct Article {
pub date: Option<DateTime<Utc>>,
pub thumbnail_url: Option<String>,
pub document: Option<Document>,
pub root_node: Option<Node>,
}
impl Article {
pub fn get_content(&self) -> Option<String> {
pub fn get_doc_content(&self) -> Option<String> {
// serialize content
let options = SaveOptions {
format: true,
@ -32,6 +33,14 @@ impl Article {
.map(|doc| doc.to_string_with_options(options))
}
pub fn get_content(&self) -> Option<String> {
if let (Some(document), Some(root)) = (self.document.as_ref(), self.root_node.as_ref()) {
Some(document.node_to_string(root))
} else {
None
}
}
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
if let Some(ref html) = self.get_content() {
if let Ok(()) = std::fs::create_dir_all(path) {

View file

@ -1,7 +1,8 @@
use libxml::tree::SaveOptions;
use libxml::tree::Node;
use reqwest::Url;
use crate::full_text_parser::error::FullTextParserError;
use crate::util::Util;
use crate::{FtrConfigEntry, FullTextParser};
pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
@ -12,34 +13,26 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document);
if let Some(mut root) = document.get_root_element() {
FullTextParser::post_process_page(&mut root)?;
FullTextParser::post_process_page(&mut root)?;
}
FullTextParser::post_process_document(&document)?;
// serialize content
let options = SaveOptions {
format: true,
no_declaration: false,
no_empty_tags: true,
no_xhtml: false,
xhtml: false,
as_xml: false,
as_html: true,
non_significant_whitespace: false,
};
let mut article_node =
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
if let Some(root) = document.get_root_element() {
Ok(document.node_to_string(&root))
} else {
Ok(document.to_string_with_options(options))
for mut node in content_nodes {
node.unlink();
article_node.add_child(&mut node).unwrap();
}
Ok(document.node_to_string(&article_node))
}
#[cfg(test)]
mod tests {
use reqwest::Url;
use super::clean_html;
use reqwest::Url;
#[test]
fn clean() {
@ -47,7 +40,6 @@ mod tests {
let url = Url::parse("https://finshots.in").unwrap();
let res = clean_html(&html, &url).unwrap();
println!("{res}");
assert_eq!(res.len(), 12118);
assert_eq!(res.len(), 11965);
}
}

View file

@ -58,23 +58,30 @@ impl FullTextParser {
date: None,
thumbnail_url: None,
document: None,
root_node: None,
};
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
let mut new_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
let mut root =
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
document.set_root_element(&root);
Node::new("article", None, &new_document).map_err(|()| FullTextParserError::Xml)?;
new_document.set_root_element(&root);
Self::generate_head(&mut root, &document)?;
Self::generate_head(&mut root, &new_document)?;
let document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&document)?;
let old_document = Self::parse_html(html, config, global_config)?;
let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
if article.thumbnail_url.is_none() {
Self::check_for_thumbnail(&xpath_ctx, &mut article);
}
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
Self::prep_content(
&xpath_ctx,
config,
global_config,
&article.url,
&old_document,
);
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
if !found_body {
log::error!("Ftr failed to find content");
@ -90,9 +97,10 @@ impl FullTextParser {
return Err(error);
}
Self::post_process_document(&document)?;
Self::post_process_document(&new_document)?;
article.document = Some(document);
article.document = Some(new_document);
article.root_node = Some(root);
let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
Ok(html)
}
@ -136,6 +144,7 @@ impl FullTextParser {
date: None,
thumbnail_url: None,
document: None,
root_node: None,
};
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
@ -181,6 +190,7 @@ impl FullTextParser {
Self::post_process_document(&document)?;
article.document = Some(document);
article.root_node = Some(root);
Ok(article)
}

View file

@ -34,6 +34,7 @@ impl Readability {
date: None,
thumbnail_url: None,
document: None,
root_node: None,
};
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
@ -46,6 +47,7 @@ impl Readability {
crate::FullTextParser::post_process_document(&article_document)?;
article.document = Some(article_document);
article.root_node = Some(root);
let html = article
.get_content()
.ok_or(FullTextParserError::Readability)?;

View file

@ -30,6 +30,7 @@ async fn run_test(name: &str) {
date: None,
thumbnail_url: None,
document: None,
root_node: None,
};
let mut article_document = Document::new().unwrap();
@ -41,6 +42,7 @@ async fn run_test(name: &str) {
crate::FullTextParser::post_process_document(&article_document).unwrap();
article.document = Some(article_document);
article.root_node = Some(root);
let html = article.get_content().unwrap();
let expected = std::fs::read_to_string(format!(