mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
only serialize root node
This commit is contained in:
parent
3f58a39fcf
commit
b4b5d802c9
5 changed files with 47 additions and 32 deletions
|
@ -1,5 +1,5 @@
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use libxml::tree::{Document, SaveOptions};
|
use libxml::tree::{Document, SaveOptions, Node};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{Error, ErrorKind, Write};
|
use std::io::{Error, ErrorKind, Write};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
@ -12,10 +12,11 @@ pub struct Article {
|
||||||
pub date: Option<DateTime<Utc>>,
|
pub date: Option<DateTime<Utc>>,
|
||||||
pub thumbnail_url: Option<String>,
|
pub thumbnail_url: Option<String>,
|
||||||
pub document: Option<Document>,
|
pub document: Option<Document>,
|
||||||
|
pub root_node: Option<Node>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Article {
|
impl Article {
|
||||||
pub fn get_content(&self) -> Option<String> {
|
pub fn get_doc_content(&self) -> Option<String> {
|
||||||
// serialize content
|
// serialize content
|
||||||
let options = SaveOptions {
|
let options = SaveOptions {
|
||||||
format: true,
|
format: true,
|
||||||
|
@ -32,6 +33,14 @@ impl Article {
|
||||||
.map(|doc| doc.to_string_with_options(options))
|
.map(|doc| doc.to_string_with_options(options))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn get_content(&self) -> Option<String> {
|
||||||
|
if let (Some(document), Some(root)) = (self.document.as_ref(), self.root_node.as_ref()) {
|
||||||
|
Some(document.node_to_string(root))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
|
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
|
||||||
if let Some(ref html) = self.get_content() {
|
if let Some(ref html) = self.get_content() {
|
||||||
if let Ok(()) = std::fs::create_dir_all(path) {
|
if let Ok(()) = std::fs::create_dir_all(path) {
|
||||||
|
|
|
@ -1,7 +1,8 @@
|
||||||
use libxml::tree::SaveOptions;
|
use libxml::tree::Node;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
|
|
||||||
use crate::full_text_parser::error::FullTextParserError;
|
use crate::full_text_parser::error::FullTextParserError;
|
||||||
|
use crate::util::Util;
|
||||||
use crate::{FtrConfigEntry, FullTextParser};
|
use crate::{FtrConfigEntry, FullTextParser};
|
||||||
|
|
||||||
pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
|
pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
|
||||||
|
@ -12,34 +13,26 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
|
||||||
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
let xpath_ctx = FullTextParser::get_xpath_ctx(&document)?;
|
||||||
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document);
|
FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document);
|
||||||
if let Some(mut root) = document.get_root_element() {
|
if let Some(mut root) = document.get_root_element() {
|
||||||
FullTextParser::post_process_page(&mut root)?;
|
FullTextParser::post_process_page(&mut root)?;
|
||||||
}
|
}
|
||||||
FullTextParser::post_process_document(&document)?;
|
FullTextParser::post_process_document(&document)?;
|
||||||
|
|
||||||
// serialize content
|
let mut article_node =
|
||||||
let options = SaveOptions {
|
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||||
format: true,
|
let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
|
||||||
no_declaration: false,
|
|
||||||
no_empty_tags: true,
|
|
||||||
no_xhtml: false,
|
|
||||||
xhtml: false,
|
|
||||||
as_xml: false,
|
|
||||||
as_html: true,
|
|
||||||
non_significant_whitespace: false,
|
|
||||||
};
|
|
||||||
|
|
||||||
if let Some(root) = document.get_root_element() {
|
for mut node in content_nodes {
|
||||||
Ok(document.node_to_string(&root))
|
node.unlink();
|
||||||
} else {
|
article_node.add_child(&mut node).unwrap();
|
||||||
Ok(document.to_string_with_options(options))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(document.node_to_string(&article_node))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use reqwest::Url;
|
|
||||||
use super::clean_html;
|
use super::clean_html;
|
||||||
|
use reqwest::Url;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn clean() {
|
fn clean() {
|
||||||
|
@ -47,7 +40,6 @@ mod tests {
|
||||||
let url = Url::parse("https://finshots.in").unwrap();
|
let url = Url::parse("https://finshots.in").unwrap();
|
||||||
let res = clean_html(&html, &url).unwrap();
|
let res = clean_html(&html, &url).unwrap();
|
||||||
|
|
||||||
println!("{res}");
|
assert_eq!(res.len(), 11965);
|
||||||
assert_eq!(res.len(), 12118);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -58,23 +58,30 @@ impl FullTextParser {
|
||||||
date: None,
|
date: None,
|
||||||
thumbnail_url: None,
|
thumbnail_url: None,
|
||||||
document: None,
|
document: None,
|
||||||
|
root_node: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
let mut new_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||||
let mut root =
|
let mut root =
|
||||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
Node::new("article", None, &new_document).map_err(|()| FullTextParserError::Xml)?;
|
||||||
document.set_root_element(&root);
|
new_document.set_root_element(&root);
|
||||||
|
|
||||||
Self::generate_head(&mut root, &document)?;
|
Self::generate_head(&mut root, &new_document)?;
|
||||||
|
|
||||||
let document = Self::parse_html(html, config, global_config)?;
|
let old_document = Self::parse_html(html, config, global_config)?;
|
||||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
|
||||||
|
|
||||||
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
|
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
|
||||||
if article.thumbnail_url.is_none() {
|
if article.thumbnail_url.is_none() {
|
||||||
Self::check_for_thumbnail(&xpath_ctx, &mut article);
|
Self::check_for_thumbnail(&xpath_ctx, &mut article);
|
||||||
}
|
}
|
||||||
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
|
Self::prep_content(
|
||||||
|
&xpath_ctx,
|
||||||
|
config,
|
||||||
|
global_config,
|
||||||
|
&article.url,
|
||||||
|
&old_document,
|
||||||
|
);
|
||||||
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
|
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
|
||||||
if !found_body {
|
if !found_body {
|
||||||
log::error!("Ftr failed to find content");
|
log::error!("Ftr failed to find content");
|
||||||
|
@ -90,9 +97,10 @@ impl FullTextParser {
|
||||||
return Err(error);
|
return Err(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
Self::post_process_document(&document)?;
|
Self::post_process_document(&new_document)?;
|
||||||
|
|
||||||
article.document = Some(document);
|
article.document = Some(new_document);
|
||||||
|
article.root_node = Some(root);
|
||||||
let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
|
let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
|
||||||
Ok(html)
|
Ok(html)
|
||||||
}
|
}
|
||||||
|
@ -136,6 +144,7 @@ impl FullTextParser {
|
||||||
date: None,
|
date: None,
|
||||||
thumbnail_url: None,
|
thumbnail_url: None,
|
||||||
document: None,
|
document: None,
|
||||||
|
root_node: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||||
|
@ -181,6 +190,7 @@ impl FullTextParser {
|
||||||
Self::post_process_document(&document)?;
|
Self::post_process_document(&document)?;
|
||||||
|
|
||||||
article.document = Some(document);
|
article.document = Some(document);
|
||||||
|
article.root_node = Some(root);
|
||||||
|
|
||||||
Ok(article)
|
Ok(article)
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,6 +34,7 @@ impl Readability {
|
||||||
date: None,
|
date: None,
|
||||||
thumbnail_url: None,
|
thumbnail_url: None,
|
||||||
document: None,
|
document: None,
|
||||||
|
root_node: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||||
|
@ -46,6 +47,7 @@ impl Readability {
|
||||||
crate::FullTextParser::post_process_document(&article_document)?;
|
crate::FullTextParser::post_process_document(&article_document)?;
|
||||||
|
|
||||||
article.document = Some(article_document);
|
article.document = Some(article_document);
|
||||||
|
article.root_node = Some(root);
|
||||||
let html = article
|
let html = article
|
||||||
.get_content()
|
.get_content()
|
||||||
.ok_or(FullTextParserError::Readability)?;
|
.ok_or(FullTextParserError::Readability)?;
|
||||||
|
|
|
@ -30,6 +30,7 @@ async fn run_test(name: &str) {
|
||||||
date: None,
|
date: None,
|
||||||
thumbnail_url: None,
|
thumbnail_url: None,
|
||||||
document: None,
|
document: None,
|
||||||
|
root_node: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut article_document = Document::new().unwrap();
|
let mut article_document = Document::new().unwrap();
|
||||||
|
@ -41,6 +42,7 @@ async fn run_test(name: &str) {
|
||||||
crate::FullTextParser::post_process_document(&article_document).unwrap();
|
crate::FullTextParser::post_process_document(&article_document).unwrap();
|
||||||
|
|
||||||
article.document = Some(article_document);
|
article.document = Some(article_document);
|
||||||
|
article.root_node = Some(root);
|
||||||
let html = article.get_content().unwrap();
|
let html = article.get_content().unwrap();
|
||||||
|
|
||||||
let expected = std::fs::read_to_string(format!(
|
let expected = std::fs::read_to_string(format!(
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue