mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
only serialize root node
This commit is contained in:
parent
3f58a39fcf
commit
b4b5d802c9
5 changed files with 47 additions and 32 deletions
|
@ -1,5 +1,5 @@
|
|||
use chrono::{DateTime, Utc};
|
||||
use libxml::tree::{Document, SaveOptions};
|
||||
use libxml::tree::{Document, SaveOptions, Node};
|
||||
use std::fs::File;
|
||||
use std::io::{Error, ErrorKind, Write};
|
||||
use std::path::PathBuf;
|
||||
|
@ -12,10 +12,11 @@ pub struct Article {
|
|||
pub date: Option<DateTime<Utc>>,
|
||||
pub thumbnail_url: Option<String>,
|
||||
pub document: Option<Document>,
|
||||
pub root_node: Option<Node>,
|
||||
}
|
||||
|
||||
impl Article {
|
||||
pub fn get_content(&self) -> Option<String> {
|
||||
pub fn get_doc_content(&self) -> Option<String> {
|
||||
// serialize content
|
||||
let options = SaveOptions {
|
||||
format: true,
|
||||
|
@ -32,6 +33,14 @@ impl Article {
|
|||
.map(|doc| doc.to_string_with_options(options))
|
||||
}
|
||||
|
||||
pub fn get_content(&self) -> Option<String> {
|
||||
if let (Some(document), Some(root)) = (self.document.as_ref(), self.root_node.as_ref()) {
|
||||
Some(document.node_to_string(root))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn save_html(&self, path: &PathBuf) -> Result<(), Error> {
|
||||
if let Some(ref html) = self.get_content() {
|
||||
if let Ok(()) = std::fs::create_dir_all(path) {
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
use libxml::tree::SaveOptions;
|
||||
use libxml::tree::Node;
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::full_text_parser::error::FullTextParserError;
|
||||
use crate::util::Util;
|
||||
use crate::{FtrConfigEntry, FullTextParser};
|
||||
|
||||
pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserError> {
|
||||
|
@ -16,30 +17,22 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result<String, FullTextParserEr
|
|||
}
|
||||
FullTextParser::post_process_document(&document)?;
|
||||
|
||||
// serialize content
|
||||
let options = SaveOptions {
|
||||
format: true,
|
||||
no_declaration: false,
|
||||
no_empty_tags: true,
|
||||
no_xhtml: false,
|
||||
xhtml: false,
|
||||
as_xml: false,
|
||||
as_html: true,
|
||||
non_significant_whitespace: false,
|
||||
};
|
||||
let mut article_node =
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||
let content_nodes = Util::evaluate_xpath(&xpath_ctx, "//body/*", true)?;
|
||||
|
||||
if let Some(root) = document.get_root_element() {
|
||||
Ok(document.node_to_string(&root))
|
||||
} else {
|
||||
Ok(document.to_string_with_options(options))
|
||||
for mut node in content_nodes {
|
||||
node.unlink();
|
||||
article_node.add_child(&mut node).unwrap();
|
||||
}
|
||||
|
||||
Ok(document.node_to_string(&article_node))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use reqwest::Url;
|
||||
use super::clean_html;
|
||||
|
||||
use reqwest::Url;
|
||||
|
||||
#[test]
|
||||
fn clean() {
|
||||
|
@ -47,7 +40,6 @@ mod tests {
|
|||
let url = Url::parse("https://finshots.in").unwrap();
|
||||
let res = clean_html(&html, &url).unwrap();
|
||||
|
||||
println!("{res}");
|
||||
assert_eq!(res.len(), 12118);
|
||||
assert_eq!(res.len(), 11965);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -58,23 +58,30 @@ impl FullTextParser {
|
|||
date: None,
|
||||
thumbnail_url: None,
|
||||
document: None,
|
||||
root_node: None,
|
||||
};
|
||||
|
||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
let mut new_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
let mut root =
|
||||
Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
|
||||
document.set_root_element(&root);
|
||||
Node::new("article", None, &new_document).map_err(|()| FullTextParserError::Xml)?;
|
||||
new_document.set_root_element(&root);
|
||||
|
||||
Self::generate_head(&mut root, &document)?;
|
||||
Self::generate_head(&mut root, &new_document)?;
|
||||
|
||||
let document = Self::parse_html(html, config, global_config)?;
|
||||
let xpath_ctx = Self::get_xpath_ctx(&document)?;
|
||||
let old_document = Self::parse_html(html, config, global_config)?;
|
||||
let xpath_ctx = Self::get_xpath_ctx(&old_document)?;
|
||||
|
||||
metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
|
||||
if article.thumbnail_url.is_none() {
|
||||
Self::check_for_thumbnail(&xpath_ctx, &mut article);
|
||||
}
|
||||
Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
|
||||
Self::prep_content(
|
||||
&xpath_ctx,
|
||||
config,
|
||||
global_config,
|
||||
&article.url,
|
||||
&old_document,
|
||||
);
|
||||
let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
|
||||
if !found_body {
|
||||
log::error!("Ftr failed to find content");
|
||||
|
@ -90,9 +97,10 @@ impl FullTextParser {
|
|||
return Err(error);
|
||||
}
|
||||
|
||||
Self::post_process_document(&document)?;
|
||||
Self::post_process_document(&new_document)?;
|
||||
|
||||
article.document = Some(document);
|
||||
article.document = Some(new_document);
|
||||
article.root_node = Some(root);
|
||||
let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
|
||||
Ok(html)
|
||||
}
|
||||
|
@ -136,6 +144,7 @@ impl FullTextParser {
|
|||
date: None,
|
||||
thumbnail_url: None,
|
||||
document: None,
|
||||
root_node: None,
|
||||
};
|
||||
|
||||
let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
|
@ -181,6 +190,7 @@ impl FullTextParser {
|
|||
Self::post_process_document(&document)?;
|
||||
|
||||
article.document = Some(document);
|
||||
article.root_node = Some(root);
|
||||
|
||||
Ok(article)
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ impl Readability {
|
|||
date: None,
|
||||
thumbnail_url: None,
|
||||
document: None,
|
||||
root_node: None,
|
||||
};
|
||||
|
||||
let mut article_document = Document::new().map_err(|()| FullTextParserError::Xml)?;
|
||||
|
@ -46,6 +47,7 @@ impl Readability {
|
|||
crate::FullTextParser::post_process_document(&article_document)?;
|
||||
|
||||
article.document = Some(article_document);
|
||||
article.root_node = Some(root);
|
||||
let html = article
|
||||
.get_content()
|
||||
.ok_or(FullTextParserError::Readability)?;
|
||||
|
|
|
@ -30,6 +30,7 @@ async fn run_test(name: &str) {
|
|||
date: None,
|
||||
thumbnail_url: None,
|
||||
document: None,
|
||||
root_node: None,
|
||||
};
|
||||
|
||||
let mut article_document = Document::new().unwrap();
|
||||
|
@ -41,6 +42,7 @@ async fn run_test(name: &str) {
|
|||
crate::FullTextParser::post_process_document(&article_document).unwrap();
|
||||
|
||||
article.document = Some(article_document);
|
||||
article.root_node = Some(root);
|
||||
let html = article.get_content().unwrap();
|
||||
|
||||
let expected = std::fs::read_to_string(format!(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue