mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 16:40:00 +02:00
first content extraction kinda working
This commit is contained in:
parent
2c76a869e7
commit
cce912c354
8 changed files with 363 additions and 58 deletions
27
src/full_text_parser/readability/tests.rs
Normal file
27
src/full_text_parser/readability/tests.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
use libxml::tree::{Document, Node};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::full_text_parser::config::ConfigEntry;
|
||||
|
||||
async fn prepare(html: &str, url: &Url) -> Document {
|
||||
let empty_config = ConfigEntry::default();
|
||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
||||
document
|
||||
}
|
||||
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_1() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
|
||||
.expect("Failed to read HTML");
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let document = prepare(&html, &url).await;
|
||||
|
||||
let mut root = Node::new("article", None, &document).unwrap();
|
||||
|
||||
super::Readability::extract_body(document, &mut root).unwrap();
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue