improve title extraction

2025-07-08 16:40:00 +02:00 · 2023-02-20 02:32:58 +01:00 · 2023-02-20 02:32:58 +01:00 · 98c06e11f4
commit 98c06e11f4
parent cce912c354
7 changed files with 107 additions and 54 deletions
--- a/src/full_text_parser/readability/tests.rs
+++ b/src/full_text_parser/readability/tests.rs
@ -1,17 +1,30 @@
-use libxml::tree::{Document, Node};
+use libxml::{
+    tree::{Document, Node},
+    xpath::Context,
+};
 use reqwest::Url;

-use crate::full_text_parser::config::ConfigEntry;
+use crate::{
+    article::Article,
+    full_text_parser::{config::ConfigEntry, metadata},
+};

-async fn prepare(html: &str, url: &Url) -> Document {
+async fn prepare(html: &str, url: &Url) -> (Document, Context, Article) {
    let empty_config = ConfigEntry::default();
    let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
    let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
    crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
-    document
+    let article = Article {
+        title: None,
+        author: None,
+        url: url.clone(),
+        date: None,
+        thumbnail_url: None,
+        document: None,
+    };
+    (document, xpath_ctx, article)
 }

-
 #[tokio::test]
 async fn test_1() {
    let _ = env_logger::builder().is_test(true).try_init();
@ -19,9 +32,11 @@ async fn test_1() {
    let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
        .expect("Failed to read HTML");
    let url = Url::parse("http://google.com").unwrap();
-    let document = prepare(&html, &url).await;
+    let (document, xpath_ctx, mut article) = prepare(&html, &url).await;

    let mut root = Node::new("article", None, &document).unwrap();

-    super::Readability::extract_body(document, &mut root).unwrap();
+    metadata::extract(&xpath_ctx, None, None, &mut article);
+
+    super::Readability::extract_body(document, &mut root, article.title.as_deref()).unwrap();
 }