fixes

2025-07-06 23:55:32 +02:00 · 2023-06-21 23:48:09 +02:00 · 2023-06-21 23:48:09 +02:00 · 582834cdf1
commit 582834cdf1
parent e0ccd7e0b3
6 changed files with 17 additions and 16 deletions
--- a/Readme.md
+++ b/Readme.md
@ -29,7 +29,6 @@ let scraper = ArticleScraper::new(None);
 let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html");
 let client = Client::new();
 let article = scraper.parse(&url, false, &client, None).await.unwrap();
-let html = article.get_doc_content();
 ```

 # CLI
--- a/article_scraper/resources/tests/ftr/youtube/expected.html
+++ b/article_scraper/resources/tests/ftr/youtube/expected.html
@ -1 +1 @@
-<article><iframe id="video" width="100%" height="100%" src="https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" title="RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn"/></article>
+<article><iframe id="video" width="100%" height="100%" src="https://www.youtube.com/embed/8KjaIumu-jI?feature=oembed" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" title="RIGGED! Arena Shuffler is BROKEN | 13 Land Mono Red Burn"><empty></empty></iframe></article>
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@ -84,18 +84,28 @@ impl FullTextParser {
            .download_all_pages(html, client, config, global_config, &url)
            .await?;

-        self.parse_offline(pages, config, global_config, Some(url))
+        self.parse_offline(pages, config, Some(url))
    }

    pub fn parse_offline(
        &self,
        pages: Vec<String>,
        config: Option<&ConfigEntry>,
-        global_config: &ConfigEntry,
        url: Option<Url>,
    ) -> Result<Article, FullTextParserError> {
        let url = url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());

+        let config = if config.is_none() {
+            self.get_grabber_config(&url)
+        } else {
+            config
+        };
+
+        let global_config = self
+            .config_files
+            .get("global.txt")
+            .ok_or(FullTextParserError::Config)?;
+
        let mut article = Article {
            title: None,
            author: None,
@ -1033,7 +1043,8 @@ impl FullTextParser {
        let xpath = "//*[not(node())]";
        let node_vec = Util::evaluate_xpath(context, xpath, false)?;
        for mut node in node_vec {
-            if node.get_name() == "meta" {
+            let name = node.get_name().to_lowercase();
+            if name == "meta" || name == "img" || name == "br" {
                continue;
            }

--- a/article_scraper/src/full_text_parser/tests.rs
+++ b/article_scraper/src/full_text_parser/tests.rs
@ -14,9 +14,7 @@ async fn run_test(name: &str, url: &str, title: Option<&str>, author: Option<&st
        .expect("Failed to read source HTML");

    let parser = FullTextParser::new(None).await;
-    let article = parser
-        .parse_offline(vec![html], None, &ConfigEntry::default(), Some(url))
-        .unwrap();
+    let article = parser.parse_offline(vec![html], None, Some(url)).unwrap();

    let content = article.html.unwrap();

--- a/article_scraper/src/lib.rs
+++ b/article_scraper/src/lib.rs
@ -30,7 +30,6 @@
 //!     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
 //!     let client = Client::new();
 //!     let article = scraper.parse(&url, false, &client, None).await.unwrap();
-//!     let html = article.get_doc_content();
 //! }
 //! ```

@ -105,7 +104,6 @@ impl ArticleScraper {
    ///     let url = Url::parse("https://www.nytimes.com/interactive/2023/04/21/science/parrots-video-chat-facetime.html").unwrap();
    ///     let client = Client::new();
    ///     let article = scraper.parse(&url, false, &client, None).await.unwrap();
-    ///     let html = article.get_doc_content();
    /// }
    /// ```
    pub async fn parse(
--- a/article_scraper_cli/src/main.rs
+++ b/article_scraper_cli/src/main.rs
@ -119,12 +119,7 @@ async fn extract_ftr(
    };

    let full_text_parser = FullTextParser::new(None).await;
-    let article = match full_text_parser.parse_offline(
-        vec![html],
-        config.as_ref(),
-        &FtrConfigEntry::default(),
-        base_url,
-    ) {
+    let article = match full_text_parser.parse_offline(vec![html], config.as_ref(), base_url) {
        Ok(res) => res,
        Err(err) => {
            log::error!("Failed to extract content with ftr: {err}");