cli to parse single page with ftr

2025-07-07 16:15:32 +02:00 · 2023-04-10 13:47:45 +02:00 · 2023-04-10 13:47:45 +02:00 · fa41633e11
commit fa41633e11
parent d978059709
4 changed files with 173 additions and 34 deletions
--- a/article_scraper/src/full_text_parser/mod.rs
+++ b/article_scraper/src/full_text_parser/mod.rs
@ -35,7 +35,69 @@ impl FullTextParser {
        Self { config_files }
    }
-    pub async fn parse(
+    pub async fn parse_offline(
        &self,
        html: &str,
        config: Option<&ConfigEntry>,
        base_url: Option<Url>,
    ) -> Result<String, FullTextParserError> {
        libxml::tree::node::set_node_rc_guard(10);
        let global_config = self
            .config_files
            .get("global.txt")
            .ok_or(FullTextParserError::Config)?;
        let url =
            base_url.unwrap_or_else(|| url::Url::parse("http://fakehost/test/base/").unwrap());
        let mut article = Article {
            title: None,
            author: None,
            url,
            date: None,
            thumbnail_url: None,
            document: None,
        };
        let mut document = Document::new().map_err(|()| FullTextParserError::Xml)?;
        let mut root =
            Node::new("article", None, &document).map_err(|()| FullTextParserError::Xml)?;
        document.set_root_element(&root);
        Self::generate_head(&mut root, &document)?;
        let document = Self::parse_html(html, config, global_config)?;
        let xpath_ctx = Self::get_xpath_ctx(&document)?;
        metadata::extract(&xpath_ctx, config, Some(global_config), &mut article);
        if article.thumbnail_url.is_none() {
            Self::check_for_thumbnail(&xpath_ctx, &mut article);
        }
        Self::prep_content(&xpath_ctx, config, global_config, &article.url, &document);
        let found_body = Self::extract_body(&xpath_ctx, &mut root, config, global_config)?;
        if !found_body {
            log::error!("Ftr failed to find content");
            return Err(FullTextParserError::Scrape);
        }
        while let Some(url) = self.check_for_next_page(&xpath_ctx, config, global_config) {
            log::info!("Next page url: {url}");
        }
        if let Err(error) = Self::prevent_self_closing_tags(&xpath_ctx) {
            log::error!("Preventing self closing tags failed - '{error}'");
            return Err(error);
        }
        Self::post_process_document(&document)?;
        article.document = Some(document);
        let html = article.get_content().ok_or(FullTextParserError::Scrape)?;
        Ok(html)
    }
    pub(crate) async fn parse(
        &self,
        url: &url::Url,
        client: &Client,
--- a/article_scraper/src/lib.rs
+++ b/article_scraper/src/lib.rs
@ -7,6 +7,7 @@ mod util;
 use article::Article;
 use error::ScraperError;
 pub use full_text_parser::config::ConfigEntry as FtrConfigEntry;
 pub use full_text_parser::FullTextParser;
 pub use full_text_parser::Readability;
 use images::ImageDownloader;
--- a/article_scraper_cli/src/args.rs
+++ b/article_scraper_cli/src/args.rs
@ -32,4 +32,22 @@ pub enum Commands {
        #[arg(long, value_name = "URL")]
        source_url: Option<String>,
    },
    Ftr {
        /// Source HTML file
        #[arg(long, value_name = "FILE")]
        html: Option<PathBuf>,
        /// Base to complete relative Url
        #[arg(long, value_name = "URL")]
        base_url: Option<String>,
        /// Source Url to download HTML from
        #[arg(long, value_name = "URL")]
        source_url: Option<String>,
        /// The Ftr config to use
        /// Otherwise source_url and base_url will be used
        #[arg(long, value_name = "domain")]
        config: Option<String>,
    },
 }
--- a/article_scraper_cli/src/main.rs
+++ b/article_scraper_cli/src/main.rs
@ -1,6 +1,8 @@
 use std::path::Path;
 use std::{path::PathBuf, process::exit};
 use crate::args::{Args, Commands};
 use article_scraper::FtrConfigEntry;
 use article_scraper::FullTextParser;
 use article_scraper::Readability;
 use clap::Parser;
@ -34,9 +36,62 @@ async fn main() {
            base_url,
            source_url,
        } => extract_readability(html, source_url, base_url, args.output).await,
        Commands::Ftr {
            html,
            base_url,
            source_url,
            config,
        } => extract_ftr(html, source_url, base_url, config, args.output).await,
    }
 }
-    log::info!("hello world");
+async fn extract_ftr(
    html_file: Option<PathBuf>,
    source_url: Option<String>,
    base_url: Option<String>,
    config: Option<String>,
    output: Option<PathBuf>,
 ) {
    let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
    let html = get_html(html_file, source_url).await;
    let config = if let Some(config_path) = config {
        match FtrConfigEntry::parse_path(Path::new(&config_path)).await {
            Ok(entry) => Some(entry),
            Err(error) => {
                log::error!("Failed to parse config entry {config_path}: {error}");
                exit(0);
            }
        }
    } else {
        None
    };
    let full_text_parser = FullTextParser::new(None).await;
    let result = match full_text_parser
        .parse_offline(&html, config.as_ref(), base_url)
        .await
    {
        Ok(res) => res,
        Err(err) => {
            log::error!("Failed to extract content with ftr: {err}");
            exit(0);
        }
    };
    let output = if let Some(output) = output {
        output
    } else {
        PathBuf::from("result.html")
    };
    match std::fs::write(&output, result) {
        Ok(()) => log::info!("successfully written result to {output:?}"),
        Err(err) => {
            log::error!("Failed to write to file {output:?}: {err}");
            exit(0);
        }
    }
 }
 async fn extract_readability(
@ -45,39 +100,8 @@ async fn extract_readability(
    base_url: Option<String>,
    output: Option<PathBuf>,
 ) {
    if html_file.is_none() && source_url.is_none() {
        log::error!("either need a source html file or source url");
        exit(0);
    }
    if html_file.is_some() && source_url.is_some() {
        log::error!("load source from html file or url? only specify one of the two options");
        exit(0);
    }
    let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
    let base_url = base_url.map(|url| Url::parse(&url).expect("invalid base url"));
-
+    let html = get_html(html_file, source_url).await;
    let html = if let Some(source_url) = source_url {
        match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
            Ok(html) => html,
            Err(err) => {
                log::error!("Failed to download html from url: {err}");
                exit(0);
            }
        }
    } else if let Some(source_file) = html_file {
        match std::fs::read_to_string(&source_file) {
            Ok(html) => html,
            Err(err) => {
                log::error!("Failed to read file {source_file:?}: {err}");
                exit(0);
            }
        }
    } else {
        unreachable!()
    };
    let result = match Readability::extract_from_str(&html, base_url).await {
        Ok(res) => res,
        Err(err) => {
@ -100,3 +124,37 @@ async fn extract_readability(
        }
    }
 }
 async fn get_html(html_file: Option<PathBuf>, source_url: Option<String>) -> String {
    if html_file.is_none() && source_url.is_none() {
        log::error!("either need a source html file or source url");
        exit(0);
    }
    if html_file.is_some() && source_url.is_some() {
        log::error!("load source from html file or url? only specify one of the two options");
        exit(0);
    }
    let source_url = source_url.map(|url| Url::parse(&url).expect("invalid source url"));
    if let Some(source_url) = source_url {
        match FullTextParser::download(&source_url, &Client::new(), HeaderMap::new()).await {
            Ok(html) => html,
            Err(err) => {
                log::error!("Failed to download html from url: {err}");
                exit(0);
            }
        }
    } else if let Some(source_file) = html_file {
        match std::fs::read_to_string(&source_file) {
            Ok(html) => html,
            Err(err) => {
                log::error!("Failed to read file {source_file:?}: {err}");
                exit(0);
            }
        }
    } else {
        unreachable!()
    }
 }