diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 5ff0bac..948fe98 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,7 +14,7 @@ exclude = ["resources/tests"] thiserror = "1.0" libxml = "0.3" reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli", "stream"] } -tokio = { version = "1.27", features = ["macros", "fs", "io-util"] } +tokio = { version = "1.28", features = ["macros", "fs", "io-util"] } url = "2.3" regex = "1.8" encoding_rs = "0.8" diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index aec3967..c7913e3 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -1,4 +1,3 @@ -use libxml::tree::Node; use reqwest::Url; use crate::full_text_parser::error::FullTextParserError; @@ -47,17 +46,21 @@ pub fn clean_html(html: &str, base_url: &Url) -> Result"); + log::debug!("Failed to add iframe as child of video wrapper
"); } } else { log::warn!("Failed to get parent of iframe"); diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index 4eb4f24..be04acf 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -547,6 +547,24 @@ impl Util { vec } + pub fn get_first_element_by_tag_name(node: &Node, tag: &str) -> Option { + let tag = tag.to_uppercase(); + + fn get_elems(node: &Node, tag: &str) -> Option { + for child in node.get_child_elements() { + if child.get_name().to_uppercase() == tag { + return Some(child.clone()); + } else { + return get_elems(&child, tag); + } + } + + None + } + + get_elems(node, &tag) + } + pub fn get_link_density(node: &Node) -> f64 { let text_length = Util::get_inner_text(node, true).len(); if text_length == 0 {