mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-08 00:19:59 +02:00
777 lines
26 KiB
Rust
777 lines
26 KiB
Rust
use libxml::{
|
|
tree::{Node, NodeType},
|
|
xpath::Context,
|
|
};
|
|
use reqwest::{
|
|
header::{HeaderMap, HeaderName, HeaderValue},
|
|
Response,
|
|
};
|
|
use tokio::fs::DirEntry;
|
|
|
|
use crate::{
|
|
constants,
|
|
full_text_parser::{config::ConfigEntry, error::FullTextParserError},
|
|
};
|
|
|
|
pub struct Util;
|
|
|
|
impl Util {
|
|
pub fn check_extension(path: &DirEntry, extension: &str) -> bool {
|
|
if let Some(ext) = path.path().extension() {
|
|
ext.to_str() == Some(extension)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub fn str_extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
|
|
let value = &line[identifier.len()..];
|
|
let value = value.trim();
|
|
match value.find('#') {
|
|
Some(pos) => &value[..pos],
|
|
None => value,
|
|
}
|
|
}
|
|
|
|
pub fn split_values(values: &str) -> Vec<&str> {
|
|
values.split('|').map(|s| s.trim()).collect()
|
|
}
|
|
|
|
pub fn select_rule<'a>(
|
|
site_specific_rule: Option<&'a str>,
|
|
global_rule: Option<&'a str>,
|
|
) -> Option<&'a str> {
|
|
if site_specific_rule.is_some() {
|
|
site_specific_rule
|
|
} else {
|
|
global_rule
|
|
}
|
|
}
|
|
|
|
pub fn generate_headers(
|
|
site_specific_rule: Option<&ConfigEntry>,
|
|
global_rule: &ConfigEntry,
|
|
) -> Result<HeaderMap, FullTextParserError> {
|
|
let mut headers = HeaderMap::new();
|
|
|
|
if let Some(config) = site_specific_rule {
|
|
for header in &config.header {
|
|
let name = HeaderName::from_bytes(header.name.as_bytes())
|
|
.map_err(|_| FullTextParserError::Config)?;
|
|
let value = header
|
|
.value
|
|
.parse::<HeaderValue>()
|
|
.map_err(|_| FullTextParserError::Config)?;
|
|
headers.insert(name, value);
|
|
}
|
|
}
|
|
|
|
for header in &global_rule.header {
|
|
let name = HeaderName::from_bytes(header.name.as_bytes())
|
|
.map_err(|_| FullTextParserError::Config)?;
|
|
let value = header
|
|
.value
|
|
.parse::<HeaderValue>()
|
|
.map_err(|_| FullTextParserError::Config)?;
|
|
headers.insert(name, value);
|
|
}
|
|
|
|
Ok(headers)
|
|
}
|
|
|
|
pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> {
|
|
let res = Self::evaluate_xpath(xpath_ctx, xpath_page_link, false).ok()?;
|
|
let mut url = None;
|
|
|
|
for node in res {
|
|
let content = node.get_content();
|
|
let url_str = if content.trim().is_empty() && node.has_attribute("href") {
|
|
node.get_attribute("href")
|
|
.expect("already checked for href")
|
|
} else {
|
|
content
|
|
};
|
|
|
|
if let Ok(parsed_url) = url::Url::parse(&url_str) {
|
|
url = Some(parsed_url);
|
|
break;
|
|
}
|
|
}
|
|
|
|
url
|
|
}
|
|
|
|
pub fn evaluate_xpath(
|
|
xpath_ctx: &Context,
|
|
xpath: &str,
|
|
thorw_if_empty: bool,
|
|
) -> Result<Vec<Node>, FullTextParserError> {
|
|
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
|
|
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
|
FullTextParserError::Xml
|
|
})?;
|
|
|
|
let node_vec = res.get_nodes_as_vec();
|
|
|
|
if node_vec.is_empty() {
|
|
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
|
if thorw_if_empty {
|
|
return Err(FullTextParserError::Xml);
|
|
}
|
|
}
|
|
|
|
Ok(node_vec)
|
|
}
|
|
|
|
pub fn check_content_type(response: &Response) -> Result<bool, FullTextParserError> {
|
|
if response.status().is_success() {
|
|
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
|
if let Ok(content_type) = content_type.to_str() {
|
|
if content_type.contains("text/html") {
|
|
return Ok(true);
|
|
}
|
|
}
|
|
}
|
|
|
|
log::error!("Content type is not text/HTML");
|
|
return Ok(false);
|
|
}
|
|
|
|
log::error!("Failed to determine content type");
|
|
Err(FullTextParserError::Http)
|
|
}
|
|
|
|
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
|
|
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
|
log::debug!("Article url redirects to '{}'", response.url().as_str());
|
|
return Some(response.url().clone());
|
|
} else if response.url() != original_url {
|
|
return Some(response.url().clone());
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
pub fn get_attribute(
|
|
context: &Context,
|
|
xpath: &str,
|
|
attribute: &str,
|
|
) -> Result<String, FullTextParserError> {
|
|
Util::evaluate_xpath(context, xpath, false)?
|
|
.iter()
|
|
.find_map(|node| node.get_attribute(attribute))
|
|
.ok_or(FullTextParserError::Xml)
|
|
}
|
|
|
|
pub fn extract_value(context: &Context, xpath: &str) -> Result<String, FullTextParserError> {
|
|
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
|
if let Some(val) = node_vec.get(0) {
|
|
return Ok(val.get_content());
|
|
}
|
|
|
|
Err(FullTextParserError::Xml)
|
|
}
|
|
|
|
pub fn extract_value_merge(
|
|
context: &Context,
|
|
xpath: &str,
|
|
) -> Result<String, FullTextParserError> {
|
|
let node_vec = Util::evaluate_xpath(context, xpath, true)?;
|
|
let mut val = String::new();
|
|
for node in node_vec {
|
|
let part = node
|
|
.get_content()
|
|
.split_whitespace()
|
|
.map(|s| format!("{} ", s))
|
|
.collect::<String>();
|
|
val.push_str(&part);
|
|
val.push(' ');
|
|
}
|
|
|
|
Ok(val.trim().to_string())
|
|
}
|
|
|
|
pub fn strip_node(context: &Context, xpath: &str) -> Result<(), FullTextParserError> {
|
|
let mut ancestor = xpath.to_string();
|
|
if ancestor.starts_with("//") {
|
|
ancestor = ancestor.chars().skip(2).collect();
|
|
}
|
|
|
|
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
|
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
|
|
|
for mut node in node_vec {
|
|
let tag_name = node.get_name();
|
|
if constants::EMBED_TAG_NAMES.contains(tag_name.to_uppercase().as_str())
|
|
&& node
|
|
.get_attributes()
|
|
.iter()
|
|
.any(|(_name, value)| constants::VIDEOS.is_match(value))
|
|
{
|
|
continue;
|
|
}
|
|
node.unlink();
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub fn strip_id_or_class(
|
|
context: &Context,
|
|
id_or_class: &str,
|
|
) -> Result<(), FullTextParserError> {
|
|
let xpath = &format!(
|
|
"//*[contains(@class, '{}') or contains(@id, '{}')]",
|
|
id_or_class, id_or_class
|
|
);
|
|
|
|
let mut ancestor = xpath.clone();
|
|
if ancestor.starts_with("//") {
|
|
ancestor = ancestor.chars().skip(2).collect();
|
|
}
|
|
|
|
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
|
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
|
for mut node in node_vec {
|
|
node.unlink();
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
pub fn is_probably_visible(node: &Node) -> bool {
|
|
let is_hidden = node.has_attribute("hidden");
|
|
let aria_hidden = node
|
|
.get_attribute("aria-hidden")
|
|
.map(|attr| attr == "true")
|
|
.unwrap_or(false);
|
|
let has_fallback_image = node.get_class_names().contains("fallback-image");
|
|
|
|
!is_hidden && !aria_hidden || has_fallback_image
|
|
}
|
|
|
|
pub fn is_whitespace(node: &Node) -> bool {
|
|
let content = node.get_content();
|
|
let tag_name = node.get_name().to_uppercase();
|
|
|
|
let is_text_node = node
|
|
.get_type()
|
|
.map(|t| t == NodeType::TextNode)
|
|
.unwrap_or(false);
|
|
let is_element_node = node
|
|
.get_type()
|
|
.map(|t| t == NodeType::ElementNode)
|
|
.unwrap_or(false);
|
|
|
|
(is_text_node && content.trim().is_empty()) || (is_element_node && tag_name == "BR")
|
|
}
|
|
|
|
pub fn remove_and_next(node: &mut Node) -> Option<Node> {
|
|
let next_node = Self::next_node(node, true);
|
|
node.unlink();
|
|
next_node
|
|
}
|
|
|
|
pub fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
|
|
let mut node = node.clone();
|
|
|
|
// First check for kids if those aren't being ignored
|
|
let first_child = node.get_first_child();
|
|
if !ignore_self_and_kids && first_child.is_some() {
|
|
return first_child;
|
|
}
|
|
|
|
// Then for siblings...
|
|
let next_sibling = node.get_next_sibling();
|
|
if next_sibling.is_some() {
|
|
return next_sibling;
|
|
}
|
|
|
|
// And finally, move up the parent chain *and* find a sibling
|
|
// (because this is depth-first traversal, we will have already
|
|
// seen the parent nodes themselves).
|
|
loop {
|
|
let parent = node.get_parent();
|
|
if parent.is_none() {
|
|
break;
|
|
}
|
|
|
|
if let Some(parent) = parent {
|
|
let parent_name = parent.get_name().to_uppercase();
|
|
if parent_name == "HTML" {
|
|
break;
|
|
}
|
|
|
|
let next_sibling = parent.get_next_sibling();
|
|
if next_sibling.is_some() {
|
|
return next_sibling;
|
|
} else {
|
|
node = parent;
|
|
}
|
|
}
|
|
}
|
|
|
|
None
|
|
}
|
|
|
|
pub fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
|
let content = node.get_content().trim().to_owned();
|
|
if normalize_spaces {
|
|
constants::NORMALIZE.replace(&content, " ").into()
|
|
} else {
|
|
content
|
|
}
|
|
}
|
|
|
|
pub fn text_similarity(a: &str, b: &str) -> f64 {
|
|
let a = a.to_lowercase();
|
|
let b = b.to_lowercase();
|
|
let tokens_a = constants::TOKENIZE.split(&a).collect::<Vec<_>>();
|
|
let tokens_b = constants::TOKENIZE.split(&b).collect::<Vec<_>>();
|
|
if tokens_a.is_empty() || tokens_b.is_empty() {
|
|
return 0.0;
|
|
}
|
|
|
|
let tokens_b_total = tokens_b.join(" ").len() as f64;
|
|
let uniq_tokens_b = tokens_b
|
|
.into_iter()
|
|
.filter(|token| !tokens_a.iter().any(|t| t == token))
|
|
.collect::<Vec<_>>();
|
|
let uniq_tokens_b_total = uniq_tokens_b.join(" ").len() as f64;
|
|
|
|
let distance_b = uniq_tokens_b_total / tokens_b_total;
|
|
1.0 - distance_b
|
|
}
|
|
|
|
pub fn has_ancestor_tag<F>(
|
|
node: &Node,
|
|
tag_name: &str,
|
|
max_depth: Option<u64>,
|
|
filter: Option<F>,
|
|
) -> bool
|
|
where
|
|
F: Fn(&Node) -> bool,
|
|
{
|
|
let max_depth = max_depth.unwrap_or(3);
|
|
let tag_name = tag_name.to_uppercase();
|
|
let mut depth = 0;
|
|
let mut node = node.get_parent();
|
|
|
|
loop {
|
|
if depth > max_depth {
|
|
return false;
|
|
}
|
|
|
|
let tmp_node = match node {
|
|
Some(node) => node,
|
|
None => return false,
|
|
};
|
|
|
|
if tmp_node.get_name() == tag_name
|
|
&& filter
|
|
.as_ref()
|
|
.map(|filter| filter(&tmp_node))
|
|
.unwrap_or(true)
|
|
{
|
|
return true;
|
|
}
|
|
|
|
node = tmp_node.get_parent();
|
|
depth += 1;
|
|
}
|
|
}
|
|
|
|
pub fn has_single_tag_inside_element(node: &Node, tag: &str) -> bool {
|
|
// There should be exactly 1 element child with given tag
|
|
if node.get_child_nodes().len() != 1
|
|
|| node
|
|
.get_child_nodes()
|
|
.first()
|
|
.map(|n| n.get_name().to_uppercase() != tag)
|
|
.unwrap_or(false)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
// And there should be no text nodes with real content
|
|
!node.get_child_nodes().iter().any(|n| {
|
|
n.get_type()
|
|
.map(|t| t == NodeType::TextNode)
|
|
.unwrap_or(false)
|
|
&& constants::HAS_CONTENT.is_match(&n.get_content())
|
|
})
|
|
}
|
|
|
|
pub fn is_element_without_content(node: &Node) -> bool {
|
|
if let Some(node_type) = node.get_type() {
|
|
let len = node.get_child_nodes().len();
|
|
|
|
node_type == NodeType::ElementNode
|
|
&& (len == 0
|
|
|| len
|
|
== Self::get_elements_by_tag_name(node, "br").len()
|
|
+ Self::get_elements_by_tag_name(node, "hr").len())
|
|
&& node.get_content().trim().is_empty()
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub fn get_elements_by_tag_name(node: &Node, tag: &str) -> Vec<Node> {
|
|
let tag = tag.to_uppercase();
|
|
let all_tags = tag == "*";
|
|
let mut vec = Vec::new();
|
|
|
|
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
|
|
for child in node.get_child_elements() {
|
|
if all_tags || child.get_name().to_uppercase() == tag {
|
|
vec.push(child.clone());
|
|
}
|
|
get_elems(&child, tag, vec, all_tags);
|
|
}
|
|
}
|
|
|
|
get_elems(node, &tag, &mut vec, all_tags);
|
|
vec
|
|
}
|
|
|
|
pub fn get_link_density(node: &Node) -> f64 {
|
|
let text_length = Util::get_inner_text(node, false).len();
|
|
if text_length == 0 {
|
|
return 0.0;
|
|
}
|
|
|
|
let mut link_length = 0.0;
|
|
|
|
// XXX implement _reduceNodeList?
|
|
let link_nodes = Util::get_elements_by_tag_name(node, "A");
|
|
for link_node in link_nodes {
|
|
if let Some(href) = link_node.get_attribute("href") {
|
|
let coefficient = if constants::HASH_URL.is_match(&href) {
|
|
0.3
|
|
} else {
|
|
1.0
|
|
};
|
|
link_length += Util::get_inner_text(&link_node, false).len() as f64 * coefficient;
|
|
}
|
|
}
|
|
|
|
link_length / text_length as f64
|
|
}
|
|
|
|
// Determine whether element has any children block level elements.
|
|
pub fn has_child_block_element(node: &Node) -> bool {
|
|
node.get_child_nodes().iter().any(|node| {
|
|
constants::DIV_TO_P_ELEMS.contains(node.get_name().to_uppercase().as_str())
|
|
|| Self::has_child_block_element(node)
|
|
})
|
|
}
|
|
|
|
pub fn get_node_ancestors(node: &Node, max_depth: Option<u64>) -> Vec<Node> {
|
|
let mut ancestors = Vec::new();
|
|
let mut node = node.clone();
|
|
let max_depth = max_depth.unwrap_or(u64::MAX);
|
|
|
|
for _ in 0..max_depth {
|
|
let parent = node.get_parent();
|
|
match parent {
|
|
Some(parent) => {
|
|
ancestors.push(parent.clone());
|
|
node = parent;
|
|
}
|
|
None => return ancestors,
|
|
}
|
|
}
|
|
|
|
ancestors
|
|
}
|
|
|
|
pub fn has_tag_name(node: Option<&Node>, tag_name: &str) -> bool {
|
|
node.map(|n| n.get_name().to_uppercase() == tag_name.to_uppercase())
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
// Check if node is image, or if node contains exactly only one image
|
|
// whether as a direct child or as its descendants.
|
|
pub fn is_single_image(node: &Node) -> bool {
|
|
if node.get_name().to_uppercase() == "IMG" {
|
|
true
|
|
} else if node.get_child_elements().len() != 1 || node.get_content().trim() != "" {
|
|
false
|
|
} else if let Some(first_child) = node.get_child_elements().first() {
|
|
Self::is_single_image(first_child)
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
// Clean an element of all tags of type "tag" if they look fishy.
|
|
// "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
|
|
pub fn clean_conditionally(root: &mut Node, tag: &str) {
|
|
// Gather counts for other typical elements embedded within.
|
|
// Traverse backwards so we can remove nodes at the same time
|
|
// without effecting the traversal.
|
|
//
|
|
// TODO: Consider taking into account original contentScore here.
|
|
let nodes = Util::get_elements_by_tag_name(root, tag);
|
|
|
|
for mut node in nodes.into_iter().rev() {
|
|
if Self::should_remove(&node, tag) {
|
|
node.unlink();
|
|
}
|
|
}
|
|
}
|
|
|
|
fn should_remove(node: &Node, tag: &str) -> bool {
|
|
// First check if this node IS data table, in which case don't remove it.
|
|
let mut is_list = tag == "ul" || tag == "ol";
|
|
if !is_list {
|
|
let mut list_length = 0.0;
|
|
let ul_nodes = Self::get_elements_by_tag_name(node, "ul");
|
|
let ol_nodes = Self::get_elements_by_tag_name(node, "ol");
|
|
for list_node in ul_nodes {
|
|
list_length += Util::get_inner_text(&list_node, false).len() as f64;
|
|
}
|
|
for list_node in ol_nodes {
|
|
list_length += Util::get_inner_text(&list_node, false).len() as f64;
|
|
}
|
|
is_list = (list_length / Util::get_inner_text(node, false).len() as f64) > 0.9;
|
|
}
|
|
|
|
if tag == "table" && Self::is_data_table(node) {
|
|
return false;
|
|
}
|
|
|
|
// Next check if we're inside a data table, in which case don't remove it as well.
|
|
if Self::has_ancestor_tag(node, "table", Some(u64::MAX), Some(Self::is_data_table)) {
|
|
return false;
|
|
}
|
|
|
|
if Self::has_ancestor_tag(node, "code", None, None::<fn(&Node) -> bool>) {
|
|
return false;
|
|
}
|
|
|
|
let weight = Self::get_class_weight(node);
|
|
if weight < 0 {
|
|
return true;
|
|
}
|
|
|
|
if Self::get_char_count(node, ',') < 10 {
|
|
// If there are not very many commas, and the number of
|
|
// non-paragraph elements is more than paragraphs or other
|
|
// ominous signs, remove the element.
|
|
let p = Self::get_elements_by_tag_name(node, "p").len();
|
|
let img = Self::get_elements_by_tag_name(node, "img").len();
|
|
let li = Self::get_elements_by_tag_name(node, "li").len() as i64 - 100;
|
|
let input = Self::get_elements_by_tag_name(node, "input").len();
|
|
let heading_density =
|
|
Self::get_text_density(node, &["h1", "h2", "h3", "h4", "h5", "h6"]);
|
|
|
|
let mut embed_count = 0;
|
|
let embed_tags = ["object", "embed", "iframe"];
|
|
|
|
for embed_tag in embed_tags {
|
|
for embed_node in Self::get_elements_by_tag_name(node, embed_tag) {
|
|
// If this embed has attribute that matches video regex, don't delete it.
|
|
for (_name, value) in embed_node.get_attributes() {
|
|
if constants::VIDEOS.is_match(&value) {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
// For embed with <object> tag, check inner HTML as well.
|
|
// if embed_node.get_name().to_lowercase() == "object" && constants::VIDEOS.is_match(embed_node.innerHTML) {
|
|
// return false;
|
|
// }
|
|
|
|
embed_count += 1;
|
|
}
|
|
}
|
|
|
|
let link_density = Self::get_link_density(node);
|
|
let content = Self::get_inner_text(node, false);
|
|
let content_length = content.len();
|
|
|
|
let have_to_remove = (img > 1
|
|
&& (p as f64 / img as f64) < 0.5
|
|
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
|
|| (!is_list && li > p as i64)
|
|
|| (input as f64 > f64::floor(p as f64 / 3.0))
|
|
|| (!is_list
|
|
&& heading_density < 0.9
|
|
&& content_length < 25
|
|
&& (img == 0 || img > 2)
|
|
&& !Self::has_ancestor_tag(node, "figure", None, None::<fn(&Node) -> bool>))
|
|
|| (!is_list && weight < 25 && link_density > 0.2)
|
|
|| (weight >= 25 && link_density > 0.5)
|
|
|| ((embed_count == 1 && content_length < 75) || embed_count > 1);
|
|
|
|
// Allow simple lists of images to remain in pages
|
|
if is_list && have_to_remove {
|
|
for child in node.get_child_elements() {
|
|
// Don't filter in lists with li's that contain more than one child
|
|
if child.get_child_elements().len() > 1 {
|
|
return have_to_remove;
|
|
}
|
|
}
|
|
|
|
let li_count = Util::get_elements_by_tag_name(node, "li").len();
|
|
// Only allow the list to remain if every li contains an image
|
|
if img == li_count {
|
|
return false;
|
|
}
|
|
}
|
|
|
|
have_to_remove
|
|
} else {
|
|
false
|
|
}
|
|
}
|
|
|
|
pub fn get_class_weight(node: &Node) -> i64 {
|
|
let mut weight = 0;
|
|
|
|
// Look for a special classname
|
|
if let Some(class_names) = node.get_property("class") {
|
|
if constants::NEGATIVE.is_match(&class_names) {
|
|
weight -= 25;
|
|
}
|
|
|
|
if constants::POSITIVE.is_match(&class_names) {
|
|
weight += 25;
|
|
}
|
|
}
|
|
|
|
// Look for a special ID
|
|
if let Some(class_names) = node.get_property("id") {
|
|
if constants::NEGATIVE.is_match(&class_names) {
|
|
weight -= 25;
|
|
}
|
|
|
|
if constants::POSITIVE.is_match(&class_names) {
|
|
weight += 25;
|
|
}
|
|
}
|
|
|
|
weight
|
|
}
|
|
|
|
fn get_char_count(node: &Node, char: char) -> usize {
|
|
Util::get_inner_text(node, false).split(char).count() - 1
|
|
}
|
|
|
|
fn get_text_density(node: &Node, tags: &[&str]) -> f64 {
|
|
let text_length = Util::get_inner_text(node, false).len();
|
|
if text_length == 0 {
|
|
return 0.0;
|
|
}
|
|
|
|
let mut children_length = 0;
|
|
for tag in tags {
|
|
for child in Self::get_elements_by_tag_name(node, tag) {
|
|
children_length += Util::get_inner_text(&child, false).len()
|
|
}
|
|
}
|
|
children_length as f64 / text_length as f64
|
|
}
|
|
|
|
fn is_data_table(node: &Node) -> bool {
|
|
node.get_attribute(constants::DATA_TABLE_ATTR)
|
|
.and_then(|is_data_table| is_data_table.parse::<bool>().ok())
|
|
.unwrap_or(false)
|
|
}
|
|
|
|
pub fn mark_data_tables(context: &Context) -> Result<(), FullTextParserError> {
|
|
let nodes = Util::evaluate_xpath(context, "//table", false)?;
|
|
for mut node in nodes {
|
|
if node
|
|
.get_attribute("role")
|
|
.map(|role| role == "presentation")
|
|
.unwrap_or(false)
|
|
{
|
|
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false");
|
|
continue;
|
|
}
|
|
|
|
if node
|
|
.get_attribute("datatable")
|
|
.map(|role| role == "0")
|
|
.unwrap_or(false)
|
|
{
|
|
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false");
|
|
continue;
|
|
}
|
|
|
|
if node.get_attribute("summary").is_some() {
|
|
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
|
|
continue;
|
|
}
|
|
|
|
if let Some(first_caption) = Self::get_elements_by_tag_name(&node, "caption").first() {
|
|
if !first_caption.get_child_nodes().is_empty() {
|
|
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// If the table has a descendant with any of these tags, consider a data table:
|
|
let data_table_descendants = ["col", "colgroup", "tfoot", "thead", "th"];
|
|
for descendant in data_table_descendants {
|
|
if !Self::get_elements_by_tag_name(&node, descendant).is_empty() {
|
|
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Nested tables indicate a layout table:
|
|
if !Self::get_elements_by_tag_name(&node, "table").is_empty() {
|
|
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "false");
|
|
continue;
|
|
}
|
|
|
|
let (rows, columns) = Self::get_row_and_column_count(&node);
|
|
if rows >= 10 || columns > 4 {
|
|
let _ = node.set_attribute(constants::DATA_TABLE_ATTR, "true");
|
|
continue;
|
|
}
|
|
|
|
// Now just go by size entirely:
|
|
let _ = node.set_attribute(
|
|
constants::DATA_TABLE_ATTR,
|
|
if rows * columns > 10 { "true" } else { "false" },
|
|
);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn get_row_and_column_count(node: &Node) -> (usize, usize) {
|
|
if node.get_name().to_uppercase() != "TABLE" {
|
|
return (0, 0);
|
|
}
|
|
|
|
let mut rows = 0;
|
|
let mut columns = 0;
|
|
|
|
let trs = Self::get_elements_by_tag_name(node, "tr");
|
|
for tr in trs {
|
|
let row_span = tr
|
|
.get_attribute("rowspan")
|
|
.and_then(|span| span.parse::<usize>().ok())
|
|
.unwrap_or(1);
|
|
rows += row_span;
|
|
|
|
// Now look for column-related info
|
|
let mut columns_in_this_row = 0;
|
|
let cells = Self::get_elements_by_tag_name(&tr, "td");
|
|
for cell in cells {
|
|
let colspan = cell
|
|
.get_attribute("colspan")
|
|
.and_then(|span| span.parse::<usize>().ok())
|
|
.unwrap_or(1);
|
|
columns_in_this_row += colspan;
|
|
}
|
|
columns = usize::max(columns, columns_in_this_row);
|
|
}
|
|
|
|
(rows, columns)
|
|
}
|
|
}
|