mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-10 01:15:31 +02:00
special handling trying to find single page links: fixes youtube
This commit is contained in:
parent
7b1b027c6d
commit
8c2af14871
6 changed files with 226 additions and 174 deletions
164
src/util.rs
164
src/util.rs
|
@ -1,8 +1,15 @@
|
|||
use failure::ResultExt;
|
||||
use reqwest::header::{HeaderMap, HeaderValue, HeaderName};
|
||||
use libxml::{tree::Node, xpath::Context};
|
||||
use reqwest::{
|
||||
header::{HeaderMap, HeaderName, HeaderValue},
|
||||
Response,
|
||||
};
|
||||
use tokio::fs::DirEntry;
|
||||
|
||||
use crate::{config::ConfigEntry, error::{ScraperErrorKind, ScraperError}};
|
||||
use crate::{
|
||||
config::ConfigEntry,
|
||||
error::{ScraperError, ScraperErrorKind},
|
||||
};
|
||||
|
||||
pub struct Util;
|
||||
|
||||
|
@ -15,7 +22,7 @@ impl Util {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
|
||||
pub fn str_extract_value<'a>(identifier: &str, line: &'a str) -> &'a str {
|
||||
let value = &line[identifier.len()..];
|
||||
let value = value.trim();
|
||||
match value.find('#') {
|
||||
|
@ -39,23 +46,164 @@ impl Util {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn generate_headers(site_specific_rule: Option<&ConfigEntry>, global_rule: &ConfigEntry) -> Result<HeaderMap, ScraperError> {
|
||||
pub fn generate_headers(
|
||||
site_specific_rule: Option<&ConfigEntry>,
|
||||
global_rule: &ConfigEntry,
|
||||
) -> Result<HeaderMap, ScraperError> {
|
||||
let mut headers = HeaderMap::new();
|
||||
|
||||
if let Some(config) = site_specific_rule {
|
||||
for header in &config.header {
|
||||
let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
|
||||
let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
|
||||
let name = HeaderName::from_bytes(header.name.as_bytes())
|
||||
.context(ScraperErrorKind::Config)?;
|
||||
let value = header
|
||||
.value
|
||||
.parse::<HeaderValue>()
|
||||
.context(ScraperErrorKind::Config)?;
|
||||
headers.insert(name, value);
|
||||
}
|
||||
}
|
||||
|
||||
for header in &global_rule.header {
|
||||
let name = HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
|
||||
let value = header.value.parse::<HeaderValue>().context(ScraperErrorKind::Config)?;
|
||||
let name =
|
||||
HeaderName::from_bytes(header.name.as_bytes()).context(ScraperErrorKind::Config)?;
|
||||
let value = header
|
||||
.value
|
||||
.parse::<HeaderValue>()
|
||||
.context(ScraperErrorKind::Config)?;
|
||||
headers.insert(name, value);
|
||||
}
|
||||
|
||||
Ok(headers)
|
||||
}
|
||||
|
||||
pub fn find_page_url(xpath_ctx: &Context, xpath_page_link: &str) -> Option<url::Url> {
|
||||
let res = Self::evaluate_xpath(&xpath_ctx, &xpath_page_link, false).ok()?;
|
||||
let mut url = None;
|
||||
|
||||
for node in res {
|
||||
let content = node.get_content();
|
||||
let url_str = if content.trim().is_empty() && node.has_attribute("href") {
|
||||
node.get_attribute("href").unwrap()
|
||||
} else {
|
||||
content
|
||||
};
|
||||
|
||||
if let Ok(parsed_url) = url::Url::parse(&url_str) {
|
||||
url = Some(parsed_url);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
url
|
||||
}
|
||||
|
||||
pub fn evaluate_xpath(
|
||||
xpath_ctx: &Context,
|
||||
xpath: &str,
|
||||
thorw_if_empty: bool,
|
||||
) -> Result<Vec<Node>, ScraperError> {
|
||||
let res = xpath_ctx.evaluate(xpath).map_err(|()| {
|
||||
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
||||
ScraperErrorKind::Xml
|
||||
})?;
|
||||
|
||||
let node_vec = res.get_nodes_as_vec();
|
||||
|
||||
if node_vec.is_empty() {
|
||||
log::debug!("Evaluation of xpath '{}' yielded no results", xpath);
|
||||
if thorw_if_empty {
|
||||
return Err(ScraperErrorKind::Xml.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(node_vec)
|
||||
}
|
||||
|
||||
pub fn check_content_type(response: &Response) -> Result<bool, ScraperError> {
|
||||
if response.status().is_success() {
|
||||
if let Some(content_type) = response.headers().get(reqwest::header::CONTENT_TYPE) {
|
||||
if let Ok(content_type) = content_type.to_str() {
|
||||
if content_type.contains("text/html") {
|
||||
return Ok(true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
log::error!("Content type is not text/HTML");
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
log::error!("Failed to determine content type");
|
||||
Err(ScraperErrorKind::Http.into())
|
||||
}
|
||||
|
||||
pub fn check_redirect(response: &Response, original_url: &url::Url) -> Option<url::Url> {
|
||||
if response.status() == reqwest::StatusCode::PERMANENT_REDIRECT {
|
||||
log::debug!("Article url redirects to '{}'", response.url().as_str());
|
||||
return Some(response.url().clone());
|
||||
} else if response.url() != original_url {
|
||||
return Some(response.url().clone());
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
pub fn extract_value(context: &Context, xpath: &str) -> Result<String, ScraperError> {
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, false)?;
|
||||
if let Some(val) = node_vec.get(0) {
|
||||
return Ok(val.get_content());
|
||||
}
|
||||
|
||||
Err(ScraperErrorKind::Xml.into())
|
||||
}
|
||||
|
||||
pub fn extract_value_merge(context: &Context, xpath: &str) -> Result<String, ScraperError> {
|
||||
let node_vec = Util::evaluate_xpath(context, xpath, true)?;
|
||||
let mut val = String::new();
|
||||
for node in node_vec {
|
||||
let part = node
|
||||
.get_content()
|
||||
.split_whitespace()
|
||||
.map(|s| format!("{} ", s))
|
||||
.collect::<String>();
|
||||
val.push_str(&part);
|
||||
val.push_str(" ");
|
||||
}
|
||||
|
||||
Ok(val.trim().to_string())
|
||||
}
|
||||
|
||||
pub fn strip_node(context: &Context, xpath: &str) -> Result<(), ScraperError> {
|
||||
let mut ancestor = xpath.to_string();
|
||||
if ancestor.starts_with("//") {
|
||||
ancestor = ancestor.chars().skip(2).collect();
|
||||
}
|
||||
|
||||
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
||||
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
||||
for mut node in node_vec {
|
||||
node.unlink();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn strip_id_or_class(context: &Context, id_or_class: &str) -> Result<(), ScraperError> {
|
||||
let xpath = &format!(
|
||||
"//*[contains(@class, '{}') or contains(@id, '{}')]",
|
||||
id_or_class, id_or_class
|
||||
);
|
||||
|
||||
let mut ancestor = xpath.clone();
|
||||
if ancestor.starts_with("//") {
|
||||
ancestor = ancestor.chars().skip(2).collect();
|
||||
}
|
||||
|
||||
let query = &format!("{}[not(ancestor::{})]", xpath, ancestor);
|
||||
let node_vec = Util::evaluate_xpath(context, query, false)?;
|
||||
for mut node in node_vec {
|
||||
node.unlink();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue