1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-07 16:15:32 +02:00

first content extraction kinda working

This commit is contained in:
Jan Lukas Gernert 2023-02-20 00:29:44 +01:00
parent 2c76a869e7
commit cce912c354
8 changed files with 363 additions and 58 deletions

View file

@ -11,14 +11,17 @@ repository = "https://gitlab.com/news-flash/article_scraper"
thiserror = "1.0"
libxml = "0.3"
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
tokio = { version = "1.22", features = ["macros", "fs", "io-util"] }
tokio = { version = "1.25", features = ["macros", "fs", "io-util"] }
url = "2.3"
regex = "1.7"
encoding_rs = "0.8"
chrono = "0.4"
base64 = "0.20"
base64 = "0.21"
image = "0.24"
log = "0.4"
rust-embed="6.4"
once_cell = "1.16"
escaper = "0.1"
[dev-dependencies]
env_logger = "0.10"

View file

@ -0,0 +1,234 @@
<!DOCTYPE html>
<html class="no-js" lang="en">
<head>
<meta charset="utf-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>
<title>Get your Frontend JavaScript Code Covered | Code | Nicolas Perriault</title>
<meta
name="description" content="Nicolas Perriault's homepage."/>
<meta name="viewport" content="width=device-width"/>
<link href="//fonts.googleapis.com/css?family=Asap:400,400italic,700,700italic&amp;subset=latin,latin-ext"
rel="stylesheet" type="text/css"/>
<link rel="stylesheet" type="text/css" href="/static/packed.css?1412806084"/>
<link rel="alternate" type="application/rss+xml" href="/code/feed/" title="Code (RSS)"/>
<link rel="alternate" type="application/rss+xml" href="/photography/feed/"
title="Photography (RSS)"/>
<link rel="alternate" type="application/rss+xml" href="/talks/feed/" title="Talks (RSS)"/>
<link rel="alternate" type="application/rss+xml" href="/carnet/feed/"
title="Carnet (RSS)"/>
<link rel="alternate" type="application/rss+xml" href="/feed/" title="Everything (RSS)"/>
<!--[if lt IE 9]>
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
<![endif]-->
</head>
<body class="code " onload="prettyPrint()">
<!--[if lt IE 7]>
<p class="chromeframe">Your browser is <em>ancient!</em> Please <a href="http://www.quirksmode.org/upgrade.html">upgrade</a>.</p>
<![endif]-->
<div class="container">
<header class="main-title">
<h1><a href="/">Hi, I'm <strong>Nicolas.</strong></a></h1>
<small>I code stuff. I take photos. I write rants.</small>
</header>
<main class="contents" role="main">
<article lang="en" class="code" itemscope="" itemtype="http://schema.org/BlogPosting">
<link itemprop="url" href="/code/2013/get-your-frontend-javascript-code-covered/"/>
<header>
<h2><a itemprop="name" href="/code/2013/get-your-frontend-javascript-code-covered/">Get your Frontend JavaScript Code Covered</a></h2>
</header>
<section>
<p><strong>So finally you're <a href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">testing your frontend JavaScript code</a>? Great! The more you
write tests, the more confident you are with your code… but how much precisely?
That's where <a href="http://en.wikipedia.org/wiki/Code_coverage">code coverage</a> might
help.</strong>
</p>
<p>The idea behind code coverage is to record which parts of your code (functions,
statements, conditionals and so on) have been executed by your test suite,
to compute metrics out of these data and usually to provide tools for navigating
and inspecting them.</p>
<p>Not a lot of frontend developers I know actually test their frontend code,
and I can barely imagine how many of them have ever setup code coverage…
Mostly because there are not many frontend-oriented tools in this area
I guess.</p>
<p>Actually I've only found one which provides an adapter for <a href="http://visionmedia.github.io/mocha/">Mocha</a> and
actually works…</p>
<blockquote class="twitter-tweet tw-align-center">
<p>Drinking game for web devs:
<br />(1) Think of a noun
<br />(2) Google "&lt;noun&gt;.js"
<br />(3) If a library with that name exists - drink</p>— Shay Friedman (@ironshay)
<a
href="https://twitter.com/ironshay/statuses/370525864523743232">August 22, 2013</a>
</blockquote>
<p><strong><a href="http://blanketjs.org/">Blanket.js</a></strong> is an <em>easy to install, easy to configure,
and easy to use JavaScript code coverage library that works both in-browser and
with nodejs.</em>
</p>
<p>Its use is dead easy, adding Blanket support to your Mocha test suite
is just matter of adding this simple line to your HTML test file:</p>
<pre><code>&lt;script src="vendor/blanket.js"
data-cover-adapter="vendor/mocha-blanket.js"&gt;&lt;/script&gt;
</code></pre>
<p>Source files: <a href="https://raw.github.com/alex-seville/blanket/master/dist/qunit/blanket.min.js">blanket.js</a>,
<a
href="https://raw.github.com/alex-seville/blanket/master/src/adapters/mocha-blanket.js">mocha-blanket.js</a>
</p>
<p>As an example, let's reuse the silly <code>Cow</code> example we used
<a
href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">in a previous episode</a>:</p>
<pre><code>// cow.js
(function(exports) {
"use strict";
function Cow(name) {
this.name = name || "Anon cow";
}
exports.Cow = Cow;
Cow.prototype = {
greets: function(target) {
if (!target)
throw new Error("missing target");
return this.name + " greets " + target;
}
};
})(this);
</code></pre>
<p>And its test suite, powered by Mocha and <a href="http://chaijs.com/">Chai</a>:</p>
<pre><code>var expect = chai.expect;
describe("Cow", function() {
describe("constructor", function() {
it("should have a default name", function() {
var cow = new Cow();
expect(cow.name).to.equal("Anon cow");
});
it("should set cow's name if provided", function() {
var cow = new Cow("Kate");
expect(cow.name).to.equal("Kate");
});
});
describe("#greets", function() {
it("should greet passed target", function() {
var greetings = (new Cow("Kate")).greets("Baby");
expect(greetings).to.equal("Kate greets Baby");
});
});
});
</code></pre>
<p>Let's create the HTML test file for it, featuring Blanket and its adapter
for Mocha:</p>
<pre><code>&lt;!DOCTYPE html&gt;
&lt;html&gt;
&lt;head&gt;
&lt;meta charset="utf-8"&gt;
&lt;title&gt;Test&lt;/title&gt;
&lt;link rel="stylesheet" media="all" href="vendor/mocha.css"&gt;
&lt;/head&gt;
&lt;body&gt;
&lt;div id="mocha"&gt;&lt;/div&gt;
&lt;div id="messages"&gt;&lt;/div&gt;
&lt;div id="fixtures"&gt;&lt;/div&gt;
&lt;script src="vendor/mocha.js"&gt;&lt;/script&gt;
&lt;script src="vendor/chai.js"&gt;&lt;/script&gt;
&lt;script src="vendor/blanket.js"
data-cover-adapter="vendor/mocha-blanket.js"&gt;&lt;/script&gt;
&lt;script&gt;mocha.setup('bdd');&lt;/script&gt;
&lt;script src="cow.js" data-cover&gt;&lt;/script&gt;
&lt;script src="cow_test.js"&gt;&lt;/script&gt;
&lt;script&gt;mocha.run();&lt;/script&gt;
&lt;/body&gt;
&lt;/html&gt;
</code></pre>
<p><strong>Notes</strong>:</p>
<ul>
<li>Notice the <code>data-cover</code> attribute we added to the script tag
loading the source of our library;</li>
<li>The HTML test file <em>must</em> be served over HTTP for the adapter to
be loaded.</li>
</ul>
<p>Running the tests now gives us something like this:</p>
<p>
<img alt="screenshot" src="/static/code/2013/blanket-coverage.png"/>
</p>
<p>As you can see, the report at the bottom highlights that we haven't actually
tested the case where an error is raised in case a target name is missing.
We've been informed of that, nothing more, nothing less. We simply know
we're missing a test here. Isn't this cool? I think so!</p>
<p>Just remember that code coverage will only <a href="http://codebetter.com/karlseguin/2008/12/09/code-coverage-use-it-wisely/">bring you numbers</a> and
raw information, not actual proofs that the whole of your <em>code logic</em> has
been actually covered. If you ask me, the best inputs you can get about
your code logic and implementation ever are the ones issued out of <a href="http://www.extremeprogramming.org/rules/pair.html">pair programming</a>
sessions
and <a href="http://alexgaynor.net/2013/sep/26/effective-code-review/">code reviews</a>
but that's another story.</p>
<p><strong>So is code coverage silver bullet? No. Is it useful? Definitely. Happy testing!</strong>
</p>
</section>
<aside>
<p> <span class="article-author" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
<span itemprop="name">Nicolas Perriault</span></span>
<time
datetime="2013-09-29" itemprop="datePublished">2013-09-29</time>— in <a href="/code/" itemprop="genre">Code</a>
<a href="/code/2013/get-your-frontend-javascript-code-covered/">Permalink</a>
<a
rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/">License</a><a href="http://flattr.com/submit/auto?url=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/&amp;title=Get your Frontend JavaScript Code Covered&amp;user_id=n1k0&amp;category=software&amp;language=en">flattr this</a>
</p>
</aside>
<hr/>
<nav> <a class="prev" href="/code/2013/functional-javascript-for-crawling-the-web/">Functional JavaScript for crawling the Web</a>
|
<a
class="next" href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">Testing your frontend JavaScript code using mocha, chai, and sinon</a>
</nav>
</article>
</main>
<nav class="sidebar">
<ul>
<li class="home"><a href="/" hreflang="en">Home</a>
</li>
<li class="code"><a href="/code/" hreflang="en">Code</a>
</li>
<li class="photography"><a href="/photography/" hreflang="en">Photography</a>
</li>
<li class="talks"><a href="/talks/" hreflang="en">Talks</a>
</li>
<li class="carnet"><a href="/carnet/" hreflang="fr">Carnet <span>fr</span></a>
</li>
<li class="contact"><a href="/contact/" hreflang="en">Contact</a>
</li>
</ul>
</nav>
<footer class="site-footer">
<p>© 2012 Nicolas Perriault — <a href="https://twitter.com/n1k0">Tweet at me</a>
<a
href="https://github.com/n1k0">Get my code</a><a href="http://500px.com/n1k0">Enjoy my pics</a>
<a href="/contact/">Contact me</a>
</p>
</footer>
</div>
<!-- /container -->
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
<script>
window.jQuery || document.write('&lt;script src="js/libs/jquery-1.7.1.min.js">&lt;\/script>')
</script>
<script type="text/javascript" src="/static/js/libs/prettify/prettify.js"></script>
<script type="text/javascript" src="/static/js/app.js"></script>
<script src="//platform.twitter.com/widgets.js" charset="utf-8"></script>
</body>
</html>

View file

@ -34,6 +34,24 @@ pub struct ConfigEntry {
pub next_page_link: Option<String>,
}
impl Default for ConfigEntry {
fn default() -> Self {
Self {
xpath_title: Vec::new(),
xpath_author: Vec::new(),
xpath_date: Vec::new(),
xpath_body: Vec::new(),
xpath_strip: Vec::new(),
strip_id_or_class: Vec::new(),
strip_image_src: Vec::new(),
replace: Vec::new(),
header: Vec::new(),
single_page_link: None,
next_page_link: None,
}
}
}
impl ConfigEntry {
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
let mut file = fs::File::open(&config_path).await?;

View file

@ -181,8 +181,8 @@ impl FullTextParser {
Self::strip_junk(&xpath_ctx, config, global_config, url);
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
if found_body {
if let Err(error) = Readability::extract_body_readability(document, root) {
if !found_body {
if let Err(error) = Readability::extract_body(document, root) {
log::error!("Both ftr and readability failed to find content: {}", error);
return Err(error);
}

View file

@ -1,9 +1,12 @@
mod constants;
mod state;
#[cfg(test)]
mod tests;
use std::cmp::Ordering;
use libxml::tree::{Document, Node, NodeType};
use libxml::tree::{node, Document, Node, NodeType};
use self::state::State;
use super::error::FullTextParserError;
@ -11,13 +14,12 @@ use super::error::FullTextParserError;
pub struct Readability;
impl Readability {
pub fn extract_body_readability(
document: Document,
root: &mut Node,
) -> Result<bool, FullTextParserError> {
pub fn extract_body(document: Document, root: &mut Node) -> Result<bool, FullTextParserError> {
node::set_node_rc_guard(6);
let mut state = State::default();
let mut document = document;
let mut attempts: Vec<(Node, usize)> = Vec::new();
let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
let document_cache = document
.dup()
.map_err(|()| FullTextParserError::Readability)?;
@ -75,7 +77,7 @@ impl Readability {
}
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
if tag_name == "DIV"
if (tag_name == "DIV"
|| tag_name == "SECTION"
|| tag_name == "HEADER"
|| tag_name == "H1"
@ -83,7 +85,8 @@ impl Readability {
|| tag_name == "H3"
|| tag_name == "H4"
|| tag_name == "H5"
|| tag_name == "H6" && Self::is_element_without_content(node_ref)
|| tag_name == "H6")
&& Self::is_element_without_content(node_ref)
{
node = Self::remove_and_next(node_ref);
continue;
@ -159,7 +162,7 @@ impl Readability {
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
// Then add their score to their parent node.
// A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
for element_to_score in elements_to_score {
for element_to_score in elements_to_score.drain(..) {
if element_to_score.get_parent().is_none() {
continue;
}
@ -195,7 +198,7 @@ impl Readability {
}
if Self::get_content_score(&ancestor).is_none() {
Self::initialize_node(&mut ancestor, &state);
Self::initialize_node(&mut ancestor, &state)?;
candidates.push(ancestor.clone());
}
@ -213,7 +216,7 @@ impl Readability {
if let Some(mut score) = Self::get_content_score(&ancestor) {
score += content_score / score_divider;
Self::set_content_score(&mut ancestor, score);
Self::set_content_score(&mut ancestor, score)?;
}
}
}
@ -226,7 +229,7 @@ impl Readability {
// unaffected by this operation.
if let Some(content_score) = Self::get_content_score(candidate) {
let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
Self::set_content_score(candidate, candidate_score);
Self::set_content_score(candidate, candidate_score)?;
}
}
@ -244,11 +247,11 @@ impl Readability {
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
Self::initialize_node(root, &state);
let mut rt = document.get_root_element().unwrap();
Self::initialize_node(&mut rt, &state).unwrap();
needed_to_create_top_candidate = true;
root.clone()
rt
});
#[allow(unused_assignments)]
let mut parent_of_top_candidate = None;
let mut alternative_candidate_ancestors = Vec::new();
@ -257,8 +260,9 @@ impl Readability {
for top_candidate in &top_candidates {
if let Some(score) = Self::get_content_score(top_candidate) {
if score >= 0.75 {
alternative_candidate_ancestors
.push(Self::get_node_ancestors(top_candidate, 0));
if let Some(ancestor) = top_candidate.get_parent() {
alternative_candidate_ancestors.push(ancestor);
}
}
}
}
@ -273,15 +277,16 @@ impl Readability {
alternative_candidate_ancestors.len(),
constants::MINIMUM_TOPCANDIDATES,
);
for item in alternative_candidate_ancestors.iter().take(tmp) {
let tmp = item.iter().any(|n| n == parent);
lists_containing_this_ancestor += if tmp { 1 } else { 0 };
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
}
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
top_candidate = parent.clone();
break;
}
} else {
break;
}
parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
@ -289,7 +294,7 @@ impl Readability {
}
if Self::get_content_score(&top_candidate).is_none() {
Self::initialize_node(&mut top_candidate, &state);
Self::initialize_node(&mut top_candidate, &state)?;
}
// Because of our bonus system, parents of candidates might have scores
@ -353,7 +358,7 @@ impl Readability {
}
if Self::get_content_score(&top_candidate).is_none() {
Self::initialize_node(&mut top_candidate, &state);
Self::initialize_node(&mut top_candidate, &state)?;
}
// Now that we have the top candidate, look through its siblings for content
@ -432,6 +437,7 @@ impl Readability {
})?;
}
sibling.unlink();
article_content.add_child(&mut sibling).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
@ -471,6 +477,7 @@ impl Readability {
})?;
for mut child in article_content.get_child_nodes() {
child.unlink();
div.add_child(&mut child).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
@ -489,33 +496,31 @@ impl Readability {
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
let text_length = Self::get_inner_text(&article_content, true).len();
let text = Self::get_inner_text(&article_content, true);
let text_length = text.len();
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
parse_successful = false;
document = document_cache
.dup()
.map_err(|()| FullTextParserError::Readability)?;
if state.strip_unlikely {
state.strip_unlikely = false;
attempts.push((article_content, text_length));
attempts.push((article_content, text_length, document));
} else if state.weigh_classes {
state.weigh_classes = false;
attempts.push((article_content, text_length));
attempts.push((article_content, text_length, document));
} else if state.clean_conditionally {
state.clean_conditionally = false;
attempts.push((article_content, text_length));
attempts.push((article_content, text_length, document));
} else {
attempts.push((article_content, text_length));
attempts.push((article_content, text_length, document));
// No luck after removing flags, just return the longest text we found during the different loops
attempts.sort_by(|(_, size_a), (_, size_b)| size_a.cmp(size_b));
attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));
// But first check if we actually have something
if let Some((best_attempt, _len)) = attempts.first() {
article_content = best_attempt.clone();
root.add_child(&mut article_content).map_err(|error| {
if let Some((mut best_attempt, _len, _document)) = attempts.pop() {
best_attempt.unlink();
root.add_child(&mut best_attempt).map_err(|error| {
log::error!("{error}");
FullTextParserError::Readability
})?;
@ -524,6 +529,10 @@ impl Readability {
return Ok(parse_successful);
}
document = document_cache
.dup()
.map_err(|()| FullTextParserError::Readability)?;
} else {
root.add_child(&mut article_content).map_err(|error| {
log::error!("{error}");
@ -539,9 +548,12 @@ impl Readability {
.and_then(|a| a.parse::<f64>().ok())
}
fn set_content_score(node: &mut Node, score: f64) {
fn set_content_score(node: &mut Node, score: f64) -> Result<(), FullTextParserError> {
node.set_attribute(constants::SCORE_ATTR, &score.to_string())
.expect("Failed to set content score");
.map_err(|err| {
log::error!("failed to set content score: {err}");
FullTextParserError::Readability
})
}
fn is_probably_visible(node: &Node) -> bool {
@ -580,6 +592,8 @@ impl Readability {
}
fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
let mut node = node.clone();
// First check for kids if those aren't being ignored
let first_child = node.get_first_child();
if !ignore_self_and_kids && first_child.is_some() {
@ -602,9 +616,16 @@ impl Readability {
}
if let Some(parent) = parent {
let parent_name = parent.get_name().to_uppercase();
if parent_name == "HTML" {
break;
}
let next_sibling = parent.get_next_sibling();
if next_sibling.is_some() {
return next_sibling;
} else {
node = parent;
}
}
}
@ -649,11 +670,11 @@ impl Readability {
// the same as the article title.
fn header_duplicates_title(node: &Node) -> bool {
let name = node.get_name().to_lowercase();
if name != "h1" || name != "h2" {
if name != "h1" && name != "h2" {
return false;
}
let heading = Self::get_inner_text(node, false);
Self::text_similarity(&heading, "FIXME") > 0.75
Self::text_similarity(&heading, "Get your Frontend JavaScript Code Covered") > 0.75
}
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
@ -759,10 +780,10 @@ impl Readability {
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
for child in node.get_child_elements() {
if all_tags || child.get_name() == tag {
vec.push(child);
if all_tags || child.get_name().to_uppercase() == tag {
vec.push(child.clone());
}
get_elems(node, tag, vec, all_tags);
get_elems(&child, tag, vec, all_tags);
}
}
@ -823,7 +844,7 @@ impl Readability {
let mut ancestors = Vec::new();
let mut node = node.clone();
for _ in 0..max_depth {
for _ in 0..=max_depth {
let parent = node.get_parent();
match parent {
Some(parent) => {
@ -839,7 +860,7 @@ impl Readability {
// Initialize a node with the readability object. Also checks the
// className/id for special names to add to its score.
fn initialize_node(node: &mut Node, state: &State) {
fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
let score = match node.get_name().to_uppercase().as_str() {
"DIV" => 5,
"PRE" | "TD" | "BLOCKQUITE" => 3,
@ -848,7 +869,8 @@ impl Readability {
_ => 0,
};
let score = score + Self::get_class_weight(node, state);
Self::set_content_score(node, score as f64);
Self::set_content_score(node, score as f64)?;
Ok(())
}
fn get_class_weight(node: &Node, state: &State) -> i64 {

View file

@ -0,0 +1,27 @@
use libxml::tree::{Document, Node};
use reqwest::Url;
use crate::full_text_parser::config::ConfigEntry;
async fn prepare(html: &str, url: &Url) -> Document {
let empty_config = ConfigEntry::default();
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
document
}
#[tokio::test]
async fn test_1() {
let _ = env_logger::builder().is_test(true).try_init();
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
.expect("Failed to read HTML");
let url = Url::parse("http://google.com").unwrap();
let document = prepare(&html, &url).await;
let mut root = Node::new("article", None, &document).unwrap();
super::Readability::extract_body(document, &mut root).unwrap();
}

View file

@ -2,7 +2,7 @@ use super::FullTextParser;
use reqwest::Client;
use std::path::PathBuf;
#[tokio::test(flavor = "current_thread")]
#[tokio::test]
async fn golem() {
let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
@ -26,7 +26,7 @@ async fn golem() {
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
}
#[tokio::test(flavor = "current_thread")]
#[tokio::test]
async fn phoronix() {
let out_path = PathBuf::from(r"./test_output");
let url =
@ -45,7 +45,7 @@ async fn phoronix() {
);
}
#[tokio::test(flavor = "current_thread")]
#[tokio::test]
async fn youtube() {
let out_path = PathBuf::from(r"./test_output");
let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap();
@ -64,7 +64,7 @@ async fn youtube() {
.unwrap_or(false));
}
#[tokio::test(flavor = "current_thread")]
#[tokio::test]
async fn encoding_windows_1252() {
let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap();
let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new())

View file

@ -1,5 +1,6 @@
pub use self::error::ImageDownloadError;
use crate::util::Util;
use base64::Engine;
use libxml::parser::Parser;
use libxml::tree::{Document, Node, SaveOptions};
use libxml::xpath::Context;
@ -143,8 +144,8 @@ impl ImageDownloader {
}
}
let small_image_base64 = base64::encode(&small_image);
let big_image_base64 = big_image.map(base64::encode);
let small_image_base64 = base64::engine::general_purpose::STANDARD.encode(&small_image);
let big_image_base64 = big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img));
let small_image_string =
format!("data:{};base64,{}", content_type_small, small_image_base64);
let big_image_string = match big_image_base64 {
@ -290,13 +291,13 @@ mod tests {
use std::fs;
use std::io::Write;
#[tokio::test(flavor = "current_thread")]
#[tokio::test]
async fn close_tags() {
let image_dowloader = ImageDownloader::new((2048, 2048));
let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
.expect("Failed to read HTML");
let result = image_dowloader
.download_images_from_string(&hdyleaflet, &Client::new())
.download_images_from_string(&html, &Client::new())
.await
.expect("Failed to downalod images");
let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")