mirror of
https://gitlab.com/news-flash/article_scraper.git
synced 2025-07-07 16:15:32 +02:00
first content extraction kinda working
This commit is contained in:
parent
2c76a869e7
commit
cce912c354
8 changed files with 363 additions and 58 deletions
|
@ -11,14 +11,17 @@ repository = "https://gitlab.com/news-flash/article_scraper"
|
|||
thiserror = "1.0"
|
||||
libxml = "0.3"
|
||||
reqwest = { version = "0.11", features = ["json", "native-tls", "gzip", "brotli"] }
|
||||
tokio = { version = "1.22", features = ["macros", "fs", "io-util"] }
|
||||
tokio = { version = "1.25", features = ["macros", "fs", "io-util"] }
|
||||
url = "2.3"
|
||||
regex = "1.7"
|
||||
encoding_rs = "0.8"
|
||||
chrono = "0.4"
|
||||
base64 = "0.20"
|
||||
base64 = "0.21"
|
||||
image = "0.24"
|
||||
log = "0.4"
|
||||
rust-embed="6.4"
|
||||
once_cell = "1.16"
|
||||
escaper = "0.1"
|
||||
escaper = "0.1"
|
||||
|
||||
[dev-dependencies]
|
||||
env_logger = "0.10"
|
234
resources/tests/readability-test-1.html
Normal file
234
resources/tests/readability-test-1.html
Normal file
|
@ -0,0 +1,234 @@
|
|||
<!DOCTYPE html>
|
||||
<html class="no-js" lang="en">
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>
|
||||
<title>Get your Frontend JavaScript Code Covered | Code | Nicolas Perriault</title>
|
||||
<meta
|
||||
name="description" content="Nicolas Perriault's homepage."/>
|
||||
<meta name="viewport" content="width=device-width"/>
|
||||
<link href="//fonts.googleapis.com/css?family=Asap:400,400italic,700,700italic&subset=latin,latin-ext"
|
||||
rel="stylesheet" type="text/css"/>
|
||||
<link rel="stylesheet" type="text/css" href="/static/packed.css?1412806084"/>
|
||||
<link rel="alternate" type="application/rss+xml" href="/code/feed/" title="Code (RSS)"/>
|
||||
<link rel="alternate" type="application/rss+xml" href="/photography/feed/"
|
||||
title="Photography (RSS)"/>
|
||||
<link rel="alternate" type="application/rss+xml" href="/talks/feed/" title="Talks (RSS)"/>
|
||||
<link rel="alternate" type="application/rss+xml" href="/carnet/feed/"
|
||||
title="Carnet (RSS)"/>
|
||||
<link rel="alternate" type="application/rss+xml" href="/feed/" title="Everything (RSS)"/>
|
||||
<!--[if lt IE 9]>
|
||||
<script src="//html5shiv.googlecode.com/svn/trunk/html5.js"></script>
|
||||
<![endif]-->
|
||||
</head>
|
||||
|
||||
<body class="code " onload="prettyPrint()">
|
||||
<!--[if lt IE 7]>
|
||||
<p class="chromeframe">Your browser is <em>ancient!</em> Please <a href="http://www.quirksmode.org/upgrade.html">upgrade</a>.</p>
|
||||
<![endif]-->
|
||||
<div class="container">
|
||||
<header class="main-title">
|
||||
<h1><a href="/">Hi, I'm <strong>Nicolas.</strong></a></h1>
|
||||
<small>I code stuff. I take photos. I write rants.</small>
|
||||
|
||||
</header>
|
||||
<main class="contents" role="main">
|
||||
<article lang="en" class="code" itemscope="" itemtype="http://schema.org/BlogPosting">
|
||||
<link itemprop="url" href="/code/2013/get-your-frontend-javascript-code-covered/"/>
|
||||
<header>
|
||||
<h2><a itemprop="name" href="/code/2013/get-your-frontend-javascript-code-covered/">Get your Frontend JavaScript Code Covered</a></h2>
|
||||
</header>
|
||||
<section>
|
||||
<p><strong>So finally you're <a href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">testing your frontend JavaScript code</a>? Great! The more you
|
||||
write tests, the more confident you are with your code… but how much precisely?
|
||||
That's where <a href="http://en.wikipedia.org/wiki/Code_coverage">code coverage</a> might
|
||||
help.</strong>
|
||||
</p>
|
||||
<p>The idea behind code coverage is to record which parts of your code (functions,
|
||||
statements, conditionals and so on) have been executed by your test suite,
|
||||
to compute metrics out of these data and usually to provide tools for navigating
|
||||
and inspecting them.</p>
|
||||
<p>Not a lot of frontend developers I know actually test their frontend code,
|
||||
and I can barely imagine how many of them have ever setup code coverage…
|
||||
Mostly because there are not many frontend-oriented tools in this area
|
||||
I guess.</p>
|
||||
<p>Actually I've only found one which provides an adapter for <a href="http://visionmedia.github.io/mocha/">Mocha</a> and
|
||||
actually works…</p>
|
||||
<blockquote class="twitter-tweet tw-align-center">
|
||||
<p>Drinking game for web devs:
|
||||
<br />(1) Think of a noun
|
||||
<br />(2) Google "<noun>.js"
|
||||
<br />(3) If a library with that name exists - drink</p>— Shay Friedman (@ironshay)
|
||||
<a
|
||||
href="https://twitter.com/ironshay/statuses/370525864523743232">August 22, 2013</a>
|
||||
</blockquote>
|
||||
<p><strong><a href="http://blanketjs.org/">Blanket.js</a></strong> is an <em>easy to install, easy to configure,
|
||||
and easy to use JavaScript code coverage library that works both in-browser and
|
||||
with nodejs.</em>
|
||||
</p>
|
||||
<p>Its use is dead easy, adding Blanket support to your Mocha test suite
|
||||
is just matter of adding this simple line to your HTML test file:</p>
|
||||
<pre><code><script src="vendor/blanket.js"
|
||||
data-cover-adapter="vendor/mocha-blanket.js"></script>
|
||||
</code></pre>
|
||||
|
||||
<p>Source files: <a href="https://raw.github.com/alex-seville/blanket/master/dist/qunit/blanket.min.js">blanket.js</a>,
|
||||
<a
|
||||
href="https://raw.github.com/alex-seville/blanket/master/src/adapters/mocha-blanket.js">mocha-blanket.js</a>
|
||||
</p>
|
||||
<p>As an example, let's reuse the silly <code>Cow</code> example we used
|
||||
<a
|
||||
href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">in a previous episode</a>:</p>
|
||||
<pre><code>// cow.js
|
||||
(function(exports) {
|
||||
"use strict";
|
||||
|
||||
function Cow(name) {
|
||||
this.name = name || "Anon cow";
|
||||
}
|
||||
exports.Cow = Cow;
|
||||
|
||||
Cow.prototype = {
|
||||
greets: function(target) {
|
||||
if (!target)
|
||||
throw new Error("missing target");
|
||||
return this.name + " greets " + target;
|
||||
}
|
||||
};
|
||||
})(this);
|
||||
</code></pre>
|
||||
|
||||
<p>And its test suite, powered by Mocha and <a href="http://chaijs.com/">Chai</a>:</p>
|
||||
<pre><code>var expect = chai.expect;
|
||||
|
||||
describe("Cow", function() {
|
||||
describe("constructor", function() {
|
||||
it("should have a default name", function() {
|
||||
var cow = new Cow();
|
||||
expect(cow.name).to.equal("Anon cow");
|
||||
});
|
||||
|
||||
it("should set cow's name if provided", function() {
|
||||
var cow = new Cow("Kate");
|
||||
expect(cow.name).to.equal("Kate");
|
||||
});
|
||||
});
|
||||
|
||||
describe("#greets", function() {
|
||||
it("should greet passed target", function() {
|
||||
var greetings = (new Cow("Kate")).greets("Baby");
|
||||
expect(greetings).to.equal("Kate greets Baby");
|
||||
});
|
||||
});
|
||||
});
|
||||
</code></pre>
|
||||
|
||||
<p>Let's create the HTML test file for it, featuring Blanket and its adapter
|
||||
for Mocha:</p>
|
||||
<pre><code><!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Test</title>
|
||||
<link rel="stylesheet" media="all" href="vendor/mocha.css">
|
||||
</head>
|
||||
<body>
|
||||
<div id="mocha"></div>
|
||||
<div id="messages"></div>
|
||||
<div id="fixtures"></div>
|
||||
<script src="vendor/mocha.js"></script>
|
||||
<script src="vendor/chai.js"></script>
|
||||
<script src="vendor/blanket.js"
|
||||
data-cover-adapter="vendor/mocha-blanket.js"></script>
|
||||
<script>mocha.setup('bdd');</script>
|
||||
<script src="cow.js" data-cover></script>
|
||||
<script src="cow_test.js"></script>
|
||||
<script>mocha.run();</script>
|
||||
</body>
|
||||
</html>
|
||||
</code></pre>
|
||||
|
||||
<p><strong>Notes</strong>:</p>
|
||||
<ul>
|
||||
<li>Notice the <code>data-cover</code> attribute we added to the script tag
|
||||
loading the source of our library;</li>
|
||||
<li>The HTML test file <em>must</em> be served over HTTP for the adapter to
|
||||
be loaded.</li>
|
||||
</ul>
|
||||
<p>Running the tests now gives us something like this:</p>
|
||||
<p>
|
||||
<img alt="screenshot" src="/static/code/2013/blanket-coverage.png"/>
|
||||
</p>
|
||||
<p>As you can see, the report at the bottom highlights that we haven't actually
|
||||
tested the case where an error is raised in case a target name is missing.
|
||||
We've been informed of that, nothing more, nothing less. We simply know
|
||||
we're missing a test here. Isn't this cool? I think so!</p>
|
||||
<p>Just remember that code coverage will only <a href="http://codebetter.com/karlseguin/2008/12/09/code-coverage-use-it-wisely/">bring you numbers</a> and
|
||||
raw information, not actual proofs that the whole of your <em>code logic</em> has
|
||||
been actually covered. If you ask me, the best inputs you can get about
|
||||
your code logic and implementation ever are the ones issued out of <a href="http://www.extremeprogramming.org/rules/pair.html">pair programming</a>
|
||||
sessions
|
||||
and <a href="http://alexgaynor.net/2013/sep/26/effective-code-review/">code reviews</a> —
|
||||
but that's another story.</p>
|
||||
<p><strong>So is code coverage silver bullet? No. Is it useful? Definitely. Happy testing!</strong>
|
||||
</p>
|
||||
</section>
|
||||
<aside>
|
||||
<p> <span class="article-author" itemprop="author" itemscope="" itemtype="http://schema.org/Person">
|
||||
<span itemprop="name">Nicolas Perriault</span> —</span>
|
||||
<time
|
||||
datetime="2013-09-29" itemprop="datePublished">2013-09-29</time>— in <a href="/code/" itemprop="genre">Code</a>
|
||||
— <a href="/code/2013/get-your-frontend-javascript-code-covered/">Permalink</a>
|
||||
—
|
||||
<a
|
||||
rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/">License</a>— <a href="http://flattr.com/submit/auto?url=https://nicolas.perriault.net/code/2013/get-your-frontend-javascript-code-covered/&title=Get your Frontend JavaScript Code Covered&user_id=n1k0&category=software&language=en">flattr this</a>
|
||||
|
||||
</p>
|
||||
</aside>
|
||||
<hr/>
|
||||
<nav> <a class="prev" href="/code/2013/functional-javascript-for-crawling-the-web/">Functional JavaScript for crawling the Web</a>
|
||||
|
|
||||
<a
|
||||
class="next" href="/code/2013/testing-frontend-javascript-code-using-mocha-chai-and-sinon/">Testing your frontend JavaScript code using mocha, chai, and sinon</a>
|
||||
</nav>
|
||||
</article>
|
||||
</main>
|
||||
<nav class="sidebar">
|
||||
<ul>
|
||||
<li class="home"><a href="/" hreflang="en">Home</a>
|
||||
</li>
|
||||
<li class="code"><a href="/code/" hreflang="en">Code</a>
|
||||
</li>
|
||||
<li class="photography"><a href="/photography/" hreflang="en">Photography</a>
|
||||
</li>
|
||||
<li class="talks"><a href="/talks/" hreflang="en">Talks</a>
|
||||
</li>
|
||||
<li class="carnet"><a href="/carnet/" hreflang="fr">Carnet <span>fr</span></a>
|
||||
</li>
|
||||
<li class="contact"><a href="/contact/" hreflang="en">Contact</a>
|
||||
</li>
|
||||
</ul>
|
||||
</nav>
|
||||
<footer class="site-footer">
|
||||
<p>© 2012 Nicolas Perriault — <a href="https://twitter.com/n1k0">Tweet at me</a>
|
||||
—
|
||||
<a
|
||||
href="https://github.com/n1k0">Get my code</a>— <a href="http://500px.com/n1k0">Enjoy my pics</a>
|
||||
— <a href="/contact/">Contact me</a>
|
||||
|
||||
</p>
|
||||
</footer>
|
||||
</div>
|
||||
<!-- /container -->
|
||||
<script src="//ajax.googleapis.com/ajax/libs/jquery/1.7.1/jquery.min.js"></script>
|
||||
<script>
|
||||
window.jQuery || document.write('<script src="js/libs/jquery-1.7.1.min.js"><\/script>')
|
||||
</script>
|
||||
<script type="text/javascript" src="/static/js/libs/prettify/prettify.js"></script>
|
||||
<script type="text/javascript" src="/static/js/app.js"></script>
|
||||
<script src="//platform.twitter.com/widgets.js" charset="utf-8"></script>
|
||||
</body>
|
||||
|
||||
</html>
|
||||
|
|
@ -34,6 +34,24 @@ pub struct ConfigEntry {
|
|||
pub next_page_link: Option<String>,
|
||||
}
|
||||
|
||||
impl Default for ConfigEntry {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
xpath_title: Vec::new(),
|
||||
xpath_author: Vec::new(),
|
||||
xpath_date: Vec::new(),
|
||||
xpath_body: Vec::new(),
|
||||
xpath_strip: Vec::new(),
|
||||
strip_id_or_class: Vec::new(),
|
||||
strip_image_src: Vec::new(),
|
||||
replace: Vec::new(),
|
||||
header: Vec::new(),
|
||||
single_page_link: None,
|
||||
next_page_link: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ConfigEntry {
|
||||
pub async fn parse_path(config_path: &Path) -> Result<ConfigEntry, ConfigError> {
|
||||
let mut file = fs::File::open(&config_path).await?;
|
||||
|
|
|
@ -181,8 +181,8 @@ impl FullTextParser {
|
|||
Self::strip_junk(&xpath_ctx, config, global_config, url);
|
||||
let found_body = Self::extract_body(&xpath_ctx, root, config, global_config)?;
|
||||
|
||||
if found_body {
|
||||
if let Err(error) = Readability::extract_body_readability(document, root) {
|
||||
if !found_body {
|
||||
if let Err(error) = Readability::extract_body(document, root) {
|
||||
log::error!("Both ftr and readability failed to find content: {}", error);
|
||||
return Err(error);
|
||||
}
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
mod constants;
|
||||
mod state;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use libxml::tree::{Document, Node, NodeType};
|
||||
use libxml::tree::{node, Document, Node, NodeType};
|
||||
|
||||
use self::state::State;
|
||||
use super::error::FullTextParserError;
|
||||
|
@ -11,13 +14,12 @@ use super::error::FullTextParserError;
|
|||
pub struct Readability;
|
||||
|
||||
impl Readability {
|
||||
pub fn extract_body_readability(
|
||||
document: Document,
|
||||
root: &mut Node,
|
||||
) -> Result<bool, FullTextParserError> {
|
||||
pub fn extract_body(document: Document, root: &mut Node) -> Result<bool, FullTextParserError> {
|
||||
node::set_node_rc_guard(6);
|
||||
|
||||
let mut state = State::default();
|
||||
let mut document = document;
|
||||
let mut attempts: Vec<(Node, usize)> = Vec::new();
|
||||
let mut attempts: Vec<(Node, usize, Document)> = Vec::new();
|
||||
let document_cache = document
|
||||
.dup()
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
|
@ -75,7 +77,7 @@ impl Readability {
|
|||
}
|
||||
|
||||
// Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
|
||||
if tag_name == "DIV"
|
||||
if (tag_name == "DIV"
|
||||
|| tag_name == "SECTION"
|
||||
|| tag_name == "HEADER"
|
||||
|| tag_name == "H1"
|
||||
|
@ -83,7 +85,8 @@ impl Readability {
|
|||
|| tag_name == "H3"
|
||||
|| tag_name == "H4"
|
||||
|| tag_name == "H5"
|
||||
|| tag_name == "H6" && Self::is_element_without_content(node_ref)
|
||||
|| tag_name == "H6")
|
||||
&& Self::is_element_without_content(node_ref)
|
||||
{
|
||||
node = Self::remove_and_next(node_ref);
|
||||
continue;
|
||||
|
@ -159,7 +162,7 @@ impl Readability {
|
|||
// Loop through all paragraphs, and assign a score to them based on how content-y they look.
|
||||
// Then add their score to their parent node.
|
||||
// A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
|
||||
for element_to_score in elements_to_score {
|
||||
for element_to_score in elements_to_score.drain(..) {
|
||||
if element_to_score.get_parent().is_none() {
|
||||
continue;
|
||||
}
|
||||
|
@ -195,7 +198,7 @@ impl Readability {
|
|||
}
|
||||
|
||||
if Self::get_content_score(&ancestor).is_none() {
|
||||
Self::initialize_node(&mut ancestor, &state);
|
||||
Self::initialize_node(&mut ancestor, &state)?;
|
||||
candidates.push(ancestor.clone());
|
||||
}
|
||||
|
||||
|
@ -213,7 +216,7 @@ impl Readability {
|
|||
|
||||
if let Some(mut score) = Self::get_content_score(&ancestor) {
|
||||
score += content_score / score_divider;
|
||||
Self::set_content_score(&mut ancestor, score);
|
||||
Self::set_content_score(&mut ancestor, score)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -226,7 +229,7 @@ impl Readability {
|
|||
// unaffected by this operation.
|
||||
if let Some(content_score) = Self::get_content_score(candidate) {
|
||||
let candidate_score = content_score * (1.0 - Self::get_link_density(candidate));
|
||||
Self::set_content_score(candidate, candidate_score);
|
||||
Self::set_content_score(candidate, candidate_score)?;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -244,11 +247,11 @@ impl Readability {
|
|||
let mut top_candidate = top_candidates.first().cloned().unwrap_or_else(|| {
|
||||
// If we still have no top candidate, just use the body as a last resort.
|
||||
// We also have to copy the body node so it is something we can modify.
|
||||
Self::initialize_node(root, &state);
|
||||
let mut rt = document.get_root_element().unwrap();
|
||||
Self::initialize_node(&mut rt, &state).unwrap();
|
||||
needed_to_create_top_candidate = true;
|
||||
root.clone()
|
||||
rt
|
||||
});
|
||||
#[allow(unused_assignments)]
|
||||
let mut parent_of_top_candidate = None;
|
||||
|
||||
let mut alternative_candidate_ancestors = Vec::new();
|
||||
|
@ -257,8 +260,9 @@ impl Readability {
|
|||
for top_candidate in &top_candidates {
|
||||
if let Some(score) = Self::get_content_score(top_candidate) {
|
||||
if score >= 0.75 {
|
||||
alternative_candidate_ancestors
|
||||
.push(Self::get_node_ancestors(top_candidate, 0));
|
||||
if let Some(ancestor) = top_candidate.get_parent() {
|
||||
alternative_candidate_ancestors.push(ancestor);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -273,15 +277,16 @@ impl Readability {
|
|||
alternative_candidate_ancestors.len(),
|
||||
constants::MINIMUM_TOPCANDIDATES,
|
||||
);
|
||||
for item in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
let tmp = item.iter().any(|n| n == parent);
|
||||
lists_containing_this_ancestor += if tmp { 1 } else { 0 };
|
||||
for ancestor in alternative_candidate_ancestors.iter().take(tmp) {
|
||||
lists_containing_this_ancestor += if ancestor == parent { 1 } else { 0 };
|
||||
}
|
||||
|
||||
if lists_containing_this_ancestor >= constants::MINIMUM_TOPCANDIDATES {
|
||||
top_candidate = parent.clone();
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
parent_of_top_candidate = parent_of_top_candidate.and_then(|n| n.get_parent());
|
||||
|
@ -289,7 +294,7 @@ impl Readability {
|
|||
}
|
||||
|
||||
if Self::get_content_score(&top_candidate).is_none() {
|
||||
Self::initialize_node(&mut top_candidate, &state);
|
||||
Self::initialize_node(&mut top_candidate, &state)?;
|
||||
}
|
||||
|
||||
// Because of our bonus system, parents of candidates might have scores
|
||||
|
@ -353,7 +358,7 @@ impl Readability {
|
|||
}
|
||||
|
||||
if Self::get_content_score(&top_candidate).is_none() {
|
||||
Self::initialize_node(&mut top_candidate, &state);
|
||||
Self::initialize_node(&mut top_candidate, &state)?;
|
||||
}
|
||||
|
||||
// Now that we have the top candidate, look through its siblings for content
|
||||
|
@ -432,6 +437,7 @@ impl Readability {
|
|||
})?;
|
||||
}
|
||||
|
||||
sibling.unlink();
|
||||
article_content.add_child(&mut sibling).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
|
@ -471,6 +477,7 @@ impl Readability {
|
|||
})?;
|
||||
|
||||
for mut child in article_content.get_child_nodes() {
|
||||
child.unlink();
|
||||
div.add_child(&mut child).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
|
@ -489,33 +496,31 @@ impl Readability {
|
|||
// grabArticle with different flags set. This gives us a higher likelihood of
|
||||
// finding the content, and the sieve approach gives us a higher likelihood of
|
||||
// finding the -right- content.
|
||||
let text_length = Self::get_inner_text(&article_content, true).len();
|
||||
let text = Self::get_inner_text(&article_content, true);
|
||||
let text_length = text.len();
|
||||
|
||||
if text_length < constants::DEFAULT_CHAR_THRESHOLD {
|
||||
parse_successful = false;
|
||||
document = document_cache
|
||||
.dup()
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
|
||||
if state.strip_unlikely {
|
||||
state.strip_unlikely = false;
|
||||
attempts.push((article_content, text_length));
|
||||
attempts.push((article_content, text_length, document));
|
||||
} else if state.weigh_classes {
|
||||
state.weigh_classes = false;
|
||||
attempts.push((article_content, text_length));
|
||||
attempts.push((article_content, text_length, document));
|
||||
} else if state.clean_conditionally {
|
||||
state.clean_conditionally = false;
|
||||
attempts.push((article_content, text_length));
|
||||
attempts.push((article_content, text_length, document));
|
||||
} else {
|
||||
attempts.push((article_content, text_length));
|
||||
attempts.push((article_content, text_length, document));
|
||||
// No luck after removing flags, just return the longest text we found during the different loops
|
||||
|
||||
attempts.sort_by(|(_, size_a), (_, size_b)| size_a.cmp(size_b));
|
||||
attempts.sort_by(|(_, size_a, _), (_, size_b, _)| size_a.cmp(size_b));
|
||||
|
||||
// But first check if we actually have something
|
||||
if let Some((best_attempt, _len)) = attempts.first() {
|
||||
article_content = best_attempt.clone();
|
||||
root.add_child(&mut article_content).map_err(|error| {
|
||||
if let Some((mut best_attempt, _len, _document)) = attempts.pop() {
|
||||
best_attempt.unlink();
|
||||
root.add_child(&mut best_attempt).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
FullTextParserError::Readability
|
||||
})?;
|
||||
|
@ -524,6 +529,10 @@ impl Readability {
|
|||
|
||||
return Ok(parse_successful);
|
||||
}
|
||||
|
||||
document = document_cache
|
||||
.dup()
|
||||
.map_err(|()| FullTextParserError::Readability)?;
|
||||
} else {
|
||||
root.add_child(&mut article_content).map_err(|error| {
|
||||
log::error!("{error}");
|
||||
|
@ -539,9 +548,12 @@ impl Readability {
|
|||
.and_then(|a| a.parse::<f64>().ok())
|
||||
}
|
||||
|
||||
fn set_content_score(node: &mut Node, score: f64) {
|
||||
fn set_content_score(node: &mut Node, score: f64) -> Result<(), FullTextParserError> {
|
||||
node.set_attribute(constants::SCORE_ATTR, &score.to_string())
|
||||
.expect("Failed to set content score");
|
||||
.map_err(|err| {
|
||||
log::error!("failed to set content score: {err}");
|
||||
FullTextParserError::Readability
|
||||
})
|
||||
}
|
||||
|
||||
fn is_probably_visible(node: &Node) -> bool {
|
||||
|
@ -580,6 +592,8 @@ impl Readability {
|
|||
}
|
||||
|
||||
fn next_node(node: &Node, ignore_self_and_kids: bool) -> Option<Node> {
|
||||
let mut node = node.clone();
|
||||
|
||||
// First check for kids if those aren't being ignored
|
||||
let first_child = node.get_first_child();
|
||||
if !ignore_self_and_kids && first_child.is_some() {
|
||||
|
@ -602,9 +616,16 @@ impl Readability {
|
|||
}
|
||||
|
||||
if let Some(parent) = parent {
|
||||
let parent_name = parent.get_name().to_uppercase();
|
||||
if parent_name == "HTML" {
|
||||
break;
|
||||
}
|
||||
|
||||
let next_sibling = parent.get_next_sibling();
|
||||
if next_sibling.is_some() {
|
||||
return next_sibling;
|
||||
} else {
|
||||
node = parent;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -649,11 +670,11 @@ impl Readability {
|
|||
// the same as the article title.
|
||||
fn header_duplicates_title(node: &Node) -> bool {
|
||||
let name = node.get_name().to_lowercase();
|
||||
if name != "h1" || name != "h2" {
|
||||
if name != "h1" && name != "h2" {
|
||||
return false;
|
||||
}
|
||||
let heading = Self::get_inner_text(node, false);
|
||||
Self::text_similarity(&heading, "FIXME") > 0.75
|
||||
Self::text_similarity(&heading, "Get your Frontend JavaScript Code Covered") > 0.75
|
||||
}
|
||||
|
||||
fn get_inner_text(node: &Node, normalize_spaces: bool) -> String {
|
||||
|
@ -759,10 +780,10 @@ impl Readability {
|
|||
|
||||
fn get_elems(node: &Node, tag: &str, vec: &mut Vec<Node>, all_tags: bool) {
|
||||
for child in node.get_child_elements() {
|
||||
if all_tags || child.get_name() == tag {
|
||||
vec.push(child);
|
||||
if all_tags || child.get_name().to_uppercase() == tag {
|
||||
vec.push(child.clone());
|
||||
}
|
||||
get_elems(node, tag, vec, all_tags);
|
||||
get_elems(&child, tag, vec, all_tags);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -823,7 +844,7 @@ impl Readability {
|
|||
let mut ancestors = Vec::new();
|
||||
let mut node = node.clone();
|
||||
|
||||
for _ in 0..max_depth {
|
||||
for _ in 0..=max_depth {
|
||||
let parent = node.get_parent();
|
||||
match parent {
|
||||
Some(parent) => {
|
||||
|
@ -839,7 +860,7 @@ impl Readability {
|
|||
|
||||
// Initialize a node with the readability object. Also checks the
|
||||
// className/id for special names to add to its score.
|
||||
fn initialize_node(node: &mut Node, state: &State) {
|
||||
fn initialize_node(node: &mut Node, state: &State) -> Result<(), FullTextParserError> {
|
||||
let score = match node.get_name().to_uppercase().as_str() {
|
||||
"DIV" => 5,
|
||||
"PRE" | "TD" | "BLOCKQUITE" => 3,
|
||||
|
@ -848,7 +869,8 @@ impl Readability {
|
|||
_ => 0,
|
||||
};
|
||||
let score = score + Self::get_class_weight(node, state);
|
||||
Self::set_content_score(node, score as f64);
|
||||
Self::set_content_score(node, score as f64)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn get_class_weight(node: &Node, state: &State) -> i64 {
|
||||
|
|
27
src/full_text_parser/readability/tests.rs
Normal file
27
src/full_text_parser/readability/tests.rs
Normal file
|
@ -0,0 +1,27 @@
|
|||
use libxml::tree::{Document, Node};
|
||||
use reqwest::Url;
|
||||
|
||||
use crate::full_text_parser::config::ConfigEntry;
|
||||
|
||||
async fn prepare(html: &str, url: &Url) -> Document {
|
||||
let empty_config = ConfigEntry::default();
|
||||
let document = crate::FullTextParser::parse_html(html, None, &empty_config).unwrap();
|
||||
let xpath_ctx = crate::FullTextParser::get_xpath_ctx(&document).unwrap();
|
||||
crate::FullTextParser::strip_junk(&xpath_ctx, None, &empty_config, url);
|
||||
document
|
||||
}
|
||||
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_1() {
|
||||
let _ = env_logger::builder().is_test(true).try_init();
|
||||
|
||||
let html = std::fs::read_to_string(r"./resources/tests/readability-test-1.html")
|
||||
.expect("Failed to read HTML");
|
||||
let url = Url::parse("http://google.com").unwrap();
|
||||
let document = prepare(&html, &url).await;
|
||||
|
||||
let mut root = Node::new("article", None, &document).unwrap();
|
||||
|
||||
super::Readability::extract_body(document, &mut root).unwrap();
|
||||
}
|
|
@ -2,7 +2,7 @@ use super::FullTextParser;
|
|||
use reqwest::Client;
|
||||
use std::path::PathBuf;
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
#[tokio::test]
|
||||
async fn golem() {
|
||||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url = url::Url::parse("https://www.golem.de/news/http-error-418-fehlercode-ich-bin-eine-teekanne-darf-bleiben-1708-129460.html").unwrap();
|
||||
|
@ -26,7 +26,7 @@ async fn golem() {
|
|||
assert_eq!(article.author, Some(String::from("Hauke Gierow")));
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
#[tokio::test]
|
||||
async fn phoronix() {
|
||||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url =
|
||||
|
@ -45,7 +45,7 @@ async fn phoronix() {
|
|||
);
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
#[tokio::test]
|
||||
async fn youtube() {
|
||||
let out_path = PathBuf::from(r"./test_output");
|
||||
let url = url::Url::parse("https://www.youtube.com/watch?v=8KjaIumu-jI").unwrap();
|
||||
|
@ -64,7 +64,7 @@ async fn youtube() {
|
|||
.unwrap_or(false));
|
||||
}
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
#[tokio::test]
|
||||
async fn encoding_windows_1252() {
|
||||
let url = url::Url::parse("https://www.aerzteblatt.de/nachrichten/139511/Scholz-zuversichtlich-mit-Blick-auf-Coronasituation-im-Winter").unwrap();
|
||||
let html = FullTextParser::download(&url, &Client::new(), reqwest::header::HeaderMap::new())
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
pub use self::error::ImageDownloadError;
|
||||
use crate::util::Util;
|
||||
use base64::Engine;
|
||||
use libxml::parser::Parser;
|
||||
use libxml::tree::{Document, Node, SaveOptions};
|
||||
use libxml::xpath::Context;
|
||||
|
@ -143,8 +144,8 @@ impl ImageDownloader {
|
|||
}
|
||||
}
|
||||
|
||||
let small_image_base64 = base64::encode(&small_image);
|
||||
let big_image_base64 = big_image.map(base64::encode);
|
||||
let small_image_base64 = base64::engine::general_purpose::STANDARD.encode(&small_image);
|
||||
let big_image_base64 = big_image.map(|img| base64::engine::general_purpose::STANDARD.encode(img));
|
||||
let small_image_string =
|
||||
format!("data:{};base64,{}", content_type_small, small_image_base64);
|
||||
let big_image_string = match big_image_base64 {
|
||||
|
@ -290,13 +291,13 @@ mod tests {
|
|||
use std::fs;
|
||||
use std::io::Write;
|
||||
|
||||
#[tokio::test(flavor = "current_thread")]
|
||||
#[tokio::test]
|
||||
async fn close_tags() {
|
||||
let image_dowloader = ImageDownloader::new((2048, 2048));
|
||||
let hdyleaflet = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||
let html = fs::read_to_string(r"./resources/tests/planetGnome/fedora31.html")
|
||||
.expect("Failed to read HTML");
|
||||
let result = image_dowloader
|
||||
.download_images_from_string(&hdyleaflet, &Client::new())
|
||||
.download_images_from_string(&html, &Client::new())
|
||||
.await
|
||||
.expect("Failed to downalod images");
|
||||
let mut file = fs::File::create(r"./test_output/fedora31_images_downloaded.html")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue