From ca1cc47af1f7749ea7d10983e8e269afb0c57daf Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:02:40 +0100 Subject: [PATCH 01/12] update CI image --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 5a57616..7880e5d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,7 +4,7 @@ stages: run-build: stage: build - image: rust:1.79 + image: rust:1.83 before_script: - rustup component add rustfmt - rustup component add clippy From 8cfcd6d9f3636a84336da4d882a9f6db5ce565b4 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 17 Jan 2025 03:05:55 +0100 Subject: [PATCH 02/12] clippy --- article_scraper/src/full_text_parser/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 98e9478..18fc682 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -275,7 +275,7 @@ impl FullTextParser { /// See: /// - /// - - /// These two functions should be removed when the issue is fixed in libxml crate. + /// These two functions should be removed when the issue is fixed in libxml crate. fn try_usize_to_i32(value: usize) -> Result { if cfg!(target_pointer_width = "16") || (value < i32::MAX as usize) { // Cannot safely use our value comparison, but the conversion if always safe. From 9f56ed03b8e384378d92e01c5bc38bf80525760c Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 10 Mar 2025 13:42:31 +0100 Subject: [PATCH 03/12] article_scraper: don't specify reqwest features --- article_scraper/Cargo.toml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index 100766c..e852be9 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,17 +14,17 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } +reqwest = "0.12" tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" -regex = "1.10" +regex = "1.11" encoding_rs = "0.8" chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.4" -once_cell = "1.19" +rust-embed="8.6" +once_cell = "1.20" escaper = "0.1" futures = "0.3" unic-emoji-char = "0.9" From 0978335d3b73e8049c602713b76ca5e3f038d9ca Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Fri, 28 Mar 2025 17:18:03 +0100 Subject: [PATCH 04/12] [f] ignore url harvest error --- article_scraper/src/images/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index de0f48f..4be98b8 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -219,7 +219,9 @@ impl ImageDownloader { let mut image_urls = Vec::new(); for node in node_vec { - image_urls.push(Self::harvest_image_urls_from_node(node)?); + if let Ok(url) = Self::harvest_image_urls_from_node(node) { + image_urls.push(url); + } } Ok(image_urls) From b92500fca276535b40d1956e71a1cca226d92437 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:45:41 +0200 Subject: [PATCH 05/12] better error messages --- article_scraper/src/error.rs | 6 +- article_scraper/src/full_text_parser/mod.rs | 96 ++++++++++----------- 2 files changed, 50 insertions(+), 52 deletions(-) diff --git a/article_scraper/src/error.rs b/article_scraper/src/error.rs index 4f915fd..41ac9de 100644 --- a/article_scraper/src/error.rs +++ b/article_scraper/src/error.rs @@ -6,10 +6,10 @@ use thiserror::Error; #[derive(Error, Debug)] pub enum ScraperError { - #[error("")] + #[error("Configerror {0}")] Config(#[from] ConfigError), - #[error("")] + #[error("ImageDownloadError {0}")] Image(#[from] ImageDownloadError), - #[error("")] + #[error("FullTextParserError {0}")] Scrap(#[from] FullTextParserError), } diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 18fc682..4bb8a30 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -354,63 +354,61 @@ impl FullTextParser { .send() .await .map_err(|err| { - log::error!( - "Downloading HTML failed: GET '{}' - '{}'", - url.as_str(), - err - ); + log::error!("Downloading HTML failed: GET '{url}' - '{err}'"); FullTextParserError::Http })?; Ok(response) } async fn get_body(response: Response) -> Result { - if response.status().is_success() { - let headers = response.headers().clone(); - let bytes = response - .bytes() - .await - .map_err(|_| FullTextParserError::Http)?; - - match from_utf8(&bytes) { - Ok(utf8_str) => { - log::debug!("Valid utf-8 string"); - return Ok(utf8_str.into()); - } - Err(error) => { - log::debug!("Invalid utf-8 string"); - let lossy_string = std::string::String::from_utf8_lossy(&bytes); - - if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { - log::debug!("Encoding extracted from HTML: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { - log::debug!("Encoding extracted from headers: '{}'", encoding); - if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { - let decoded_html = decoded_html.replacen( - &format!("charset=\"{encoding}\""), - "charset=\"utf-8\"", - 1, - ); - return Ok(decoded_html); - } - } - - return Err(FullTextParserError::Utf8(error)); - } - } + let status = response.status(); + if !status.is_success() { + log::error!("status code: {status}"); + return Err(FullTextParserError::Http); } - Err(FullTextParserError::Http) + let headers = response.headers().clone(); + let bytes = response + .bytes() + .await + .map_err(|_| FullTextParserError::Http)?; + + match from_utf8(&bytes) { + Ok(utf8_str) => { + log::debug!("Valid utf-8 string"); + Ok(utf8_str.into()) + } + Err(error) => { + log::debug!("Invalid utf-8 string"); + let lossy_string = std::string::String::from_utf8_lossy(&bytes); + + if let Some(encoding) = Self::get_encoding_from_html(&lossy_string) { + log::debug!("Encoding extracted from HTML: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + if let Some(encoding) = Self::get_encoding_from_http_header(&headers) { + log::debug!("Encoding extracted from headers: '{encoding}'"); + if let Some(decoded_html) = Self::decode_html(&bytes, encoding) { + let decoded_html = decoded_html.replacen( + &format!("charset=\"{encoding}\""), + "charset=\"utf-8\"", + 1, + ); + return Ok(decoded_html); + } + } + + Err(FullTextParserError::Utf8(error)) + } + } } pub async fn download( From 9b374a28c717e57db7341fac4967b9e0114ad455 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sat, 5 Apr 2025 15:47:08 +0200 Subject: [PATCH 06/12] update ftr-site-config --- article_scraper/ftr-site-config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/ftr-site-config b/article_scraper/ftr-site-config index ccde390..69aa220 160000 --- a/article_scraper/ftr-site-config +++ b/article_scraper/ftr-site-config @@ -1 +1 @@ -Subproject commit ccde390b11893cbafdc84f74b449ddc3cc05c024 +Subproject commit 69aa220193d99427d3822fabccdfaeede56cd532 From f361392c04376736ce9ce2d338c7363959135878 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:34:33 +0200 Subject: [PATCH 07/12] check for empty http response and parsed documents without root element --- article_scraper/src/full_text_parser/mod.rs | 35 +++++++++++++++++++-- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index 4bb8a30..ac77bf6 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -69,6 +69,11 @@ impl FullTextParser { let html = Self::get_body(response).await?; + if html.is_empty() { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + // check for fingerprints let config = if config.is_none() { if let Some(url) = Fingerprints::detect(&html) { @@ -264,10 +269,17 @@ impl FullTextParser { } // parse html - Self::parse_html_string_patched(html.as_str()).map_err(|err| { + let document = Self::parse_html_string_patched(html.as_str()).map_err(|err| { log::error!("Parsing HTML failed for downloaded HTML {:?}", err); FullTextParserError::Xml - }) + })?; + + if document.get_root_element().is_none() { + log::error!("document without root"); + Err(FullTextParserError::Xml) + } else { + Ok(document) + } } /// FIXME: Here are some patched functions of libxml crate. @@ -368,6 +380,18 @@ impl FullTextParser { } let headers = response.headers().clone(); + + if headers + .get(reqwest::header::CONTENT_LENGTH) + .and_then(|hv| hv.to_str().ok()) + .and_then(|str| str.parse::().ok()) + .map(|content_length| content_length == 0) + .unwrap_or(false) + { + log::error!("Empty response body"); + return Err(FullTextParserError::Http); + } + let bytes = response .bytes() .await @@ -420,7 +444,12 @@ impl FullTextParser { let headers = Util::generate_headers(config, global_config)?; let response = Self::get_response(url, client, headers).await?; let body = Self::get_body(response).await?; - Ok(body) + if body.is_empty() { + log::error!("Empty response body"); + Err(FullTextParserError::Http) + } else { + Ok(body) + } } fn get_encoding_from_http_header(headers: &reqwest::header::HeaderMap) -> Option<&str> { From 06990acbc0d4cd55a44aeb20e95c1e6216074a16 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:38:46 +0200 Subject: [PATCH 08/12] fix libxml CI build --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 7880e5d..159f07d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -4,12 +4,14 @@ stages: run-build: stage: build - image: rust:1.83 + image: rust:1.86 before_script: - rustup component add rustfmt - rustup component add clippy + - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so script: - rustc --version && cargo --version + - echo $LIBXML2 - cargo fmt -- --check - cargo clippy --all-targets --all-features -- -D warnings - cargo build --release From 498008f6307c3faabfd6ac40e820871752b75039 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 17:51:30 +0200 Subject: [PATCH 09/12] bump version --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 99695c8..8569ad0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ members = ["article_scraper", "article_scraper_cli"] resolver = "2" [workspace.package] -version = "2.1.1" +version = "2.1.2" authors = ["Jan Lukas Gernert "] edition = "2021" license = "GPL-3.0-or-later" From 9f349f8c6f2a88b277a8d1552d3d84781bdc9363 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Sun, 4 May 2025 18:00:59 +0200 Subject: [PATCH 10/12] need reqwest streams --- article_scraper/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index e852be9..eeed67c 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -14,7 +14,7 @@ exclude = ["resources/tests"] [dependencies] thiserror = "2.0" libxml = "0.3" -reqwest = "0.12" +reqwest = { version = "0.12", features = ["stream"] } tokio = { version = "1", features = ["macros", "fs", "io-util"] } url = "2.5" regex = "1.11" From a23a691c3177c2ecab38c9754c8c17b5387456fc Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 7 Jul 2025 18:03:45 +0200 Subject: [PATCH 11/12] clean html fragment: don't remove same page links & footnotes --- .gitlab-ci.yml | 17 +++++----- article_scraper/Cargo.toml | 4 +-- article_scraper/src/clean.rs | 23 +++++++++++++- .../src/full_text_parser/metadata.rs | 2 +- article_scraper/src/full_text_parser/mod.rs | 31 +++++++++---------- article_scraper/src/images/mod.rs | 2 +- article_scraper/src/util.rs | 8 ++--- article_scraper_cli/Cargo.toml | 21 ++++++++++--- 8 files changed, 67 insertions(+), 41 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 159f07d..9ed9667 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,17 +1,16 @@ stages: - build - run-build: stage: build image: rust:1.86 before_script: - - rustup component add rustfmt - - rustup component add clippy - - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so + - rustup component add rustfmt + - rustup component add clippy + - export LIBXML2=$(pkg-config libxml-2.0 --variable=libdir)/libxml2.so script: - - rustc --version && cargo --version - - echo $LIBXML2 - - cargo fmt -- --check - - cargo clippy --all-targets --all-features -- -D warnings - - cargo build --release + - rustc --version && cargo --version + - echo $LIBXML2 + - cargo fmt -- --check + - cargo clippy --all-targets --all-features -- -D warnings + - cargo build --release diff --git a/article_scraper/Cargo.toml b/article_scraper/Cargo.toml index eeed67c..c76e90f 100644 --- a/article_scraper/Cargo.toml +++ b/article_scraper/Cargo.toml @@ -23,8 +23,8 @@ chrono = "0.4" base64 = "0.22" image = "0.25" log = "0.4" -rust-embed="8.6" -once_cell = "1.20" +rust-embed = "8.7" +once_cell = "1.21" escaper = "0.1" futures = "0.3" unic-emoji-char = "0.9" diff --git a/article_scraper/src/clean.rs b/article_scraper/src/clean.rs index e44f421..4d5ed6b 100644 --- a/article_scraper/src/clean.rs +++ b/article_scraper/src/clean.rs @@ -45,7 +45,16 @@ pub fn clean_html_fragment( let thumbnail = FullTextParser::check_for_thumbnail(&xpath_ctx); FullTextParser::prep_content(&xpath_ctx, None, &empty_config, base_url, &document, None); if let Some(mut root) = document.get_root_element() { - FullTextParser::post_process_page(&mut root)?; + Util::clean_headers(&mut root); + Util::replace_schema_org_orbjects(&mut root); + Util::clean_conditionally(&mut root, "fieldset"); + Util::clean_conditionally(&mut root, "table"); + + FullTextParser::remove_share_elements(&mut root); + FullTextParser::clean_attributes(&mut root)?; + FullTextParser::remove_single_cell_tables(&mut root); + FullTextParser::remove_extra_p_and_div(&mut root); + FullTextParser::remove_empty_nodes(&mut root); } FullTextParser::prevent_self_closing_tags(&xpath_ctx)?; FullTextParser::post_process_document(&document)?; @@ -180,4 +189,16 @@ mod tests { assert_eq!(res.thumbnail, None); assert!(res.html.contains("iframe")); } + + #[test] + fn cartographerstale() { + let html = r#" +

Subscribe now

To no one's surprise, I love maps. And like any map enthusiast, I've spent a lot of time reading about map projections. Understanding them is key to being able to interpret a map properly.

The earth, with its quasi-spherical shape, faces a mathematical problem that has no solution. The sphere, as a geometric figure, cannot be developed on a plane without incurring some kind of distortion. It is not that we have not found the right way to do it, it is that it is not possible. We will never be able to.

That does not detract from the fact that there can be good approximations, although all of them have some compromise. An easy way to understand it is that there are three fundamental characteristics that cannot be fulfilled simultaneously, so any projection has to reach a compromise between the three:

  • Distance: A world map that can preserve the distance between any two points once translated to the plane is considered an equidistant projection.

  • The areas: When the representation on the plane can maintain the areas of a polygon, for example each country, then we speak of an equivalent projection.

  • Angles: This characteristic refers to the angles and, therefore, the shapes of each polygon that we find on the map, for example the outline of a country. If the projection manages to maintain this peculiarity, then we call it conformal projection.

Geometry tells us that, since the sphere is not developable in the plane, it is impossible to find a cartographic projection that is simultaneously equidistant, equivalent, and conformal. So we can state categorically that there is no such thing as a correct projection. The most we can aspire to is to have multiple projections available and to use the most relevant ones for each occasion.

Here are some examples of popular projections, and some information to understand them a little better

Mercator, a projection with a utility

This map is a copy of the world map published by Gerardus Mercator in 1569 and nowadays kept in Basel. Mercator was a cartographer who went down in history for introducing what is possibly the most used and popular projection for centuries, as well as the most hated and criticised in recent decades.

It is therefore important to understand this projection in its historical context. In the 16th century, Europe was in the midst of the Age of Discovery. Navigation was a key element for the different countries to continue exploring the world, so it had to be efficient and reliable. And this is precisely where this projection really adds great value.

The Mercator projection is conformal, so angles and shapes are maintained throughout the plane. An indirect quality of this is that all the straight lines that can be drawn on the plane are possible navigational lines that a ship can maintain. This guarantees that it will get from one point on the map to another.

In other words, when James Cook was on his way to the Pacific Ocean with one of these maps in hand, he was sure that by keeping his course along a straight line on the map he could get easily from Madagascar to India. It is important to stress that this does not guarantee that this line will be the shortest distance, but it will be the one that will require the least additional navigational calculations1.

Peters, activism with many greys

In the 20th century, the voices against the Mercator projection became louder. The fundamental criticism is that the map designed by Gerardus Mercator had an imperialist intention and that it intentionally under-represented the countries of the Southern Hemisphere, over-representing Europe and North America.

I always like to emphasize that this is a consequence, not the actual purpose of the map. The Mercator projection is compliant, with the great utility we have just seen, but it is not equivalent, so there are large discrepancies in the area. Contrary to what is often stated, the deformation mainly affects the areas near the poles, which appear with an extreme size, and the areas near the equator, which appear with a minimum size. Of course, South America and Africa are not in the Southern Hemisphere as some claim, but are crossed by the equator.

Arno Peters, a German filmmaker with a PhD in political propaganda, saw a great window of opportunity. Possibly drawing on his academic career, he held a series of lectures in which he presented Peters' projection, which he claimed was the most accurate representation of the world. And well, yes, this projection is equivalent, since it respects the areas of the different countries, but as a matter of compromise it completely destroys the shapes and distances.

The first obscure point of this story is that the projection he presented in Bonn in 1973 was not his own creation, but he was reusing a projection conceived by the cartographer James Gall in a scholarly paper in 1885. The second obscure point is that his work was by no means altruistic, for after convincing several countries and international institutions of the importance of adopting this new map, he began to charge for each of the reproductions sold. Today, in the midst of 2023, these reproductions still have a cost from which the heirs of Arno Peters continue to make a penny.

Winkel-Tripel, a good compromise

Like almost everything in life, this is not a question of extremes. The two projections we have just seen focused on preserving only one of the three fundamental characteristics of maps, but it is possible to find a compromise that allows us to create generic cartographic representations without a specific goal.

This is one of the three projections proposed by the German cartographer Oswald Winkel in 1921. The name tripel comes from the German for triple, which refers to the fundamental objective of this projection, which is precisely to minimise the three fundamental distortions: area, angles and distance. Several studies by mathematicians and cartographers in the following decades raised this map among the projections that achieve the greatest compromise.

In 1998, thanks to this growing popularity, the Winkel-Tripel projection replaced the Robinson projection as the projection used for maps created by the National Geographic Society. In the past 25 years, more and more administrations, universities and educational institutions are following the example set by National Geographic and including this projection in their textbooks.

Dymaxion, a projection with a different objective

The problem with the Winkel-Tripel projection is that it seeks extreme compromise, so it lacks an objective beyond the intention of becoming the most faithful representation possible. This may be necessary, but it leaves a wide range of areas where maps are needed, and other projections can be used.

Perhaps one of the most unique projections is the one created by Buckminster Fuller in 1946, which resulted in the Dymaxion map you can see above. To transfer the information from the sphere to the plane, this projection approximates the earth to an icosahedron and makes an independent projection on each of the 20 faces of the icosahedron. Once this initial step has been completed, all that remains is to develop the polyhedron on a plane in a way that is most useful for the use of the map.

In this example of use, the icosahedron is developed with the aim of keeping all continents contiguous. Thanks to the peculiarity of this projection, we can easily see that all the continents of the Earth are located quite close to each other. This is probably the best projection to understand the migrations from Africa to the rest of the world. The arrows help with the direction the migrations followed, and the colours show the thousands of years that have passed since humans first arrived at each of the ends of the planet.


I cannot conclude an article about projections without mentioning one of the best works done by xkcd, which is this detailed, funny and quite accurate explanation of map projections.

Share

1

I’ve already written about geodesics in this other article: Unexpected curves: Drawing straight lines on a map.

+ "#; + + let url = Url::parse("https://www.cartographerstale.com").unwrap(); + let res = clean_html_fragment(html, &url).unwrap(); + + std::fs::write("/var/home/jangernert/test-output.html", &res.html).unwrap(); + } } diff --git a/article_scraper/src/full_text_parser/metadata.rs b/article_scraper/src/full_text_parser/metadata.rs index c05a385..845e215 100644 --- a/article_scraper/src/full_text_parser/metadata.rs +++ b/article_scraper/src/full_text_parser/metadata.rs @@ -158,7 +158,7 @@ fn extract_date( fn get_meta(context: &Context, name: &str) -> Option { Util::get_attribute( context, - &format!("//meta[contains(@name, '{}')]", name), + &format!("//meta[contains(@name, '{name}')]"), "content", ) .ok() diff --git a/article_scraper/src/full_text_parser/mod.rs b/article_scraper/src/full_text_parser/mod.rs index ac77bf6..3fefaa4 100644 --- a/article_scraper/src/full_text_parser/mod.rs +++ b/article_scraper/src/full_text_parser/mod.rs @@ -722,7 +722,7 @@ impl FullTextParser { } fn fix_iframe_size(context: &Context, site_name: &str) -> Result<(), FullTextParserError> { - let xpath = &format!("//iframe[contains(@src, '{}')]", site_name); + let xpath = &format!("//iframe[contains(@src, '{site_name}')]"); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { if node.is_null() { @@ -761,7 +761,7 @@ impl FullTextParser { ) -> Result<(), FullTextParserError> { let xpath_tag = tag.unwrap_or("*"); - let xpath = &format!("//{}[@{}]", xpath_tag, attribute); + let xpath = &format!("//{xpath_tag}[@{attribute}]"); let node_vec = Util::evaluate_xpath(context, xpath, false)?; for mut node in node_vec { if let Err(err) = node.remove_property(attribute) { @@ -791,17 +791,16 @@ impl FullTextParser { if let Some(url) = node.get_attribute(attribute) { let trimmed_url = url.trim(); - let is_hash_url = url.starts_with('#'); + if url.starts_with('#') || url.starts_with("\\#") { + continue; + } + let is_relative_url = url::Url::parse(&url) .err() .map(|err| err == url::ParseError::RelativeUrlWithoutBase) .unwrap_or(false); let is_javascript = trimmed_url.contains("javascript:"); - if !is_hash_url && node.get_name().to_uppercase() == "A" { - _ = node.set_attribute("target", "_blank"); - } - if let Some(srcset) = node.get_attribute("srcset") { let res = constants::SRC_SET_URL .captures_iter(&srcset) @@ -832,9 +831,7 @@ impl FullTextParser { _ = node.set_attribute("srcset", res.as_str()); } - if is_hash_url { - _ = node.set_attribute(attribute, trimmed_url); - } else if is_relative_url { + if is_relative_url { let completed_url = match article_url.join(trimmed_url) { Ok(joined_url) => joined_url, Err(_) => continue, @@ -947,7 +944,7 @@ impl FullTextParser { for xpath_strip_img_src in &config.strip_image_src { _ = Util::strip_node( context, - &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), + &format!("//img[contains(@src,'{xpath_strip_img_src}')]"), ); } } @@ -955,7 +952,7 @@ impl FullTextParser { for xpath_strip_img_src in &global_config.strip_image_src { _ = Util::strip_node( context, - &format!("//img[contains(@src,'{}')]", xpath_strip_img_src), + &format!("//img[contains(@src,'{xpath_strip_img_src}')]"), ); } @@ -1270,7 +1267,7 @@ impl FullTextParser { Ok(()) } - fn remove_single_cell_tables(root: &mut Node) { + pub(crate) fn remove_single_cell_tables(root: &mut Node) { let mut node_iter = Some(root.clone()); while let Some(node) = node_iter { @@ -1308,7 +1305,7 @@ impl FullTextParser { } } - fn remove_extra_p_and_div(root: &mut Node) { + pub(crate) fn remove_extra_p_and_div(root: &mut Node) { let mut node_iter = Some(root.clone()); while let Some(mut node) = node_iter { @@ -1330,7 +1327,7 @@ impl FullTextParser { } } - fn remove_share_elements(root: &mut Node) { + pub(crate) fn remove_share_elements(root: &mut Node) { let mut node_iter = Some(root.clone()); while let Some(mut node) = node_iter { @@ -1350,7 +1347,7 @@ impl FullTextParser { } } - fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> { + pub(crate) fn clean_attributes(root: &mut Node) -> Result<(), FullTextParserError> { let mut node_iter = Some(root.clone()); while let Some(mut node) = node_iter { @@ -1439,7 +1436,7 @@ impl FullTextParser { Ok(()) } - fn remove_empty_nodes(root: &mut Node) { + pub(crate) fn remove_empty_nodes(root: &mut Node) { let mut node_iter = Some(root.clone()); while let Some(mut node) = node_iter { diff --git a/article_scraper/src/images/mod.rs b/article_scraper/src/images/mod.rs index 4be98b8..dd5d018 100644 --- a/article_scraper/src/images/mod.rs +++ b/article_scraper/src/images/mod.rs @@ -276,7 +276,7 @@ impl ImageDownloader { } let image_base64 = base64::engine::general_purpose::STANDARD.encode(&image.data); - let image_string = format!("data:{};base64,{}", content_type, image_base64); + let image_string = format!("data:{content_type};base64,{image_base64}"); let image_data_base64 = ImageDataBase64 { url: image.url, data: image_string, diff --git a/article_scraper/src/util.rs b/article_scraper/src/util.rs index df76ced..cac5103 100644 --- a/article_scraper/src/util.rs +++ b/article_scraper/src/util.rs @@ -264,17 +264,15 @@ impl Util { context: &Context, id_or_class: &str, ) -> Result<(), FullTextParserError> { - let xpath = &format!( - "//*[contains(@class, '{}') or contains(@id, '{}')]", - id_or_class, id_or_class - ); + let xpath = + &format!("//*[contains(@class, '{id_or_class}') or contains(@id, '{id_or_class}')]"); let mut ancestor = xpath.clone(); if ancestor.starts_with("//") { ancestor = ancestor.chars().skip(2).collect(); } - let query = &format!("{}[not(ancestor::{})]", xpath, ancestor); + let query = &format!("{xpath}[not(ancestor::{ancestor})]"); let node_vec = Util::evaluate_xpath(context, query, false)?; for mut node in node_vec { if node.is_null() { diff --git a/article_scraper_cli/Cargo.toml b/article_scraper_cli/Cargo.toml index 22edcf1..9ed13b6 100644 --- a/article_scraper_cli/Cargo.toml +++ b/article_scraper_cli/Cargo.toml @@ -9,11 +9,22 @@ repository.workspace = true [dependencies] -article_scraper = { path = "../article_scraper/" } -clap = { version = "4.5", features = [ "derive" ] } +article_scraper = { path = "../article_scraper/" } +clap = { version = "4.5", features = ["derive"] } simplelog = "0.12" log = "0.4" url = "2.5" -reqwest = { version = "0.12", features = ["json", "native-tls", "gzip", "brotli", "stream"] } -tokio = { version = "1", features = ["macros", "fs", "io-util", "rt-multi-thread" ] } -indicatif = "0.17" \ No newline at end of file +reqwest = { version = "0.12", features = [ + "json", + "native-tls", + "gzip", + "brotli", + "stream", +] } +tokio = { version = "1", features = [ + "macros", + "fs", + "io-util", + "rt-multi-thread", +] } +indicatif = "0.18" From 4c9709e2927b6af4075c7335f9f332f95be6c0d8 Mon Sep 17 00:00:00 2001 From: Jan Lukas Gernert Date: Mon, 7 Jul 2025 18:56:16 +0200 Subject: [PATCH 12/12] bump version --- Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8569ad0..1a881ac 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,8 +3,8 @@ members = ["article_scraper", "article_scraper_cli"] resolver = "2" [workspace.package] -version = "2.1.2" +version = "2.1.3" authors = ["Jan Lukas Gernert "] edition = "2021" license = "GPL-3.0-or-later" -repository = "https://gitlab.com/news-flash/article_scraper" \ No newline at end of file +repository = "https://gitlab.com/news-flash/article_scraper"