1
0
Fork 0
mirror of https://gitlab.com/news-flash/article_scraper.git synced 2025-07-13 10:55:31 +02:00
article_scraper/resources/tests/golem/golem.de.txt
Jan Lukas Gernert 4b2e6a24eb initial commit
2018-07-31 16:10:09 +02:00

42 lines
1,022 B
Text

# Author: zinnober
# Rewrite of original template which fetched the printer-version without pictures
tidy: no
prune: no
# Set full title
title: //h1/span
date: //time
author: //a[@rel='author']
# Content is here
body: //article
# Fetch full multipage articles
next_page_link: //a[@id='atoc_next']
# Remove tracking and ads
strip_id_or_class: iqadtile4
# General Cleanup
strip_id_or_class: list-jtoc
strip_id_or_class: table-jtoc
strip_id_or_class: implied
strip_id_or_class: social-
strip_id_or_class: comments
strip_id_or_class: footer
strip_id_or_class: job-market
strip_id_or_class: tags
# Tidy up galleries (could still be improved, though)
strip: //img[@src='']
strip: //li[not(*)]
strip: //div[contains(@style,'margin')]
strip: //figure[contains(@id,'gvideo')]
# Try yourself
test_url: http://www.golem.de/news/intel-core-i7-5960x-im-test-die-pc-revolution-beginnt-mit-octacore-und-ddr4-1408-108893.html
test_url: http://www.golem.de/news/test-infamous-first-light-neonbunter-actionspass-1408-108914.html