Update various Hungarian news sources

This commit is contained in:
Kovid Goyal
2026-05-04 23:24:42 +05:30
parent f3687893aa
commit e098139d38
4 changed files with 140 additions and 106 deletions
+32 -31
View File
@@ -1,28 +1,23 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class aktualneRecipe(BasicNewsRecipe):
__author__ = 'bubak'
__author__ = 'bubak'
title = u'aktualne.cz'
publisher = u'Centrum holdings'
description = 'aktuálně.cz'
oldest_article = 1
max_articles_per_feed = 20
encoding = 'utf-8'
feeds = [
(u'Domácí', u'http://aktualne.centrum.cz/feeds/rss/domaci/?photo=0'),
(u'Zprávy', u'http://aktualne.centrum.cz/feeds/rss/zpravy/?photo=0'),
(u'Praha', u'http://aktualne.centrum.cz/feeds/rss/domaci/regiony/praha/?photo=0'),
(u'Ekonomika', u'http://aktualne.centrum.cz/feeds/rss/ekonomika/?photo=0'),
(u'Finance', u'http://aktualne.centrum.cz/feeds/rss/finance/?photo=0'),
(u'Blogy a názory', u'http://blog.aktualne.centrum.cz/export-all.php')
]
(u'Domácí', u'https://zpravy.aktualne.cz/rss/domaci/'),
(u'Zprávy', u'https://zpravy.aktualne.cz/rss/'),
(u'Zahraniční zprávy', u'https://zpravy.aktualne.cz/rss/zahranici/'),
(u'Ekonomika', u'https://zpravy.aktualne.cz/rss/ekonomika/'),
(u'Blogy a názory', u'https://nazory.aktualne.cz/rss/')
]
language = 'cs'
cover_url = 'http://img.aktualne.centrum.cz/design/akt4/o/l/logo-akt-ciste.png'
@@ -30,31 +25,37 @@ class aktualneRecipe(BasicNewsRecipe):
no_stylesheets = True
remove_attributes = []
remove_tags_before = dict(name='h1', attrs={'class': ['titulek-clanku']})
remove_tags_before = dict(name='h1', attrs={'class':['titulek-clanku']})
remove_tags_after = dict(name='div', attrs={'class':['e-articles-show-body-bottom__share']})
filter_regexps = [r'img.aktualne.centrum.cz']
remove_tags = [dict(name='div', attrs={'id': ['social-bookmark']}),
dict(name='div', attrs={'class': ['box1', 'svazane-tagy']}),
dict(name='div', attrs={'class': 'itemcomment id0'}),
dict(name='div', attrs={'class': 'hlavicka'}),
dict(name='div', attrs={'class': 'hlavni-menu'}),
dict(name='div', attrs={
'class': 'top-standard-brand-obal'}),
dict(name='div', attrs={'class': 'breadcrumb'}),
dict(name='div', attrs={'id': 'start-standard'}),
dict(name='div', attrs={'id': 'forum'}),
dict(name='span', attrs={'class': 'akce'}),
dict(name='span', attrs={'class': 'odrazka vetsi'}),
dict(name='div', attrs={'class': 'boxP'}),
dict(name='div', attrs={'class': 'box2'})]
remove_tags = [dict(name='div', attrs={'id':['social-bookmark']}),
dict(name='div', attrs={'class':['box1', 'svazane-tagy']}),
dict(name='div', attrs={'class':'itemcomment id0'}),
dict(name='div', attrs={'class':'hlavicka'}),
dict(name='div', attrs={'class':'hlavni-menu'}),
dict(name='div', attrs={'class':'top-standard-brand-obal'}),
dict(name='div', attrs={'class':['breadcrumb', 'hlavicka clearfix', 'e-ui-sub-menu',
'e-web-aktualne-articles-show-header__authors-and-share', 'f-tiptap-node',
'e-web-aktualne-articles-show__recommended']}),
dict(name='div', attrs={'id':'start-standard'}),
dict(name='div', attrs={'id':'forum'}),
dict(name='ul', attrs={'id':'nav-breadcrumb'}),
dict(name='header'),
dict(name='footer'),
dict(name='aside'),
dict(name='span', attrs={'class':'akce'}),
dict(name='span', attrs={'class':'odrazka vetsi'}),
dict(name='div', attrs={'class':'boxP'}),
dict(name='div', attrs={'class':'box2'})]
preprocess_regexps = [
(re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*',
re.DOTALL | re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<div class="[^"]*poutak-clanek-trojka".*', re.DOTALL | re.IGNORECASE), lambda match: '</body>')]
(re.compile(r'<div class="(contenttitle"|socialni-site|wiki|facebook-promo|facebook-like-button"|meta-akce).*',
re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<div class="[^"]*poutak-clanek-trojka".*', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<[aA]\s[^>]*>', re.DOTALL|re.IGNORECASE), lambda match: '')]
keep_only_tags = []
visited_urls = {}
def get_article_url(self, article):
url = BasicNewsRecipe.get_article_url(self, article)
if url in self.visited_urls:
+21 -47
View File
@@ -1,63 +1,37 @@
##########################################################################
# Description: http://hvg.hu/ RSS channel
# Author: Bigpapa (bigpapabig@hotmail.com)
# Date: 2011.12.20. - V1.1
##########################################################################
import re
from calibre.web.feeds.news import BasicNewsRecipe
class hvg(BasicNewsRecipe):
title = u'HVG'
__author__ = 'Bigpapa'
__author__ = 'bubak'
language = 'hu'
oldest_article = 5 # Hany napos legyen a legregebbi cikk amit leszedjen.
# Az adott e-bookban tarolt cikkek feedenkenti maximalis szamat adja meg.
max_articles_per_feed = 5
oldest_article = 1
max_articles_per_feed = 20
no_stylesheets = True
encoding = 'utf8'
extra_css = ' h2 { font:bold 28px} '
remove_attributes = ['style', 'font', 'href']
keep_only_tags = [
dict(name='div', attrs={'id': ['pg-content']})
]
remove_tags_before = dict(name='h1', attrs={'class':['title']})
remove_tags = [
dict(name='div', attrs={'class': [
'box articlemenu', 'bannergoogle468', 'boxcontainer left', 'boxcontainer', 'commentbox']}),
dict(name='table', attrs={'class': ['banner2', 'monocle']}),
dict(name='div', attrs={
'id': ['connect_widget_4cf63ca849ddf4577922632', 'sharetip', 'upprev_box']}),
dict(name='div', attrs={
'style': ['float: right; margin-bottom: 5px;', 'display: none;']}),
dict(name='h3', attrs={'class': ['hthree']}),
dict(name='ul', attrs={'class': ['defaultul']}),
dict(name='form', attrs={'id': ['commentForm']}),
dict(name='h6', attrs={'class': ['hthree']}),
dict(name='h6', attrs={'class': ['more2']}),
dict(name='img', attrs={'class': ['framed']}),
dict(name='td', attrs={'class': [
'greyboxbody', 'embedvideobody', 'embedvideofooter', 'embedvideobottom']}),
dict(name='div', attrs={'class':re.compile(
r'hide-on-mobile|hide-on-tablet|hide-on-desktop|election-popup', re.IGNORECASE)})
,dict(name='section', attrs={'class':re.compile(r'card-section|container-narrow', re.IGNORECASE)})
,dict(name='div', attrs={'class':['Desktop-responsive-display', 'ad-container-wrapper']})
,dict(name='ul', attrs={'class':['tags', 'author-list']})
,dict(name='blockquote', attrs={'class':['embedly-card']})
,dict(name='footer')
,dict(name='iframe')
]
feeds = [
(u'Itthon', 'http://hvg.hu/rss/itthon'),
(u'Vil\xe1g', 'http://hvg.hu/rss/vilag'),
(u'Gazdas\xe1g', 'http://hvg.hu/rss/gazdasag'),
(u'Tudom\xe1ny', 'http://hvg.hu/rss/tudomany'),
(u'Panor\xe1ma', 'http://hvg.hu/rss/panorama'),
(u'Karrier', 'http://hvg.hu/rss/karrier'),
(u'Gasztron\xf3mia', 'http://hvg.hu/rss/gasztronomia'),
(u'Helyi \xe9rt\xe9k', 'http://hvg.hu/rss/helyiertek'),
(u'Kult\xfara', 'http://hvg.hu/rss/kultura'),
(u'C\xe9gaut\xf3', 'http://hvg.hu/rss/cegauto'),
(u'V\xe1llalkoz\xf3 szellem', 'http://hvg.hu/rss/kkv'),
(u'Eg\xe9szs\xe9g', 'http://hvg.hu/rss/egeszseg'),
(u'V\xe9lem\xe9ny', 'http://hvg.hu/rss/velemeny'),
(u'Sport', 'http://hvg.hu/rss/sport')
(u'Világ', 'http://hvg.hu/rss/vilag'),
(u'Gazdaság', 'http://hvg.hu/rss/gazdasag'),
(u'Vélemény', 'http://hvg.hu/rss/velemeny'),
(u'Tudomány', 'http://hvg.hu/rss/tudomany'),
(u'Panoráma', 'http://hvg.hu/rss/panorama'),
(u'Helyi érték', 'http://hvg.hu/rss/helyiertek'),
(u'Kultúra', 'http://hvg.hu/rss/kultura'),
(u'Egészség', 'http://hvg.hu/rss/egeszseg')
]
+51 -12
View File
@@ -1,18 +1,57 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Fetch Magyar Nemzet
'''
import re
from calibre.web.feeds.news import AutomaticNewsRecipe
from calibre.web.feeds.news import BasicNewsRecipe
class BasicUserRecipe1556388146(AutomaticNewsRecipe):
title = 'Magyar Nemzet'
__author__ = 'pofa'
class mn(BasicNewsRecipe):
title = u'Magyar Nemzet'
description = ''
__author__ = 'bubak'
use_embedded_content = False
oldest_article = 2
max_articles_per_feed = 40
no_stylesheets = True
language = 'hu'
oldest_article = 1
max_articles_per_feed = 100
auto_cleanup = True
simultaneous_downloads = 5
remove_javascript = True
cover_url = 'https://magyarnemzet.hu/assets/images/logo-mno-dark.svg'
feeds = [
('Magyar Nemzet', 'https://magyarnemzet.hu/feed/?'),
]
(u'MNO hírek2', u'https://magyarnemzet.hu/publicapi/hu/rss/magyar_nemzet/articles')
]
extra_css = '''
'''
def preprocess_html(self, soup):
h2 = soup.find('h2')
h2.name = 'badH2'
return soup
preprocess_regexps = [(re.compile(r'<a\s[^>]*>(.*?)</a>', re.DOTALL|re.IGNORECASE), lambda match: match.group(1))]
remove_attributes = []
remove_tags_before = dict(name='h3', attrs={'class':['cikkcim']})
remove_tags_after = dict(name='mno-promo-block')
remove_tags = [dict(name='div', attrs={'class':['adbox ad_cikk_kozepre']}),
dict(name='div', attrs={'class':['cikk_share_box clr', 'sidebar', 'breaking']}),
dict(name='div', attrs={'class':['cikk-cimkek', 'colwrp', 'article-cikkboxes clearfix']}),
dict(name='div', attrs={'class':['author', 'article-data', 'info-line', 'dossier-link-wrapper']}),
dict(name='div', attrs={'id':['menu', 'article-right-col', 'header']}),
dict(name='header'),
dict(name='app-breadcrumb'),
dict(name='mno-tag-list'),
dict(name='figcaption'),
dict(name='app-comment-section'),
dict(name='div', attrs={'class':['cikk_video_box box clr', 'cikkblock']}),
dict(name='article', attrs={'class':['article-recommender']}),
dict(name='a', attrs={'href':['http://www.mno.hu/sorolo', '/rovat/keresztrejtveny']}),
dict(name='app-recommendation-block'),
dict(name='mno-external-recommendation-block'),
dict(name='mno-opinion-newsletter-box'),
dict(name='mno-promo-block'),
dict(name='mno-tag-list'),
dict(name='div', attrs={'class':['print']})]
+36 -16
View File
@@ -1,35 +1,55 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import unicode_literals
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
class plRecipe(BasicNewsRecipe):
__author__ = 'bubak'
__author__ = 'bubak'
title = u'Parlamentn\u00ed Listy'
publisher = u''
description = ''
oldest_article = 1
max_articles_per_feed = 20
max_articles_per_feed = 40
feeds = [
(u'Parlamentní listy.cz', u'http://www.parlamentnilisty.cz/export/rss.aspx')
]
(u'Parlamentní listy.cz', u'http://www.parlamentnilisty.cz/export/rss.aspx')
]
language = 'cs'
cover_url = 'http://www.parlamentnilisty.cz/design/listy-logo2.png'
remove_javascript = True
no_stylesheets = True
use_embedded_content = False
remove_attributes = []
remove_tags = [dict(name='div', attrs={'class': ['articledetailboxin', 'crumbs', 'relatedarticles articledetailbox']}),
dict(name='div', attrs={
'class': ['socialshare-1 noprint', 'socialshare-2 noprint']}),
dict(name='div', attrs={'id': 'widget'}),
dict(name='div', attrs={'class': 'article-discussion-box noprint'})]
preprocess_regexps = [(re.compile(r'<(span|strong)[^>]*>\s*Ptejte se politik.*',
re.DOTALL | re.IGNORECASE), lambda match: '</body>')]
remove_tags_after = [
dict(name='div', attrs={'class':re.compile(r'donate-banner|related-articles-wrap', re.IGNORECASE)})
]
remove_tags = [dict(name='div', attrs={'class':['articledetailboxin','crumbs', 'relatedarticles articledetailbox']}),
dict(name='div', attrs={'class':['socialshare-1 noprint', 'socialshare-2 noprint', 'advert-caption']}),
dict(name='div', attrs={'class':['breadcrumbs', 'marquee', 'navbar', 'header-tags']}),
dict(name='div', attrs={'class':['search-mobile', 'related-articles']}),
dict(name='ul', attrs={'class':['nav navbar-nav']}),
dict(name='ul', attrs={'role':['navigation']}),
dict(name='div', attrs={'id':['widget', 'carousel-marquee', 'search-form-radios', 'search-form-profiles']}),
dict(name='img'),
dict(name='section', attrs={'class':'section-brown section-inarticle related-articles'}),
dict(name='section', attrs={'class':'section-blue section-inarticle profile-thumb-card'}),
dict(name='section', attrs={'class':'related-articles-wrap'}),
dict(name='section', attrs={'class':'section-brown section-inarticle poll'}), # does not work
dict(name='div', attrs={'class':'article-discussion-box noprint'}),
dict(name='div', attrs={'class':re.compile(r'login-buttons', re.IGNORECASE)}),
dict(name='section', attrs={'class':re.compile(r'poll', re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(r'social-media-buttons', re.IGNORECASE)}),
dict(name='div', attrs={'class':re.compile(r'alert', re.IGNORECASE)}),
dict(name='section', attrs={'class':re.compile(r'section-inarticle', re.IGNORECASE)}),
dict(attrs={'class':re.compile(r'twitter-tweet', re.IGNORECASE)}),
dict(name='figure')
]
keep_only_tags = [dict(name='div', attrs={'class': ['article-detail']})]
preprocess_regexps = [(re.compile(r'<(span|strong|p)[^>]*>\s*Ptejte se politik.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>'),
(re.compile(r'<a\s[^>]*>(<[^>]*>)*([^<]*)(<[^>]*>)*</a>', re.DOTALL|re.IGNORECASE), lambda match: match.group(2)),
(re.compile(r'<(span|strong|p)[^>]*>\s*Jste politik?.*', re.DOTALL|re.IGNORECASE), lambda match: '</body>')
]
def get_browser(self, *a, **kw):
kw['verify_ssl_certificates'] = False
return super().get_browser(*a, **kw)