calibre/recipes/nzherald.recipe

#!/usr/bin/env  python
from calibre.web.feeds.recipes import BasicNewsRecipe


def classes(classes):
    q = frozenset(classes.split(' '))
    return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})


class NewZealandHerald(BasicNewsRecipe):
    title = 'New Zealand Herald'
    __author__ = 'unkn0wn'
    description = 'Daily news'
    timefmt = ' [%d %b, %Y]'
    language = 'en_NZ'
    oldest_article = 1
    remove_attributes = ['style', 'height', 'width']
    use_embedded_content = False
    encoding = 'utf-8'
    ignore_duplicate_articles = {'url'}
    no_stylesheets = True
    resolve_internal_links = True
    remove_empty_feeds = True

    def get_cover_url(self):
        soup = self.index_to_soup('https://www.frontpages.com/new-zealand-newspapers/')
        for img in soup.findAll('img', src=True):
            if 'new-zealand-herald' in img['src']:
                return 'https://www.frontpages.com' + img['src']

    extra_css = '''
        [data-test-ui="author--text--body"], .article-media__caption {font-size: small;}
    '''

    keep_only_tags = [
        dict(
            attrs={
                'data-test-ui': [
                    'article__heading',
                    'author--text--body',
                    'article-top-body',
                    'article-bottom-body',
                ]
            }
        ),
    ]

    remove_tags = [classes('related-articles article__ad-wrapper article__action-bar')]

    feeds = [
        ('Business', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000003.xml'),
        ('World', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000002.xml'),
        ('National', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000001.xml'),
        ('Entertainment', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_001501119.xml'),
        ('Travel', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000007.xml'),
        ('Opinion', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000466.xml'),
        ('Life & Style', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000006.xml'),
        ('Technology', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000005.xml'),
        ('Sport', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000004.xml'),
        ('Motoring', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000009.xml'),
        ('Property', 'http://rss.nzherald.co.nz/rss/xml/nzhrsscid_000000008.xml'),
    ]

    def preprocess_html(self, soup, *a):
        for img in soup.findAll('img', attrs={'data-srcset': True}):
            for x in img['data-srcset'].split(','):
                if '768w' in x:
                    img['src'] = x.split()[0]
                    break
            else:
                img['src'] = img['data-srcset'].split(',')[-1].split()[0]
        return soup