calibre/recipes/private_eye.recipe

'''
Fetch Private Eye (Online Edition)
'''

import re
from datetime import datetime, timedelta

from calibre.web.feeds.news import BasicNewsRecipe


class PrivateEyeRecipe(BasicNewsRecipe):
    ##
    # Last Edited:  2026-01-11 by Sophist-UK
    #
    # Remark:   Version 3.4 2026-01-13 by Sophist-UK
    #               Tweak #block-sections spacing
    #           Version 3.3 2026-01-11 by Sophist-UK
    #               Fix recipe after web-site changes
    #                   Get next publication date from home page
    #                   Get articles URL list from sub-menu on News page
    #               Add classified ads from https://www.eyeads.co.uk/
    #           Version 3.2 2025-04-02 by Sophist-UK
    #               Fix recipe after web-site changes
    #           Version 3.1 2023-07-14 by Sophist-UK
    #               Show crossword on right so clues are continuous down left
    #               Link to crossword image removed
    #               Improve many image layouts
    #           Version 3.0 2023-07-01 by Sophist-UK
    #               Rewrite (by Sophist-UK) to fit latest web pages,
    #                   correctly identify pages to include
    #                   and improve formatting.
    #               Added:  inclusion of About page,
    #                       identifying series number and publication date and setting metadata.

    title = u'Private Eye (Online Edition)'
    description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop'
    publication_type = 'magazine'
    language = 'en_GB'
    encoding = 'utf-8'
    oldest_article = 13
    max_articles_per_feed = 100
    remove_javascript = True
    ignore_duplicate_articles = {'url'}

    __author__ = u'Martyn Pritchard & Sophist-UK'
    __copyright__ = '© 2020-2026, Martyn Pritchard <MPritchard2k9@gmail.com> & Sophist-UK <sophist-uk@sodalis.co.uk>'

    base_url            = 'https://www.private-eye.co.uk/'
    current_issue       = 'https://www.private-eye.co.uk/news'
    about_page          = 'https://www.private-eye.co.uk/about'
    number_crunching    = 'https://www.private-eye.co.uk/number-crunching'
    masthead_url        = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png'
    classified_ads      = 'https://www.eyeads.co.uk'
    next_issue_date     = 'Next issue on sale:'

    author = 'Private Eye'
    series = title = 'Private Eye Online'
    conversion_options = {
        'authors':      author,
        'author_sort':  author,
        'series':       series,
        'series_index': 0,
        'title':        title,
        'title_sort':   title,
    }

    index_attrs_to_include = [
        {'id': 'sub-nav-box'},
        {'class': 'sub-nav-bar'},
    ]

    titles_to_skip = [
        'Home',
        'more',
        'In This Issue',
    ]

    url_to_section_name = {
        'news': 'News',
        'street-of-shame': 'Street of Shame',
        'hp-sauce': 'HP Sauce',
        'cartoons': 'Strips and Cartoons',
        'pms-whatsapp': "PM's Whatsapp",
        'mediaballs': 'Commentatorballs',
        'lookalikes': 'Lookalike',
        'crossword': 'Eye Crossword',
        'in-the-back': 'In the Back',
        'media-news': 'Media News',
        'columnists': 'Columnists',
        'rotten-boroughs': 'Rotten Boroughs',
        'number-crunching': 'Number Crunching',
        'www.eyeads.co.uk': 'Private Eye Classified Ads',
    }

    re_extract_date = re.compile(r'\d.*\d')
    articles = []
    urls = []

    def add_article(self, title, url):
        if url in self.urls:
            return
        known_url = url.rsplit('/',1)[-1]
        if known_url and known_url in self.url_to_section_name:
            title = self.url_to_section_name[known_url]
        if not title:
            return
        self.articles.append({
            'title':    title,
            'url':      url,
        })
        self.urls.append(url)
        self.log('Adding:', title, '(', url, ')')

    def get_cover_url(self):
        soup = self.index_to_soup(self.current_issue)

        for img in soup.findAll('img',  {'class': 'issue-cover'}):
            src = img['src']
            if src.endswith('_big.jpg'):
                file_name = src.rsplit('/',1)[1]
                if file_name is None:
                    file_name = src
                try:
                    self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])})
                    self.log('series-index:', self.conversion_options['series_index'])
                except (TypeError, ValueError):
                    # wrong big image
                    continue
                return src
        return None

    def get_next_issue_date(self, soup):
        sidebar = soup.find('div', id='sections-sidebar')
        if sidebar is None:
            return None
        for s in sidebar.stripped_strings:
            match = self.re_extract_date.match(s)
            if match:
                return match.group(0)
        return None

    def parse_index(self):
        self.log('')
        self.log('Title:', self.title)
        self.log('Description:', self.description)
        self.log('Authors:', self.__author__)
        self.log('Copyright:', self.__copyright__)
        self.log('')
        self.log('Support:', u'If this recipe stops working email sophist-uk@sodalis.co.uk to let them know.')
        self.log('')

        soup = self.index_to_soup(self.current_issue)

        try:
            # Get publication date - Next issue on sale date - 13 days
            next_issue_date = self.get_next_issue_date(soup)
            self.log('next_issue_date:', next_issue_date)

            day, month, year = next_issue_date.split(' ')
            # remove day suffixes e.g. 2nd
            day = ''.join(c for c in day if c.isdigit())
            pub_date = datetime.strptime(' '.join((day, month, year)), '%d %B %Y') - timedelta(13)
            self.log('pub-date:', pub_date)

            self.conversion_options.update({'pubdate': datetime.strftime(pub_date, '%d %B %Y').lstrip('0')})
            title = self.title + ' ' + datetime.strftime(pub_date, '%Y-%m-%d')
            self.conversion_options.update({'title': title})
            self.conversion_options.update({'title_sort': title})
        except (TypeError, ValueError):
            # Bad date
            self.log('Cannot parse next issue date from:', next_issue_date)

        # Get pages from the various contents panels.
        # Duplicates will be eliminated automatically.

        # Since we are now using the first online page to generate the list of pages to include
        # we need to include this one:

        self.add_article('News', self.current_issue)

        for section_attrs in self.index_attrs_to_include:
            section = soup.find('div', attrs=section_attrs)

            if not section:
                continue

            for a in section.findAll('a', href=True):
                url = a.get('href')
                title = a.getText().rstrip(' »\n')
                if title in self.titles_to_skip:
                    continue
                if not url.startswith('http'):
                    url = self.base_url + url
                self.add_article(title, url)

        if not self.articles:
            raise ValueError('Private-Eye Online index of pages not found')

        # Add the pages with no menu
        self.add_article('Number crunching', self.number_crunching)

        # Add the About page as a final article
        self.add_article('About Private Eye', self.about_page)

        # Add classified adverts - as a fun historical record
        self.add_article('Classified Ads', self.classified_ads)

        self.log('parse_index:', self.articles)

        return [('Private Eye', self.articles)]

    def preprocess_html(self, soup):
        # Remove <a> tag link to crossword image
        for tag in soup.findAll('a', {'href': re.compile(r'/pictures/crossword/')}):
            self.log('Removing link to crossword image...')
            tag.unwrap()

        # Remove align tag in crossword image (so float right works)
        for tag in soup.findAll('img', {'src': re.compile(r'/pictures/crossword/')}):
            if 'align' in tag.attrs:
                self.log('Removing crossword image align attribute...')
                del tag.attrs['align']

        return soup

    # We remove vast swathes of HTML which is not part of the articles.
    # Remove sibling content
    remove_tags_before = [
        {'name': 'div', 'class': 'article'},
        {'name': 'div', 'id': 'page'},
        {'name': 'div', 'id': 'page-wide'},
        {'name': 'div', 'id': 'content'},
    ]
    remove_tags_after = remove_tags_before.copy()
    remove_tags_after.extend([
        {'name': 'div', 'id': 'about-covers'},
        {'name': 'script', 'id': 'about-covers'},
        {'name': 'a', 'attrs': {'href': 'https://shop.private-eye.co.uk'}},
#        {'name': 'img', 'attrs': {'src': re.compile(r'/grfx/logos/')}},
    ])
    # Remove non-sibling content
    remove_tags = [
        # Top
        {'name': 'div', 'attrs': {'id': 'header-wide'}},
        {'name': 'div', 'attrs': {'id': 'top-bar-sticky'}},
        {'name': 'div', 'attrs': {'id': 'nav-box-mobile'}},
        {'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}},
        {'name': 'div', 'attrs': {'id': 'sub-nav-box'}},
        {'name': 'div', 'attrs': {'class': 'sub-nav-bar'}},
        # Content
        {'name': 'div', 'attrs': {'id': 'why-subscribe'}},
        {'name': 'div', 'attrs': {'id': 'issue-box'}},
        {'name': 'div', 'attrs': {'id': 'sections-sidebar'}},
        # Spacers
        {'name': 'div', 'attrs': {'class': 'gap'}},
#        {'name': 'div', 'attrs': {'class': 'gap-small'}},
        {'name': 'div', 'attrs': {'class': 'gap-bigger'}},
        {'name': 'div', 'attrs': {'class': 'gap-biggest'}},
        # Bottom
        {'name': 'div', 'attrs': {'id': 'home-content'}},
        {'name': 'div', 'attrs': {'id': 'home-shop-content-wide'}},
        {'name': 'div', 'attrs': {'id': 'page-94-strip-wide'}},
        {'name': 'div', 'attrs': {'id': 'footer-wide'}},
        {'name': 'div', 'attrs': {'id': 'about-covers'}},
#        {'name': 'div', 'attrs': {'id': 'follow-buttons'}},
#        {'name': 'div', 'attrs': {'id': 'sidebar'}},
        {'name': 'a', '  attrs': {'href': 'https://www.private-eye.co.uk/shop'}},
        {'name': 'a', '  attrs': {'href': 'https://shop.private-eye.co.uk'}},
        {'name': 'a', '  attrs': {'href': 'https://shop.private-eye.co.uk/'}},
        {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/lightbox/')}},
        {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/news_ticker/')}},
        {'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/media-queries-')}},
        {'name': 'img', 'attrs': {'src': re.compile(r'/grfx/logos/gnitty.gif')}},
        {'name': 'img', 'attrs': {'src': re.compile(r'/grfx/stuff/subscribe_here_gnitty.gif')}},
        {'name': 'iframe'},
        {'name': 'script'},
        {'name': 'style'},
        # Classified site
        {'name': 'a', 'attrs': {'class': 'skip-link'}},
        {'name': 'header', 'attrs': {'id': 'masthead'}},
        {'name': 'div', 'attrs': {'class': 'placeAdIntro'}},
        {'name': 'img', 'attrs': {'class': 'cat-image'}},
        {'name': 'img', 'attrs': {'class': 'catArrow'}},
        {'name': 'div', 'attrs': {'class': 'ad-contact'}},
        {'name': 'div', 'attrs': {'id': re.compile(r'cookie-law-')}},
        {'name': 'div', 'attrs': {'class': re.compile(r'cli-modal')}},
        {'name': 'footer', 'attrs': {'class': 'site-footer'}},
    ]

    preprocess_regexps = [
        # Convert headers to h1, strapline to h4
        (
            re.compile(
                r'<span class="headline(?:-new)?">(.*?)</span>\s*(?:<br/?>\s*)*((?:<span class="text">(.*?)</span>)|(?:<font .*?>(.*?)</font>))?',
                re.DOTALL | re.IGNORECASE
            ),
            lambda match: '<h1 class="myheadline">' + match[1] + '</h1>' +
                         (('<h4 class="mystrapline">' + match[2] + '</h4>') if match[2] else '')
        ),
        # Remove broken links for this issue
        (
            re.compile(
                r'<a href="/issue-.*?".*?>(.*?)</a>',
                re.DOTALL | re.IGNORECASE
            ),
            lambda match: match[1]
        ),
    ]

    # The following extra css is to tweak the formatting of various elements of various article pages.
    # Unfortunately, there are a variety of different pages styles, hence the extended tweak list.
    # Some of these mimic the actual layout.css which does not seem to make it across into the calibre
    # ebook without duplicating it as extra css.
    # However some is new css to tweak output when part of an ebook.
    extra_css = ' \n '.join([
        '.article  {font-family: Merriweather, Georgia, serif; font-size: 1em}',
        '.myheadline {font-family: "Source Sans Pro", Arial, Helvetica, sans-serif; font-size: 2em;}',
        '.mystrapline {font-family: "Source Sans Pro", Arial, Helvetica, sans-serif; font-size: 1.2em;}',
        '#content img {float: right; width: 45%; minimum-width:350px;}',
        '#content img.office {float: right; width: 45%; maximum-width:390px; margin-left: 15px;}',
        '#content img.cartoon-left {float: left; margin-right: 15px; margin-bottom: 15px;}',
        '#content img.cartoon-right {float: none; margin-bottom: 15px;}',
        '#content img.strip {float: none; width: 100%;}',
        '#content img:first-child {float: none;}',
        '#content img.gnitty-right {float: none; width: 160px;}',
        '#content #story > div[align=right] > img:first-child {float: none; width: 15px;}',
        '#content #story > img:first-child {float: none; height: 100px; width: none; minimum-width: none;}',
        '#content #block-sections div.divider {height: 1px; background-color: #eee; margin: 10px 0px; clear: both;}',
        '#content #block-sections img {float: none; width: none;}',
        '#content #block-sections img.lookalike {float: none; width: 100%;}',
        '#content #block-sections img.photo-right {float: right; width: 25%; min-width:120px; margin-left: 15px;}',
        '#content #block-sections > p:last-child > img:first-child {float: none; width: 120px;}',
        '#content #block-sections > p:last-child > img:nth-child(2) {float: none; width: 120px;}',
        '#content #block-sections img.crossword {float: right; width: 40%; margin-left: 15px; min-width: 350px;}',
        '#content #article-caption-box {float: right; background: #222222; display: block; width: 40%; min-width: 250px; font-size: 90%; margin-left: 15px;}',
        '#content #article-caption-box img {float: none; width: 100%; max-width: none;}',
        '#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}',
        '#whatsapp {border-left: 5px #f45e61 solid; border-right: 5px #f45e61 solid; border-bottom: 5px #f45e61 solid; padding: 10px 20px 20px 10px;}',
        '#whatsapp::after {clear:both;}',
        '#whatsapp .whatsapp-left, .whatsapp-right {margin: 20px 0; padding: 15px; border-radius: 10px;}',
        '#whatsapp .whatsapp-left, .whatsapp-right {font-family: Helvetica, Arial, "sans-serif"; font-weight: 300; font-size: 18px; line-height: 24px;}',
        '#whatsapp .whatsapp-left {margin-right: 30%; background-color: #eeeeee;}',
        '#whatsapp .whatsapp-right {margin-left: 30%; background-color: #dce5ae;}',
        '#whatsapp .whatsapp-left img, #whatsapp .whatsapp-right img {width: 35px; margin: 0 10px; vertical-align: middle;}',
        '#whatsapp .whatsapp-left img.emoji, #whatsapp .whatsapp-right img.emoji {max-width: 35px; margin: 0 5px; vertical-align: middle;}',
        '.container .row .catTitle {color: fff; background-color: #12174a;}',
    ])