Files
calibre/recipes/private_eye.recipe

348 lines
16 KiB
Python

'''
Fetch Private Eye (Online Edition)
'''
import re
from datetime import datetime, timedelta
from calibre.web.feeds.news import BasicNewsRecipe
class PrivateEyeRecipe(BasicNewsRecipe):
##
# Last Edited: 2026-01-11 by Sophist-UK
#
# Remark: Version 3.4 2026-01-13 by Sophist-UK
# Tweak #block-sections spacing
# Version 3.3 2026-01-11 by Sophist-UK
# Fix recipe after web-site changes
# Get next publication date from home page
# Get articles URL list from sub-menu on News page
# Add classified ads from https://www.eyeads.co.uk/
# Version 3.2 2025-04-02 by Sophist-UK
# Fix recipe after web-site changes
# Version 3.1 2023-07-14 by Sophist-UK
# Show crossword on right so clues are continuous down left
# Link to crossword image removed
# Improve many image layouts
# Version 3.0 2023-07-01 by Sophist-UK
# Rewrite (by Sophist-UK) to fit latest web pages,
# correctly identify pages to include
# and improve formatting.
# Added: inclusion of About page,
# identifying series number and publication date and setting metadata.
title = u'Private Eye (Online Edition)'
description = u'Private Eye is a fortnightly British satirical and current affairs magazine, edited by Ian Hislop'
publication_type = 'magazine'
language = 'en_GB'
encoding = 'utf-8'
oldest_article = 13
max_articles_per_feed = 100
remove_javascript = True
ignore_duplicate_articles = {'url'}
__author__ = u'Martyn Pritchard & Sophist-UK'
__copyright__ = '© 2020-2026, Martyn Pritchard <MPritchard2k9@gmail.com> & Sophist-UK <sophist-uk@sodalis.co.uk>'
base_url = 'https://www.private-eye.co.uk/'
current_issue = 'https://www.private-eye.co.uk/news'
about_page = 'https://www.private-eye.co.uk/about'
number_crunching = 'https://www.private-eye.co.uk/number-crunching'
masthead_url = 'https://www.private-eye.co.uk/grfx/logos/logo-new.png'
classified_ads = 'https://www.eyeads.co.uk'
next_issue_date = 'Next issue on sale:'
author = 'Private Eye'
series = title = 'Private Eye Online'
conversion_options = {
'authors': author,
'author_sort': author,
'series': series,
'series_index': 0,
'title': title,
'title_sort': title,
}
index_attrs_to_include = [
{'id': 'sub-nav-box'},
{'class': 'sub-nav-bar'},
]
titles_to_skip = [
'Home',
'more',
'In This Issue',
]
url_to_section_name = {
'news': 'News',
'street-of-shame': 'Street of Shame',
'hp-sauce': 'HP Sauce',
'cartoons': 'Strips and Cartoons',
'pms-whatsapp': "PM's Whatsapp",
'mediaballs': 'Commentatorballs',
'lookalikes': 'Lookalike',
'crossword': 'Eye Crossword',
'in-the-back': 'In the Back',
'media-news': 'Media News',
'columnists': 'Columnists',
'rotten-boroughs': 'Rotten Boroughs',
'number-crunching': 'Number Crunching',
'www.eyeads.co.uk': 'Private Eye Classified Ads',
}
re_extract_date = re.compile(r'\d.*\d')
articles = []
urls = []
def add_article(self, title, url):
if url in self.urls:
return
known_url = url.rsplit('/',1)[-1]
if known_url and known_url in self.url_to_section_name:
title = self.url_to_section_name[known_url]
if not title:
return
self.articles.append({
'title': title,
'url': url,
})
self.urls.append(url)
self.log('Adding:', title, '(', url, ')')
def get_cover_url(self):
soup = self.index_to_soup(self.current_issue)
for img in soup.findAll('img', {'class': 'issue-cover'}):
src = img['src']
if src.endswith('_big.jpg'):
file_name = src.rsplit('/',1)[1]
if file_name is None:
file_name = src
try:
self.conversion_options.update({'series_index': int(file_name[:-len('_big.jpg')])})
self.log('series-index:', self.conversion_options['series_index'])
except (TypeError, ValueError):
# wrong big image
continue
return src
return None
def get_next_issue_date(self, soup):
sidebar = soup.find('div', id='sections-sidebar')
if sidebar is None:
return None
for s in sidebar.stripped_strings:
match = self.re_extract_date.match(s)
if match:
return match.group(0)
return None
def parse_index(self):
self.log('')
self.log('Title:', self.title)
self.log('Description:', self.description)
self.log('Authors:', self.__author__)
self.log('Copyright:', self.__copyright__)
self.log('')
self.log('Support:', u'If this recipe stops working email sophist-uk@sodalis.co.uk to let them know.')
self.log('')
soup = self.index_to_soup(self.current_issue)
try:
# Get publication date - Next issue on sale date - 13 days
next_issue_date = self.get_next_issue_date(soup)
self.log('next_issue_date:', next_issue_date)
day, month, year = next_issue_date.split(' ')
# remove day suffixes e.g. 2nd
day = ''.join(c for c in day if c.isdigit())
pub_date = datetime.strptime(' '.join((day, month, year)), '%d %B %Y') - timedelta(13)
self.log('pub-date:', pub_date)
self.conversion_options.update({'pubdate': datetime.strftime(pub_date, '%d %B %Y').lstrip('0')})
title = self.title + ' ' + datetime.strftime(pub_date, '%Y-%m-%d')
self.conversion_options.update({'title': title})
self.conversion_options.update({'title_sort': title})
except (TypeError, ValueError):
# Bad date
self.log('Cannot parse next issue date from:', next_issue_date)
# Get pages from the various contents panels.
# Duplicates will be eliminated automatically.
# Since we are now using the first online page to generate the list of pages to include
# we need to include this one:
self.add_article('News', self.current_issue)
for section_attrs in self.index_attrs_to_include:
section = soup.find('div', attrs=section_attrs)
if not section:
continue
for a in section.findAll('a', href=True):
url = a.get('href')
title = a.getText().rstrip(' »\n')
if title in self.titles_to_skip:
continue
if not url.startswith('http'):
url = self.base_url + url
self.add_article(title, url)
if not self.articles:
raise ValueError('Private-Eye Online index of pages not found')
# Add the pages with no menu
self.add_article('Number crunching', self.number_crunching)
# Add the About page as a final article
self.add_article('About Private Eye', self.about_page)
# Add classified adverts - as a fun historical record
self.add_article('Classified Ads', self.classified_ads)
self.log('parse_index:', self.articles)
return [('Private Eye', self.articles)]
def preprocess_html(self, soup):
# Remove <a> tag link to crossword image
for tag in soup.findAll('a', {'href': re.compile(r'/pictures/crossword/')}):
self.log('Removing link to crossword image...')
tag.unwrap()
# Remove align tag in crossword image (so float right works)
for tag in soup.findAll('img', {'src': re.compile(r'/pictures/crossword/')}):
if 'align' in tag.attrs:
self.log('Removing crossword image align attribute...')
del tag.attrs['align']
return soup
# We remove vast swathes of HTML which is not part of the articles.
# Remove sibling content
remove_tags_before = [
{'name': 'div', 'class': 'article'},
{'name': 'div', 'id': 'page'},
{'name': 'div', 'id': 'page-wide'},
{'name': 'div', 'id': 'content'},
]
remove_tags_after = remove_tags_before.copy()
remove_tags_after.extend([
{'name': 'div', 'id': 'about-covers'},
{'name': 'script', 'id': 'about-covers'},
{'name': 'a', 'attrs': {'href': 'https://shop.private-eye.co.uk'}},
# {'name': 'img', 'attrs': {'src': re.compile(r'/grfx/logos/')}},
])
# Remove non-sibling content
remove_tags = [
# Top
{'name': 'div', 'attrs': {'id': 'header-wide'}},
{'name': 'div', 'attrs': {'id': 'top-bar-sticky'}},
{'name': 'div', 'attrs': {'id': 'nav-box-mobile'}},
{'name': 'div', 'attrs': {'id': 'nav-box-sections-mobile'}},
{'name': 'div', 'attrs': {'id': 'sub-nav-box'}},
{'name': 'div', 'attrs': {'class': 'sub-nav-bar'}},
# Content
{'name': 'div', 'attrs': {'id': 'why-subscribe'}},
{'name': 'div', 'attrs': {'id': 'issue-box'}},
{'name': 'div', 'attrs': {'id': 'sections-sidebar'}},
# Spacers
{'name': 'div', 'attrs': {'class': 'gap'}},
# {'name': 'div', 'attrs': {'class': 'gap-small'}},
{'name': 'div', 'attrs': {'class': 'gap-bigger'}},
{'name': 'div', 'attrs': {'class': 'gap-biggest'}},
# Bottom
{'name': 'div', 'attrs': {'id': 'home-content'}},
{'name': 'div', 'attrs': {'id': 'home-shop-content-wide'}},
{'name': 'div', 'attrs': {'id': 'page-94-strip-wide'}},
{'name': 'div', 'attrs': {'id': 'footer-wide'}},
{'name': 'div', 'attrs': {'id': 'about-covers'}},
# {'name': 'div', 'attrs': {'id': 'follow-buttons'}},
# {'name': 'div', 'attrs': {'id': 'sidebar'}},
{'name': 'a', ' attrs': {'href': 'https://www.private-eye.co.uk/shop'}},
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk'}},
{'name': 'a', ' attrs': {'href': 'https://shop.private-eye.co.uk/'}},
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/lightbox/')}},
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/news_ticker/')}},
{'name': 'link', 'attrs': {'href': re.compile(r'/javastyle/media-queries-')}},
{'name': 'img', 'attrs': {'src': re.compile(r'/grfx/logos/gnitty.gif')}},
{'name': 'img', 'attrs': {'src': re.compile(r'/grfx/stuff/subscribe_here_gnitty.gif')}},
{'name': 'iframe'},
{'name': 'script'},
{'name': 'style'},
# Classified site
{'name': 'a', 'attrs': {'class': 'skip-link'}},
{'name': 'header', 'attrs': {'id': 'masthead'}},
{'name': 'div', 'attrs': {'class': 'placeAdIntro'}},
{'name': 'img', 'attrs': {'class': 'cat-image'}},
{'name': 'img', 'attrs': {'class': 'catArrow'}},
{'name': 'div', 'attrs': {'class': 'ad-contact'}},
{'name': 'div', 'attrs': {'id': re.compile(r'cookie-law-')}},
{'name': 'div', 'attrs': {'class': re.compile(r'cli-modal')}},
{'name': 'footer', 'attrs': {'class': 'site-footer'}},
]
preprocess_regexps = [
# Convert headers to h1, strapline to h4
(
re.compile(
r'<span class="headline(?:-new)?">(.*?)</span>\s*(?:<br/?>\s*)*((?:<span class="text">(.*?)</span>)|(?:<font .*?>(.*?)</font>))?',
re.DOTALL | re.IGNORECASE
),
lambda match: '<h1 class="myheadline">' + match[1] + '</h1>' +
(('<h4 class="mystrapline">' + match[2] + '</h4>') if match[2] else '')
),
# Remove broken links for this issue
(
re.compile(
r'<a href="/issue-.*?".*?>(.*?)</a>',
re.DOTALL | re.IGNORECASE
),
lambda match: match[1]
),
]
# The following extra css is to tweak the formatting of various elements of various article pages.
# Unfortunately, there are a variety of different pages styles, hence the extended tweak list.
# Some of these mimic the actual layout.css which does not seem to make it across into the calibre
# ebook without duplicating it as extra css.
# However some is new css to tweak output when part of an ebook.
extra_css = ' \n '.join([
'.article {font-family: Merriweather, Georgia, serif; font-size: 1em}',
'.myheadline {font-family: "Source Sans Pro", Arial, Helvetica, sans-serif; font-size: 2em;}',
'.mystrapline {font-family: "Source Sans Pro", Arial, Helvetica, sans-serif; font-size: 1.2em;}',
'#content img {float: right; width: 45%; minimum-width:350px;}',
'#content img.office {float: right; width: 45%; maximum-width:390px; margin-left: 15px;}',
'#content img.cartoon-left {float: left; margin-right: 15px; margin-bottom: 15px;}',
'#content img.cartoon-right {float: none; margin-bottom: 15px;}',
'#content img.strip {float: none; width: 100%;}',
'#content img:first-child {float: none;}',
'#content img.gnitty-right {float: none; width: 160px;}',
'#content #story > div[align=right] > img:first-child {float: none; width: 15px;}',
'#content #story > img:first-child {float: none; height: 100px; width: none; minimum-width: none;}',
'#content #block-sections div.divider {height: 1px; background-color: #eee; margin: 10px 0px; clear: both;}',
'#content #block-sections img {float: none; width: none;}',
'#content #block-sections img.lookalike {float: none; width: 100%;}',
'#content #block-sections img.photo-right {float: right; width: 25%; min-width:120px; margin-left: 15px;}',
'#content #block-sections > p:last-child > img:first-child {float: none; width: 120px;}',
'#content #block-sections > p:last-child > img:nth-child(2) {float: none; width: 120px;}',
'#content #block-sections img.crossword {float: right; width: 40%; margin-left: 15px; min-width: 350px;}',
'#content #article-caption-box {float: right; background: #222222; display: block; width: 40%; min-width: 250px; font-size: 90%; margin-left: 15px;}',
'#content #article-caption-box img {float: none; width: 100%; max-width: none;}',
'#caption-box {color: #ffffff; text-align: center; padding: 5px 20px 15px 20px;}',
'#whatsapp {border-left: 5px #f45e61 solid; border-right: 5px #f45e61 solid; border-bottom: 5px #f45e61 solid; padding: 10px 20px 20px 10px;}',
'#whatsapp::after {clear:both;}',
'#whatsapp .whatsapp-left, .whatsapp-right {margin: 20px 0; padding: 15px; border-radius: 10px;}',
'#whatsapp .whatsapp-left, .whatsapp-right {font-family: Helvetica, Arial, "sans-serif"; font-weight: 300; font-size: 18px; line-height: 24px;}',
'#whatsapp .whatsapp-left {margin-right: 30%; background-color: #eeeeee;}',
'#whatsapp .whatsapp-right {margin-left: 30%; background-color: #dce5ae;}',
'#whatsapp .whatsapp-left img, #whatsapp .whatsapp-right img {width: 35px; margin: 0 10px; vertical-align: middle;}',
'#whatsapp .whatsapp-left img.emoji, #whatsapp .whatsapp-right img.emoji {max-width: 35px; margin: 0 5px; vertical-align: middle;}',
'.container .row .catTitle {color: fff; background-color: #12174a;}',
])