Files
calibre/recipes/time_magazine.recipe

362 lines
13 KiB
Python

import re
import time as time_mod
from datetime import datetime
from calibre.utils.date import utcnow
from calibre.web.feeds.news import BasicNewsRecipe
class TimeMagazineUSA(BasicNewsRecipe):
title = 'TIME Magazine'
__author__ = 'Monkfishare'
description = 'Weekly US magazine.'
language = 'en'
timefmt = ''
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b3/Time_Magazine_logo.svg/960px-Time_Magazine_logo.svg.png'
recipe_specific_options = {
'edition': {
'short': 'Edition URL',
'long': (
'For example, https://time.com/magazine/us/7362702/february-9th-2026-vol-207-no-3-u-s/\n'
'Browse past editions at https://time.com/vault/year/2026 (change the year as needed).\n'
'Leave the URL field blank to automatically download the latest US edition.'
),
'default': '',
},
'res': {
'short': 'Image resolution (width in pixels)',
'long': (
'Recommended for e-ink devices: 1200, 1600, or 2000 (default)\n'
'Use 400 or 800 for smaller file sizes or non e-ink devices.'
),
'default': '2000',
},
}
keep_only_tags = [
dict(name='section', attrs={'aria-labelledby': lambda v: v and (
v == 'intro-section' or re.match(r'^[a-z][a-z0-9_]*$', v)
)}),
dict(name='article'),
]
remove_tags = [
dict(attrs={'class': lambda c: c and any(x in c for x in [
'bg-warm-grey-ads', 'native-ad', 'advertisement', 'sr-only',
'google-source', 'print:hidden', 'rightrail',
])}),
dict(attrs={'data-testid': 'authors-list'}),
dict(name='button'),
dict(name='svg'),
dict(attrs={'aria-hidden': 'true'}),
dict(attrs={'role': 'tooltip'}),
dict(attrs={'class': lambda c: c and 'items-center' in c and 'flex-row' in c}),
dict(attrs={'class': lambda c: c and 'relative' in c and 'inline-flex' in c and 'w-max' in c}),
]
extra_css = '''
h1 { font-size: 1.6em; }
h2, h3 { font-size: 1.2em; }
time { color: #666; font-size: 0.85em; }
img { display: block; max-width: 100%; height: auto;
position: static !important; float: none !important; }
figure { display: block; position: static !important;
margin: 1em 0; overflow: visible; }
'''
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
br.addheaders += [
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
('Accept-Language', 'en-US,en;q=0.9'),
('Sec-Fetch-Mode', 'navigate'),
('Sec-Fetch-Dest', 'document'),
('Sec-Fetch-Site', 'none'),
('Upgrade-Insecure-Requests', '1'),
]
return br
def _img_width(self):
w = self.recipe_specific_options.get('res')
if w and isinstance(w, str) and w.isdigit():
return w
return '2000'
def _set_img_width(self, src):
src = re.sub(r'[?&]width=\d+', '', src).rstrip('?&')
sep = '&' if '?' in src else '?'
return src + sep + 'width=' + self._img_width()
def _sanitize_nested_anchors(self, raw_html):
return re.sub(
r'<a\b([^>]*\bhref="/section/[^"]*"[^>]*)>(.*?)</a>',
r'<span\1>\2</span>',
raw_html,
flags=re.DOTALL,
)
def _find_latest_us_edition(self, soup):
candidates = []
for art in soup.find_all('article'):
a = art.find('a', href=True)
if not a:
continue
href = a['href']
if '/magazine/us/' not in href:
continue
time_tag = art.find('time', attrs={'datetime': True})
if not time_tag:
continue
try:
dt = datetime.fromisoformat(time_tag['datetime'])
candidates.append((dt, href, a.get_text(strip=True)))
except Exception:
pass
if not candidates:
return None, None
candidates.sort(key=lambda x: x[0], reverse=True)
return candidates[0][1], candidates[0][2]
def _parse_toc_from_soup(self, soup):
toc_container = soup.find('div', class_=lambda c: c and 'columns-1' in c)
if not toc_container:
return None
sections = []
for section_div in toc_container.find_all(
'div', class_=lambda c: c and 'break-inside-avoid' in c
):
h3 = section_div.find('h3', class_=lambda c: c and 'text-4xl' in c)
section_name = h3.get_text(strip=True) if h3 else None
if not section_name:
continue
articles = []
for title_h3 in section_div.find_all(
'h3', class_=lambda c: c and 'font-editorial' in c
):
title = title_h3.get_text(strip=True)
a = title_h3.find_parent('a', href=True)
if not a:
continue
href = a['href']
if not re.match(r'^(/\d{7,}/|/article/|/collection/)', href):
continue
if title:
p = a.find('p', class_=lambda c: c and 'text-t4' in c)
desc = p.get_text(strip=True) if p else ''
self.log(' ', title)
articles.append({'title': title, 'url': 'https://time.com' + href, 'description': desc})
if articles:
self.log(section_name)
sections.append((section_name, articles))
return sections or None
def _slug_to_title(self, path):
parts = path.strip('/').split('/')
slug = parts[1] if len(parts) >= 2 else parts[0]
return slug.replace('-', ' ').title()
def _parse_toc_from_raw(self, raw_html):
text = raw_html.replace('\\/', '/')
text = re.sub(r'\\u002[Ff]', '/', text)
combined = re.compile(
r'(?:'
r'/\d{7,}/[a-z0-9][a-z0-9\-]*/'
r'|/article/\d{4}/\d{2}/\d{2}/[a-z0-9][a-z0-9\-]*/'
r'|/collection/[a-z0-9\-]+/\d{4}/[a-z0-9\-]*/'
r')'
)
seen = set()
paths = []
for m in combined.finditer(text):
path = m.group(0)
if re.search(r'/magazine/|/section/|/vault/', path):
continue
if path not in seen:
seen.add(path)
paths.append(path)
if not paths:
return None
articles = []
for path in paths:
url = 'https://time.com' + path
try:
art_soup = self.index_to_soup(url)
h1 = art_soup.find('h1')
title = h1.get_text(strip=True) if h1 else self._slug_to_title(path)
except Exception:
title = self._slug_to_title(path)
self.log('\t', title)
articles.append({'title': title, 'url': url})
time_mod.sleep(0.5)
return [('Articles', articles)]
def _fetch_raw(self, url):
resp = self.browser.open(url)
return resp.read().decode('utf-8', errors='replace')
def _title_from_path(self, path):
slug = path.strip('/').split('/')[-1]
m = re.match(r'([a-z]+)-(\d+)[a-z]*-(\d{4})', slug)
if m:
month, day, year = m.group(1).capitalize(), m.group(2), m.group(3)
return '%s %s, %s' % (month, day, year)
return slug.replace('-', ' ').title()
def parse_index(self):
opts = self.recipe_specific_options
custom_path = (opts.get('edition') or '').strip()
if custom_path:
edition_url = custom_path if custom_path.startswith('http') else 'https://time.com' + custom_path
date_str = self._title_from_path(custom_path)
self.log('Using custom edition:', edition_url)
else:
year = utcnow().year
archive_soup = self.index_to_soup('https://time.com/vault/year/%d/' % year)
edition_url, edition_title = self._find_latest_us_edition(archive_soup)
if not edition_url:
archive_soup = self.index_to_soup(
'https://time.com/vault/year/%d/' % (year - 1)
)
edition_url, edition_title = self._find_latest_us_edition(archive_soup)
if not edition_url:
raise ValueError('Could not find latest US edition')
self.log('Found edition:', edition_url)
date_str = edition_title.split('|')[0].strip() if edition_title else 'USA'
self.title = 'TIME Magazine'
self.timefmt = ' [' + date_str + ']'
raw_html = self._fetch_raw(edition_url)
raw_html = self._sanitize_nested_anchors(raw_html)
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw_html, 'html.parser')
for span in soup.find_all('span', class_=lambda c: c and 'text-t3' in c):
if span.get_text(strip=True).lower() == 'issue name':
p = span.find_next_sibling('p')
if p:
full_issue_name = p.get_text(strip=True)
date_str = full_issue_name.split('|')[0].strip()
if date_str:
self.timefmt = ' [' + date_str + ']'
self.log('Issue name:', full_issue_name)
break
figure = soup.find('figure')
if figure:
img = figure.find('img')
if img:
src = img.get('src', '')
self.cover_url = self._set_img_width(src)
self.log('Cover:', self.cover_url)
sections = self._parse_toc_from_soup(soup)
if sections:
self.log('Parsed TOC from HTML structure (%d sections)' % len(sections))
return sections
self.log('TOC not in HTML structure, trying raw payload extraction')
sections = self._parse_toc_from_raw(raw_html)
if sections:
self.log('Extracted %d article URLs from raw payload' % len(sections[0][1]))
return sections
raise ValueError(
'Could not parse edition TOC from either HTML or raw payload. '
'Response length: %d bytes' % len(raw_html)
)
def preprocess_html(self, soup):
for picture in soup.find_all('picture'):
img = picture.find('img')
if img:
for attr in ('srcset', 'sizes', 'class', 'style', 'loading', 'decoding'):
img.attrs.pop(attr, None)
img['src'] = self._set_img_width(img.get('src', ''))
picture.replace_with(img)
else:
picture.decompose()
for a in soup.find_all('a', href=True):
if 'magazineshop' in a['href']:
parent = a.find_parent()
if parent and parent.name in ('p', 'div', 'span'):
parent.decompose()
else:
a.decompose()
for figure in soup.find_all('figure'):
figcaption = figure.find('figcaption')
caption_text = figcaption.get_text(strip=True) if figcaption else ''
img = figure.find('img')
if img:
img_src = self._set_img_width(img.get('src', ''))
img_tag = soup.new_tag('img', src=img_src, alt=caption_text)
new_fig = soup.new_tag('figure')
new_fig.append(img_tag)
if caption_text:
new_cap = soup.new_tag(
'figcaption',
style='font-size:0.8em; color:#555; font-style:italic; text-align:center; margin-top:0.3em;'
)
new_cap.string = caption_text
new_fig.append(new_cap)
figure.replace_with(new_fig)
else:
figure.decompose()
for tag in soup.find_all(['img']):
for attr in ('style', 'class'):
if tag.has_attr(attr):
del tag[attr]
tag['src'] = self._set_img_width(tag.get('src', ''))
for div in soup.find_all(
'div', class_=lambda c: c and 'justify-items-center' in c
):
div.unwrap()
for div in soup.find_all(
'div', class_=lambda c: c and 'container' in c and 'mx-auto' in c
):
div.unwrap()
for div in soup.find_all('div', class_=lambda c: c and 'grid' in c):
div.unwrap()
for ul in soup.find_all('ul'):
tag_links = [a for a in ul.find_all('a', href=True)
if '/section/' in a['href'] or '/tag/' in a['href']]
if tag_links:
p = soup.new_tag('p')
for i, a in enumerate(tag_links):
if i > 0:
p.append(' | ')
p.append(a)
ul.replace_with(p)
for p in soup.find_all('p'):
if p.get_text(strip=True) == 'Advertisement':
parent = p.find_parent(attrs={'data-ad-wrapper': True}) or p.find_parent(
class_=lambda c: c and 'bg-warm-grey-ads' in c
)
if parent:
parent.decompose()
else:
p.decompose()
return soup