calibre/recipes/time_magazine.recipe

import re
import time as time_mod
from datetime import datetime

from calibre.utils.date import utcnow
from calibre.web.feeds.news import BasicNewsRecipe


class TimeMagazineUSA(BasicNewsRecipe):
    title = 'TIME Magazine'
    __author__ = 'Monkfishare'
    description = 'Weekly US magazine.'
    language = 'en'
    timefmt = ''
    no_stylesheets = True
    remove_javascript = True
    remove_empty_feeds = True
    masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/b/b3/Time_Magazine_logo.svg/960px-Time_Magazine_logo.svg.png'

    recipe_specific_options = {
        'edition': {
            'short': 'Edition URL',
            'long': (
                'For example, https://time.com/magazine/us/7362702/february-9th-2026-vol-207-no-3-u-s/\n'
                'Browse past editions at https://time.com/vault/year/2026 (change the year as needed).\n'
                'Leave the URL field blank to automatically download the latest US edition.'
            ),
            'default': '',
        },
        'res': {
            'short': 'Image resolution (width in pixels)',
            'long': (
                'Recommended for e-ink devices: 1200, 1600, or 2000 (default)\n'
                'Use 400 or 800 for smaller file sizes or non e-ink devices.'
            ),
            'default': '2000',
        },
    }

    keep_only_tags = [
        dict(name='section', attrs={'aria-labelledby': lambda v: v and (
            v == 'intro-section' or re.match(r'^[a-z][a-z0-9_]*$', v)
        )}),
        dict(name='article'),
    ]

    remove_tags = [
        dict(attrs={'class': lambda c: c and any(x in c for x in [
            'bg-warm-grey-ads', 'native-ad', 'advertisement', 'sr-only',
            'google-source', 'print:hidden', 'rightrail',
        ])}),
        dict(attrs={'data-testid': 'authors-list'}),
        dict(name='button'),
        dict(name='svg'),
        dict(attrs={'aria-hidden': 'true'}),
        dict(attrs={'role': 'tooltip'}),
        dict(attrs={'class': lambda c: c and 'items-center' in c and 'flex-row' in c}),
        dict(attrs={'class': lambda c: c and 'relative' in c and 'inline-flex' in c and 'w-max' in c}),
    ]

    extra_css = '''
        h1 { font-size: 1.6em; }
        h2, h3 { font-size: 1.2em; }
        time { color: #666; font-size: 0.85em; }
        img { display: block; max-width: 100%; height: auto;
              position: static !important; float: none !important; }
        figure { display: block; position: static !important;
                 margin: 1em 0; overflow: visible; }
    '''

    def get_browser(self):
        br = BasicNewsRecipe.get_browser(self)
        br.addheaders += [
            ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
            ('Accept-Language', 'en-US,en;q=0.9'),
            ('Sec-Fetch-Mode', 'navigate'),
            ('Sec-Fetch-Dest', 'document'),
            ('Sec-Fetch-Site', 'none'),
            ('Upgrade-Insecure-Requests', '1'),
        ]
        return br

    def _img_width(self):
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str) and w.isdigit():
            return w
        return '2000'

    def _set_img_width(self, src):
        src = re.sub(r'[?&]width=\d+', '', src).rstrip('?&')
        sep = '&' if '?' in src else '?'
        return src + sep + 'width=' + self._img_width()

    def _sanitize_nested_anchors(self, raw_html):
        return re.sub(
            r'<a\b([^>]*\bhref="/section/[^"]*"[^>]*)>(.*?)</a>',
            r'<span\1>\2</span>',
            raw_html,
            flags=re.DOTALL,
        )

    def _find_latest_us_edition(self, soup):
        candidates = []
        for art in soup.find_all('article'):
            a = art.find('a', href=True)
            if not a:
                continue
            href = a['href']
            if '/magazine/us/' not in href:
                continue
            time_tag = art.find('time', attrs={'datetime': True})
            if not time_tag:
                continue
            try:
                dt = datetime.fromisoformat(time_tag['datetime'])
                candidates.append((dt, href, a.get_text(strip=True)))
            except Exception:
                pass
        if not candidates:
            return None, None
        candidates.sort(key=lambda x: x[0], reverse=True)
        return candidates[0][1], candidates[0][2]

    def _parse_toc_from_soup(self, soup):
        toc_container = soup.find('div', class_=lambda c: c and 'columns-1' in c)
        if not toc_container:
            return None

        sections = []
        for section_div in toc_container.find_all(
            'div', class_=lambda c: c and 'break-inside-avoid' in c
        ):
            h3 = section_div.find('h3', class_=lambda c: c and 'text-4xl' in c)
            section_name = h3.get_text(strip=True) if h3 else None
            if not section_name:
                continue

            articles = []
            for title_h3 in section_div.find_all(
                'h3', class_=lambda c: c and 'font-editorial' in c
            ):
                title = title_h3.get_text(strip=True)
                a = title_h3.find_parent('a', href=True)
                if not a:
                    continue
                href = a['href']
                if not re.match(r'^(/\d{7,}/|/article/|/collection/)', href):
                    continue
                if title:
                    p = a.find('p', class_=lambda c: c and 'text-t4' in c)
                    desc = p.get_text(strip=True) if p else ''
                    self.log('	', title)
                    articles.append({'title': title, 'url': 'https://time.com' + href, 'description': desc})

            if articles:
                self.log(section_name)
                sections.append((section_name, articles))

        return sections or None

    def _slug_to_title(self, path):
        parts = path.strip('/').split('/')
        slug = parts[1] if len(parts) >= 2 else parts[0]
        return slug.replace('-', ' ').title()

    def _parse_toc_from_raw(self, raw_html):
        text = raw_html.replace('\\/', '/')
        text = re.sub(r'\\u002[Ff]', '/', text)

        combined = re.compile(
            r'(?:'
            r'/\d{7,}/[a-z0-9][a-z0-9\-]*/'
            r'|/article/\d{4}/\d{2}/\d{2}/[a-z0-9][a-z0-9\-]*/'
            r'|/collection/[a-z0-9\-]+/\d{4}/[a-z0-9\-]*/'
            r')'
        )

        seen = set()
        paths = []
        for m in combined.finditer(text):
            path = m.group(0)
            if re.search(r'/magazine/|/section/|/vault/', path):
                continue
            if path not in seen:
                seen.add(path)
                paths.append(path)

        if not paths:
            return None

        articles = []
        for path in paths:
            url = 'https://time.com' + path
            try:
                art_soup = self.index_to_soup(url)
                h1 = art_soup.find('h1')
                title = h1.get_text(strip=True) if h1 else self._slug_to_title(path)
            except Exception:
                title = self._slug_to_title(path)
            self.log('\t', title)
            articles.append({'title': title, 'url': url})
            time_mod.sleep(0.5)

        return [('Articles', articles)]

    def _fetch_raw(self, url):
        resp = self.browser.open(url)
        return resp.read().decode('utf-8', errors='replace')

    def _title_from_path(self, path):
        slug = path.strip('/').split('/')[-1]
        m = re.match(r'([a-z]+)-(\d+)[a-z]*-(\d{4})', slug)
        if m:
            month, day, year = m.group(1).capitalize(), m.group(2), m.group(3)
            return '%s %s, %s' % (month, day, year)
        return slug.replace('-', ' ').title()

    def parse_index(self):
        opts = self.recipe_specific_options
        custom_path = (opts.get('edition') or '').strip()

        if custom_path:
            edition_url = custom_path if custom_path.startswith('http') else 'https://time.com' + custom_path
            date_str = self._title_from_path(custom_path)
            self.log('Using custom edition:', edition_url)
        else:
            year = utcnow().year
            archive_soup = self.index_to_soup('https://time.com/vault/year/%d/' % year)
            edition_url, edition_title = self._find_latest_us_edition(archive_soup)
            if not edition_url:
                archive_soup = self.index_to_soup(
                    'https://time.com/vault/year/%d/' % (year - 1)
                )
                edition_url, edition_title = self._find_latest_us_edition(archive_soup)
            if not edition_url:
                raise ValueError('Could not find latest US edition')
            self.log('Found edition:', edition_url)
            date_str = edition_title.split('|')[0].strip() if edition_title else 'USA'

        self.title = 'TIME Magazine'
        self.timefmt = ' [' + date_str + ']'

        raw_html = self._fetch_raw(edition_url)
        raw_html = self._sanitize_nested_anchors(raw_html)

        from calibre.ebooks.BeautifulSoup import BeautifulSoup
        soup = BeautifulSoup(raw_html, 'html.parser')

        for span in soup.find_all('span', class_=lambda c: c and 'text-t3' in c):
            if span.get_text(strip=True).lower() == 'issue name':
                p = span.find_next_sibling('p')
                if p:
                    full_issue_name = p.get_text(strip=True)
                    date_str = full_issue_name.split('|')[0].strip()
                    if date_str:
                        self.timefmt = ' [' + date_str + ']'
                        self.log('Issue name:', full_issue_name)
                break

        figure = soup.find('figure')
        if figure:
            img = figure.find('img')
            if img:
                src = img.get('src', '')
                self.cover_url = self._set_img_width(src)
                self.log('Cover:', self.cover_url)

        sections = self._parse_toc_from_soup(soup)
        if sections:
            self.log('Parsed TOC from HTML structure (%d sections)' % len(sections))
            return sections

        self.log('TOC not in HTML structure, trying raw payload extraction')
        sections = self._parse_toc_from_raw(raw_html)
        if sections:
            self.log('Extracted %d article URLs from raw payload' % len(sections[0][1]))
            return sections

        raise ValueError(
            'Could not parse edition TOC from either HTML or raw payload. '
            'Response length: %d bytes' % len(raw_html)
        )

    def preprocess_html(self, soup):
        for picture in soup.find_all('picture'):
            img = picture.find('img')
            if img:
                for attr in ('srcset', 'sizes', 'class', 'style', 'loading', 'decoding'):
                    img.attrs.pop(attr, None)
                img['src'] = self._set_img_width(img.get('src', ''))
                picture.replace_with(img)
            else:
                picture.decompose()

        for a in soup.find_all('a', href=True):
            if 'magazineshop' in a['href']:
                parent = a.find_parent()
                if parent and parent.name in ('p', 'div', 'span'):
                    parent.decompose()
                else:
                    a.decompose()

        for figure in soup.find_all('figure'):
            figcaption = figure.find('figcaption')
            caption_text = figcaption.get_text(strip=True) if figcaption else ''
            img = figure.find('img')
            if img:
                img_src = self._set_img_width(img.get('src', ''))
                img_tag = soup.new_tag('img', src=img_src, alt=caption_text)
                new_fig = soup.new_tag('figure')
                new_fig.append(img_tag)
                if caption_text:
                    new_cap = soup.new_tag(
                        'figcaption',
                        style='font-size:0.8em; color:#555; font-style:italic; text-align:center; margin-top:0.3em;'
                    )
                    new_cap.string = caption_text
                    new_fig.append(new_cap)
                figure.replace_with(new_fig)
            else:
                figure.decompose()

        for tag in soup.find_all(['img']):
            for attr in ('style', 'class'):
                if tag.has_attr(attr):
                    del tag[attr]
            tag['src'] = self._set_img_width(tag.get('src', ''))

        for div in soup.find_all(
            'div', class_=lambda c: c and 'justify-items-center' in c
        ):
            div.unwrap()
        for div in soup.find_all(
            'div', class_=lambda c: c and 'container' in c and 'mx-auto' in c
        ):
            div.unwrap()
        for div in soup.find_all('div', class_=lambda c: c and 'grid' in c):
            div.unwrap()

        for ul in soup.find_all('ul'):
            tag_links = [a for a in ul.find_all('a', href=True)
                         if '/section/' in a['href'] or '/tag/' in a['href']]
            if tag_links:
                p = soup.new_tag('p')
                for i, a in enumerate(tag_links):
                    if i > 0:
                        p.append(' | ')
                    p.append(a)
                ul.replace_with(p)

        for p in soup.find_all('p'):
            if p.get_text(strip=True) == 'Advertisement':
                parent = p.find_parent(attrs={'data-ad-wrapper': True}) or p.find_parent(
                    class_=lambda c: c and 'bg-warm-grey-ads' in c
                )
                if parent:
                    parent.decompose()
                else:
                    p.decompose()

        return soup