Update The Week

This commit is contained in:
unkn0w7n
2026-04-06 18:13:59 +05:30
parent 2dc9de23bd
commit 40bfe696f8
2 changed files with 53 additions and 20 deletions
+29 -14
View File
@@ -2,6 +2,8 @@
# vim:fileencoding=utf-8
import json
import re
import urllib.parse
from datetime import datetime, timedelta
from calibre.web.feeds.news import BasicNewsRecipe, classes
@@ -36,11 +38,19 @@ class LiveMint(BasicNewsRecipe):
self.oldest_article = float(d)
def get_cover_url(self):
soup = self.index_to_soup('https://epaper.livemint.com/')
cov = soup.findAll('img', attrs={'src': lambda x: x and x.startswith('/EPAPERIMAGES')})
for x in cov:
if 'MINT_FRONT_1' in x['src']:
return 'https://epaper.livemint.com' + x['src'].replace('-S', '')
today = datetime.today().strftime('%d/%m/%Y')
if datetime.today().weekday() in (5, 6):
today = (
datetime.today() - timedelta(days=datetime.today().weekday() - 4)
).strftime('%d/%m/%Y')
soup = self.index_to_soup(
'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate='
+ urllib.parse.quote(today, safe=''),
raw=True,
)
for x in json.loads(soup):
if 'Mint_Front_' in x['SectionName'] or 'Mint_Cover' in x['SectionName']:
return x['MrImageUrl']
extra_css = '''
img {margin:0 auto;}
@@ -58,7 +68,12 @@ class LiveMint(BasicNewsRecipe):
name='article',
attrs={'id': lambda x: x and x.startswith(('article_', 'box_'))},
),
dict(attrs={'class': lambda x: x and x.startswith(('storyPage_storyBox__', 'storyBoxSchema'))}),
dict(
attrs={
'class': lambda x: x
and x.startswith(('storyPage_storyBox__', 'storyBoxSchema'))
}
),
classes('contentSec'),
]
@@ -67,13 +82,11 @@ class LiveMint(BasicNewsRecipe):
dict(
attrs={
'class': lambda x: x
and x.startswith(
(
'storyPage_alsoRead__',
'storyPage_firstPublishDate__',
'storyPage_bcrumb__',
)
)
and x.startswith((
'storyPage_alsoRead__',
'storyPage_firstPublishDate__',
'storyPage_bcrumb__',
))
}
),
dict(attrs={'id': ['faqSection', 'seoText', 'ellipsisId', 'gift_redeemed_box ']}),
@@ -108,7 +121,9 @@ class LiveMint(BasicNewsRecipe):
def preprocess_raw_html(self, raw, *a):
# remove empty p tags
raw = re.sub(
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', r'\g<2>', re.sub(
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])',
r'\g<2>',
re.sub(
r'(<p>\s*&nbsp;\s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+>&nbsp;\s*<\/p>)', '', raw
),
)
+24 -6
View File
@@ -6,10 +6,11 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes
class TheWeek(BasicNewsRecipe):
title = u'The Week'
title = 'The Week'
description = (
'The Week is the best selling general interest English news magazine. The magazine covers politics, entertainment,'
' social issues, trends, technology, lifestyle and everything else you should be knowing. Best downloaded on Mondays.')
' social issues, trends, technology, lifestyle and everything else you should be knowing. Best downloaded on Mondays.'
)
language = 'en_IN'
__author__ = 'unkn0wn'
encoding = 'utf-8'
@@ -34,7 +35,7 @@ class TheWeek(BasicNewsRecipe):
recipe_specific_options = {
'date': {
'short': 'The date of the edition to download (YYYY.MM.DD format)',
'long': 'For example, 2024.06.30'
'long': 'For example, 2024.06.30',
}
}
@@ -44,7 +45,9 @@ class TheWeek(BasicNewsRecipe):
soup = self.index_to_soup(
'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/'
)
return soup.find('img', id=lambda s: s and 'mgd__lhd__cover' in s.split())['src']
return soup.find('img', id=lambda s: s and 'mgd__lhd__cover' in s.split())[
'src'
]
def parse_index(self):
issue = 'https://www.theweek.in/theweek.html'
@@ -70,6 +73,21 @@ class TheWeek(BasicNewsRecipe):
return [('Articles', ans)]
def preprocess_html(self, soup):
for img in soup.findAll('img', attrs={'data-src-web': True}):
img['src'] = img['data-src-web']
for pic in soup.findAll('picture'):
if src := pic.find('source'):
if img := pic.find('img'):
img['src'] = src['srcset'].split('.jpg')[0] + '.jpg'
for src in pic.findAll('source'):
src.extract()
for img in soup.findAll('img'):
src = img.get('data-src-mobile') or img.get('data-src-web')
if src:
img['src'] = src
return soup
def populate_article_metadata(self, article, soup, first):
h1 = soup.find('h1')
if h1:
p = h1.findNext('p')
if p:
article.text_summary = article.summary = self.tag_to_string(p)