mirror of
https://github.com/kovidgoyal/calibre.git
synced 2026-05-08 10:32:28 +00:00
Update The Week
This commit is contained in:
+29
-14
@@ -2,6 +2,8 @@
|
||||
# vim:fileencoding=utf-8
|
||||
import json
|
||||
import re
|
||||
import urllib.parse
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
|
||||
@@ -36,11 +38,19 @@ class LiveMint(BasicNewsRecipe):
|
||||
self.oldest_article = float(d)
|
||||
|
||||
def get_cover_url(self):
|
||||
soup = self.index_to_soup('https://epaper.livemint.com/')
|
||||
cov = soup.findAll('img', attrs={'src': lambda x: x and x.startswith('/EPAPERIMAGES')})
|
||||
for x in cov:
|
||||
if 'MINT_FRONT_1' in x['src']:
|
||||
return 'https://epaper.livemint.com' + x['src'].replace('-S', '')
|
||||
today = datetime.today().strftime('%d/%m/%Y')
|
||||
if datetime.today().weekday() in (5, 6):
|
||||
today = (
|
||||
datetime.today() - timedelta(days=datetime.today().weekday() - 4)
|
||||
).strftime('%d/%m/%Y')
|
||||
soup = self.index_to_soup(
|
||||
'https://epaper.livemint.com/Home/GetAllpages?editionid=1&editiondate='
|
||||
+ urllib.parse.quote(today, safe=''),
|
||||
raw=True,
|
||||
)
|
||||
for x in json.loads(soup):
|
||||
if 'Mint_Front_' in x['SectionName'] or 'Mint_Cover' in x['SectionName']:
|
||||
return x['MrImageUrl']
|
||||
|
||||
extra_css = '''
|
||||
img {margin:0 auto;}
|
||||
@@ -58,7 +68,12 @@ class LiveMint(BasicNewsRecipe):
|
||||
name='article',
|
||||
attrs={'id': lambda x: x and x.startswith(('article_', 'box_'))},
|
||||
),
|
||||
dict(attrs={'class': lambda x: x and x.startswith(('storyPage_storyBox__', 'storyBoxSchema'))}),
|
||||
dict(
|
||||
attrs={
|
||||
'class': lambda x: x
|
||||
and x.startswith(('storyPage_storyBox__', 'storyBoxSchema'))
|
||||
}
|
||||
),
|
||||
classes('contentSec'),
|
||||
]
|
||||
|
||||
@@ -67,13 +82,11 @@ class LiveMint(BasicNewsRecipe):
|
||||
dict(
|
||||
attrs={
|
||||
'class': lambda x: x
|
||||
and x.startswith(
|
||||
(
|
||||
'storyPage_alsoRead__',
|
||||
'storyPage_firstPublishDate__',
|
||||
'storyPage_bcrumb__',
|
||||
)
|
||||
)
|
||||
and x.startswith((
|
||||
'storyPage_alsoRead__',
|
||||
'storyPage_firstPublishDate__',
|
||||
'storyPage_bcrumb__',
|
||||
))
|
||||
}
|
||||
),
|
||||
dict(attrs={'id': ['faqSection', 'seoText', 'ellipsisId', 'gift_redeemed_box ']}),
|
||||
@@ -108,7 +121,9 @@ class LiveMint(BasicNewsRecipe):
|
||||
def preprocess_raw_html(self, raw, *a):
|
||||
# remove empty p tags
|
||||
raw = re.sub(
|
||||
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])', r'\g<2>', re.sub(
|
||||
r'(<p>\s*)(<[^(\/|a|i|b|em|strong)])',
|
||||
r'\g<2>',
|
||||
re.sub(
|
||||
r'(<p>\s* \s*<\/p>)|(<p>\s*<\/p>)|(<p\s*\S+> \s*<\/p>)', '', raw
|
||||
),
|
||||
)
|
||||
|
||||
+24
-6
@@ -6,10 +6,11 @@ from calibre.web.feeds.news import BasicNewsRecipe, classes
|
||||
|
||||
|
||||
class TheWeek(BasicNewsRecipe):
|
||||
title = u'The Week'
|
||||
title = 'The Week'
|
||||
description = (
|
||||
'The Week is the best selling general interest English news magazine. The magazine covers politics, entertainment,'
|
||||
' social issues, trends, technology, lifestyle and everything else you should be knowing. Best downloaded on Mondays.')
|
||||
' social issues, trends, technology, lifestyle and everything else you should be knowing. Best downloaded on Mondays.'
|
||||
)
|
||||
language = 'en_IN'
|
||||
__author__ = 'unkn0wn'
|
||||
encoding = 'utf-8'
|
||||
@@ -34,7 +35,7 @@ class TheWeek(BasicNewsRecipe):
|
||||
recipe_specific_options = {
|
||||
'date': {
|
||||
'short': 'The date of the edition to download (YYYY.MM.DD format)',
|
||||
'long': 'For example, 2024.06.30'
|
||||
'long': 'For example, 2024.06.30',
|
||||
}
|
||||
}
|
||||
|
||||
@@ -44,7 +45,9 @@ class TheWeek(BasicNewsRecipe):
|
||||
soup = self.index_to_soup(
|
||||
'https://www.magzter.com/IN/Malayala_Manorama/THE_WEEK/Business/'
|
||||
)
|
||||
return soup.find('img', id=lambda s: s and 'mgd__lhd__cover' in s.split())['src']
|
||||
return soup.find('img', id=lambda s: s and 'mgd__lhd__cover' in s.split())[
|
||||
'src'
|
||||
]
|
||||
|
||||
def parse_index(self):
|
||||
issue = 'https://www.theweek.in/theweek.html'
|
||||
@@ -70,6 +73,21 @@ class TheWeek(BasicNewsRecipe):
|
||||
return [('Articles', ans)]
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
for img in soup.findAll('img', attrs={'data-src-web': True}):
|
||||
img['src'] = img['data-src-web']
|
||||
for pic in soup.findAll('picture'):
|
||||
if src := pic.find('source'):
|
||||
if img := pic.find('img'):
|
||||
img['src'] = src['srcset'].split('.jpg')[0] + '.jpg'
|
||||
for src in pic.findAll('source'):
|
||||
src.extract()
|
||||
for img in soup.findAll('img'):
|
||||
src = img.get('data-src-mobile') or img.get('data-src-web')
|
||||
if src:
|
||||
img['src'] = src
|
||||
return soup
|
||||
|
||||
def populate_article_metadata(self, article, soup, first):
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
p = h1.findNext('p')
|
||||
if p:
|
||||
article.text_summary = article.summary = self.tag_to_string(p)
|
||||
|
||||
Reference in New Issue
Block a user