diff --git a/cps/metadata_provider/kobo.py b/cps/metadata_provider/kobo.py new file mode 100644 index 0000000..4eaaf71 --- /dev/null +++ b/cps/metadata_provider/kobo.py @@ -0,0 +1,1170 @@ +# -*- coding: utf-8 -*- +# Calibre-Web Automated – fork of Calibre-Web +# Copyright (C) 2018-2025 Calibre-Web contributors +# Copyright (C) 2024-2025 Calibre-Web Automated contributors +# SPDX-License-Identifier: GPL-3.0-or-later +# See CONTRIBUTORS for full list of authors. + +import concurrent.futures +import json +import re +from datetime import datetime +from typing import Any, Dict, List, Optional, Tuple, Union +import os +from http.cookies import SimpleCookie + +from curl_cffi import requests as creq # type: ignore +from bs4 import BeautifulSoup as BS +from bs4.element import Tag +from cps import logger +from cps.isoLanguages import get_lang3, get_language_name +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata + +log = logger.create() + + +class Kobo(Metadata): + """Kobo metadata provider via web scraping. + + Accepts a query string and a language ISO-2 code (e.g., "en", "ja"). + Scrapes Kobo search results and follows detail pages to extract: + Title, Authors, Year, Series Name, Series Number, Language, Cover, Synopsis. + """ + + __name__ = "Kobo" + __id__ = "kobo" + + DESCRIPTION = "Kobo Books" + META_URL = "https://www.kobo.com/" + + # Centralized selectors and patterns for maintainability + SERIES_DT_TESTID = "series-product-type-and-number" + SERIES_LINK_SELECTOR = "a[href*='series/']" + TITLE_SELECTORS = [ + "h1[data-testid='title']", + "[data-testid='product-title']", + "[data-testid='product-header-title']", + "[data-testid='title'] .link--label", + "[data-testid='title']", + "h1[itemprop='name']", + "h1", + ] + AUTHOR_SELECTORS = [ + "dd[data-testid='authors'] a[data-testid='book-attribute-link'] .link--label", + "dd[data-testid='authors'] a[href*='author/'] .link--label", + "[data-automation='author-name']", + "a[href*='/search?query='][href*='contributor']", + "a[href*='author/']", + ] + DESC_SELECTORS = [ + "[data-full-synopsis]", + "[data-testid='synopsis']", + "[data-testid='description']", + "[data-automation='synopsis']", + "[data-automation='book-description']", + "[itemprop='description']", + ".text-synopsis", + ] + BOOK_DT_PREFIX_RE = re.compile(r"^\s*Book\b", re.I) + CARD_SELECTORS = ( + "[data-testid=\"book-card-search-result-items\"], [data-testid=\"search-result-widget\"]" + ) + + headers = { + "user-agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + "accept": ( + "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8" + ), + "accept-language": "en-US,en;q=0.9", + "upgrade-insecure-requests": "1", + "accept-encoding": "gzip, deflate, br, zstd", + "referer": "https://www.kobo.com/", + "sec-ch-ua": '"Google Chrome";v="120", "Chromium";v="120", "Not:A-Brand";v="99"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-ch-ua-full-version-list": '"Google Chrome";v="120.0.0.0", "Chromium";v="120.0.0.0", "Not:A-Brand";v="99.0.0.0"', + "sec-fetch-site": "same-origin", + "sec-fetch-mode": "navigate", + "sec-fetch-dest": "document", + "sec-fetch-user": "?1", + } + session = creq.Session(impersonate="chrome120") + session.headers.update(headers) + + SEARCH_MAX = 5 + DETAIL_TIMEOUT = 12 + + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: + if not self.active: + return [] + + headers = self._headers_for_locale(locale) + headers = self._apply_cookies(headers) + + # Warm up session to pick up cookies that sometimes gate search + try: + self._get(self.META_URL, headers=headers, timeout=8) + except Exception: + pass + + # Build primary and fallback search URLs + primary_url = self._build_search_url(query=query, lang=locale or "en") + simple_q = "+".join(list(self.get_title_tokens(query, strip_joiners=False)) or [query]) + fallback_url = f"https://www.kobo.com/search?query={simple_q}&fcmedia=Book" + + r = None + for url in (primary_url, fallback_url): + try: + r = self._get(url, headers=headers, timeout=10, allow_redirects=True) + if r.status_code == 403: + continue + r.raise_for_status() + break + except Exception as e: + # Treat as a hard failure on this URL and stop trying + log.warning("Kobo search failed for %s: %s", url, e) + return [] + if not r or r.status_code >= 400: + log.warning("Kobo search failed: no usable response (last status %s)", r.status_code if r else None) + return [] + + soup = BS(r.text, "lxml") + next_data = self._get_next_data_json(soup) + # Harvest search-level series hints for fallback (e.g., 'Book 6 -') + search_series_map = self._extract_search_series_map(soup, next_data) + + + links = self._extract_result_links(soup) + links = links[: self.SEARCH_MAX] + + results: List[Tuple[MetaRecord, int]] = [] + + def fetch_and_parse(link: str, index: int): + try: + rec = self._fetch_detail(link, generic_cover, locale) + if rec: + # Backfill series data from search page if missing + slug = self._extract_kobo_id_from_url(link) + ser = search_series_map.get(slug) + if ser: + name, idx = ser + if not rec.series: + rec.series = name + if not rec.series_index: + rec.series_index = idx + return (rec, index) + except Exception as ex: + log.warning("Kobo detail fetch failed for %s: %s", link, ex) + return None + + if not links: + return [] + + with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: + futs = {executor.submit(fetch_and_parse, link, i): i for i, link in enumerate(links)} + try: + for fut in concurrent.futures.as_completed(futs, timeout=self.DETAIL_TIMEOUT): + item = fut.result() + if item: + results.append(item) + except concurrent.futures.TimeoutError: + log.warning("Kobo search detail timeout after %ss", self.DETAIL_TIMEOUT) + + results.sort(key=lambda x: x[1]) + return [x[0] for x in results] + + def _build_search_url(self, query: str, lang: str) -> str: + lang = str(lang or "en").lower() + country = "jp" if lang == "ja" else "us" + path_lang = "ja" if lang == "ja" else "en" + tokens = list(self.get_title_tokens(query, strip_joiners=False)) or [query] + q = "+".join(tokens) + return f"https://www.kobo.com/{country}/{path_lang}/search?query={q}&fcmedia=Book" + + def _headers_for_locale(self, locale: str) -> Dict[str, str]: + h = dict(self.headers) + loc = str(locale or "").lower() + if loc.startswith("ja"): + h["accept-language"] = "ja-JP,ja;q=0.9" + else: + h["accept-language"] = "en-US,en;q=0.9" + return h + + def _get(self, url: str, headers: Optional[Dict[str, str]] = None, timeout: int = 10, allow_redirects: bool = True) -> Any: + resp = self.session.get(url, headers=headers, timeout=timeout, allow_redirects=allow_redirects) + return resp + + def _extract_result_links(self, soup: BS) -> List[str]: + # Prefer product links that contain '/ebook/' and avoid audiobooks + seen = set() + links: List[str] = [] + for a in soup.select("a[href]"): + if not isinstance(a, Tag): + continue + href = str(a.get("href", "")) + if not href: + continue + if "/ebook/" not in href: + continue + if "/audiobook/" in href: + continue + # Make absolute + if href.startswith("/"): + href = f"https://www.kobo.com{href}" + if href not in seen: + seen.add(href) + links.append(href) + return links + + def _extract_search_series_map(self, soup: BS, next_data: Optional[Dict[str, Any]] = None) -> Dict[str, Tuple[str, Union[int, float]]]: + """From the search results page, build a map of product slug -> (series_name, series_index). + Priority is parsing __NEXT_DATA__ JSON, then fall back to DOM. + Only captures 'Book' media to avoid audiobooks. + """ + # 1) Try provided NEXT_DATA JSON + try: + j = next_data or self._get_next_data_json(soup) + if isinstance(j, dict): + items = ( + j.get("props", {}) + .get("pageProps", {}) + .get("searchResultSSR", {}) + .get("Items", []) + ) + out: Dict[str, Tuple[str, Union[int, float]]] = {} + for it in items: + if not isinstance(it, dict): + continue + book = it.get("Book") + if not isinstance(book, dict): + continue + slug = str(book.get("Slug", "")) + # Series fields may be absent for standalones + sname = str(book.get("SeriesName", "")).strip() + snum = book.get("SeriesNumber") or book.get("SeriesNumberFloat") + if not (slug and sname and snum): + continue + idx = self._parse_series_index(snum) + if not idx: + continue + out[slug] = (sname, idx) + if out: + return out + except Exception as e: + log.warning("Kobo: Failed to parse NEXT_DATA search series map; falling back to DOM: %s", e) + + # 2) Fallback to DOM card parsing + out: Dict[str, Tuple[str, Union[int, float]]] = {} + try: + # Each card generally has a title link and a dt/dd pair for series + for card in soup.select(self.CARD_SELECTORS): + if not isinstance(card, Tag): + continue + a = card.select_one("a[data-testid='title'][href]") + if not isinstance(a, Tag): + continue + href = a.get("href") or "" + if not isinstance(href, str): + continue + if "/ebook/" not in href: + continue + # Normalize absolute URL + if href.startswith("/"): + href = f"https://www.kobo.com{href}" + slug = self._extract_kobo_id_from_url(href) + + # Find the series dt/dd inside this card + dd = card.select_one(f"dd[data-testid='{self.SERIES_DT_TESTID}']") + dt = card.select_one(f"dt[data-testid='{self.SERIES_DT_TESTID}']") + if not (isinstance(dd, Tag) and isinstance(dt, Tag)): + continue + dt_text = dt.get_text(" ", strip=True) + if not self.BOOK_DT_PREFIX_RE.search(dt_text): + continue + idx = self._parse_series_index(dt_text) + lbl = dd.select_one("a .link--label") or dd.select_one(".link--label") or dd.select_one("a") + series_name = lbl.get_text(strip=True) if isinstance(lbl, Tag) else dd.get_text(" ", strip=True) + if slug and series_name and idx: + out[slug] = (series_name, idx) + except Exception as e: + log.warning("Kobo: DOM search series map extraction failed: %s", e) + pass + return out + + def _fetch_detail(self, url: str, generic_cover: str, locale: str) -> Optional[MetaRecord]: + # Data precedence: NEXT_DATA > hidden synopsis > DOM > meta/LD-JSON + headers = self._headers_for_locale(locale) + headers = self._apply_cookies(headers) + r = self._get(url, headers=headers, timeout=10) + r.raise_for_status() + soup = BS(r.text, "lxml") + next_data = self._get_next_data_json(soup) + + # 1) Prefer data from Next.js __NEXT_DATA__ first + data = self._parse_next_data_detail(soup, url, next_data) + # 2) Then augment with DOM fallbacks + # Basic meta/OG fallbacks (image, short description, etc.) + meta_data = self._parse_meta_fallbacks(soup) + for k in ("image", "publisher", "publishedDate", "language", "description"): + if meta_data.get(k) and not data.get(k): + data[k] = meta_data[k] + # Published date from DOM if still missing + if not data.get("publishedDate"): + dom_pub = self._parse_published_from_dom(soup) + if dom_pub: + data["publishedDate"] = dom_pub + # Publisher from DOM if still missing (eBook Details or dt/dd) + if not data.get("publisher"): + dom_publisher = self._parse_publisher_from_dom(soup) + if dom_publisher: + data["publisher"] = dom_publisher + if not data.get("authors"): + dom_authors = self._parse_authors_from_dom(soup) + if dom_authors: + data["authors"] = dom_authors + # Series precedence: NEXT_DATA already tried inside; augment from DOM only if missing + if not data.get("series") or not data.get("series_index"): + dom_series_name, dom_series_index = self._parse_series_from_dom(soup) + if not data.get("series") and dom_series_name: + data["series"] = dom_series_name + if not data.get("series_index") and dom_series_index: + data["series_index"] = dom_series_index + # Prefer visible title from DOM if missing or looks like a page title + dom_title = self._parse_title_from_dom(soup) + if dom_title: + prev_title = data.get("title") + data["title"] = dom_title + + # 3) If still thin, fill from JSON-LD/meta as a last resort + if not data or (not data.get("title") and not data.get("authors")): + ld = self._parse_ld_json(soup) + if ld: + for k, v in ld.items(): + data.setdefault(k, v) + + # Prefer a longer synopsis from embedded JSON or DOM + json_desc = self._parse_description_from_embedded_json(soup, next_data) + if json_desc and len(json_desc) > len(data.get("description", "")): + data["description"] = json_desc + # Hidden full synopsis on detail pages + hidden_desc = self._parse_hidden_full_synopsis(soup) + if hidden_desc and len(hidden_desc) > len(data.get("description", "")): + data["description"] = hidden_desc + # General DOM synopsis as another fallback + dom_desc = self._parse_description_from_dom(soup) + if not data.get("description") and dom_desc: + data["description"] = dom_desc + + title = str(data.get("title", "")).strip() + authors = data.get("authors", []) + description = self._clean_description(data.get("description", "")) + language_code = str(data.get("language", locale or "")).lower() + cover = self._normalize_cover_url(data.get("image", generic_cover)) + publisher = data.get("publisher", "") + published = data.get("publishedDate", "") + series = data.get("series", "") + series_index = data.get("series_index", 0) + identifiers: Dict[str, Union[str, int]] = {} + identifiers["kobo"] = self._extract_kobo_id_from_url(url) + + # Normalize languages to display names like other providers + languages: List[str] = [] + if language_code: + try: + languages = [get_language_name(locale or "en", get_lang3(language_code))] + except Exception: + languages = [] + + match = MetaRecord( + id=identifiers.get("kobo") or title, + title=title, + authors=authors, + url=url, + source=MetaSourceInfo(id=self.__id__, description=self.DESCRIPTION, link=self.META_URL), + ) + match.cover = cover or generic_cover + match.description = description + match.languages = languages + match.publisher = publisher + match.publishedDate = published + match.series = series + match.series_index = series_index + match.identifiers = identifiers + match.tags = [] + + # If there is no synopsis, treat as not-a-book and skip + if not match.description: + return None + + return match + + def _parse_ld_json(self, soup: BS) -> Dict[str, Any]: + data: Dict[str, Any] = {} + scripts = soup.find_all("script", attrs={"type": "application/ld+json"}) + for s in scripts: + if not isinstance(s, Tag): + continue + try: + content = s.get_text() or "{}" + j = json.loads(content) + except Exception: + continue + + def extract_from(obj: Dict) -> Optional[Dict]: + if not isinstance(obj, dict): + return None + types = obj.get("@type") + types = [types] if isinstance(types, str) else types or [] + # Some Kobo pages wrap the Book as mainEntity or use Product + if not any(t in ("Book", "Product") for t in types): + # If this object has a mainEntity, try extracting from it + main = obj.get("mainEntity") + if isinstance(main, dict): + return extract_from(main) + return None + + title = obj.get("name") or obj.get("headline") or "" + # Authors can be list of dicts, dict, or string + authors = obj.get("author") + if isinstance(authors, dict): + authors = [authors.get("name", "")] if authors else [] + elif isinstance(authors, list): + authors = [a.get("name", "") if isinstance(a, dict) else str(a) for a in authors] + elif isinstance(authors, str): + authors = [authors] + else: + authors = [] + + # Description may contain HTML; strip tags + desc = obj.get("description", "") + if desc: + desc = BS(desc, "lxml").text.strip() + + lang = (obj.get("inLanguage") or "").lower() + image = obj.get("image") if isinstance(obj.get("image"), str) else obj.get("image", {}).get("url", "") + + publisher = obj.get("publisher", {}) + if isinstance(publisher, dict): + publisher = publisher.get("name", "") + else: + publisher = str(publisher) if publisher else "" + + date_published = obj.get("datePublished", "") + published = self._normalize_date(date_published) + + isbn = obj.get("isbn") or (obj.get("identifier") if isinstance(obj.get("identifier"), str) else None) + + # Series info + series_name = "" + series_index = 0 + is_part_of = obj.get("isPartOf") or obj.get("partOfSeries") or obj.get("series") + if isinstance(is_part_of, dict): + series_name = is_part_of.get("name", "") + series_index = self._parse_series_index(obj.get("position") or is_part_of.get("position")) + elif isinstance(is_part_of, list) and is_part_of: + first = is_part_of[0] + if isinstance(first, dict): + series_name = first.get("name", "") + series_index = self._parse_series_index(first.get("position")) + + # rating ignored + + out = { + "title": title, + "authors": [a for a in authors if a], + "description": desc, + "language": lang, + "image": image, + "publisher": publisher, + "publishedDate": published, + "series": series_name, + "series_index": series_index, + } + # Only accept if it looks like a book + if out["title"] or out["authors"]: + return out + return None + + # The JSON-LD may be an array, an object, or an object with @graph + candidates: List[Dict[str, Any]] = [] + if isinstance(j, list): + for item in j: + ext = extract_from(item) + if ext: + candidates.append(ext) + elif isinstance(j, dict): + # If this is a @graph wrapper, iterate its items + if isinstance(j.get("@graph"), list): + for item in j["@graph"]: + ext = extract_from(item) + if ext: + candidates.append(ext) + else: + ext = extract_from(j) + if ext: + candidates.append(ext) + + # Prefer the first valid candidate + if candidates: + return candidates[0] + return {} + + def _parse_meta_fallbacks(self, soup: BS) -> Dict[str, Any]: + data: Dict[str, Any] = {} + # Basic OpenGraph fallbacks + og_title = soup.find("meta", property="og:title") + if isinstance(og_title, Tag): + content = og_title.get("content") + if isinstance(content, str): + data["title"] = content.strip() + og_image = soup.find("meta", property="og:image") + if isinstance(og_image, Tag): + content = og_image.get("content") + if isinstance(content, str): + data["image"] = content.strip() + # Kobo pages often include description in meta name="description" + meta_desc = soup.find("meta", attrs={"name": "description"}) + if isinstance(meta_desc, Tag): + content = meta_desc.get("content") + if isinstance(content, str): + data["description"] = content.strip() + + # Attempt to extract authors from visible page nodes as a heuristic + authors = self._parse_authors_from_dom(soup) + if authors: + data["authors"] = authors + + # Extract series details from DOM + series_name, series_index = self._parse_series_from_dom(soup) + if series_name: + data["series"] = series_name + if series_index: + data["series_index"] = series_index + + # Language is typically implied by path; leave blank here + data.setdefault("language", "") + data.setdefault("publisher", "") + data.setdefault("publishedDate", "") + data.setdefault("series", "") + data.setdefault("series_index", 0) + return data + + def _parse_authors_from_dom(self, soup: BS) -> List[str]: + authors: List[str] = [] + # Common Kobo patterns for authors + for sel in self.AUTHOR_SELECTORS: + for n in soup.select(sel): + if not isinstance(n, Tag): + continue + t = n.get_text(strip=True) + if t and t not in authors: + authors.append(t) + return authors + + def _parse_publisher_from_dom(self, soup: BS) -> str: + """Extract publisher/imprint from visible DOM on the detail page. + Prefers an explicit Publisher dt/dd row, then falls back to the eBook Details list. + """ + try: + # 1) Look for dt/dd rows with 'publisher' + for dt in soup.select("dt"): + if not isinstance(dt, Tag): + continue + label = (dt.get_text(" ", strip=True) or "").lower() + if "publisher" in label: + dd = dt.find_next_sibling("dd") + if isinstance(dd, Tag): + val = dd.get_text(" ", strip=True) + if val: + return val + + # 2) Fallback to eBook Details list + # Try explicit 'Imprint:' first and use its value as publisher if found + for li in soup.select(".bookitem-secondary-metadata ul li, ul li"): + if not isinstance(li, Tag): + continue + raw = li.get_text(" ", strip=True) or "" + low = raw.lower() + if low.startswith("imprint:"): + # Prefer anchor or span content + el = li.find("a") or li.find("span") + txt = (el.get_text(" ", strip=True) if isinstance(el, Tag) else raw.split(":", 1)[-1]).strip() + if txt: + return txt + + # 3) If no label present, the first list item is often the publisher name + # Skip known labeled rows: release date, imprint, book id, language, download options + for li in soup.select(".bookitem-secondary-metadata ul li"): + if not isinstance(li, Tag): + continue + raw = (li.get_text(" ", strip=True) or "").strip() + low = raw.lower() + if not raw: + continue + if any(x in low for x in ("release date", "imprint:", "book id:", "language:", "download options")): + continue + # Likely the publisher name + return raw + except Exception: + pass + return "" + + def _parse_series_from_dom(self, soup: BS) -> Tuple[str, int]: + # Parse the series name and index from dt/dd pairs + # Example (from search page): + #