Compare commits

..

19 Commits

Author SHA1 Message Date
panni fb8bfeb044 untested WIP. 2020-06-06 04:24:03 +02:00
panni e083e133eb bump dev 2020-04-14 05:06:45 +02:00
panni c787e671c3 providers: addic7ed: enforce limits once they're hit, to avoid unnecessary search queries #723 2020-04-14 05:06:02 +02:00
panni 31ff93c3f1 fix logging; set DownloadLimitPerDayExceeded timeout to 4 hours (was one day); #723 2020-04-13 05:57:41 +02:00
panni 289f174e2b providers: addic7ed: properly implement limits #723 2020-04-13 05:54:04 +02:00
panni ea03f3fc4d providers: addic7ed: properly compare last_dl, add last_reset tracking info to log #723 2020-04-12 22:47:21 +02:00
panni 740fc93c13 bump dev 2020-04-12 06:54:36 +02:00
panni e94bd3fcb9 providers: addic7ed: limit downloads per day; add vip setting 2020-04-12 06:52:45 +02:00
panni dba469750b submod: HI: remove more music tags
core: update pysubs2
providers: opensubtitles: actually use sessions (they're broken) for checking for token state
2020-04-11 05:06:12 +02:00
panni c7fe6076cb bump version 2020-03-08 16:32:40 +01:00
panni 356f578014 remove py3 compat breaking unnecessary change 2020-03-08 16:29:49 +01:00
panni b151ed4c55 core: mods: CM_punctuation_space2: detect AND don't try changing domain/url/host when fixing punctuation
add python-tld; functools_lru_cache
2020-03-08 05:37:32 +01:00
panni 9455e3b52b skip drawing tags for SRT 2020-03-08 05:18:20 +01:00
panni 60e2656541 back to dev 2020-02-16 06:06:23 +01:00
panni fcb1a8a6a7 release 2.6.5.3223 2020-02-16 06:05:04 +01:00
panni 9e5829151d release 2.6.5.3223 2020-02-16 06:03:21 +01:00
panni 1f0a713f9b core: scoring: reorder subtitles based on second non-hash-score if main hash score is the same; morpheus65535/bazarr#821 2020-02-16 05:44:59 +01:00
panni ff49dd4512 providers: bsplayer: verify hash; clean up 2020-02-16 05:08:50 +01:00
panni 3cf83b5bf7 back to dev 2020-02-15 03:17:36 +01:00
68 changed files with 73168 additions and 676 deletions
+6 -2
View File
@@ -15,7 +15,8 @@ import subliminal
import subliminal_patch
import subzero.constants
import lib
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded, AuthenticationError
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded, AuthenticationError, \
DownloadLimitPerDayExceeded
from subliminal_patch.core import is_windows_special_path
from whichdb import whichdb
@@ -61,12 +62,14 @@ def int_or_default(s, default):
return default
VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled)
VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, DownloadLimitPerDayExceeded,
ServiceUnavailable, APIThrottled)
PROVIDER_THROTTLE_MAP = {
"default": {
TooManyRequests: (datetime.timedelta(hours=1), "1 hour"),
DownloadLimitExceeded: (datetime.timedelta(hours=3), "3 hours"),
DownloadLimitPerDayExceeded: (datetime.timedelta(hours=4), "4 hours"),
ServiceUnavailable: (datetime.timedelta(minutes=20), "20 minutes"),
APIThrottled: (datetime.timedelta(minutes=10), "10 minutes"),
AuthenticationError: (datetime.timedelta(hours=2), "2 hours"),
@@ -873,6 +876,7 @@ class Config(object):
provider_settings = {'addic7ed': {'username': Prefs['provider.addic7ed.username'],
'password': Prefs['provider.addic7ed.password'],
'is_vip': cast_bool(Prefs['provider.addic7ed.is_vip']),
},
'opensubtitles': {'username': Prefs['provider.opensubtitles.username'],
'password': Prefs['provider.opensubtitles.password'],
+6 -3
View File
@@ -171,12 +171,15 @@ class SubtitleListingMixin(object):
else:
s.wrong_season_ep = True
orig_matches = matches.copy()
score, score_without_hash = compute_score(matches, s, video, hearing_impaired=use_hearing_impaired)
unsorted_subtitles.append(
(s, compute_score(matches, s, video, hearing_impaired=use_hearing_impaired), matches))
scored_subtitles = sorted(unsorted_subtitles, key=operator.itemgetter(1), reverse=True)
(s, score, score_without_hash, matches, orig_matches))
scored_subtitles = sorted(unsorted_subtitles, key=operator.itemgetter(1, 2), reverse=True)
subtitles = []
for subtitle, score, matches in scored_subtitles:
for subtitle, score, score_without_hash, matches, orig_matches in scored_subtitles:
# check score
if score < min_score and not subtitle.wrong_series:
Log.Info(u'%s: Score %d is below min_score (%d)', self.name, score, min_score)
+6
View File
@@ -375,6 +375,12 @@
"default": "",
"secure": "true"
},
{
"id": "provider.addic7ed.is_vip",
"label": "Addic7ed VIP? (80 vs 40 downloads per day)",
"type": "bool",
"default": "false"
},
{
"id": "provider.addic7ed.boost_by2",
"label": "Addic7ed: boost score (if requirements met)",
+3 -3
View File
@@ -13,7 +13,7 @@
<key>CFBundleSignature</key>
<string>????</string>
<key>CFBundleVersion</key>
<string>2.6.5.3217</string>
<string>2.6.5.3237</string>
<key>PlexFrameworkVersion</key>
<string>2</string>
<key>PlexPluginClass</key>
@@ -23,7 +23,7 @@
<key>PlexPluginConsoleLogging</key>
<string>0</string>
<key>PlexPluginDevMode</key>
<string>0</string>
<string>1</string>
<key>PlexPluginCodePolicy</key>
<!-- this allows channels to access some python methods which are otherwise blocked, as well as import external code libraries, and interact with the PMS HTTP API -->
<string>Elevated</string>
@@ -32,7 +32,7 @@
&lt;h1&gt;Sub-Zero for Plex&lt;/h1&gt;&lt;i&gt;Subtitles done right&lt;/i&gt;
Version 2.6.5.3217
Version 2.6.5.3237 DEV
Originally based on @bramwalet's awesome &lt;a href=&quot;https://github.com/bramwalet/Subliminal.bundle&quot;&gt;Subliminal.bundle&lt;/a&gt;
@@ -0,0 +1 @@
__path__ = __import__('pkgutil').extend_path(__path__, __name__)
@@ -0,0 +1,196 @@
from __future__ import absolute_import
import functools
from collections import namedtuple
from threading import RLock
_CacheInfo = namedtuple("CacheInfo", ["hits", "misses", "maxsize", "currsize"])
@functools.wraps(functools.update_wrapper)
def update_wrapper(
wrapper,
wrapped,
assigned=functools.WRAPPER_ASSIGNMENTS,
updated=functools.WRAPPER_UPDATES,
):
"""
Patch two bugs in functools.update_wrapper.
"""
# workaround for http://bugs.python.org/issue3445
assigned = tuple(attr for attr in assigned if hasattr(wrapped, attr))
wrapper = functools.update_wrapper(wrapper, wrapped, assigned, updated)
# workaround for https://bugs.python.org/issue17482
wrapper.__wrapped__ = wrapped
return wrapper
class _HashedSeq(list):
__slots__ = 'hashvalue'
def __init__(self, tup, hash=hash):
self[:] = tup
self.hashvalue = hash(tup)
def __hash__(self):
return self.hashvalue
def _make_key(
args,
kwds,
typed,
kwd_mark=(object(),),
fasttypes=set([int, str, frozenset, type(None)]),
sorted=sorted,
tuple=tuple,
type=type,
len=len,
):
'Make a cache key from optionally typed positional and keyword arguments'
key = args
if kwds:
sorted_items = sorted(kwds.items())
key += kwd_mark
for item in sorted_items:
key += item
if typed:
key += tuple(type(v) for v in args)
if kwds:
key += tuple(type(v) for k, v in sorted_items)
elif len(key) == 1 and type(key[0]) in fasttypes:
return key[0]
return _HashedSeq(key)
def lru_cache(maxsize=100, typed=False):
"""Least-recently-used cache decorator.
If *maxsize* is set to None, the LRU features are disabled and the cache
can grow without bound.
If *typed* is True, arguments of different types will be cached separately.
For example, f(3.0) and f(3) will be treated as distinct calls with
distinct results.
Arguments to the cached function must be hashable.
View the cache statistics named tuple (hits, misses, maxsize, currsize) with
f.cache_info(). Clear the cache and statistics with f.cache_clear().
Access the underlying function with f.__wrapped__.
See: http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used
"""
# Users should only access the lru_cache through its public API:
# cache_info, cache_clear, and f.__wrapped__
# The internals of the lru_cache are encapsulated for thread safety and
# to allow the implementation to change (including a possible C version).
def decorating_function(user_function):
cache = dict()
stats = [0, 0] # make statistics updateable non-locally
HITS, MISSES = 0, 1 # names for the stats fields
make_key = _make_key
cache_get = cache.get # bound method to lookup key or return None
_len = len # localize the global len() function
lock = RLock() # because linkedlist updates aren't threadsafe
root = [] # root of the circular doubly linked list
root[:] = [root, root, None, None] # initialize by pointing to self
nonlocal_root = [root] # make updateable non-locally
PREV, NEXT, KEY, RESULT = 0, 1, 2, 3 # names for the link fields
if maxsize == 0:
def wrapper(*args, **kwds):
# no caching, just do a statistics update after a successful call
result = user_function(*args, **kwds)
stats[MISSES] += 1
return result
elif maxsize is None:
def wrapper(*args, **kwds):
# simple caching without ordering or size limit
key = make_key(args, kwds, typed)
result = cache_get(
key, root
) # root used here as a unique not-found sentinel
if result is not root:
stats[HITS] += 1
return result
result = user_function(*args, **kwds)
cache[key] = result
stats[MISSES] += 1
return result
else:
def wrapper(*args, **kwds):
# size limited caching that tracks accesses by recency
key = make_key(args, kwds, typed) if kwds or typed else args
with lock:
link = cache_get(key)
if link is not None:
# record recent use of the key by moving it
# to the front of the list
root, = nonlocal_root
link_prev, link_next, key, result = link
link_prev[NEXT] = link_next
link_next[PREV] = link_prev
last = root[PREV]
last[NEXT] = root[PREV] = link
link[PREV] = last
link[NEXT] = root
stats[HITS] += 1
return result
result = user_function(*args, **kwds)
with lock:
root, = nonlocal_root
if key in cache:
# getting here means that this same key was added to the
# cache while the lock was released. since the link
# update is already done, we need only return the
# computed result and update the count of misses.
pass
elif _len(cache) >= maxsize:
# use the old root to store the new key and result
oldroot = root
oldroot[KEY] = key
oldroot[RESULT] = result
# empty the oldest link and make it the new root
root = nonlocal_root[0] = oldroot[NEXT]
oldkey = root[KEY]
root[KEY] = root[RESULT] = None
# now update the cache dictionary for the new links
del cache[oldkey]
cache[key] = oldroot
else:
# put result in a new link at the front of the list
last = root[PREV]
link = [last, root, key, result]
last[NEXT] = root[PREV] = cache[key] = link
stats[MISSES] += 1
return result
def cache_info():
"""Report cache statistics"""
with lock:
return _CacheInfo(stats[HITS], stats[MISSES], maxsize, len(cache))
def cache_clear():
"""Clear the cache and cache statistics"""
with lock:
cache.clear()
root = nonlocal_root[0]
root[:] = [root, root, None, None]
stats[:] = [0, 0]
wrapper.__wrapped__ = user_function
wrapper.cache_info = cache_info
wrapper.cache_clear = cache_clear
return update_wrapper(wrapper, user_function)
return decorating_function
+517 -153
View File
@@ -2,6 +2,20 @@ import logging
import re
import sys
import ssl
import requests
try:
import copyreg
except ImportError:
import copy_reg as copyreg
try:
from HTMLParser import HTMLParser
except ImportError:
if sys.version_info >= (3, 4):
import html
else:
from html.parser import HTMLParser
from copy import deepcopy
from time import sleep
@@ -9,9 +23,17 @@ from collections import OrderedDict
from requests.sessions import Session
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
from .exceptions import (
CloudflareLoopProtection,
CloudflareCode1020,
CloudflareIUAMError,
CloudflareReCaptchaError,
CloudflareReCaptchaProvider
)
from .interpreters import JavaScriptInterpreter
from .reCaptcha import reCaptcha
from .user_agent import User_Agent
try:
@@ -25,219 +47,540 @@ except ImportError:
pass
try:
from urlparse import urlparse
from urlparse import urlunparse
from urlparse import urlparse, urljoin
except ImportError:
from urllib.parse import urlparse
from urllib.parse import urlunparse
from urllib.parse import urlparse, urljoin
##########################################################################################################################################################
__version__ = '1.1.9'
# ------------------------------------------------------------------------------- #
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
__version__ = '1.2.31'
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
class CipherSuiteAdapter(HTTPAdapter):
def __init__(self, cipherSuite=None, **kwargs):
self.cipherSuite = cipherSuite
__attrs__ = [
'ssl_context',
'max_retries',
'config',
'_pool_connections',
'_pool_maxsize',
'_pool_block'
]
if hasattr(ssl, 'PROTOCOL_TLS'):
self.ssl_context = create_urllib3_context(
ssl_version=getattr(ssl, 'PROTOCOL_TLSv1_3', ssl.PROTOCOL_TLSv1_2),
ciphers=self.cipherSuite
)
else:
self.ssl_context = create_urllib3_context(ssl_version=ssl.PROTOCOL_TLSv1)
def __init__(self, *args, **kwargs):
self.ssl_context = kwargs.pop('ssl_context', None)
self.cipherSuite = kwargs.pop('cipherSuite', None)
if not self.ssl_context:
self.ssl_context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
self.ssl_context.set_ciphers(self.cipherSuite)
self.ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
super(CipherSuiteAdapter, self).__init__(**kwargs)
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
def init_poolmanager(self, *args, **kwargs):
kwargs['ssl_context'] = self.ssl_context
return super(CipherSuiteAdapter, self).init_poolmanager(*args, **kwargs)
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
def proxy_manager_for(self, *args, **kwargs):
kwargs['ssl_context'] = self.ssl_context
return super(CipherSuiteAdapter, self).proxy_manager_for(*args, **kwargs)
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
class CloudScraper(Session):
def __init__(self, *args, **kwargs):
self.debug = kwargs.pop('debug', False)
self.delay = kwargs.pop('delay', None)
self.interpreter = kwargs.pop('interpreter', 'js2py')
self.allow_brotli = kwargs.pop('allow_brotli', True if 'brotli' in sys.modules.keys() else False)
self.cipherSuite = None
self.cipherSuite = kwargs.pop('cipherSuite', None)
self.interpreter = kwargs.pop('interpreter', 'native')
self.recaptcha = kwargs.pop('recaptcha', {})
self.allow_brotli = kwargs.pop(
'allow_brotli',
True if 'brotli' in sys.modules.keys() else False
)
self.user_agent = User_Agent(
allow_brotli=self.allow_brotli,
browser=kwargs.pop('browser', None)
)
self._solveDepthCnt = 0
self.solveDepth = kwargs.pop('solveDepth', 3)
super(CloudScraper, self).__init__(*args, **kwargs)
# pylint: disable=E0203
if 'requests' in self.headers['User-Agent']:
# ------------------------------------------------------------------------------- #
# Set a random User-Agent if no custom User-Agent has been set
self.headers = User_Agent(allow_brotli=self.allow_brotli).headers
# ------------------------------------------------------------------------------- #
self.headers = self.user_agent.headers
if not self.cipherSuite:
self.cipherSuite = self.user_agent.cipherSuite
self.mount('https://', CipherSuiteAdapter(self.loadCipherSuite()))
if isinstance(self.cipherSuite, list):
self.cipherSuite = ':'.join(self.cipherSuite)
##########################################################################################################################################################
self.mount(
'https://',
CipherSuiteAdapter(
cipherSuite=self.cipherSuite
)
)
# purely to allow us to pickle dump
copyreg.pickle(ssl.SSLContext, lambda obj: (obj.__class__, (obj.protocol,)))
# ------------------------------------------------------------------------------- #
# Allow us to pickle our session back with all variables
# ------------------------------------------------------------------------------- #
def __getstate__(self):
return self.__dict__
# ------------------------------------------------------------------------------- #
# Raise an Exception with no stacktrace and reset depth counter.
# ------------------------------------------------------------------------------- #
def simpleException(self, exception, msg):
self._solveDepthCnt = 0
sys.tracebacklimit = 0
raise exception(msg)
# ------------------------------------------------------------------------------- #
# debug the request via the response
# ------------------------------------------------------------------------------- #
@staticmethod
def debugRequest(req):
try:
print(dump.dump_all(req).decode('utf-8'))
except: # noqa
pass
except ValueError as e:
print("Debug Error: {}".format(getattr(e, 'message', e)))
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
# Unescape / decode html entities
# ------------------------------------------------------------------------------- #
def loadCipherSuite(self):
if self.cipherSuite:
return self.cipherSuite
@staticmethod
def unescape(html_text):
if sys.version_info >= (3, 0):
if sys.version_info >= (3, 4):
return html.unescape(html_text)
self.cipherSuite = ''
return HTMLParser().unescape(html_text)
if hasattr(ssl, 'PROTOCOL_TLS'):
ciphers = [
'ECDHE-ECDSA-AES128-GCM-SHA256', 'ECDHE-RSA-AES128-GCM-SHA256', 'ECDHE-ECDSA-AES256-GCM-SHA384',
'ECDHE-RSA-AES256-GCM-SHA384', 'ECDHE-ECDSA-CHACHA20-POLY1305-SHA256', 'ECDHE-RSA-CHACHA20-POLY1305-SHA256',
'ECDHE-RSA-AES128-CBC-SHA', 'ECDHE-RSA-AES256-CBC-SHA', 'RSA-AES128-GCM-SHA256', 'RSA-AES256-GCM-SHA384',
'ECDHE-RSA-AES128-GCM-SHA256', 'RSA-AES256-SHA', '3DES-EDE-CBC'
]
return HTMLParser().unescape(html_text)
if hasattr(ssl, 'PROTOCOL_TLSv1_3'):
ciphers.insert(0, ['GREASE_3A', 'GREASE_6A', 'AES128-GCM-SHA256', 'AES256-GCM-SHA256', 'AES256-GCM-SHA384', 'CHACHA20-POLY1305-SHA256'])
# ------------------------------------------------------------------------------- #
# Decode Brotli on older versions of urllib3 manually
# ------------------------------------------------------------------------------- #
ctx = ssl.SSLContext(getattr(ssl, 'PROTOCOL_TLSv1_3', ssl.PROTOCOL_TLSv1_2))
for cipher in ciphers:
try:
ctx.set_ciphers(cipher)
self.cipherSuite = '{}:{}'.format(self.cipherSuite, cipher).rstrip(':')
except ssl.SSLError:
pass
return self.cipherSuite
##########################################################################################################################################################
def request(self, method, url, *args, **kwargs):
ourSuper = super(CloudScraper, self)
resp = ourSuper.request(method, url, *args, **kwargs)
if resp.headers.get('Content-Encoding') == 'br':
def decodeBrotli(self, resp):
if requests.packages.urllib3.__version__ < '1.25.1' and resp.headers.get('Content-Encoding') == 'br':
if self.allow_brotli and resp._content:
resp._content = brotli.decompress(resp.content)
else:
logging.warning('Brotli content detected, But option is disabled, we will not continue.')
return resp
logging.warning(
'You\'re running urllib3 {}, Brotli content detected, '
'Which requires manual decompression, '
'But option allow_brotli is set to False, '
'We will not continue to decompress.'.format(requests.packages.urllib3.__version__)
)
return resp
# ------------------------------------------------------------------------------- #
# Our hijacker request function
# ------------------------------------------------------------------------------- #
def request(self, method, url, *args, **kwargs):
# pylint: disable=E0203
if kwargs.get('proxies') and kwargs.get('proxies') != self.proxies:
self.proxies = kwargs.get('proxies')
resp = self.decodeBrotli(
super(CloudScraper, self).request(method, url, *args, **kwargs)
)
# ------------------------------------------------------------------------------- #
# Debug request
# ------------------------------------------------------------------------------- #
if self.debug:
self.debugRequest(resp)
# Check if Cloudflare anti-bot is on
if self.isChallengeRequest(resp):
if resp.request.method != 'GET':
# Work around if the initial request is not a GET,
# Supersede with a GET then re-request the original METHOD.
self.request('GET', resp.url)
resp = ourSuper.request(method, url, *args, **kwargs)
else:
# Solve Challenge
resp = self.sendChallengeResponse(resp, **kwargs)
if self.is_Challenge_Request(resp):
# ------------------------------------------------------------------------------- #
# Try to solve the challenge and send it back
# ------------------------------------------------------------------------------- #
if self._solveDepthCnt >= self.solveDepth:
_ = self._solveDepthCnt
self.simpleException(
CloudflareLoopProtection,
"!!Loop Protection!! We have tried to solve {} time(s) in a row.".format(_)
)
self._solveDepthCnt += 1
resp = self.Challenge_Response(resp, **kwargs)
else:
if not resp.is_redirect and resp.status_code not in [429, 503]:
self._solveDepthCnt = 0
return resp
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
# check if the response contains a valid Cloudflare challenge
# ------------------------------------------------------------------------------- #
@staticmethod
def isChallengeRequest(resp):
if resp.headers.get('Server', '').startswith('cloudflare'):
if b'why_captcha' in resp.content or b'/cdn-cgi/l/chk_captcha' in resp.content:
raise ValueError('Captcha')
def is_IUAM_Challenge(resp):
try:
return (
resp.status_code in [429, 503]
and all(s in resp.content for s in [b'jschl_vc', b'jschl_answer'])
resp.headers.get('Server', '').startswith('cloudflare')
and resp.status_code in [429, 503]
and re.search(
r'action="/.*?__cf_chl_jschl_tk__=\S+".*?name="jschl_vc"\svalue=.*?',
resp.text,
re.M | re.DOTALL
)
)
except AttributeError:
pass
return False
##########################################################################################################################################################
def sendChallengeResponse(self, resp, **original_kwargs):
body = resp.text
# Cloudflare requires a delay before solving the challenge
if not self.delay:
try:
delay = float(re.search(r'submit\(\);\r?\n\s*},\s*([0-9]+)', body).group(1)) / float(1000)
if isinstance(delay, (int, float)):
self.delay = delay
except: # noqa
pass
sleep(self.delay)
parsed_url = urlparse(resp.url)
domain = parsed_url.netloc
submit_url = '{}://{}/cdn-cgi/l/chk_jschl'.format(parsed_url.scheme, domain)
cloudflare_kwargs = deepcopy(original_kwargs)
# ------------------------------------------------------------------------------- #
# check if the response contains a valid Cloudflare reCaptcha challenge
# ------------------------------------------------------------------------------- #
@staticmethod
def is_reCaptcha_Challenge(resp):
try:
params = OrderedDict()
s = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body)
if s:
params['s'] = s.group('s_value')
params.update(
[
('jschl_vc', re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)),
('pass', re.search(r'name="pass" value="(.+?)"', body).group(1))
]
)
params = cloudflare_kwargs.setdefault('params', params)
except Exception as e:
raise ValueError('Unable to parse Cloudflare anti-bots page: {} {}'.format(e.message, BUG_REPORT))
# Solve the Javascript challenge
params['jschl_answer'] = JavaScriptInterpreter.dynamicImport(self.interpreter).solveChallenge(body, domain)
# Requests transforms any request into a GET after a redirect,
# so the redirect has to be handled manually here to allow for
# performing other types of requests even as the first request.
cloudflare_kwargs['allow_redirects'] = False
redirect = self.request(resp.request.method, submit_url, **cloudflare_kwargs)
redirect_location = urlparse(redirect.headers['Location'])
if not redirect_location.netloc:
redirect_url = urlunparse(
(
parsed_url.scheme,
domain,
redirect_location.path,
redirect_location.params,
redirect_location.query,
redirect_location.fragment
return (
resp.headers.get('Server', '').startswith('cloudflare')
and resp.status_code == 403
and re.search(
r'action="/.*?__cf_chl_captcha_tk__=\S+".*?data\-sitekey=.*?',
resp.text,
re.M | re.DOTALL
)
)
return self.request(resp.request.method, redirect_url, **original_kwargs)
except AttributeError:
pass
return self.request(resp.request.method, redirect.headers['Location'], **original_kwargs)
return False
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
# check if the response contains Firewall 1020 Error
# ------------------------------------------------------------------------------- #
@staticmethod
def is_Firewall_Blocked(resp):
try:
return (
resp.headers.get('Server', '').startswith('cloudflare')
and resp.status_code == 403
and re.search(
r'<span class="cf-error-code">1020</span>',
resp.text,
re.M | re.DOTALL
)
)
except AttributeError:
pass
return False
# ------------------------------------------------------------------------------- #
# Wrapper for is_reCaptcha_Challenge, is_IUAM_Challenge, is_Firewall_Blocked
# ------------------------------------------------------------------------------- #
def is_Challenge_Request(self, resp):
if self.is_Firewall_Blocked(resp):
self.simpleException(
CloudflareCode1020,
'Cloudflare has blocked this request (Code 1020 Detected).'
)
if self.is_reCaptcha_Challenge(resp) or self.is_IUAM_Challenge(resp):
return True
return False
# ------------------------------------------------------------------------------- #
# Try to solve cloudflare javascript challenge.
# ------------------------------------------------------------------------------- #
def IUAM_Challenge_Response(self, body, url, interpreter):
try:
formPayload = re.search(
r'<form (?P<form>id="challenge-form" action="(?P<challengeUUID>.*?'
r'__cf_chl_jschl_tk__=\S+)"(.*?)</form>)',
body,
re.M | re.DOTALL
).groupdict()
if not all(key in formPayload for key in ['form', 'challengeUUID']):
self.simpleException(
CloudflareIUAMError,
"Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly."
)
payload = OrderedDict(
re.findall(
r'name="(r|jschl_vc|pass)"\svalue="(.*?)"',
formPayload['form']
)
)
except AttributeError:
self.simpleException(
CloudflareIUAMError,
"Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly."
)
hostParsed = urlparse(url)
try:
payload['jschl_answer'] = JavaScriptInterpreter.dynamicImport(
interpreter
).solveChallenge(body, hostParsed.netloc)
except Exception as e:
self.simpleException(
CloudflareIUAMError,
'Unable to parse Cloudflare anti-bots page: {}'.format(
getattr(e, 'message', e)
)
)
return {
'url': '{}://{}{}'.format(
hostParsed.scheme,
hostParsed.netloc,
self.unescape(formPayload['challengeUUID'])
),
'data': payload
}
# ------------------------------------------------------------------------------- #
# Try to solve the reCaptcha challenge via 3rd party.
# ------------------------------------------------------------------------------- #
def reCaptcha_Challenge_Response(self, provider, provider_params, body, url):
try:
formPayload = re.search(
r'<form class="challenge-form" (?P<form>id="challenge-form" '
r'action="(?P<challengeUUID>.*?__cf_chl_captcha_tk__=\S+)"(.*?)</form>)',
body,
re.M | re.DOTALL
).groupdict()
if not all(key in formPayload for key in ['form', 'challengeUUID']):
self.simpleException(
CloudflareReCaptchaError,
"Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly."
)
payload = OrderedDict(
re.findall(
r'(name="r"\svalue|data-ray|data-sitekey)="(.*?)"',
formPayload['form']
)
)
except (AttributeError):
self.simpleException(
CloudflareReCaptchaError,
"Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly."
)
hostParsed = urlparse(url)
return {
'url': '{}://{}{}'.format(
hostParsed.scheme,
hostParsed.netloc,
self.unescape(formPayload['challengeUUID'])
),
'data': OrderedDict([
('r', payload.get('name="r" value', '')),
('id', payload.get('data-ray')),
(
'g-recaptcha-response',
reCaptcha.dynamicImport(
provider.lower()
).solveCaptcha(
url,
payload['data-sitekey'],
provider_params
)
)
])
}
# ------------------------------------------------------------------------------- #
# Attempt to handle and send the challenge response back to cloudflare
# ------------------------------------------------------------------------------- #
def Challenge_Response(self, resp, **kwargs):
if self.is_reCaptcha_Challenge(resp):
# ------------------------------------------------------------------------------- #
# double down on the request as some websites are only checking
# if cfuid is populated before issuing reCaptcha.
# ------------------------------------------------------------------------------- #
resp = self.decodeBrotli(
super(CloudScraper, self).request(resp.request.method, resp.url, **kwargs)
)
if not self.is_reCaptcha_Challenge(resp):
return resp
# ------------------------------------------------------------------------------- #
# if no reCaptcha provider raise a runtime error.
# ------------------------------------------------------------------------------- #
if not self.recaptcha or not isinstance(self.recaptcha, dict) or not self.recaptcha.get('provider'):
self.simpleException(
CloudflareReCaptchaProvider,
"Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider "
"correctly via the 'recaptcha' parameter."
)
# ------------------------------------------------------------------------------- #
# if provider is return_response, return the response without doing anything.
# ------------------------------------------------------------------------------- #
if self.recaptcha.get('provider') == 'return_response':
return resp
self.recaptcha['proxies'] = self.proxies
submit_url = self.reCaptcha_Challenge_Response(
self.recaptcha.get('provider'),
self.recaptcha,
resp.text,
resp.url
)
else:
# ------------------------------------------------------------------------------- #
# Cloudflare requires a delay before solving the challenge
# ------------------------------------------------------------------------------- #
if not self.delay:
try:
delay = float(
re.search(
r'submit\(\);\r?\n\s*},\s*([0-9]+)',
resp.text
).group(1)
) / float(1000)
if isinstance(delay, (int, float)):
self.delay = delay
except (AttributeError, ValueError):
self.simpleException(
CloudflareIUAMError,
"Cloudflare IUAM possibility malformed, issue extracing delay value."
)
sleep(self.delay)
# ------------------------------------------------------------------------------- #
submit_url = self.IUAM_Challenge_Response(
resp.text,
resp.url,
self.interpreter
)
# ------------------------------------------------------------------------------- #
# Send the Challenge Response back to Cloudflare
# ------------------------------------------------------------------------------- #
if submit_url:
def updateAttr(obj, name, newValue):
try:
obj[name].update(newValue)
return obj[name]
except (AttributeError, KeyError):
obj[name] = {}
obj[name].update(newValue)
return obj[name]
cloudflare_kwargs = deepcopy(kwargs)
cloudflare_kwargs['allow_redirects'] = False
cloudflare_kwargs['data'] = updateAttr(
cloudflare_kwargs,
'data',
submit_url['data']
)
urlParsed = urlparse(resp.url)
cloudflare_kwargs['headers'] = updateAttr(
cloudflare_kwargs,
'headers',
{
'Origin': '{}://{}'.format(urlParsed.scheme, urlParsed.netloc),
'Referer': resp.url
}
)
challengeSubmitResponse = self.request(
'POST',
submit_url['url'],
**cloudflare_kwargs
)
# ------------------------------------------------------------------------------- #
# Return response if Cloudflare is doing content pass through instead of 3xx
# else request with redirect URL also handle protocol scheme change http -> https
# ------------------------------------------------------------------------------- #
if not challengeSubmitResponse.is_redirect:
return challengeSubmitResponse
else:
cloudflare_kwargs = deepcopy(kwargs)
cloudflare_kwargs['headers'] = updateAttr(
cloudflare_kwargs,
'headers',
{'Referer': challengeSubmitResponse.url}
)
if not urlparse(challengeSubmitResponse.headers['Location']).netloc:
redirect_location = urljoin(
challengeSubmitResponse.url,
challengeSubmitResponse.headers['Location']
)
else:
redirect_location = challengeSubmitResponse.headers['Location']
return self.request(
resp.request.method,
redirect_location,
**cloudflare_kwargs
)
# ------------------------------------------------------------------------------- #
# We shouldn't be here...
# Re-request the original query and/or process again....
# ------------------------------------------------------------------------------- #
return self.request(resp.request.method, resp.url, **kwargs)
# ------------------------------------------------------------------------------- #
@classmethod
def create_scraper(cls, sess=None, **kwargs):
@@ -247,24 +590,30 @@ class CloudScraper(Session):
scraper = cls(**kwargs)
if sess:
attrs = ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']
for attr in attrs:
for attr in ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']:
val = getattr(sess, attr, None)
if val:
setattr(scraper, attr, val)
return scraper
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
# Functions for integrating cloudscraper with other applications and scripts
# ------------------------------------------------------------------------------- #
@classmethod
def get_tokens(cls, url, **kwargs):
scraper = cls.create_scraper(
debug=kwargs.pop('debug', False),
delay=kwargs.pop('delay', None),
interpreter=kwargs.pop('interpreter', 'js2py'),
allow_brotli=kwargs.pop('allow_brotli', True),
**{
field: kwargs.pop(field, None) for field in [
'allow_brotli',
'browser',
'debug',
'delay',
'interpreter',
'recaptcha'
] if field in kwargs
}
)
try:
@@ -283,7 +632,11 @@ class CloudScraper(Session):
cookie_domain = d
break
else:
raise ValueError('Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM ("I\'m Under Attack Mode") enabled?')
cls.simpleException(
CloudflareIUAMError,
"Unable to find Cloudflare cookies. Does the site actually "
"have Cloudflare IUAM (I'm Under Attack Mode) enabled?"
)
return (
{
@@ -293,7 +646,7 @@ class CloudScraper(Session):
scraper.headers['User-Agent']
)
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
@classmethod
def get_cookie_string(cls, url, **kwargs):
@@ -304,7 +657,18 @@ class CloudScraper(Session):
return '; '.join('='.join(pair) for pair in tokens.items()), user_agent
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
if ssl.OPENSSL_VERSION_INFO < (1, 1, 1):
print(
"DEPRECATION: The OpenSSL being used by this python install ({}) does not meet the minimum supported "
"version (>= OpenSSL 1.1.1) in order to support TLS 1.3 required by Cloudflare, "
"You may encounter an unexpected reCaptcha or cloudflare 1020 blocks.".format(
ssl.OPENSSL_VERSION
)
)
# ------------------------------------------------------------------------------- #
create_scraper = CloudScraper.create_scraper
get_tokens = CloudScraper.get_tokens
@@ -0,0 +1,99 @@
# -*- coding: utf-8 -*-
# ------------------------------------------------------------------------------- #
"""
cloudscraper.exceptions
~~~~~~~~~~~~~~~~~~~
This module contains the set of cloudscraper exceptions.
"""
# ------------------------------------------------------------------------------- #
class CloudflareException(Exception):
"""
Base exception class for cloudscraper for Cloudflare
"""
class CloudflareLoopProtection(CloudflareException):
"""
Raise an exception for recursive depth protection
"""
class CloudflareCode1020(CloudflareException):
"""
Raise an exception for Cloudflare code 1020 block
"""
class CloudflareIUAMError(CloudflareException):
"""
Raise an error for problem extracting IUAM paramters
from Cloudflare payload
"""
class CloudflareReCaptchaError(CloudflareException):
"""
Raise an error for problem extracting reCaptcha paramters
from Cloudflare payload
"""
class CloudflareReCaptchaProvider(CloudflareException):
"""
Raise an exception for no reCaptcha provider loaded for Cloudflare.
"""
# ------------------------------------------------------------------------------- #
class reCaptchaException(Exception):
"""
Base exception class for cloudscraper reCaptcha Providers
"""
class reCaptchaServiceUnavailable(reCaptchaException):
"""
Raise an exception for external services that cannot be reached
"""
class reCaptchaAPIError(reCaptchaException):
"""
Raise an error for error from API response.
"""
class reCaptchaAccountError(reCaptchaException):
"""
Raise an error for reCaptcha provider account problem.
"""
class reCaptchaTimeout(reCaptchaException):
"""
Raise an exception for reCaptcha provider taking too long.
"""
class reCaptchaParameter(reCaptchaException):
"""
Raise an exception for bad or missing Parameter.
"""
class reCaptchaBadJobID(reCaptchaException):
"""
Raise an exception for invalid job id.
"""
class reCaptchaReportError(reCaptchaException):
"""
Raise an error for reCaptcha provider unable to report bad solve.
"""
@@ -1,4 +1,3 @@
import re
import sys
import logging
import abc
@@ -8,20 +7,24 @@ if sys.version_info >= (3, 4):
else:
ABC = abc.ABCMeta('ABC', (), {})
##########################################################################################################################################################
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
interpreters = {}
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
# ------------------------------------------------------------------------------- #
class JavaScriptInterpreter(ABC):
# ------------------------------------------------------------------------------- #
@abc.abstractmethod
def __init__(self, name):
interpreters[name] = self
# ------------------------------------------------------------------------------- #
@classmethod
def dynamicImport(cls, name):
if name not in interpreters:
@@ -35,55 +38,17 @@ class JavaScriptInterpreter(ABC):
return interpreters[name]
# ------------------------------------------------------------------------------- #
@abc.abstractmethod
def eval(self, jsEnv, js):
pass
# ------------------------------------------------------------------------------- #
def solveChallenge(self, body, domain):
try:
js = re.search(
r'setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n',
body
).group(1)
except Exception:
raise ValueError('Unable to identify Cloudflare IUAM Javascript on website. {}'.format(BUG_REPORT))
js = re.sub(r'\s{2,}', ' ', js, flags=re.MULTILINE | re.DOTALL).replace('\'; 121\'', '')
js += '\na.value;'
jsEnv = '''
String.prototype.italics=function(str) {{return "<i>" + this + "</i>";}};
var document = {{
createElement: function () {{
return {{ firstChild: {{ href: "https://{domain}/" }} }}
}},
getElementById: function () {{
return {{"innerHTML": "{innerHTML}"}};
}}
}};
'''
try:
innerHTML = re.search(
r'<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)</div>',
body,
re.MULTILINE | re.DOTALL
)
innerHTML = innerHTML.group(2) if innerHTML else ''
except: # noqa
logging.error('Error extracting Cloudflare IUAM Javascript. {}'.format(BUG_REPORT))
raise
try:
result = self.eval(
re.sub(r'\s{2,}', ' ', jsEnv.format(domain=domain, innerHTML=innerHTML), flags=re.MULTILINE | re.DOTALL),
js
)
float(result)
return float(self.eval(body, domain))
except Exception:
logging.error('Error executing Cloudflare IUAM Javascript. {}'.format(BUG_REPORT))
raise
return result
@@ -0,0 +1,103 @@
from __future__ import absolute_import
import os
import sys
import ctypes.util
from ctypes import c_void_p, c_size_t, byref, create_string_buffer, CDLL
from . import JavaScriptInterpreter
from .encapsulated import template
# ------------------------------------------------------------------------------- #
class ChallengeInterpreter(JavaScriptInterpreter):
# ------------------------------------------------------------------------------- #
def __init__(self):
super(ChallengeInterpreter, self).__init__('chakracore')
# ------------------------------------------------------------------------------- #
def eval(self, body, domain):
chakraCoreLibrary = None
# check current working directory.
for _libraryFile in ['libChakraCore.so', 'libChakraCore.dylib', 'ChakraCore.dll']:
if os.path.isfile(os.path.join(os.getcwd(), _libraryFile)):
chakraCoreLibrary = os.path.join(os.getcwd(), _libraryFile)
continue
if not chakraCoreLibrary:
chakraCoreLibrary = ctypes.util.find_library('ChakraCore')
if not chakraCoreLibrary:
sys.tracebacklimit = 0
raise RuntimeError(
'ChakraCore library not found in current path or any of your system library paths, '
'please download from https://www.github.com/VeNoMouS/cloudscraper/tree/ChakraCore/, '
'or https://github.com/Microsoft/ChakraCore/'
)
try:
chakraCore = CDLL(chakraCoreLibrary)
except OSError:
sys.tracebacklimit = 0
raise RuntimeError('There was an error loading the ChakraCore library {}'.format(chakraCoreLibrary))
if sys.platform != 'win32':
chakraCore.DllMain(0, 1, 0)
chakraCore.DllMain(0, 2, 0)
script = create_string_buffer(template(body, domain).encode('utf-16'))
runtime = c_void_p()
chakraCore.JsCreateRuntime(0, 0, byref(runtime))
context = c_void_p()
chakraCore.JsCreateContext(runtime, byref(context))
chakraCore.JsSetCurrentContext(context)
fname = c_void_p()
chakraCore.JsCreateString(
'iuam-challenge.js',
len('iuam-challenge.js'),
byref(fname)
)
scriptSource = c_void_p()
chakraCore.JsCreateExternalArrayBuffer(
script,
len(script),
0,
0,
byref(scriptSource)
)
jsResult = c_void_p()
chakraCore.JsRun(scriptSource, 0, fname, 0x02, byref(jsResult))
resultJSString = c_void_p()
chakraCore.JsConvertValueToString(jsResult, byref(resultJSString))
stringLength = c_size_t()
chakraCore.JsCopyString(resultJSString, 0, 0, byref(stringLength))
resultSTR = create_string_buffer(stringLength.value + 1)
chakraCore.JsCopyString(
resultJSString,
byref(resultSTR),
stringLength.value + 1,
0
)
chakraCore.JsDisposeRuntime(runtime)
return resultSTR.value
# ------------------------------------------------------------------------------- #
ChallengeInterpreter()
@@ -0,0 +1,58 @@
import logging
import re
# ------------------------------------------------------------------------------- #
def template(body, domain):
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
try:
js = re.search(
r'setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n',
body
).group(1)
except Exception:
raise ValueError('Unable to identify Cloudflare IUAM Javascript on website. {}'.format(BUG_REPORT))
js = re.sub(r'\s{2,}', ' ', js, flags=re.MULTILINE | re.DOTALL).replace('\'; 121\'', '')
js += '\na.value;'
jsEnv = '''
String.prototype.italics=function(str) {{return "<i>" + this + "</i>";}};
var document = {{
createElement: function () {{
return {{ firstChild: {{ href: "https://{domain}/" }} }}
}},
getElementById: function () {{
return {{"innerHTML": "{innerHTML}"}};
}}
}};
'''
try:
innerHTML = re.search(
r'<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)</div>',
body,
re.MULTILINE | re.DOTALL
)
innerHTML = innerHTML.group(2) if innerHTML else ''
except: # noqa
logging.error('Error extracting Cloudflare IUAM Javascript. {}'.format(BUG_REPORT))
raise
return '{}{}'.format(
re.sub(
r'\s{2,}',
' ',
jsEnv.format(
domain=domain,
innerHTML=innerHTML
),
re.MULTILINE | re.DOTALL
),
js
)
# ------------------------------------------------------------------------------- #
@@ -6,27 +6,39 @@ import base64
from . import JavaScriptInterpreter
from .encapsulated import template
from .jsunfuck import jsunfuck
# ------------------------------------------------------------------------------- #
class ChallengeInterpreter(JavaScriptInterpreter):
# ------------------------------------------------------------------------------- #
def __init__(self):
super(ChallengeInterpreter, self).__init__('js2py')
def eval(self, jsEnv, js):
# ------------------------------------------------------------------------------- #
def eval(self, body, domain):
jsPayload = template(body, domain)
if js2py.eval_js('(+(+!+[]+[+!+[]]+(!![]+[])[!+[]+!+[]+!+[]]+[!+[]+!+[]]+[+[]])+[])[+!+[]]') == '1':
logging.warning('WARNING - Please upgrade your js2py https://github.com/PiotrDabkowski/Js2Py, applying work around for the meantime.')
js = jsunfuck(js)
jsPayload = jsunfuck(jsPayload)
def atob(s):
return base64.b64decode('{}'.format(s)).decode('utf-8')
js2py.disable_pyimport()
context = js2py.EvalJs({'atob': atob})
result = context.eval('{}{}'.format(jsEnv, js))
result = context.eval(jsPayload)
return result
# ------------------------------------------------------------------------------- #
ChallengeInterpreter()
@@ -0,0 +1,120 @@
from __future__ import absolute_import
import re
import operator as op
from . import JavaScriptInterpreter
# ------------------------------------------------------------------------------- #
class ChallengeInterpreter(JavaScriptInterpreter):
def __init__(self):
super(ChallengeInterpreter, self).__init__('native')
def eval(self, body, domain):
# ------------------------------------------------------------------------------- #
operators = {
'+': op.add,
'-': op.sub,
'*': op.mul,
'/': op.truediv
}
# ------------------------------------------------------------------------------- #
def jsfuckToNumber(jsFuck):
t = ''
split_numbers = re.compile(r'-?\d+').findall
for i in re.findall(
r'\((?:\d|\+|\-)*\)',
jsFuck.replace('!+[]', '1').replace('!![]', '1').replace('[]', '0').lstrip('+').replace('(+', '(')
):
t = '{}{}'.format(t, sum(int(x) for x in split_numbers(i)))
return int(t)
# ------------------------------------------------------------------------------- #
def divisorMath(payload, needle, domain):
jsfuckMath = payload.split('/')
if needle in jsfuckMath[1]:
expression = re.findall(r"^(.*?)(.)\(function", jsfuckMath[1])[0]
expression_value = operators[expression[1]](
float(jsfuckToNumber(expression[0])),
float(ord(domain[jsfuckToNumber(jsfuckMath[1][
jsfuckMath[1].find('"("+p+")")}') + len('"("+p+")")}'):-2
])]))
)
else:
expression_value = jsfuckToNumber(jsfuckMath[1])
expression_value = jsfuckToNumber(jsfuckMath[0]) / float(expression_value)
return expression_value
# ------------------------------------------------------------------------------- #
def challengeSolve(body, domain):
jschl_answer = 0
jsfuckChallenge = re.search(
r"setTimeout\(function\(\){\s+var.*?f,\s*(?P<variable>\w+).*?:(?P<init>\S+)};"
r".*?\('challenge-form'\);\s+;(?P<challenge>.*?a\.value)"
r"(?:.*id=\"cf-dn-.*?>(?P<k>\S+)<)?",
body,
re.DOTALL | re.MULTILINE
).groupdict()
jsfuckChallenge['challenge'] = re.finditer(
r'{}.*?([+\-*/])=(.*?);(?=a\.value|{})'.format(
jsfuckChallenge['variable'],
jsfuckChallenge['variable']
),
jsfuckChallenge['challenge']
)
# ------------------------------------------------------------------------------- #
if '/' in jsfuckChallenge['init']:
val = jsfuckChallenge['init'].split('/')
jschl_answer = jsfuckToNumber(val[0]) / float(jsfuckToNumber(val[1]))
else:
jschl_answer = jsfuckToNumber(jsfuckChallenge['init'])
# ------------------------------------------------------------------------------- #
for expressionMatch in jsfuckChallenge['challenge']:
oper, expression = expressionMatch.groups()
if '/' in expression:
expression_value = divisorMath(expression, 'function(p)', domain)
else:
if 'Element' in expression:
expression_value = divisorMath(jsfuckChallenge['k'], '"("+p+")")}', domain)
else:
expression_value = jsfuckToNumber(expression)
jschl_answer = operators[oper](jschl_answer, expression_value)
# ------------------------------------------------------------------------------- #
if not jsfuckChallenge['k'] and '+ t.length' in body:
jschl_answer += len(domain)
# ------------------------------------------------------------------------------- #
return '{0:.10f}'.format(jschl_answer)
# ------------------------------------------------------------------------------- #
return challengeSolve(body, domain)
# ------------------------------------------------------------------------------- #
ChallengeInterpreter()
@@ -1,22 +1,23 @@
import base64
import logging
import subprocess
import sys
from . import JavaScriptInterpreter
from .encapsulated import template
##########################################################################################################################################################
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
class ChallengeInterpreter(JavaScriptInterpreter):
# ------------------------------------------------------------------------------- #
def __init__(self):
super(ChallengeInterpreter, self).__init__('nodejs')
def eval(self, jsEnv, js):
# ------------------------------------------------------------------------------- #
def eval(self, body, domain):
try:
js = 'var atob = function(str) {return Buffer.from(str, "base64").toString("binary");};' \
'var challenge = atob("%s");' \
@@ -24,23 +25,25 @@ class ChallengeInterpreter(JavaScriptInterpreter):
'var options = {filename: "iuam-challenge.js", timeout: 4000};' \
'var answer = require("vm").runInNewContext(challenge, context, options);' \
'process.stdout.write(String(answer));' \
% base64.b64encode('{}{}'.format(jsEnv, js).encode('UTF-8')).decode('ascii')
% base64.b64encode(template(body, domain).encode('UTF-8')).decode('ascii')
return subprocess.check_output(['node', '-e', js])
except OSError as e:
if e.errno == 2:
raise EnvironmentError(
'Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, '
'in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cloudscraper'
' README\'s Dependencies section: https://github.com/VeNoMouS/cloudscraper#dependencies.'
'Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`).\n\n'
'Your Node binary may be called `nodejs` rather than `node`, '
'in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems.\n\n'
'(Please read the cloudscraper README\'s Dependencies section: '
'https://github.com/VeNoMouS/cloudscraper#dependencies.)'
)
raise
except Exception:
logging.error('Error executing Cloudflare IUAM Javascript. %s' % BUG_REPORT)
raise
sys.tracebacklimit = 0
raise RuntimeError('Error executing Cloudflare IUAM Javascript in nodejs')
pass
# ------------------------------------------------------------------------------- #
ChallengeInterpreter()
@@ -0,0 +1,33 @@
from __future__ import absolute_import
import sys
try:
import v8eval
except ImportError:
sys.tracebacklimit = 0
raise RuntimeError('Please install the python module v8eval either via pip or download it from https://github.com/sony/v8eval')
from . import JavaScriptInterpreter
from .encapsulated import template
# ------------------------------------------------------------------------------- #
class ChallengeInterpreter(JavaScriptInterpreter):
def __init__(self):
super(ChallengeInterpreter, self).__init__('v8')
# ------------------------------------------------------------------------------- #
def eval(self, body, domain):
try:
return v8eval.V8().eval(template(body, domain))
except (TypeError, v8eval.V8Error):
RuntimeError('We encountered an error running the V8 Engine.')
# ------------------------------------------------------------------------------- #
ChallengeInterpreter()
@@ -0,0 +1,236 @@
from __future__ import absolute_import
import requests
from ..exceptions import (
reCaptchaServiceUnavailable,
reCaptchaAPIError,
reCaptchaTimeout,
reCaptchaParameter,
reCaptchaBadJobID,
reCaptchaReportError
)
try:
import polling
except ImportError:
raise ImportError(
"Please install the python module 'polling' via pip or download it from "
"https://github.com/justiniso/polling/"
)
from . import reCaptcha
class captchaSolver(reCaptcha):
def __init__(self):
super(captchaSolver, self).__init__('2captcha')
self.host = 'https://2captcha.com'
self.session = requests.Session()
# ------------------------------------------------------------------------------- #
@staticmethod
def checkErrorStatus(response, request_type):
if response.status_code in [500, 502]:
raise reCaptchaServiceUnavailable('2Captcha: Server Side Error {}'.format(response.status_code))
errors = {
'in.php': {
"ERROR_WRONG_USER_KEY": "You've provided api_key parameter value is in incorrect format, it should contain 32 symbols.",
"ERROR_KEY_DOES_NOT_EXIST": "The api_key you've provided does not exists.",
"ERROR_ZERO_BALANCE": "You don't have sufficient funds on your account.",
"ERROR_PAGEURL": "pageurl parameter is missing in your request.",
"ERROR_NO_SLOT_AVAILABLE":
"No Slots Available.\nYou can receive this error in two cases:\n"
"1. If you solve ReCaptcha: the queue of your captchas that are not distributed to workers is too long. "
"Queue limit changes dynamically and depends on total amount of captchas awaiting solution and usually it's between 50 and 100 captchas.\n"
"2. If you solve Normal Captcha: your maximum rate for normal captchas is lower than current rate on the server."
"You can change your maximum rate in your account's settings.",
"ERROR_IP_NOT_ALLOWED": "The request is sent from the IP that is not on the list of your allowed IPs.",
"IP_BANNED": "Your IP address is banned due to many frequent attempts to access the server using wrong authorization keys.",
"ERROR_BAD_TOKEN_OR_PAGEURL":
"You can get this error code when sending ReCaptcha V2. "
"That happens if your request contains invalid pair of googlekey and pageurl. "
"The common reason for that is that ReCaptcha is loaded inside an iframe hosted on another domain/subdomain.",
"ERROR_GOOGLEKEY":
"You can get this error code when sending ReCaptcha V2. "
"That means that sitekey value provided in your request is incorrect: it's blank or malformed.",
"MAX_USER_TURN": "You made more than 60 requests within 3 seconds.Your account is banned for 10 seconds. Ban will be lifted automatically."
},
'res.php': {
"ERROR_CAPTCHA_UNSOLVABLE":
"We are unable to solve your captcha - three of our workers were unable solve it "
"or we didn't get an answer within 90 seconds (300 seconds for ReCaptcha V2). "
"We will not charge you for that request.",
"ERROR_WRONG_USER_KEY": "You've provided api_key parameter value in incorrect format, it should contain 32 symbols.",
"ERROR_KEY_DOES_NOT_EXIST": "The api_key you've provided does not exists.",
"ERROR_WRONG_ID_FORMAT": "You've provided captcha ID in wrong format. The ID can contain numbers only.",
"ERROR_WRONG_CAPTCHA_ID": "You've provided incorrect captcha ID.",
"ERROR_BAD_DUPLICATES":
"Error is returned when 100% accuracy feature is enabled. "
"The error means that max numbers of tries is reached but min number of matches not found.",
"REPORT_NOT_RECORDED": "Error is returned to your complain request if you already complained lots of correctly solved captchas.",
"ERROR_IP_ADDRES":
"You can receive this error code when registering a pingback (callback) IP or domain."
"That happes if your request is coming from an IP address that doesn't match the IP address of your pingback IP or domain.",
"ERROR_TOKEN_EXPIRED": "You can receive this error code when sending GeeTest. That error means that challenge value you provided is expired.",
"ERROR_EMPTY_ACTION": "Action parameter is missing or no value is provided for action parameter."
}
}
if response.json().get('status') is False and response.json().get('request') in errors.get(request_type):
raise reCaptchaAPIError(
'{} {}'.format(
response.json().get('request'),
errors.get(request_type).get(response.json().get('request'))
)
)
# ------------------------------------------------------------------------------- #
def reportJob(self, jobID):
if not jobID:
raise reCaptchaBadJobID(
"2Captcha: Error bad job id to request reCaptcha."
)
def _checkRequest(response):
if response.ok and response.json().get('status') == 1:
return response
self.checkErrorStatus(response, 'res.php')
return None
response = polling.poll(
lambda: self.session.get(
'{}/res.php'.format(self.host),
params={
'key': self.api_key,
'action': 'reportbad',
'id': jobID,
'json': '1'
}
),
check_success=_checkRequest,
step=5,
timeout=180
)
if response:
return True
else:
raise reCaptchaReportError(
"2Captcha: Error - Failed to report bad reCaptcha solve."
)
# ------------------------------------------------------------------------------- #
def requestJob(self, jobID):
if not jobID:
raise reCaptchaBadJobID("2Captcha: Error bad job id to request reCaptcha.")
def _checkRequest(response):
if response.ok and response.json().get('status') == 1:
return response
self.checkErrorStatus(response, 'res.php')
return None
response = polling.poll(
lambda: self.session.get(
'{}/res.php'.format(self.host),
params={
'key': self.api_key,
'action': 'get',
'id': jobID,
'json': '1'
}
),
check_success=_checkRequest,
step=5,
timeout=180
)
if response:
return response.json().get('request')
else:
raise reCaptchaTimeout(
"2Captcha: Error failed to solve reCaptcha."
)
# ------------------------------------------------------------------------------- #
def requestSolve(self, site_url, site_key):
def _checkRequest(response):
if response.ok and response.json().get("status") == 1 and response.json().get('request'):
return response
self.checkErrorStatus(response, 'in.php')
return None
response = polling.poll(
lambda: self.session.post(
'{}/in.php'.format(self.host),
data={
'key': self.api_key,
'method': 'userrecaptcha',
'googlekey': site_key,
'pageurl': site_url,
'json': '1',
'soft_id': '5507698'
},
allow_redirects=False
),
check_success=_checkRequest,
step=5,
timeout=180
)
if response:
return response.json().get('request')
else:
raise reCaptchaBadJobID(
'2Captcha: Error no job id was returned.'
)
# ------------------------------------------------------------------------------- #
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
jobID = None
if not reCaptchaParams.get('api_key'):
raise reCaptchaParameter(
"2Captcha: Missing api_key parameter."
)
self.api_key = reCaptchaParams.get('api_key')
if reCaptchaParams.get('proxy'):
self.session.proxies = reCaptchaParams.get('proxies')
try:
jobID = self.requestSolve(site_url, site_key)
return self.requestJob(jobID)
except polling.TimeoutException:
try:
if jobID:
self.reportJob(jobID)
except polling.TimeoutException:
raise reCaptchaTimeout(
"2Captcha: reCaptcha solve took to long and also failed reporting the job the job id {}.".format(jobID)
)
raise reCaptchaTimeout(
"2Captcha: reCaptcha solve took to long to execute job id {}, aborting.".format(jobID)
)
# ------------------------------------------------------------------------------- #
captchaSolver()
@@ -0,0 +1,207 @@
from __future__ import absolute_import
import re
import requests
try:
import polling
except ImportError:
raise ImportError(
"Please install the python module 'polling' via pip or download it from "
"https://github.com/justiniso/polling/"
)
from ..exceptions import (
reCaptchaServiceUnavailable,
reCaptchaAPIError,
reCaptchaTimeout,
reCaptchaParameter,
reCaptchaBadJobID
)
from . import reCaptcha
class captchaSolver(reCaptcha):
def __init__(self):
super(captchaSolver, self).__init__('9kw')
self.host = 'https://www.9kw.eu/index.cgi'
self.maxtimeout = 180
self.session = requests.Session()
# ------------------------------------------------------------------------------- #
@staticmethod
def checkErrorStatus(response):
if response.status_code in [500, 502]:
raise reCaptchaServiceUnavailable(
'9kw: Server Side Error {}'.format(response.status_code)
)
error_codes = {
1: 'No API Key available.',
2: 'No API key found.',
3: 'No active API key found.',
4: 'API Key has been disabled by the operator. ',
5: 'No user found.',
6: 'No data found.',
7: 'Found No ID.',
8: 'found No captcha.',
9: 'No image found.',
10: 'Image size not allowed.',
11: 'credit is not sufficient.',
12: 'what was done.',
13: 'No answer contain.',
14: 'Captcha already been answered.',
15: 'Captcha to quickly filed.',
16: 'JD check active.',
17: 'Unknown problem.',
18: 'Found No ID.',
19: 'Incorrect answer.',
20: 'Do not timely filed (Incorrect UserID).',
21: 'Link not allowed.',
22: 'Prohibited submit.',
23: 'Entering prohibited.',
24: 'Too little credit.',
25: 'No entry found.',
26: 'No Conditions accepted.',
27: 'No coupon code found in the database.',
28: 'Already unused voucher code.',
29: 'maxTimeout under 60 seconds.',
30: 'User not found.',
31: 'An account is not yet 24 hours in system.',
32: 'An account does not have the full rights.',
33: 'Plugin needed a update.',
34: 'No HTTPS allowed.',
35: 'No HTTP allowed.',
36: 'Source not allowed.',
37: 'Transfer denied.',
38: 'Incorrect answer without space',
39: 'Incorrect answer with space',
40: 'Incorrect answer with not only numbers',
41: 'Incorrect answer with not only A-Z, a-z',
42: 'Incorrect answer with not only 0-9, A-Z, a-z',
43: 'Incorrect answer with not only [0-9,- ]',
44: 'Incorrect answer with not only [0-9A-Za-z,- ]',
45: 'Incorrect answer with not only coordinates',
46: 'Incorrect answer with not only multiple coordinates',
47: 'Incorrect answer with not only data',
48: 'Incorrect answer with not only rotate number',
49: 'Incorrect answer with not only text',
50: 'Incorrect answer with not only text and too short',
51: 'Incorrect answer with not enough chars',
52: 'Incorrect answer with too many chars',
53: 'Incorrect answer without no or yes',
54: 'Assignment was not found.'
}
if response.text.startswith('{'):
if response.json().get('error'):
raise reCaptchaAPIError(error_codes.get(int(response.json().get('error'))))
else:
error_code = int(re.search(r'^00(?P<error_code>\d+)', response.text).groupdict().get('error_code', 0))
if error_code:
raise reCaptchaAPIError(error_codes.get(error_code))
# ------------------------------------------------------------------------------- #
def requestJob(self, jobID):
if not jobID:
raise reCaptchaBadJobID(
"9kw: Error bad job id to request reCaptcha against."
)
def _checkRequest(response):
if response.ok and response.json().get('answer') != 'NO DATA':
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.get(
self.host,
params={
'apikey': self.api_key,
'action': 'usercaptchacorrectdata',
'id': jobID,
'info': 1,
'json': 1
}
),
check_success=_checkRequest,
step=10,
timeout=(self.maxtimeout + 10)
)
if response:
return response.json().get('answer')
else:
raise reCaptchaTimeout("9kw: Error failed to solve reCaptcha.")
# ------------------------------------------------------------------------------- #
def requestSolve(self, site_url, site_key):
def _checkRequest(response):
if response.ok and response.text.startswith('{') and response.json().get('captchaid'):
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.post(
self.host,
data={
'apikey': self.api_key,
'action': 'usercaptchaupload',
'interactive': 1,
'file-upload-01': site_key,
'oldsource': 'recaptchav2',
'pageurl': site_url,
'maxtimeout': self.maxtimeout,
'json': 1
},
allow_redirects=False
),
check_success=_checkRequest,
step=5,
timeout=(self.maxtimeout + 10)
)
if response:
return response.json().get('captchaid')
else:
raise reCaptchaBadJobID('9kw: Error no valid job id was returned.')
# ------------------------------------------------------------------------------- #
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
jobID = None
if not reCaptchaParams.get('api_key'):
raise reCaptchaParameter("9kw: Missing api_key parameter.")
self.api_key = reCaptchaParams.get('api_key')
if reCaptchaParams.get('maxtimeout'):
self.maxtimeout = reCaptchaParams.get('maxtimeout')
if reCaptchaParams.get('proxy'):
self.session.proxies = reCaptchaParams.get('proxies')
try:
jobID = self.requestSolve(site_url, site_key)
return self.requestJob(jobID)
except polling.TimeoutException:
raise reCaptchaTimeout(
"9kw: reCaptcha solve took to long to execute 'captchaid' {}, aborting.".format(jobID)
)
# ------------------------------------------------------------------------------- #
captchaSolver()
@@ -0,0 +1,46 @@
import abc
import logging
import sys
if sys.version_info >= (3, 4):
ABC = abc.ABC # noqa
else:
ABC = abc.ABCMeta('ABC', (), {})
# ------------------------------------------------------------------------------- #
captchaSolvers = {}
# ------------------------------------------------------------------------------- #
class reCaptcha(ABC):
@abc.abstractmethod
def __init__(self, name):
captchaSolvers[name] = self
# ------------------------------------------------------------------------------- #
@classmethod
def dynamicImport(cls, name):
if name not in captchaSolvers:
try:
__import__('{}.{}'.format(cls.__module__, name))
if not isinstance(captchaSolvers.get(name), reCaptcha):
raise ImportError('The anti reCaptcha provider was not initialized.')
except ImportError:
logging.error("Unable to load {} anti reCaptcha provider".format(name))
raise
return captchaSolvers[name]
# ------------------------------------------------------------------------------- #
@abc.abstractmethod
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
pass
# ------------------------------------------------------------------------------- #
def solveCaptcha(self, site_url, site_key, reCaptchaParams):
return self.getCaptchaAnswer(site_url, site_key, reCaptchaParams)
@@ -0,0 +1,49 @@
from __future__ import absolute_import
from ..exceptions import reCaptchaParameter
try:
from python_anticaptcha import (
AnticaptchaClient,
NoCaptchaTaskProxylessTask
)
except ImportError:
raise ImportError(
"Please install the python module 'python_anticaptcha' via pip or download it from "
"https://github.com/ad-m/python-anticaptcha"
)
from . import reCaptcha
class captchaSolver(reCaptcha):
def __init__(self):
super(captchaSolver, self).__init__('anticaptcha')
# ------------------------------------------------------------------------------- #
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
if not reCaptchaParams.get('api_key'):
raise reCaptchaParameter("anticaptcha: Missing api_key parameter.")
client = AnticaptchaClient(reCaptchaParams.get('api_key'))
if reCaptchaParams.get('proxy'):
client.session.proxies = reCaptchaParams.get('proxies')
task = NoCaptchaTaskProxylessTask(site_url, site_key)
if not hasattr(client, 'createTaskSmee'):
raise NotImplementedError(
"Please upgrade 'python_anticaptcha' via pip or download it from "
"https://github.com/ad-m/python-anticaptcha"
)
job = client.createTaskSmee(task)
return job.get_solution_response()
# ------------------------------------------------------------------------------- #
captchaSolver()
@@ -0,0 +1,227 @@
from __future__ import absolute_import
import json
import requests
try:
import polling
except ImportError:
raise ImportError(
"Please install the python module 'polling' via pip or download it from "
"https://github.com/justiniso/polling/"
)
from ..exceptions import (
reCaptchaServiceUnavailable,
reCaptchaAccountError,
reCaptchaTimeout,
reCaptchaParameter,
reCaptchaBadJobID,
reCaptchaReportError
)
from . import reCaptcha
class captchaSolver(reCaptcha):
def __init__(self):
super(captchaSolver, self).__init__('deathbycaptcha')
self.host = 'http://api.dbcapi.me/api'
self.session = requests.Session()
# ------------------------------------------------------------------------------- #
@staticmethod
def checkErrorStatus(response):
errors = dict(
[
(400, "DeathByCaptcha: 400 Bad Request"),
(403, "DeathByCaptcha: 403 Forbidden - Invalid credentails or insufficient credits."),
# (500, "DeathByCaptcha: 500 Internal Server Error."),
(503, "DeathByCaptcha: 503 Service Temporarily Unavailable.")
]
)
if response.status_code in errors:
raise reCaptchaServiceUnavailable(errors.get(response.status_code))
# ------------------------------------------------------------------------------- #
def login(self, username, password):
self.username = username
self.password = password
def _checkRequest(response):
if response.ok:
if response.json().get('is_banned'):
raise reCaptchaAccountError('DeathByCaptcha: Your account is banned.')
if response.json().get('balanace') == 0:
raise reCaptchaAccountError('DeathByCaptcha: insufficient credits.')
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.post(
'{}/user'.format(self.host),
headers={'Accept': 'application/json'},
data={
'username': self.username,
'password': self.password
}
),
check_success=_checkRequest,
step=10,
timeout=120
)
self.debugRequest(response)
# ------------------------------------------------------------------------------- #
def reportJob(self, jobID):
if not jobID:
raise reCaptchaBadJobID(
"DeathByCaptcha: Error bad job id to report failed reCaptcha."
)
def _checkRequest(response):
if response.status_code == 200:
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.post(
'{}/captcha/{}/report'.format(self.host, jobID),
headers={'Accept': 'application/json'},
data={
'username': self.username,
'password': self.password
}
),
check_success=_checkRequest,
step=10,
timeout=180
)
if response:
return True
else:
raise reCaptchaReportError(
"DeathByCaptcha: Error report failed reCaptcha."
)
# ------------------------------------------------------------------------------- #
def requestJob(self, jobID):
if not jobID:
raise reCaptchaBadJobID(
"DeathByCaptcha: Error bad job id to request reCaptcha."
)
def _checkRequest(response):
if response.ok and response.json().get('text'):
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.get(
'{}/captcha/{}'.format(self.host, jobID),
headers={'Accept': 'application/json'}
),
check_success=_checkRequest,
step=10,
timeout=180
)
if response:
return response.json().get('text')
else:
raise reCaptchaTimeout(
"DeathByCaptcha: Error failed to solve reCaptcha."
)
# ------------------------------------------------------------------------------- #
def requestSolve(self, site_url, site_key):
def _checkRequest(response):
if response.ok and response.json().get("is_correct") and response.json().get('captcha'):
return response
self.checkErrorStatus(response)
return None
response = polling.poll(
lambda: self.session.post(
'{}/captcha'.format(self.host),
headers={'Accept': 'application/json'},
data={
'username': self.username,
'password': self.password,
'type': '4',
'token_params': json.dumps({
'googlekey': site_key,
'pageurl': site_url
})
},
allow_redirects=False
),
check_success=_checkRequest,
step=10,
timeout=180
)
if response:
return response.json().get('captcha')
else:
raise reCaptchaBadJobID(
'DeathByCaptcha: Error no job id was returned.'
)
# ------------------------------------------------------------------------------- #
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
jobID = None
for param in ['username', 'password']:
if not reCaptchaParams.get(param):
raise reCaptchaParameter(
"DeathByCaptcha: Missing '{}' parameter.".format(param)
)
setattr(self, param, reCaptchaParams.get(param))
if reCaptchaParams.get('proxy'):
self.session.proxies = reCaptchaParams.get('proxies')
try:
jobID = self.requestSolve(site_url, site_key)
return self.requestJob(jobID)
except polling.TimeoutException:
try:
if jobID:
self.reportJob(jobID)
except polling.TimeoutException:
raise reCaptchaTimeout(
"DeathByCaptcha: reCaptcha solve took to long and also failed reporting the job id {}.".format(jobID)
)
raise reCaptchaTimeout(
"DeathByCaptcha: reCaptcha solve took to long to execute job id {}, aborting.".format(jobID)
)
# ------------------------------------------------------------------------------- #
captchaSolver()
@@ -1,40 +1,117 @@
import os
import json
import os
import random
import logging
import re
import sys
import ssl
from collections import OrderedDict
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
class User_Agent():
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
def __init__(self, *args, **kwargs):
self.headers = None
self.cipherSuite = []
self.loadUserAgent(*args, **kwargs)
##########################################################################################################################################################
# ------------------------------------------------------------------------------- #
def loadHeaders(self, user_agents, user_agent_version):
if user_agents.get(self.browser).get('releases').get(user_agent_version).get('headers'):
self.headers = user_agents.get(self.browser).get('releases').get(user_agent_version).get('headers')
else:
self.headers = user_agents.get(self.browser).get('default_headers')
# ------------------------------------------------------------------------------- #
def filterAgents(self, releases):
filtered = {}
for release in releases:
if self.mobile and releases[release]['User-Agent']['mobile']:
filtered[release] = filtered.get(release, []) + releases[release]['User-Agent']['mobile']
if self.desktop and releases[release]['User-Agent']['desktop']:
filtered[release] = filtered.get(release, []) + releases[release]['User-Agent']['desktop']
return filtered
# ------------------------------------------------------------------------------- #
def tryMatchCustom(self, user_agents):
for browser in user_agents:
for release in user_agents[browser]['releases']:
for platform in ['mobile', 'desktop']:
if re.search(re.escape(self.custom), ' '.join(user_agents[browser]['releases'][release]['User-Agent'][platform])):
self.browser = browser
self.loadHeaders(user_agents, release)
self.headers['User-Agent'] = self.custom
self.cipherSuite = user_agents[self.browser].get('cipherSuite', [])
return True
return False
# ------------------------------------------------------------------------------- #
def loadUserAgent(self, *args, **kwargs):
browser = kwargs.pop('browser', 'chrome')
self.browser = kwargs.pop('browser', None)
user_agents = json.load(
open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r'),
object_pairs_hook=OrderedDict
)
if isinstance(self.browser, dict):
self.custom = self.browser.get('custom', None)
self.desktop = self.browser.get('desktop', True)
self.mobile = self.browser.get('mobile', True)
self.browser = self.browser.get('browser', None)
else:
self.custom = kwargs.pop('custom', None)
self.desktop = kwargs.pop('desktop', True)
self.mobile = kwargs.pop('mobile', True)
if not user_agents.get(browser):
logging.error('Sorry "{}" browser User-Agent was not found.'.format(browser))
raise
if not self.desktop and not self.mobile:
sys.tracebacklimit = 0
raise RuntimeError("Sorry you can't have mobile and desktop disabled at the same time.")
user_agent = random.choice(user_agents.get(browser))
with open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r') as fp:
user_agents = json.load(
fp,
object_pairs_hook=OrderedDict
)
self.headers = user_agent.get('headers')
self.headers['User-Agent'] = random.choice(user_agent.get('User-Agent'))
if self.custom:
if not self.tryMatchCustom(user_agents):
self.cipherSuite = [
ssl._DEFAULT_CIPHERS,
'!AES128-SHA',
'!ECDHE-RSA-AES256-SHA',
]
self.headers = OrderedDict([
('User-Agent', self.custom),
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'),
('Accept-Language', 'en-US,en;q=0.9'),
('Accept-Encoding', 'gzip, deflate, br')
])
else:
if self.browser and not user_agents.get(self.browser):
sys.tracebacklimit = 0
raise RuntimeError('Sorry "{}" browser User-Agent was not found.'.format(self.browser))
if not kwargs.get('allow_brotli', False):
if 'br' in self.headers['Accept-Encoding']:
self.headers['Accept-Encoding'] = ','.join([encoding for encoding in self.headers['Accept-Encoding'].split(',') if encoding.strip() != 'br']).strip()
if not self.browser:
self.browser = random.SystemRandom().choice(list(user_agents))
self.cipherSuite = user_agents.get(self.browser).get('cipherSuite', [])
filteredAgents = self.filterAgents(user_agents.get(self.browser).get('releases'))
user_agent_version = random.SystemRandom().choice(list(filteredAgents))
self.loadHeaders(user_agents, user_agent_version)
self.headers['User-Agent'] = random.SystemRandom().choice(filteredAgents[user_agent_version])
if not kwargs.get('allow_brotli', False) and 'br' in self.headers['Accept-Encoding']:
self.headers['Accept-Encoding'] = ','.join([
encoding for encoding in self.headers['Accept-Encoding'].split(',') if encoding.strip() != 'br'
]).strip()
File diff suppressed because it is too large Load Diff
@@ -43,6 +43,8 @@ python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.get
# subscenter:list
python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.getLogger('rebulk').setLevel(logging.WARNING); import subliminal_patch, subliminal; subliminal.region.configure('dogpile.cache.memory'); from subliminal_patch.core import SZProviderPool; from babelfish import Language; from subliminal.core import scan_video; print SZProviderPool(providers=['subscenter'], )['subscenter'].list_subtitles(scan_video('FULL_PATH'), languages=[Language('heb')])"
# subscene:list
python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.getLogger('rebulk').setLevel(logging.WARNING); import subliminal_patch, subliminal; subliminal.region.configure('dogpile.cache.memory'); from subliminal_patch.core import SZProviderPool; from subzero.language import Language; from subzero.video import parse_video; SZProviderPool(providers=['subscene'], provider_configs={'subscene': {'username': 'USERNAME', 'password': 'PASSWORD'}})['subscene'].list_subtitles(parse_video('FILENAME', {}, {'type': 'episode'}, dry_run=True), languages=[Language('eng')])"
# refining
python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.getLogger('rebulk').setLevel(logging.WARNING); import os; os.environ['U1pfT01EQl9LRVk'] = '789CF30DAC2C8B0AF433F5C9AD34290A712DF30D7135F12D0FB3E502006FDE081E'; import subliminal_patch, subliminal; subliminal.region.configure('dogpile.cache.memory'); from subzero.video import parse_video, refine_video; video = parse_video('FILE_NAME', {'type': 'episode'}, dry_run=True); print refine_video(video)"
@@ -12,3 +12,6 @@ class UnknownFormatIdentifierError(Pysubs2Error):
class FormatAutodetectionError(Pysubs2Error):
"""Subtitle format is ambiguous or unknown."""
class ContentNotUsable(Pysubs2Error):
"""Current content not usable for specified format"""
@@ -41,6 +41,7 @@ class SSAStyle(object):
self.italic = False #: Italic
self.underline = False #: Underline (ASS only)
self.strikeout = False #: Strikeout (ASS only)
self.drawing = False #: Drawing (ASS only, see http://docs.aegisub.org/3.1/ASS_Tags/#drawing-tags
self.scalex = 100.0 #: Horizontal scaling (ASS only)
self.scaley = 100.0 #: Vertical scaling (ASS only)
self.spacing = 0.0 #: Letter spacing (ASS only)
+6 -1
View File
@@ -5,6 +5,7 @@ from .formatbase import FormatBase
from .ssaevent import SSAEvent
from .ssastyle import SSAStyle
from .substation import parse_tags
from .exceptions import ContentNotUsable
from .time import ms_to_times, make_time, TIMESTAMP, timestamp_to_ms
#: Largest timestamp allowed in SubRip, ie. 99:59:59,999.
@@ -81,6 +82,7 @@ class SubripFormat(FormatBase):
if sty.italic: fragment = "<i>%s</i>" % fragment
if sty.underline: fragment = "<u>%s</u>" % fragment
if sty.strikeout: fragment = "<s>%s</s>" % fragment
if sty.drawing: raise ContentNotUsable
body.append(fragment)
return re.sub("\n+", "\n", "".join(body).strip())
@@ -90,7 +92,10 @@ class SubripFormat(FormatBase):
for i, line in enumerate(visible_lines, 1):
start = ms_to_timestamp(line.start)
end = ms_to_timestamp(line.end)
text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
try:
text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
except ContentNotUsable:
continue
print("%d" % i, file=fp) # Python 2.7 compat
print(start, "-->", end, file=fp)
@@ -110,7 +110,7 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
def apply_overrides(all_overrides):
s = style.copy()
for tag in re.findall(r"\\[ibus][10]|\\r[a-zA-Z_0-9 ]*", all_overrides):
for tag in re.findall(r"\\[ibusp][0-9]|\\r[a-zA-Z_0-9 ]*", all_overrides):
if tag == r"\r":
s = style.copy() # reset to original line style
elif tag.startswith(r"\r"):
@@ -122,6 +122,13 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
elif "b" in tag: s.bold = "1" in tag
elif "u" in tag: s.underline = "1" in tag
elif "s" in tag: s.strikeout = "1" in tag
elif "p" in tag:
try:
scale = int(tag[2:])
except (ValueError, IndexError):
continue
s.drawing = scale > 0
return s
overrides = SSAEvent.OVERRIDE_SEQUENCE.findall(text)
@@ -27,3 +27,8 @@ class ServiceUnavailable(ProviderError):
class DownloadLimitExceeded(ProviderError):
"""Exception raised by providers when download limit is exceeded."""
pass
class DownloadLimitPerDayExceeded(ProviderError):
"""Exception raised by providers when download limit is exceeded."""
pass
@@ -264,7 +264,7 @@ class SZProviderPool(ProviderPool):
requests.exceptions.SSLError,
requests.Timeout,
socket.timeout):
logger.error('Provider %r connection error', subtitle.provider_name)
logger.exception('Provider %r connection error', subtitle.provider_name)
except ResponseNotReady:
logger.error('Provider %r response error, reinitializing', subtitle.provider_name)
@@ -354,15 +354,16 @@ class SZProviderPool(ProviderPool):
orig_matches = matches.copy()
logger.debug('%r: Found matches %r', s, matches)
score, score_without_hash = compute_score(matches, s, video, hearing_impaired=use_hearing_impaired)
unsorted_subtitles.append(
(s, compute_score(matches, s, video, hearing_impaired=use_hearing_impaired), matches, orig_matches))
(s, score, score_without_hash, matches, orig_matches))
# sort subtitles by score
scored_subtitles = sorted(unsorted_subtitles, key=operator.itemgetter(1), reverse=True)
scored_subtitles = sorted(unsorted_subtitles, key=operator.itemgetter(1, 2), reverse=True)
# download best subtitles, falling back on the next on error
downloaded_subtitles = []
for subtitle, score, matches, orig_matches in scored_subtitles:
for subtitle, score, score_without_hash, matches, orig_matches in scored_subtitles:
# check score
if score < min_score:
logger.info('%r: Score %d is below min_score (%d)', subtitle, score, min_score)
@@ -20,7 +20,7 @@ from exceptions import APIThrottled
from dogpile.cache.api import NO_VALUE
from subliminal.cache import region
from subliminal_patch.pitcher import pitchers
from cloudscraper import CloudScraper
from cloudscraper import CloudScraper, User_Agent
try:
import brotli
@@ -89,7 +89,9 @@ class CFSession(CloudScraper):
# Check if Cloudflare anti-bot is on
try:
if self.isChallengeRequest(resp):
print repr(resp)
if self.is_IUAM_Challenge(resp):
print "TRYYYYYYYYYY"
if resp.request.method != 'GET':
# Work around if the initial request is not a GET,
# Supersede with a GET then re-request the original METHOD.
@@ -97,9 +99,10 @@ class CFSession(CloudScraper):
resp = ourSuper.request(method, url, *args, **kwargs)
else:
# Solve Challenge
resp = self.sendChallengeResponse(resp, **kwargs)
resp = self.Challenge_Response(resp, **kwargs)
except ValueError, e:
print "YEEEEEEEEEEEEEE"
if e.message == "Captcha":
parsed_url = urlparse(url)
domain = parsed_url.netloc
@@ -241,12 +244,20 @@ class SubZeroRequestsTransport(xmlrpclib.SafeTransport):
# change our user agent to reflect Requests
user_agent = "Python XMLRPC with Requests (python-requests.org)"
proxies = None
xm_ver = 1
session_var = "PHPSESSID"
def __init__(self, use_https=True, verify=None, user_agent=None, timeout=10, *args, **kwargs):
self.verify = pem_file if verify is None else verify
self.use_https = use_https
self.user_agent = user_agent if user_agent is not None else self.user_agent
self.timeout = timeout
self.session = requests.Session()
self.session.headers['User-Agent'] = self.user_agent
# if 'requests' in self.session.headers['User-Agent']:
# # Set a random User-Agent if no custom User-Agent has been set
# self.session.headers = User_Agent(allow_brotli=False).headers
proxy = os.environ.get('SZ_HTTP_PROXY')
if proxy:
self.proxies = {
@@ -260,18 +271,40 @@ class SubZeroRequestsTransport(xmlrpclib.SafeTransport):
"""
Make an xmlrpc request.
"""
headers = {'User-Agent': self.user_agent}
url = self._build_url(host, handler)
cache_key = "xm%s_%s" % (self.xm_ver, host)
old_sessvar = self.session.cookies.get(self.session_var, "")
if not old_sessvar:
data = region.get(cache_key)
if data is not NO_VALUE:
logger.debug("Trying to re-use headers/cookies for %s" % host)
self.session.cookies, self.session.headers = data
old_sessvar = self.session.cookies.get(self.session_var, "")
try:
resp = requests.post(url, data=request_body, headers=headers,
stream=True, timeout=self.timeout, proxies=self.proxies,
verify=self.verify)
resp = self.session.post(url, data=request_body,
stream=True, timeout=self.timeout, proxies=self.proxies,
verify=self.verify)
if self.session_var in resp.cookies and resp.cookies[self.session_var] != old_sessvar:
logger.debug("Storing %s cookies" % host)
region.set(cache_key, [self.session.cookies, self.session.headers])
except ValueError:
logger.debug("Wiping cookies/headers cache (VE) for %s" % host)
region.delete(cache_key)
raise
except Exception:
logger.debug("Wiping cookies/headers cache (EX) for %s" % host)
region.delete(cache_key)
raise # something went wrong
else:
resp.raise_for_status()
try:
resp.raise_for_status()
except requests.exceptions.HTTPError:
logger.debug("Wiping cookies/headers cache (RE) for %s" % host)
region.delete(cache_key)
raise
try:
if 'x-ratelimit-remaining' in resp.headers and int(resp.headers['x-ratelimit-remaining']) <= 2:
@@ -2,6 +2,8 @@
import logging
import re
import datetime
import types
import subliminal
import time
@@ -10,7 +12,8 @@ from random import randint
from dogpile.cache.api import NO_VALUE
from requests import Session
from subliminal.cache import region
from subliminal.exceptions import DownloadLimitExceeded, AuthenticationError, ConfigurationError
from subliminal.exceptions import DownloadLimitExceeded, AuthenticationError, ConfigurationError, \
DownloadLimitPerDayExceeded
from subliminal.providers.addic7ed import Addic7edProvider as _Addic7edProvider, \
Addic7edSubtitle as _Addic7edSubtitle, ParserBeautifulSoup
from subliminal.subtitle import fix_line_ending
@@ -64,6 +67,7 @@ class Addic7edProvider(_Addic7edProvider):
'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho'
]} | {Language.fromietf(l) for l in ["sr-Latn", "sr-Cyrl"]}
vip = False
USE_ADDICTED_RANDOM_AGENTS = False
hearing_impaired_verifiable = True
subtitle_class = Addic7edSubtitle
@@ -72,9 +76,10 @@ class Addic7edProvider(_Addic7edProvider):
sanitize_characters = {'-', ':', '(', ')', '.', '/'}
last_show_ids_fetch_key = "addic7ed_last_id_fetch"
def __init__(self, username=None, password=None, use_random_agents=False):
def __init__(self, username=None, password=None, use_random_agents=False, is_vip=False):
super(Addic7edProvider, self).__init__(username=username, password=password)
self.USE_ADDICTED_RANDOM_AGENTS = use_random_agents
self.vip = is_vip
if not all((username, password)):
raise ConfigurationError('Username and password must be specified')
@@ -397,6 +402,27 @@ class Addic7edProvider(_Addic7edProvider):
return subtitles
def download_subtitle(self, subtitle):
last_dls = region.get("addic7ed_dls")
now = datetime.datetime.now()
one_day = datetime.timedelta(hours=24)
def raise_limit():
logger.info("Addic7ed: Downloads per day exceeded (%s)", cap)
raise DownloadLimitPerDayExceeded
if not isinstance(last_dls, types.ListType):
last_dls = []
else:
# filter all non-expired DLs
last_dls = filter(lambda t: t + one_day > now, last_dls)
region.set("addic7ed_dls", last_dls)
cap = self.vip and 80 or 40
amount = len(last_dls)
if amount >= cap:
raise_limit()
# download the subtitle
r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link},
timeout=10)
@@ -408,7 +434,7 @@ class Addic7edProvider(_Addic7edProvider):
if not r.content:
# Provider wrongful return a status of 304 Not Modified with an empty content
# raise_for_status won't raise exception for that status code
logger.error('Unable to download subtitle. No data returned from provider')
logger.error('Addic7ed: Unable to download subtitle. No data returned from provider')
return
# detect download limit exceeded
@@ -416,3 +442,10 @@ class Addic7edProvider(_Addic7edProvider):
raise DownloadLimitExceeded
subtitle.content = fix_line_ending(r.content)
last_dls.append(datetime.datetime.now())
region.set("addic7ed_dls", last_dls)
logger.info("Addic7ed: Used %s/%s downloads", amount + 1, cap)
if amount + 1 >= cap:
raise_limit()
@@ -19,9 +19,11 @@ from xml.etree import ElementTree
logger = logging.getLogger(__name__)
class BSPlayerSubtitle(Subtitle):
"""BSPlayer Subtitle."""
provider_name = 'bsplayer'
hash_verifiable = True
def __init__(self, language, filename, subtype, video, link):
super(BSPlayerSubtitle, self).__init__(language)
@@ -41,27 +43,12 @@ class BSPlayerSubtitle(Subtitle):
def get_matches(self, video):
matches = set()
video_filename = video.name
video_filename = os.path.basename(video_filename)
video_filename, _ = os.path.splitext(video_filename)
video_filename = sanitize_release_group(video_filename)
subtitle_filename = self.filename
subtitle_filename = os.path.basename(subtitle_filename)
subtitle_filename, _ = os.path.splitext(subtitle_filename)
subtitle_filename = sanitize_release_group(subtitle_filename)
matches |= guess_matches(video, guessit(self.filename))
matches.add(id(self))
matches.add('hash')
return matches
class BSPlayerProvider(Provider):
"""BSPlayer Provider."""
languages = {Language('por', 'BR')} | {Language(l) for l in [
@@ -69,6 +56,7 @@ class BSPlayerProvider(Provider):
'ron', 'rus', 'spa', 'swe', 'tur', 'ukr', 'zho'
]}
SEARCH_THROTTLE = 8
hash_verifiable = True
# batantly based on kodi's bsplayer plugin
# also took from BSPlayer-Subtitles-Downloader
@@ -108,18 +96,11 @@ class BSPlayerProvider(Provider):
res = self.session.post(self.search_url, data)
return ElementTree.fromstring(res.text)
### with requests
# res = requests.post(
# url=self.search_url,
# data=data,
# headers=headers
# )
# return ElementTree.fromstring(res.text)
except Exception as ex:
logger.info("ERROR: %s." % ex)
if func_name == 'logIn':
self.search_url = self.get_sub_domain()
sleep(1)
logger.info('ERROR: Too many tries (%d)...' % tries)
raise Exception('Too many tries...')
@@ -167,7 +148,6 @@ class BSPlayerProvider(Provider):
# language_ids = 'spa'
language_ids = ','.join(sorted(l.opensubtitles for l in language))
if video.imdb_id is None:
imdbId = '*'
else:
@@ -193,13 +173,13 @@ class BSPlayerProvider(Provider):
if items:
logger.info("Subtitles Found.")
for item in items:
subID=item.find('subID').text
subDownloadLink=item.find('subDownloadLink').text
subLang= Language.fromopensubtitles(item.find('subLang').text)
subName=item.find('subName').text
subFormat=item.find('subFormat').text
subID = item.find('subID').text
subDownloadLink = item.find('subDownloadLink').text
subLang = Language.fromopensubtitles(item.find('subLang').text)
subName = item.find('subName').text
subFormat = item.find('subFormat').text
subtitles.append(
BSPlayerSubtitle(subLang,subName, subFormat, video, subDownloadLink)
BSPlayerSubtitle(subLang, subName, subFormat, video, subDownloadLink)
)
return subtitles
@@ -207,9 +187,9 @@ class BSPlayerProvider(Provider):
return self.query(video, video.hashes['bsplayer'], languages)
def get_sub_domain(self):
# s1-9, s101-109
# s1-9, s101-109
SUB_DOMAINS = ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9',
's101', 's102', 's103', 's104', 's105', 's106', 's107', 's108', 's109']
's101', 's102', 's103', 's104', 's105', 's106', 's107', 's108', 's109']
API_URL_TEMPLATE = "http://{sub_domain}.api.bsplayer-subtitles.com/v1.php"
sub_domains_end = len(SUB_DOMAINS) - 1
return API_URL_TEMPLATE.format(sub_domain=SUB_DOMAINS[random.randint(0, sub_domains_end)])
@@ -226,10 +206,8 @@ class BSPlayerProvider(Provider):
raise ValueError('Error 500 on server')
with gzip.GzipFile(fileobj=io.BytesIO(res.content)) as gf:
subtitle.content = gf.read()
subtitle.content = gf.read()
subtitle.normalize()
return subtitle
raise ValueError('Problems conecting to the server')
@@ -105,7 +105,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
def __init__(self, username=None, password=None, use_tag_search=False, only_foreign=False, also_foreign=False,
skip_wrong_fps=True, is_vip=False, use_ssl=True, timeout=15):
if any((username, password)) and not all((username, password)):
if not all((username, password)):
raise ConfigurationError('Username and password must be specified')
self.username = username or ''
@@ -154,6 +154,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
logger.debug('Logged in with token %r', self.token[:10]+"X"*(len(self.token)-10))
region.set("os_token", self.token)
time.sleep(1)
def use_token_or_login(self, func):
if not self.token:
@@ -162,6 +163,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
try:
return func()
except Unauthorized:
logger.debug("Token not valid, logging in again")
self.log_in()
return func()
@@ -197,16 +199,11 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
return
logger.error("Login failed, please check your credentials")
raise
def terminate(self):
if self.token:
try:
checked(lambda: self.server.LogOut(self.token))
except:
logger.error("Logout failed: %s", traceback.format_exc())
self.server = None
self.token = None
#self.token = None
def list_subtitles(self, video, languages):
"""
@@ -60,6 +60,8 @@ def compute_score(matches, subtitle, video, hearing_impaired=None):
episode_hash_valid_if = {"series", "season", "episode", "format"}
movie_hash_valid_if = {"video_codec", "format"}
orig_matches = matches.copy()
# on hash match, discard everything else
if subtitle.hash_verifiable:
if 'hash' in matches:
@@ -83,41 +85,47 @@ def compute_score(matches, subtitle, video, hearing_impaired=None):
matches &= {'hash'}
# handle equivalent matches
eq_matches = set()
if is_episode:
if 'title' in matches:
logger.debug('Adding title match equivalent')
matches.add('episode')
eq_matches.add('episode')
if 'series_imdb_id' in matches:
logger.debug('Adding series_imdb_id match equivalent')
matches |= {'series', 'year'}
eq_matches |= {'series', 'year'}
if 'imdb_id' in matches:
logger.debug('Adding imdb_id match equivalents')
matches |= {'series', 'year', 'season', 'episode'}
eq_matches |= {'series', 'year', 'season', 'episode'}
if 'tvdb_id' in matches:
logger.debug('Adding tvdb_id match equivalents')
matches |= {'series', 'year', 'season', 'episode', 'title'}
eq_matches |= {'series', 'year', 'season', 'episode', 'title'}
if 'series_tvdb_id' in matches:
logger.debug('Adding series_tvdb_id match equivalents')
matches |= {'series', 'year'}
eq_matches |= {'series', 'year'}
# specials
if video.is_special and 'title' in matches and 'series' in matches \
and 'year' in matches:
logger.debug('Adding special title match equivalent')
matches |= {'season', 'episode'}
eq_matches |= {'season', 'episode'}
elif is_movie:
if 'imdb_id' in matches:
logger.debug('Adding imdb_id match equivalents')
matches |= {'title', 'year'}
eq_matches |= {'title', 'year'}
matches |= eq_matches
# handle hearing impaired
if hearing_impaired is not None and subtitle.hearing_impaired == hearing_impaired:
logger.debug('Matched hearing_impaired')
matches.add('hearing_impaired')
orig_matches.add('hearing_impaired')
# compute the score
score = sum((scores.get(match, 0) for match in matches))
logger.info('%r: Computed score %r with final matches %r', subtitle, score, matches)
return score
score_without_hash = sum((scores.get(match, 0) for match in orig_matches | eq_matches if match != "hash"))
return score, score_without_hash
@@ -278,6 +278,12 @@ class Subtitle(Subtitle_):
@classmethod
def pysubs2_to_unicode(cls, sub, format="srt"):
"""
this is a modified version of pysubs2.SubripFormat.to_file with special handling for drawing tags in ASS
:param sub:
:param format:
:return:
"""
def ms_to_timestamp(ms, mssep=","):
"""Convert ms to 'HH:MM:SS,mmm'"""
# XXX throw on overflow/underflow?
@@ -289,9 +295,12 @@ class Subtitle(Subtitle_):
def prepare_text(text, style):
body = []
for fragment, sty in parse_tags(text, style, sub.styles):
fragment = fragment.replace(ur"\h", u" ")
fragment = fragment.replace(ur"\n", u"\n")
fragment = fragment.replace(ur"\N", u"\n")
fragment = fragment.replace(r"\h", u" ")
fragment = fragment.replace(r"\n", u"\n")
fragment = fragment.replace(r"\N", u"\n")
if sty.drawing:
raise pysubs2.ContentNotUsable
if format == "srt":
if sty.italic:
fragment = u"<i>%s</i>" % fragment
@@ -323,7 +332,10 @@ class Subtitle(Subtitle_):
for i, line in enumerate(visible_lines, 1):
start = ms_to_timestamp(line.start, mssep=mssep)
end = ms_to_timestamp(line.end, mssep=mssep)
text = prepare_text(line.text, sub.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
try:
text = prepare_text(line.text, sub.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
except pysubs2.ContentNotUsable:
continue
out.append(u"%d\n" % i)
out.append(u"%s --> %s\n" % (start, end))
+1
View File
@@ -24,6 +24,7 @@ if debug:
sub = Subtitle(Language.fromietf("eng"), mods=["common", "remove_HI", "OCR_fixes", "fix_uppercase", "shift_offset(ms=0,s=1)"])
sub.content = open(fn).read()
sub.normalize()
sub.is_valid()
content = sub.get_modified_content(debug=True)
#submod = SubMod(debug=debug)
@@ -0,0 +1,7 @@
# coding=utf-8
class EmptyEntryError(Exception):
pass
class EmptyLineError(Exception):
pass
@@ -6,7 +6,8 @@ import pysubs2
import logging
import time
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
from mods import EMPTY_TAG_PROCESSOR
from exc import EmptyEntryError
from registry import registry
from subzero.language import Language
@@ -300,11 +301,11 @@ class SubtitleModifications(object):
mod = self.initialized_mods[identifier]
try:
line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
line = mod.modify(line.strip(), entry=t, debug=self.debug, parent=self, index=index,
**args)
except EmptyEntryError:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
logger.debug(u"%d: %s: %r -> ''", index, identifier, t)
skip_entry = True
break
@@ -329,11 +330,11 @@ class SubtitleModifications(object):
mod = self.initialized_mods[identifier]
try:
line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
line = mod.modify(line.strip(), entry=t, debug=self.debug, parent=self, index=index,
procs=["last_process"], **args)
except EmptyEntryError:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
logger.debug(u"%d: %s: %r -> ''", index, identifier, t)
skip_entry = True
break
@@ -107,9 +107,3 @@ empty_line_post_processors = [
]
class EmptyEntryError(Exception):
pass
class EmptyLineError(Exception):
pass
@@ -7,6 +7,7 @@ from subzero.modification.mods import SubtitleTextModification, empty_line_post_
from subzero.modification.processors import FuncProcessor
from subzero.modification.processors.re_processor import NReProcessor
from subzero.modification import registry
from tld import get_tld
ENGLISH = Language("eng")
@@ -28,7 +29,7 @@ class CommonFixes(SubtitleTextModification):
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1", name="CM_multidash"),
# line = _/-/\s
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
NReProcessor(re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"),
# remove >>
NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),
@@ -113,7 +114,9 @@ class CommonFixes(SubtitleTextModification):
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"),
# add space after punctuation
NReProcessor(re.compile(r'(?u)([!?.,:])([A-zÀ-ž]{2,})'), r"\1 \2", name="CM_punctuation_space2"),
NReProcessor(re.compile(r'(?u)(([^\s]*)([!?.,:])([A-zÀ-ž]{2,}))'),
lambda match: u"%s%s %s" % (match.group(2), match.group(3), match.group(4)) if not get_tld(match.group(1), fail_silently=True, fix_protocol=True) else match.group(1),
name="CM_punctuation_space2"),
# fix lowercase I in english
NReProcessor(re.compile(r'(?u)(\b)i(\b)'), r"\1I\2", name="CM_EN_lowercase_i",
@@ -1,7 +1,8 @@
# coding=utf-8
import re
from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, EmptyEntryError, TAG
from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, TAG
from subzero.modification.exc import EmptyEntryError
from subzero.modification.processors.re_processor import NReProcessor
from subzero.modification import registry
@@ -46,7 +47,7 @@ class HearingImpaired(SubtitleTextModification):
name="HI_before_colon_noncaps"),
# brackets (only remove if at least 3 chars in brackets)
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
NReProcessor(re.compile(ur'(?sux)-?%(t)s["\']*[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]]["\']*[\s:]*%(t)s' %
{"t": TAG}), "", name="HI_brackets"),
#NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
@@ -90,8 +91,8 @@ class HearingImpaired(SubtitleTextModification):
"", name="HI_music_symbols_only"),
# remove music entries
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
"", name="HI_music"),
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[*#¶♫♪]+\s*.+|.+\s*[*#¶♫♪]+\s*$)'),
"", name="HI_music", entry=True),
]
@@ -10,7 +10,7 @@ class Processor(object):
supported = None
enabled = True
def __init__(self, name=None, parent=None, supported=None):
def __init__(self, name=None, parent=None, supported=None, **kwargs):
self.name = name
self.parent = parent
self.supported = supported if supported else lambda parent: True
@@ -35,7 +35,7 @@ class Processor(object):
class FuncProcessor(Processor):
func = None
def __init__(self, func, name=None, parent=None, supported=None):
def __init__(self, func, name=None, parent=None, supported=None, **kwargs):
super(FuncProcessor, self).__init__(name=name, supported=supported)
self.func = func
@@ -2,6 +2,7 @@
import re
import logging
from subzero.modification.exc import EmptyEntryError
from subzero.modification.processors import Processor
logger = logging.getLogger(__name__)
@@ -14,13 +15,22 @@ class ReProcessor(Processor):
pattern = None
replace_with = None
def __init__(self, pattern, replace_with, name=None, supported=None):
def __init__(self, pattern, replace_with, name=None, supported=None, entry=False, **kwargs):
super(ReProcessor, self).__init__(name=name, supported=supported)
self.pattern = pattern
self.replace_with = replace_with
self.use_entry = entry
def process(self, content, debug=False, **kwargs):
return self.pattern.sub(self.replace_with, content)
def process(self, content, debug=False, entry=None, **kwargs):
if not self.use_entry:
return self.pattern.sub(self.replace_with, content)
ret = self.pattern.sub(self.replace_with, entry)
if not ret:
raise EmptyEntryError()
elif ret != entry:
return ret
return content
class NReProcessor(ReProcessor):
@@ -36,7 +46,7 @@ class MultipleWordReProcessor(ReProcessor):
}
replaces found key in pattern with the corresponding value in data
"""
def __init__(self, snr_dict, name=None, parent=None, supported=None):
def __init__(self, snr_dict, name=None, parent=None, supported=None, **kwargs):
super(ReProcessor, self).__init__(name=name, supported=supported)
self.snr_dict = snr_dict
@@ -12,7 +12,7 @@ class StringProcessor(Processor):
String replacement processor base
"""
def __init__(self, search, replace, name=None, parent=None, supported=None):
def __init__(self, search, replace, name=None, parent=None, supported=None, **kwargs):
super(StringProcessor, self).__init__(name=name, supported=supported)
self.search = search
self.replace = replace
@@ -31,7 +31,7 @@ class MultipleLineProcessor(Processor):
"data": {"old_value": "new_value"}
}
"""
def __init__(self, snr_dict, name=None, parent=None, supported=None):
def __init__(self, snr_dict, name=None, parent=None, supported=None, **kwargs):
super(MultipleLineProcessor, self).__init__(name=name, supported=supported)
self.snr_dict = snr_dict
+2
View File
@@ -19,6 +19,8 @@ I can't keep running. L can't!
<b>i don't know. Some kind of wrong "1 00" number---
of signal, drawing the Tardis off.... course.</b>
# I'm singing in the rain
www.website.com
www.nowebsite.badlol
4
00:00:16,099 --> 00:00:17,224
+14
View File
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from .utils import get_fld, get_tld, get_tld_names, is_tld, parse_tld, Result, update_tld_names
__title__ = u'tld'
__version__ = u'0.11.10'
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'get_fld', u'get_tld', u'get_tld_names', u'is_tld',
u'parse_tld', u'Result', u'update_tld_names')
+57
View File
@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from six import with_metaclass as _py_backwards_six_withmetaclass
from codecs import open as codecs_open
try:
from urllib.request import urlopen
except ImportError:
from six.moves.urllib.request import urlopen as urlopen
from .exceptions import TldIOError, TldImproperlyConfigured
from .helpers import project_dir
from .registry import Registry
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'BaseTLDSourceParser',)
class BaseTLDSourceParser(
_py_backwards_six_withmetaclass(Registry, *[object])):
u'Base TLD source parser.'
uid = None
source_url = None
local_path = None
@classmethod
def validate(cls):
u'Constructor.'
if (not cls.uid):
raise TldImproperlyConfigured(
u'The `uid` property of the TLD source parser shall be defined.')
@classmethod
def get_tld_names(cls, fail_silently=False, retry_count=0):
u'Get tld names.\n\n :param fail_silently:\n :param retry_count:\n :return:\n '
cls.validate()
raise NotImplementedError(
u'Your TLD source parser shall implement `get_tld_names` method.')
@classmethod
def update_tld_names(cls, fail_silently=False):
u'Update the local copy of the TLD file.\n\n :param fail_silently:\n :return:\n '
try:
remote_file = urlopen(cls.source_url)
local_file = codecs_open(project_dir(
cls.local_path), u'wb', encoding='utf8')
local_file.write(remote_file.read().decode(u'utf8'))
local_file.close()
remote_file.close()
except Exception as err:
if fail_silently:
return False
raise TldIOError(err)
return True
+45
View File
@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from typing import Any
from . import defaults
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'get_setting', u'reset_settings', u'set_setting', u'settings')
class Settings(object):
u'Settings registry.'
def __init__(self):
self._settings = {
}
self._settings_get = self._settings.get
def set(self, name, value):
u'\n Override default settings.\n\n :param str name:\n :param mixed value:\n '
self._settings[name] = value
def get(self, name, default=None):
u'\n Gets a variable from local settings.\n\n :param str name:\n :param mixed default: Default value.\n :return mixed:\n '
if (name in self._settings):
return self._settings_get(name, default)
elif hasattr(defaults, name):
return getattr(defaults, name, default)
return default
def reset(self):
u'Reset settings.'
for name in defaults.__all__:
self.set(name, getattr(defaults, name))
settings = Settings()
get_setting = settings.get
set_setting = settings.set
reset_settings = settings.reset
+13
View File
@@ -0,0 +1,13 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from os.path import dirname
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'DEBUG', u'NAMES_LOCAL_PATH_PARENT')
NAMES_LOCAL_PATH_PARENT = dirname(__file__)
DEBUG = False
@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from .conf import get_setting
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'TldBadUrl', u'TldDomainNotFound',
u'TldImproperlyConfigured', u'TldIOError')
class TldIOError(IOError):
u'TldIOError.\n\n Supposed to be thrown when problems with reading/writing occur.\n '
def __init__(self, msg=None):
tld_names_local_path = get_setting(u'NAMES_LOCAL_PATH')
if (msg is None):
msg = (u"Can't read from or write to the %s file!" %
tld_names_local_path)
super(TldIOError, self).__init__(msg)
class TldDomainNotFound(ValueError):
u"TldDomainNotFound.\n\n Supposed to be thrown when domain name is not found (didn't match) the\n local TLD policy.\n "
def __init__(self, domain_name):
super(TldDomainNotFound, self).__init__(
(u"Domain %s didn't match any existing TLD name!" % domain_name))
class TldBadUrl(ValueError):
u'TldBadUrl.\n\n Supposed to be thrown when bad URL is given.\n '
def __init__(self, url):
super(TldBadUrl, self).__init__((u'Is not a valid URL %s!' % url))
class TldImproperlyConfigured(Exception):
u'TldImproperlyConfigured.\n\n Supposed to be thrown when code is improperly configured. Typical use-case\n is when user tries to use `get_tld` function with both `search_public` and\n `search_private` set to False.\n '
def __init__(self, msg=None):
if (msg is None):
msg = u'Improperly configured.'
else:
msg = (u'Improperly configured. %s' % msg)
super(TldImproperlyConfigured, self).__init__(msg)
+21
View File
@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from os.path import abspath, join
from .conf import get_setting
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'project_dir', u'PROJECT_DIR')
def project_dir(base):
u'Project dir.'
tld_names_local_path_parent = get_setting(u'NAMES_LOCAL_PATH_PARENT')
return abspath(join(tld_names_local_path_parent, base).replace(u'\\', u'/'))
PROJECT_DIR = project_dir
@@ -0,0 +1,5 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
@@ -0,0 +1,5 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
+41
View File
@@ -0,0 +1,41 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from typing import Type, Dict
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'Registry',)
class Registry(type):
REGISTRY = {
}
def __new__(cls, name, bases, attrs):
new_cls = type.__new__(cls, name, bases, attrs)
if getattr(new_cls, u'_uid', None):
cls.REGISTRY[new_cls._uid] = new_cls
return new_cls
@property
def _uid(cls):
return getattr(cls, 'uid', cls.__name__)
@classmethod
def reset(cls):
cls.REGISTRY = {
}
@classmethod
def get(cls, key, default=None):
return cls.REGISTRY.get(key, default)
@classmethod
def items(cls):
return cls.REGISTRY.items()
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
+57
View File
@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from typing import Dict
try:
from urllib.parse import SplitResult
except ImportError:
from six.moves.urllib_parse import SplitResult
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'Result',)
class Result(object):
u'Container.'
__slots__ = (u'subdomain', u'domain', u'tld', u'__fld', u'parsed_url')
def __init__(self, tld, domain, subdomain, parsed_url):
self.tld = tld
self.domain = (domain if (domain != u'') else tld)
self.subdomain = subdomain
self.parsed_url = parsed_url
if domain:
self.__fld = u''.join(
[u'{}'.format(self.domain), u'.', u'{}'.format(self.tld)])
else:
self.__fld = self.tld
@property
def extension(self):
u'Alias of ``tld``.\n\n :return str:\n '
return self.tld
suffix = extension
@property
def fld(self):
u'First level domain.\n\n :return:\n :rtype: str\n '
return self.__fld
def __str__(self):
return self.tld
__repr__ = __str__
@property
def __dict__(self):
u'Mimic __dict__ functionality.\n\n :return:\n :rtype: dict\n '
return {
u'tld': self.tld,
u'domain': self.domain,
u'subdomain': self.subdomain,
u'fld': self.fld,
u'parsed_url': self.parsed_url,
}
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import unittest
from .test_core import *
from .test_commands import *
if (__name__ == u'__main__'):
unittest.main()
@@ -0,0 +1,65 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from backports.functools_lru_cache import lru_cache
import logging
import socket
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'internet_available_only', u'log_info')
LOG_INFO = True
LOGGER = logging.getLogger(__name__)
def log_info(func):
u'Log some useful info.'
if (not LOG_INFO):
return func
def inner(self, *args, **kwargs):
u'Inner.'
result = func(*([self] + list(args)), **kwargs)
LOGGER.debug(u'\n\n%s', func.__name__)
LOGGER.debug(u'============================')
if func.__doc__:
LOGGER.debug(u'""" %s """', func.__doc__.strip())
LOGGER.debug(u'----------------------------')
if (result is not None):
LOGGER.debug(result)
LOGGER.debug(u'\n++++++++++++++++++++++++++++')
return result
return inner
@lru_cache(maxsize=32)
def is_internet_available(host='8.8.8.8', port=53, timeout=3):
u'Check if internet is available.\n\n Host: 8.8.8.8 (google-public-dns-a.google.com)\n OpenPort: 53/tcp\n Service: domain (DNS/TCP)\n '
try:
socket.setdefaulttimeout(timeout)
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
return True
except socket.error as ex:
print(ex)
return False
def internet_available_only(func):
def inner(self, *args, **kwargs):
u'Inner.'
if (not is_internet_available()):
LOGGER.debug(u'\n\n%s', func.__name__)
LOGGER.debug(u'============================')
if func.__doc__:
LOGGER.debug(u'""" %s """', func.__doc__.strip())
LOGGER.debug(u'----------------------------')
LOGGER.debug(u'Skipping because no Internet connection available.')
LOGGER.debug(u'\n++++++++++++++++++++++++++++')
return None
result = func(*([self] + list(args)), **kwargs)
return result
return inner
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,43 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import unittest
import subprocess
from .base import log_info, internet_available_only
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'GPL 2.0/LGPL 2.1'
__all__ = (u'TestCommands',)
LOGGER = logging.getLogger(__name__)
class TestCommands(unittest.TestCase):
u'Tld commands tests.'
def setUp(self):
u'Set up.'
@internet_available_only
@log_info
def test_1_update_tld_names_command(self):
u'Test updating the tld names (re-fetch mozilla source).'
res = subprocess.check_output([u'update-tld-names']).strip()
self.assertEqual(res, b'')
return res
@internet_available_only
@log_info
def test_1_update_tld_names_mozilla_command(self):
u'Test updating the tld names (re-fetch mozilla source).'
res = subprocess.check_output(
[u'update-tld-names', u'mozilla']).strip()
self.assertEqual(res, b'')
return res
if (__name__ == u'__main__'):
unittest.main()
@@ -0,0 +1,708 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import logging
from os.path import abspath, join
import unittest
from tempfile import gettempdir
from typing import Type
try:
from urllib.parse import urlsplit
except ImportError:
from six.moves.urllib_parse import urlsplit
from faker import Faker
from .. import defaults
from ..base import BaseTLDSourceParser
from ..conf import get_setting, reset_settings, set_setting
from ..exceptions import TldBadUrl, TldDomainNotFound, TldImproperlyConfigured, TldIOError
from ..helpers import project_dir
from ..registry import Registry
from ..utils import get_fld, get_tld, get_tld_names, get_tld_names_container, is_tld, MozillaTLDSourceParser, BaseMozillaTLDSourceParser, parse_tld, reset_tld_names, update_tld_names, update_tld_names_cli
from .base import internet_available_only, log_info
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'TestCore',)
LOGGER = logging.getLogger(__name__)
class TestCore(unittest.TestCase):
u'Core tld functionality tests.'
@classmethod
def setUpClass(cls):
cls.faker = Faker()
cls.temp_dir = gettempdir()
def setUp(self):
u'Set up.'
self.good_patterns = [{
u'url': u'http://www.google.co.uk',
u'fld': u'google.co.uk',
u'subdomain': u'www',
u'domain': u'google',
u'suffix': u'co.uk',
u'tld': u'co.uk',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://www.v2.google.co.uk',
u'fld': u'google.co.uk',
u'subdomain': u'www.v2',
u'domain': u'google',
u'suffix': u'co.uk',
u'tld': u'co.uk',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://хром.гугл.рф',
u'fld': u'гугл.рф',
u'subdomain': u'хром',
u'domain': u'гугл',
u'suffix': u'рф',
u'tld': u'рф',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://www.google.co.uk:8001/lorem-ipsum/',
u'fld': u'google.co.uk',
u'subdomain': u'www',
u'domain': u'google',
u'suffix': u'co.uk',
u'tld': u'co.uk',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://www.me.cloudfront.net',
u'fld': u'me.cloudfront.net',
u'subdomain': u'www',
u'domain': u'me',
u'suffix': u'cloudfront.net',
u'tld': u'cloudfront.net',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://www.v2.forum.tech.google.co.uk:8001/lorem-ipsum/',
u'fld': u'google.co.uk',
u'subdomain': u'www.v2.forum.tech',
u'domain': u'google',
u'suffix': u'co.uk',
u'tld': u'co.uk',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'https://pantheon.io/',
u'fld': u'pantheon.io',
u'subdomain': u'',
u'domain': u'pantheon',
u'suffix': u'io',
u'tld': u'io',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'v2.www.google.com',
u'fld': u'google.com',
u'subdomain': u'v2.www',
u'domain': u'google',
u'suffix': u'com',
u'tld': u'com',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': u'//v2.www.google.com',
u'fld': u'google.com',
u'subdomain': u'v2.www',
u'domain': u'google',
u'suffix': u'com',
u'tld': u'com',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': u'http://foo@bar.com',
u'fld': u'bar.com',
u'subdomain': u'',
u'domain': u'bar',
u'suffix': u'com',
u'tld': u'com',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://user:foo@bar.com',
u'fld': u'bar.com',
u'subdomain': u'',
u'domain': u'bar',
u'suffix': u'com',
u'tld': u'com',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'https://faguoren.xn--fiqs8s',
u'fld': u'faguoren.xn--fiqs8s',
u'subdomain': u'',
u'domain': u'faguoren',
u'suffix': u'xn--fiqs8s',
u'tld': u'xn--fiqs8s',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'blogs.lemonde.paris',
u'fld': u'lemonde.paris',
u'subdomain': u'blogs',
u'domain': u'lemonde',
u'suffix': u'paris',
u'tld': u'paris',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': u'axel.brighton.ac.uk',
u'fld': u'brighton.ac.uk',
u'subdomain': u'axel',
u'domain': u'brighton',
u'suffix': u'ac.uk',
u'tld': u'ac.uk',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': u'm.fr.blogspot.com.au',
u'fld': u'fr.blogspot.com.au',
u'subdomain': u'm',
u'domain': u'fr',
u'suffix': u'blogspot.com.au',
u'tld': u'blogspot.com.au',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': u'help.www.福岡.jp',
u'fld': u'www.福岡.jp',
u'subdomain': u'help',
u'domain': u'www',
u'suffix': u'福岡.jp',
u'tld': u'福岡.jp',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': u'syria.arabic.variant.سوريا',
u'fld': u'variant.سوريا',
u'subdomain': u'syria.arabic',
u'domain': u'variant',
u'suffix': u'سوريا',
u'tld': u'سوريا',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': u'http://www.help.kawasaki.jp',
u'fld': u'www.help.kawasaki.jp',
u'subdomain': u'',
u'domain': u'www',
u'suffix': u'help.kawasaki.jp',
u'tld': u'help.kawasaki.jp',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://www.city.kawasaki.jp',
u'fld': u'city.kawasaki.jp',
u'subdomain': u'www',
u'domain': u'city',
u'suffix': u'kawasaki.jp',
u'tld': u'kawasaki.jp',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://fedoraproject.org',
u'fld': u'fedoraproject.org',
u'subdomain': u'',
u'domain': u'fedoraproject',
u'suffix': u'org',
u'tld': u'org',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://www.cloud.fedoraproject.org',
u'fld': u'www.cloud.fedoraproject.org',
u'subdomain': u'',
u'domain': u'www',
u'suffix': u'cloud.fedoraproject.org',
u'tld': u'cloud.fedoraproject.org',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'https://www.john.app.os.fedoraproject.org',
u'fld': u'john.app.os.fedoraproject.org',
u'subdomain': u'www',
u'domain': u'john',
u'suffix': u'app.os.fedoraproject.org',
u'tld': u'app.os.fedoraproject.org',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'ftp://www.xn--mxail5aa.xn--11b4c3d',
u'fld': u'xn--mxail5aa.xn--11b4c3d',
u'subdomain': u'www',
u'domain': u'xn--mxail5aa',
u'suffix': u'xn--11b4c3d',
u'tld': u'xn--11b4c3d',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://cloud.fedoraproject.org',
u'fld': u'cloud.fedoraproject.org',
u'subdomain': u'',
u'domain': u'cloud.fedoraproject.org',
u'suffix': u'cloud.fedoraproject.org',
u'tld': u'cloud.fedoraproject.org',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'github.io',
u'fld': u'github.io',
u'subdomain': u'',
u'domain': u'github.io',
u'suffix': u'github.io',
u'tld': u'github.io',
u'kwargs': {
u'fail_silently': True,
u'fix_protocol': True,
},
}, {
u'url': urlsplit(u'http://lemonde.fr/article.html'),
u'fld': u'lemonde.fr',
u'subdomain': u'',
u'domain': u'lemonde',
u'suffix': u'fr',
u'tld': u'fr',
u'kwargs': {
u'fail_silently': True,
},
}]
self.bad_patterns = {
u'v2.www.google.com': {
u'exception': TldBadUrl,
},
u'/index.php?a=1&b=2': {
u'exception': TldBadUrl,
},
u'http://www.tld.doesnotexist': {
u'exception': TldDomainNotFound,
},
u'https://2001:0db8:0000:85a3:0000:0000:ac1f:8001': {
u'exception': TldDomainNotFound,
},
u'http://192.169.1.1': {
u'exception': TldDomainNotFound,
},
u'http://localhost:8080': {
u'exception': TldDomainNotFound,
},
u'https://localhost': {
u'exception': TldDomainNotFound,
},
u'https://localhost2': {
u'exception': TldImproperlyConfigured,
u'kwargs': {
u'search_public': False,
u'search_private': False,
},
},
}
self.invalid_tlds = {u'v2.www.google.com', u'tld.doesnotexist',
u'2001:0db8:0000:85a3:0000:0000:ac1f', u'192.169.1.1', 'localhost', u'google.com'}
self.tld_names_local_path_custom = project_dir(
join(u'tests', u'res', u'effective_tld_names_custom.dat.txt'))
self.good_patterns_custom_parser = [{
u'url': u'http://www.foreverchild',
u'fld': u'www.foreverchild',
u'subdomain': u'',
u'domain': u'www',
u'suffix': u'foreverchild',
u'tld': u'foreverchild',
u'kwargs': {
u'fail_silently': True,
},
}, {
u'url': u'http://www.v2.foreverchild',
u'fld': u'v2.foreverchild',
u'subdomain': u'www',
u'domain': u'v2',
u'suffix': u'foreverchild',
u'tld': u'foreverchild',
u'kwargs': {
u'fail_silently': True,
},
}]
reset_settings()
def tearDown(self):
u'Tear down.'
reset_settings()
Registry.reset()
@property
def good_url(self):
return self.good_patterns[0][u'url']
@property
def bad_url(self):
return list(self.bad_patterns.keys())[0]
def get_custom_parser_class(self, uid='custom_mozilla', source_url=None, local_path='tests/res/effective_tld_names_custom.dat.txt'):
parser_class = type('CustomMozillaTLDSourceParser', (BaseMozillaTLDSourceParser,), {
'uid': uid,
'source_url': source_url,
'local_path': local_path,
})
return parser_class
@log_info
def test_0_tld_names_loaded(self):
u'Test if tld names are loaded.'
get_fld(u'http://www.google.co.uk')
from ..utils import tld_names
res = (len(tld_names) > 0)
self.assertTrue(res)
return res
@internet_available_only
@log_info
def test_1_update_tld_names(self):
u'Test updating the tld names (re-fetch mozilla source).'
res = update_tld_names(fail_silently=False)
self.assertTrue(res)
return res
@log_info
def test_2_fld_good_patterns_pass(self):
u'Test good URL patterns.'
res = []
for data in self.good_patterns:
_res = get_fld(data[u'url'], **data[u'kwargs'])
self.assertEqual(_res, data[u'fld'])
res.append(_res)
return res
@log_info
def test_3_fld_bad_patterns_pass(self):
u'Test bad URL patterns.'
res = []
for (url, params) in self.bad_patterns.items():
_res = get_fld(url, fail_silently=True)
self.assertEqual(_res, None)
res.append(_res)
return res
@log_info
def test_4_override_settings(self):
u'Testing settings override.'
def override_settings():
u'Override settings.'
return get_setting(u'DEBUG')
self.assertEqual(defaults.DEBUG, override_settings())
set_setting(u'DEBUG', True)
self.assertEqual(True, override_settings())
return override_settings()
@log_info
def test_5_tld_good_patterns_pass_parsed_object(self):
u'Test good URL patterns.'
res = []
for data in self.good_patterns:
kwargs = copy.copy(data[u'kwargs'])
kwargs.update({
u'as_object': True,
})
_res = get_tld(data[u'url'], **kwargs)
self.assertEqual(_res.tld, data[u'tld'])
self.assertEqual(_res.subdomain, data[u'subdomain'])
self.assertEqual(_res.domain, data[u'domain'])
self.assertEqual(_res.suffix, data[u'suffix'])
self.assertEqual(_res.fld, data[u'fld'])
self.assertEqual(unicode(_res).encode(u'utf8'),
data[u'tld'].encode(u'utf8'))
self.assertEqual(_res.__dict__, {
u'tld': _res.tld,
u'domain': _res.domain,
u'subdomain': _res.subdomain,
u'fld': _res.fld,
u'parsed_url': _res.parsed_url,
})
res.append(_res)
return res
@log_info
def test_6_override_full_names_path(self):
default = project_dir(u'dummy.txt')
override_base = u'/tmp/test'
set_setting(u'NAMES_LOCAL_PATH_PARENT', override_base)
modified = project_dir(u'dummy.txt')
self.assertNotEqual(default, modified)
self.assertEqual(modified, abspath(u'/tmp/test/dummy.txt'))
@log_info
def test_7_public_private(self):
res = get_fld(u'http://silly.cc.ua',
fail_silently=True, search_private=False)
self.assertEqual(res, None)
res = get_fld(u'http://silly.cc.ua',
fail_silently=True, search_private=True)
self.assertEqual(res, u'silly.cc.ua')
res = get_fld(u'mercy.compute.amazonaws.com',
fail_silently=True, search_private=False, fix_protocol=True)
self.assertEqual(res, None)
res = get_fld(u'http://whatever.com',
fail_silently=True, search_public=False)
self.assertEqual(res, None)
@log_info
def test_8_fld_bad_patterns_exceptions(self):
u'Test exceptions.'
res = []
for (url, params) in self.bad_patterns.items():
kwargs = (params[u'kwargs'] if (u'kwargs' in params) else {
})
kwargs.update({
u'fail_silently': False,
})
with self.assertRaises(params[u'exception']):
_res = get_fld(url, **kwargs)
res.append(_res)
return res
@log_info
def test_9_tld_good_patterns_pass(self):
u'Test `get_tld` good URL patterns.'
res = []
for data in self.good_patterns:
_res = get_tld(data[u'url'], **data[u'kwargs'])
self.assertEqual(_res, data[u'tld'])
res.append(_res)
return res
@log_info
def test_10_tld_bad_patterns_pass(self):
u'Test `get_tld` bad URL patterns.'
res = []
for (url, params) in self.bad_patterns.items():
_res = get_tld(url, fail_silently=True)
self.assertEqual(_res, None)
res.append(_res)
return res
@log_info
def test_11_parse_tld_good_patterns(self):
u'Test `parse_tld` good URL patterns.'
res = []
for data in self.good_patterns:
_res = parse_tld(data[u'url'], **data[u'kwargs'])
self.assertEqual(
_res, (data[u'tld'], data[u'domain'], data[u'subdomain']))
res.append(_res)
return res
@log_info
def test_12_is_tld_good_patterns(self):
u'Test `is_tld` good URL patterns.'
for data in self.good_patterns:
self.assertTrue(is_tld(data[u'tld']))
@log_info
def test_13_is_tld_bad_patterns(self):
u'Test `is_tld` bad URL patterns.'
for _tld in self.invalid_tlds:
self.assertFalse(is_tld(_tld))
@log_info
def test_14_fail_update_tld_names(self):
u'Test fail `update_tld_names`.'
parser_class = self.get_custom_parser_class(
uid='custom_mozilla_2', source_url='i-do-not-exist')
with self.assertRaises(TldIOError):
update_tld_names(fail_silently=False, parser_uid=parser_class.uid)
self.assertFalse(update_tld_names(
fail_silently=True, parser_uid=parser_class.uid))
@log_info
def test_15_fail_get_fld_wrong_kwargs(self):
u'Test fail `get_fld` with wrong kwargs.'
with self.assertRaises(TldImproperlyConfigured):
get_fld(self.good_url, as_object=True)
@log_info
def test_16_fail_parse_tld(self):
u'Test fail `parse_tld`.\n\n Assert raise TldIOError on wrong `NAMES_SOURCE_URL` for `parse_tld`.\n '
parser_class = self.get_custom_parser_class(
source_url='i-do-not-exist')
parsed_tld = parse_tld(
self.bad_url, fail_silently=False, parser_class=parser_class)
self.assertEqual(parsed_tld, (None, None, None))
@log_info
def test_17_get_tld_names_and_reset_tld_names(self):
u'Test fail `get_tld_names` and repair using `reset_tld_names`.'
tmp_filename = join(gettempdir(), u''.join(
[u'{}'.format(self.faker.uuid4()), u'.dat.txt']))
parser_class = self.get_custom_parser_class(
source_url='i-do-not-exist', local_path=tmp_filename)
reset_tld_names()
if True:
with self.assertRaises(TldIOError):
get_tld_names(fail_silently=False, parser_class=parser_class)
tmp_filename = join(gettempdir(), u''.join(
[u'{}'.format(self.faker.uuid4()), u'.dat.txt']))
parser_class_2 = self.get_custom_parser_class(
source_url='i-do-not-exist-2', local_path=tmp_filename)
reset_tld_names()
if True:
self.assertIsNone(get_tld_names(
fail_silently=True, parser_class=parser_class_2))
@internet_available_only
@log_info
def test_18_update_tld_names_cli(self):
u'Test the return code of the CLI version of `update_tld_names`.'
reset_tld_names()
res = update_tld_names_cli()
self.assertEqual(res, 0)
@log_info
def test_19_parse_tld_custom_tld_names_good_patterns(self):
u'Test `parse_tld` good URL patterns for custom tld names.'
res = []
for data in self.good_patterns_custom_parser:
kwargs = copy.copy(data[u'kwargs'])
kwargs.update({
u'parser_class': self.get_custom_parser_class(),
})
_res = parse_tld(data[u'url'], **kwargs)
self.assertEqual(
_res, (data[u'tld'], data[u'domain'], data[u'subdomain']))
res.append(_res)
return res
@log_info
def test_20_tld_custom_tld_names_good_patterns_pass_parsed_object(self):
u'Test `get_tld` good URL patterns for custom tld names.'
res = []
for data in self.good_patterns_custom_parser:
kwargs = copy.copy(data[u'kwargs'])
kwargs.update({
u'as_object': True,
u'parser_class': self.get_custom_parser_class(),
})
_res = get_tld(data[u'url'], **kwargs)
self.assertEqual(_res.tld, data[u'tld'])
self.assertEqual(_res.subdomain, data[u'subdomain'])
self.assertEqual(_res.domain, data[u'domain'])
self.assertEqual(_res.suffix, data[u'suffix'])
self.assertEqual(_res.fld, data[u'fld'])
self.assertEqual(unicode(_res).encode(u'utf8'),
data[u'tld'].encode(u'utf8'))
self.assertEqual(_res.__dict__, {
u'tld': _res.tld,
u'domain': _res.domain,
u'subdomain': _res.subdomain,
u'fld': _res.fld,
u'parsed_url': _res.parsed_url,
})
res.append(_res)
return res
@log_info
def test_21_reset_tld_names_for_custom_parser(self):
u'Test `reset_tld_names` for `tld_names_local_path`.'
res = []
parser_class = self.get_custom_parser_class()
for data in self.good_patterns_custom_parser:
kwargs = copy.copy(data[u'kwargs'])
kwargs.update({
u'as_object': True,
u'parser_class': self.get_custom_parser_class(),
})
_res = get_tld(data[u'url'], **kwargs)
self.assertEqual(_res.tld, data[u'tld'])
self.assertEqual(_res.subdomain, data[u'subdomain'])
self.assertEqual(_res.domain, data[u'domain'])
self.assertEqual(_res.suffix, data[u'suffix'])
self.assertEqual(_res.fld, data[u'fld'])
self.assertEqual(unicode(_res).encode(u'utf8'),
data[u'tld'].encode(u'utf8'))
self.assertEqual(_res.__dict__, {
u'tld': _res.tld,
u'domain': _res.domain,
u'subdomain': _res.subdomain,
u'fld': _res.fld,
u'parsed_url': _res.parsed_url,
})
res.append(_res)
tld_names = get_tld_names_container()
self.assertIn(parser_class.local_path, tld_names)
reset_tld_names(parser_class.local_path)
self.assertNotIn(parser_class.local_path, tld_names)
return res
@log_info
def test_22_fail_define_custom_parser_class_without_uid(self):
u'Test fail define custom parser class without `uid`.'
class CustomParser(BaseTLDSourceParser):
pass
class AnotherCustomParser(BaseTLDSourceParser):
uid = u'another-custom-parser'
with self.assertRaises(TldImproperlyConfigured):
CustomParser.get_tld_names()
with self.assertRaises(NotImplementedError):
AnotherCustomParser.get_tld_names()
@log_info
def test_23_len_trie_nodes(self):
u'Test len of the trie nodes.'
get_tld(u'http://delusionalinsanity.com')
tld_names = get_tld_names_container()
self.assertGreater(
len(tld_names[MozillaTLDSourceParser.local_path]), 0)
@log_info
def test_24_get_tld_names_no_arguments(self):
u'Test len of the trie nodes.'
tld_names = get_tld_names()
self.assertGreater(len(tld_names), 0)
if (__name__ == u'__main__'):
unittest.main()
+54
View File
@@ -0,0 +1,54 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'Trie', u'TrieNode')
class TrieNode(object):
u'Class representing a single Trie node.'
__slots__ = (u'children', u'exception', u'leaf', u'private')
def __init__(self):
self.children = None
self.exception = None
self.leaf = False
self.private = False
class Trie(object):
u'An adhoc Trie data structure to store tlds in reverse notation order.'
def __init__(self):
self.root = TrieNode()
self.__nodes = 0
def __len__(self):
return self.__nodes
def add(self, tld, private=False):
node = self.root
for part in reversed(tld.split(u'.')):
if part.startswith(u'!'):
node.exception = part[1:]
break
if (node.children is None):
node.children = {
}
child = TrieNode()
else:
child = node.children.get(part)
if (child is None):
child = TrieNode()
node.children[part] = child
node = child
node.leaf = True
if private:
node.private = True
self.__nodes += 1
+271
View File
@@ -0,0 +1,271 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import unicode_literals
import argparse
from codecs import open as codecs_open
from backports.functools_lru_cache import lru_cache
from os.path import isabs
import sys
from typing import Dict, Type, Union, Tuple, List
try:
from urllib.parse import urlsplit, SplitResult
except ImportError:
from six.moves.urllib_parse import urlsplit, SplitResult
from .base import BaseTLDSourceParser
from .exceptions import TldBadUrl, TldDomainNotFound, TldImproperlyConfigured, TldIOError
from .helpers import project_dir
from .trie import Trie
from .registry import Registry
from .result import Result
__author__ = u'Artur Barseghyan'
__copyright__ = u'2013-2019 Artur Barseghyan'
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
__all__ = (u'BaseMozillaTLDSourceParser', u'get_fld', u'get_tld', u'get_tld_names', u'get_tld_names_container', u'is_tld', u'MozillaTLDSourceParser', u'parse_tld',
u'pop_tld_names_container', u'process_url', u'reset_tld_names', u'Result', u'update_tld_names', u'update_tld_names_cli', u'update_tld_names_container')
tld_names = {
}
def get_tld_names_container():
u'Get container of all tld names.\n\n :return:\n :rtype dict:\n '
global tld_names
return tld_names
def update_tld_names_container(tld_names_local_path, trie_obj):
u'Update TLD Names container item.\n\n :param tld_names_local_path:\n :param trie_obj:\n :return:\n '
global tld_names
tld_names.update({
tld_names_local_path: trie_obj,
})
def pop_tld_names_container(tld_names_local_path):
u'Remove TLD names container item.\n\n :param tld_names_local_path:\n :return:\n '
global tld_names
tld_names.pop(tld_names_local_path, None)
@lru_cache(maxsize=128, typed=True)
def update_tld_names(fail_silently=False, parser_uid=None):
u'Update TLD names.\n\n :param fail_silently:\n :param parser_uid:\n :return:\n '
results = []
results_append = results.append
if parser_uid:
parser_cls = Registry.get(parser_uid, None)
if (parser_cls and parser_cls.source_url):
results_append(parser_cls.update_tld_names(
fail_silently=fail_silently))
else:
for (parser_uid, parser_cls) in Registry.items():
if (parser_cls and parser_cls.source_url):
results_append(parser_cls.update_tld_names(
fail_silently=fail_silently))
return all(results)
def update_tld_names_cli():
u'CLI wrapper for update_tld_names.\n\n Since update_tld_names returns True on success, we need to negate the\n result to match CLI semantics.\n '
parser = argparse.ArgumentParser(description='Update TLD names')
parser.add_argument(u'parser_uid', nargs='?', default=None,
help='UID of the parser to update TLD names for.')
parser.add_argument(u'--fail-silently', dest='fail_silently',
default=False, action='store_true', help='Fail silently')
args = parser.parse_args(sys.argv[1:])
parser_uid = args.parser_uid
fail_silently = args.fail_silently
return int((not update_tld_names(parser_uid=parser_uid, fail_silently=fail_silently)))
def get_tld_names(fail_silently=False, retry_count=0, parser_class=None):
u'Build the ``tlds`` list if empty. Recursive.\n\n :param fail_silently: If set to True, no exceptions are raised and None\n is returned on failure.\n :param retry_count: If greater than 1, we raise an exception in order\n to avoid infinite loops.\n :param parser_class:\n :type fail_silently: bool\n :type retry_count: int\n :type parser_class: BaseTLDSourceParser\n :return: List of TLD names\n :rtype: obj:`tld.utils.Trie`\n '
if (not parser_class):
parser_class = MozillaTLDSourceParser
return parser_class.get_tld_names(fail_silently=fail_silently, retry_count=retry_count)
class BaseMozillaTLDSourceParser(BaseTLDSourceParser):
@classmethod
def get_tld_names(cls, fail_silently=False, retry_count=0):
u'Parse.\n\n :param fail_silently:\n :param retry_count:\n :return:\n '
if (retry_count > 1):
if fail_silently:
return None
else:
raise TldIOError
global tld_names
_tld_names = tld_names
if ((cls.local_path in _tld_names) and (_tld_names[cls.local_path] is not None)):
return _tld_names
local_file = None
try:
if isabs(cls.local_path):
local_path = cls.local_path
else:
local_path = project_dir(cls.local_path)
local_file = codecs_open(local_path, u'r', encoding='utf8')
trie = Trie()
trie_add = trie.add
private_section = False
for line in local_file:
if (u'===BEGIN PRIVATE DOMAINS===' in line):
private_section = True
if (u'// xn--' in line):
line = line.split()[1]
if (line[0] in (u'/', u'\n')):
continue
trie_add(u''.join([u'{}'.format(line.strip())]),
private=private_section)
update_tld_names_container(cls.local_path, trie)
local_file.close()
except IOError as err:
cls.update_tld_names(fail_silently=fail_silently)
retry_count += 1
return cls.get_tld_names(fail_silently=fail_silently, retry_count=retry_count)
except Exception as err:
if fail_silently:
return None
else:
raise err
finally:
try:
local_file.close()
except Exception:
pass
return _tld_names
class MozillaTLDSourceParser(BaseMozillaTLDSourceParser):
u'Mozilla TLD source.'
uid = u'mozilla'
source_url = u'http://mxr.mozilla.org/mozilla/source/netwerk/dns/src/effective_tld_names.dat?raw=1'
local_path = u'res/effective_tld_names.dat.txt'
def process_url(url, fail_silently=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
u'Process URL.\n\n :param parser_class:\n :param url:\n :param fail_silently:\n :param fix_protocol:\n :param search_public:\n :param search_private:\n :return:\n '
if (not (search_public or search_private)):
raise TldImproperlyConfigured(
u'Either `search_public` or `search_private` (or both) shall be set to True.')
_tld_names = get_tld_names(
fail_silently=fail_silently, parser_class=parser_class)
if (not isinstance(url, SplitResult)):
url = url.lower()
if (fix_protocol and (not url.startswith((u'//', u'http://', u'https://')))):
url = u''.join([u'https://', u'{}'.format(url)])
parsed_url = urlsplit(url)
else:
parsed_url = url
domain_name = parsed_url.hostname
if (not domain_name):
if fail_silently:
return (None, None, parsed_url)
else:
raise TldBadUrl(url=url)
domain_parts = domain_name.split(u'.')
tld_names_local_path = parser_class.local_path
node = _tld_names[tld_names_local_path].root
current_length = 0
tld_length = 0
match = None
len_domain_parts = len(domain_parts)
for i in reversed(range(len_domain_parts)):
part = domain_parts[i]
if (node.children is None):
break
if (part == node.exception):
break
child = node.children.get(part)
if (child is None):
child = node.children.get(u'*')
if (child is None):
break
current_length += 1
node = child
if node.leaf:
tld_length = current_length
match = node
if ((match is None) or (not match.leaf) or ((not search_public) and (not match.private)) or ((not search_private) and match.private)):
if fail_silently:
return (None, None, parsed_url)
else:
raise TldDomainNotFound(domain_name=domain_name)
if (len_domain_parts == tld_length):
non_zero_i = (- 1)
else:
non_zero_i = max(1, (len_domain_parts - tld_length))
return (domain_parts, non_zero_i, parsed_url)
def get_fld(url, fail_silently=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser, **kwargs):
u"Extract the first level domain.\n\n Extract the top level domain based on the mozilla's effective TLD names\n dat file. Returns a string. May throw ``TldBadUrl`` or\n ``TldDomainNotFound`` exceptions if there's bad URL provided or no TLD\n match found respectively.\n\n :param url: URL to get top level domain from.\n :param fail_silently: If set to True, no exceptions are raised and None\n is returned on failure.\n :param fix_protocol: If set to True, missing or wrong protocol is\n ignored (https is appended instead).\n :param search_public: If set to True, search in public domains.\n :param search_private: If set to True, search in private domains.\n :param parser_class:\n :type url: str\n :type fail_silently: bool\n :type fix_protocol: bool\n :type search_public: bool\n :type search_private: bool\n :return: String with top level domain (if ``as_object`` argument\n is set to False) or a ``tld.utils.Result`` object (if ``as_object``\n argument is set to True); returns None on failure.\n :rtype: str\n "
if (u'as_object' in kwargs):
raise TldImproperlyConfigured(
u'`as_object` argument is deprecated for `get_fld`. Use `get_tld` instead.')
(domain_parts, non_zero_i, parsed_url) = process_url(url=url, fail_silently=fail_silently,
fix_protocol=fix_protocol, search_public=search_public, search_private=search_private, parser_class=parser_class)
if (domain_parts is None):
return None
if (non_zero_i < 0):
return parsed_url.hostname
return u'.'.join(domain_parts[(non_zero_i - 1):])
def get_tld(url, fail_silently=False, as_object=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
u"Extract the top level domain.\n\n Extract the top level domain based on the mozilla's effective TLD names\n dat file. Returns a string. May throw ``TldBadUrl`` or\n ``TldDomainNotFound`` exceptions if there's bad URL provided or no TLD\n match found respectively.\n\n :param url: URL to get top level domain from.\n :param fail_silently: If set to True, no exceptions are raised and None\n is returned on failure.\n :param as_object: If set to True, ``tld.utils.Result`` object is returned,\n ``domain``, ``suffix`` and ``tld`` properties.\n :param fix_protocol: If set to True, missing or wrong protocol is\n ignored (https is appended instead).\n :param search_public: If set to True, search in public domains.\n :param search_private: If set to True, search in private domains.\n :param parser_class:\n :type url: str\n :type fail_silently: bool\n :type as_object: bool\n :type fix_protocol: bool\n :type search_public: bool\n :type search_private: bool\n :return: String with top level domain (if ``as_object`` argument\n is set to False) or a ``tld.utils.Result`` object (if ``as_object``\n argument is set to True); returns None on failure.\n :rtype: str\n "
(domain_parts, non_zero_i, parsed_url) = process_url(url=url, fail_silently=fail_silently,
fix_protocol=fix_protocol, search_public=search_public, search_private=search_private, parser_class=parser_class)
if (domain_parts is None):
return None
if (not as_object):
if (non_zero_i < 0):
return parsed_url.hostname
return u'.'.join(domain_parts[non_zero_i:])
if (non_zero_i < 0):
subdomain = u''
domain = u''
_tld = parsed_url.hostname
else:
subdomain = u'.'.join(domain_parts[:(non_zero_i - 1)])
domain = u'.'.join(domain_parts[(non_zero_i - 1):non_zero_i])
_tld = u'.'.join(domain_parts[non_zero_i:])
return Result(subdomain=subdomain, domain=domain, tld=_tld, parsed_url=parsed_url)
def parse_tld(url, fail_silently=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
u'Parse TLD into parts.\n\n :param url:\n :param fail_silently:\n :param fix_protocol:\n :param search_public:\n :param search_private:\n :param parser_class:\n :return:\n :rtype: tuple\n '
try:
obj = get_tld(url, fail_silently=fail_silently, as_object=True, fix_protocol=fix_protocol,
search_public=search_public, search_private=search_private, parser_class=parser_class)
_tld = obj.tld
domain = obj.domain
subdomain = obj.subdomain
except (TldBadUrl, TldDomainNotFound, TldImproperlyConfigured, TldIOError):
_tld = None
domain = None
subdomain = None
return (_tld, domain, subdomain)
def is_tld(value, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
u'Check if given URL is tld.\n\n :param value: URL to get top level domain from.\n :param search_public: If set to True, search in public domains.\n :param search_private: If set to True, search in private domains.\n :param parser_class:\n :type value: str\n :type search_public: bool\n :type search_private: bool\n :return:\n :rtype: bool\n '
_tld = get_tld(url=value, fail_silently=True, fix_protocol=True,
search_public=search_public, search_private=search_private, parser_class=parser_class)
return (value == _tld)
def reset_tld_names(tld_names_local_path=None):
u'Reset the ``tld_names`` to empty value.\n\n If ``tld_names_local_path`` is given, removes specified\n entry from ``tld_names`` instead.\n\n :param tld_names_local_path:\n :type tld_names_local_path: str\n :return:\n '
if tld_names_local_path:
pop_tld_names_container(tld_names_local_path)
else:
global tld_names
tld_names = {
}
File diff suppressed because it is too large Load Diff
+9
View File
@@ -94,6 +94,15 @@ the.vbm, mmgoodnow, Vertig0ne, thliu78, tattoomees, ostman, count_confucius, ehe
## Changelog
2.6.5.3223
subscene, addic7ed
- either of those providers might impose a reCAPTCHA verification. In order to use those providers, please create an account at an AntiCaptcha service ([anti-captcha.com](http://getcaptchasolution.com/kkvviom7nh) or [deathbycaptcha.com](http://deathbycaptcha.com)), add funds, then supply your credentials/apikey in the configuration
Changelog
- core: scoring: reorder subtitles based on second non-hash-score if main hash score is the same; morpheus65535/bazarr#821
- providers: bsplayer: verify hash; clean up
2.6.5.3217