Compare commits
14 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| fb8bfeb044 | |||
| e083e133eb | |||
| c787e671c3 | |||
| 31ff93c3f1 | |||
| 289f174e2b | |||
| ea03f3fc4d | |||
| 740fc93c13 | |||
| e94bd3fcb9 | |||
| dba469750b | |||
| c7fe6076cb | |||
| 356f578014 | |||
| b151ed4c55 | |||
| 9455e3b52b | |||
| 60e2656541 |
@@ -15,7 +15,8 @@ import subliminal
|
||||
import subliminal_patch
|
||||
import subzero.constants
|
||||
import lib
|
||||
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded, AuthenticationError
|
||||
from subliminal.exceptions import ServiceUnavailable, DownloadLimitExceeded, AuthenticationError, \
|
||||
DownloadLimitPerDayExceeded
|
||||
from subliminal_patch.core import is_windows_special_path
|
||||
from whichdb import whichdb
|
||||
|
||||
@@ -61,12 +62,14 @@ def int_or_default(s, default):
|
||||
return default
|
||||
|
||||
|
||||
VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, ServiceUnavailable, APIThrottled)
|
||||
VALID_THROTTLE_EXCEPTIONS = (TooManyRequests, DownloadLimitExceeded, DownloadLimitPerDayExceeded,
|
||||
ServiceUnavailable, APIThrottled)
|
||||
|
||||
PROVIDER_THROTTLE_MAP = {
|
||||
"default": {
|
||||
TooManyRequests: (datetime.timedelta(hours=1), "1 hour"),
|
||||
DownloadLimitExceeded: (datetime.timedelta(hours=3), "3 hours"),
|
||||
DownloadLimitPerDayExceeded: (datetime.timedelta(hours=4), "4 hours"),
|
||||
ServiceUnavailable: (datetime.timedelta(minutes=20), "20 minutes"),
|
||||
APIThrottled: (datetime.timedelta(minutes=10), "10 minutes"),
|
||||
AuthenticationError: (datetime.timedelta(hours=2), "2 hours"),
|
||||
@@ -873,6 +876,7 @@ class Config(object):
|
||||
|
||||
provider_settings = {'addic7ed': {'username': Prefs['provider.addic7ed.username'],
|
||||
'password': Prefs['provider.addic7ed.password'],
|
||||
'is_vip': cast_bool(Prefs['provider.addic7ed.is_vip']),
|
||||
},
|
||||
'opensubtitles': {'username': Prefs['provider.opensubtitles.username'],
|
||||
'password': Prefs['provider.opensubtitles.password'],
|
||||
|
||||
@@ -375,6 +375,12 @@
|
||||
"default": "",
|
||||
"secure": "true"
|
||||
},
|
||||
{
|
||||
"id": "provider.addic7ed.is_vip",
|
||||
"label": "Addic7ed VIP? (80 vs 40 downloads per day)",
|
||||
"type": "bool",
|
||||
"default": "false"
|
||||
},
|
||||
{
|
||||
"id": "provider.addic7ed.boost_by2",
|
||||
"label": "Addic7ed: boost score (if requirements met)",
|
||||
|
||||
+3
-3
@@ -13,7 +13,7 @@
|
||||
<key>CFBundleSignature</key>
|
||||
<string>????</string>
|
||||
<key>CFBundleVersion</key>
|
||||
<string>2.6.5.3223</string>
|
||||
<string>2.6.5.3237</string>
|
||||
<key>PlexFrameworkVersion</key>
|
||||
<string>2</string>
|
||||
<key>PlexPluginClass</key>
|
||||
@@ -23,7 +23,7 @@
|
||||
<key>PlexPluginConsoleLogging</key>
|
||||
<string>0</string>
|
||||
<key>PlexPluginDevMode</key>
|
||||
<string>0</string>
|
||||
<string>1</string>
|
||||
<key>PlexPluginCodePolicy</key>
|
||||
<!-- this allows channels to access some python methods which are otherwise blocked, as well as import external code libraries, and interact with the PMS HTTP API -->
|
||||
<string>Elevated</string>
|
||||
@@ -32,7 +32,7 @@
|
||||
|
||||
<h1>Sub-Zero for Plex</h1><i>Subtitles done right</i>
|
||||
|
||||
Version 2.6.5.3223
|
||||
Version 2.6.5.3237 DEV
|
||||
|
||||
Originally based on @bramwalet's awesome <a href="https://github.com/bramwalet/Subliminal.bundle">Subliminal.bundle</a>
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
__path__ = __import__('pkgutil').extend_path(__path__, __name__)
|
||||
@@ -0,0 +1,196 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import functools
|
||||
from collections import namedtuple
|
||||
from threading import RLock
|
||||
|
||||
_CacheInfo = namedtuple("CacheInfo", ["hits", "misses", "maxsize", "currsize"])
|
||||
|
||||
|
||||
@functools.wraps(functools.update_wrapper)
|
||||
def update_wrapper(
|
||||
wrapper,
|
||||
wrapped,
|
||||
assigned=functools.WRAPPER_ASSIGNMENTS,
|
||||
updated=functools.WRAPPER_UPDATES,
|
||||
):
|
||||
"""
|
||||
Patch two bugs in functools.update_wrapper.
|
||||
"""
|
||||
# workaround for http://bugs.python.org/issue3445
|
||||
assigned = tuple(attr for attr in assigned if hasattr(wrapped, attr))
|
||||
wrapper = functools.update_wrapper(wrapper, wrapped, assigned, updated)
|
||||
# workaround for https://bugs.python.org/issue17482
|
||||
wrapper.__wrapped__ = wrapped
|
||||
return wrapper
|
||||
|
||||
|
||||
class _HashedSeq(list):
|
||||
__slots__ = 'hashvalue'
|
||||
|
||||
def __init__(self, tup, hash=hash):
|
||||
self[:] = tup
|
||||
self.hashvalue = hash(tup)
|
||||
|
||||
def __hash__(self):
|
||||
return self.hashvalue
|
||||
|
||||
|
||||
def _make_key(
|
||||
args,
|
||||
kwds,
|
||||
typed,
|
||||
kwd_mark=(object(),),
|
||||
fasttypes=set([int, str, frozenset, type(None)]),
|
||||
sorted=sorted,
|
||||
tuple=tuple,
|
||||
type=type,
|
||||
len=len,
|
||||
):
|
||||
'Make a cache key from optionally typed positional and keyword arguments'
|
||||
key = args
|
||||
if kwds:
|
||||
sorted_items = sorted(kwds.items())
|
||||
key += kwd_mark
|
||||
for item in sorted_items:
|
||||
key += item
|
||||
if typed:
|
||||
key += tuple(type(v) for v in args)
|
||||
if kwds:
|
||||
key += tuple(type(v) for k, v in sorted_items)
|
||||
elif len(key) == 1 and type(key[0]) in fasttypes:
|
||||
return key[0]
|
||||
return _HashedSeq(key)
|
||||
|
||||
|
||||
def lru_cache(maxsize=100, typed=False):
|
||||
"""Least-recently-used cache decorator.
|
||||
|
||||
If *maxsize* is set to None, the LRU features are disabled and the cache
|
||||
can grow without bound.
|
||||
|
||||
If *typed* is True, arguments of different types will be cached separately.
|
||||
For example, f(3.0) and f(3) will be treated as distinct calls with
|
||||
distinct results.
|
||||
|
||||
Arguments to the cached function must be hashable.
|
||||
|
||||
View the cache statistics named tuple (hits, misses, maxsize, currsize) with
|
||||
f.cache_info(). Clear the cache and statistics with f.cache_clear().
|
||||
Access the underlying function with f.__wrapped__.
|
||||
|
||||
See: http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used
|
||||
|
||||
"""
|
||||
|
||||
# Users should only access the lru_cache through its public API:
|
||||
# cache_info, cache_clear, and f.__wrapped__
|
||||
# The internals of the lru_cache are encapsulated for thread safety and
|
||||
# to allow the implementation to change (including a possible C version).
|
||||
|
||||
def decorating_function(user_function):
|
||||
|
||||
cache = dict()
|
||||
stats = [0, 0] # make statistics updateable non-locally
|
||||
HITS, MISSES = 0, 1 # names for the stats fields
|
||||
make_key = _make_key
|
||||
cache_get = cache.get # bound method to lookup key or return None
|
||||
_len = len # localize the global len() function
|
||||
lock = RLock() # because linkedlist updates aren't threadsafe
|
||||
root = [] # root of the circular doubly linked list
|
||||
root[:] = [root, root, None, None] # initialize by pointing to self
|
||||
nonlocal_root = [root] # make updateable non-locally
|
||||
PREV, NEXT, KEY, RESULT = 0, 1, 2, 3 # names for the link fields
|
||||
|
||||
if maxsize == 0:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# no caching, just do a statistics update after a successful call
|
||||
result = user_function(*args, **kwds)
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
elif maxsize is None:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# simple caching without ordering or size limit
|
||||
key = make_key(args, kwds, typed)
|
||||
result = cache_get(
|
||||
key, root
|
||||
) # root used here as a unique not-found sentinel
|
||||
if result is not root:
|
||||
stats[HITS] += 1
|
||||
return result
|
||||
result = user_function(*args, **kwds)
|
||||
cache[key] = result
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
else:
|
||||
|
||||
def wrapper(*args, **kwds):
|
||||
# size limited caching that tracks accesses by recency
|
||||
key = make_key(args, kwds, typed) if kwds or typed else args
|
||||
with lock:
|
||||
link = cache_get(key)
|
||||
if link is not None:
|
||||
# record recent use of the key by moving it
|
||||
# to the front of the list
|
||||
root, = nonlocal_root
|
||||
link_prev, link_next, key, result = link
|
||||
link_prev[NEXT] = link_next
|
||||
link_next[PREV] = link_prev
|
||||
last = root[PREV]
|
||||
last[NEXT] = root[PREV] = link
|
||||
link[PREV] = last
|
||||
link[NEXT] = root
|
||||
stats[HITS] += 1
|
||||
return result
|
||||
result = user_function(*args, **kwds)
|
||||
with lock:
|
||||
root, = nonlocal_root
|
||||
if key in cache:
|
||||
# getting here means that this same key was added to the
|
||||
# cache while the lock was released. since the link
|
||||
# update is already done, we need only return the
|
||||
# computed result and update the count of misses.
|
||||
pass
|
||||
elif _len(cache) >= maxsize:
|
||||
# use the old root to store the new key and result
|
||||
oldroot = root
|
||||
oldroot[KEY] = key
|
||||
oldroot[RESULT] = result
|
||||
# empty the oldest link and make it the new root
|
||||
root = nonlocal_root[0] = oldroot[NEXT]
|
||||
oldkey = root[KEY]
|
||||
root[KEY] = root[RESULT] = None
|
||||
# now update the cache dictionary for the new links
|
||||
del cache[oldkey]
|
||||
cache[key] = oldroot
|
||||
else:
|
||||
# put result in a new link at the front of the list
|
||||
last = root[PREV]
|
||||
link = [last, root, key, result]
|
||||
last[NEXT] = root[PREV] = cache[key] = link
|
||||
stats[MISSES] += 1
|
||||
return result
|
||||
|
||||
def cache_info():
|
||||
"""Report cache statistics"""
|
||||
with lock:
|
||||
return _CacheInfo(stats[HITS], stats[MISSES], maxsize, len(cache))
|
||||
|
||||
def cache_clear():
|
||||
"""Clear the cache and cache statistics"""
|
||||
with lock:
|
||||
cache.clear()
|
||||
root = nonlocal_root[0]
|
||||
root[:] = [root, root, None, None]
|
||||
stats[:] = [0, 0]
|
||||
|
||||
wrapper.__wrapped__ = user_function
|
||||
wrapper.cache_info = cache_info
|
||||
wrapper.cache_clear = cache_clear
|
||||
return update_wrapper(wrapper, user_function)
|
||||
|
||||
return decorating_function
|
||||
@@ -2,6 +2,20 @@ import logging
|
||||
import re
|
||||
import sys
|
||||
import ssl
|
||||
import requests
|
||||
|
||||
try:
|
||||
import copyreg
|
||||
except ImportError:
|
||||
import copy_reg as copyreg
|
||||
|
||||
try:
|
||||
from HTMLParser import HTMLParser
|
||||
except ImportError:
|
||||
if sys.version_info >= (3, 4):
|
||||
import html
|
||||
else:
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from copy import deepcopy
|
||||
from time import sleep
|
||||
@@ -9,9 +23,17 @@ from collections import OrderedDict
|
||||
|
||||
from requests.sessions import Session
|
||||
from requests.adapters import HTTPAdapter
|
||||
from requests.packages.urllib3.util.ssl_ import create_urllib3_context
|
||||
|
||||
from .exceptions import (
|
||||
CloudflareLoopProtection,
|
||||
CloudflareCode1020,
|
||||
CloudflareIUAMError,
|
||||
CloudflareReCaptchaError,
|
||||
CloudflareReCaptchaProvider
|
||||
)
|
||||
|
||||
from .interpreters import JavaScriptInterpreter
|
||||
from .reCaptcha import reCaptcha
|
||||
from .user_agent import User_Agent
|
||||
|
||||
try:
|
||||
@@ -25,219 +47,540 @@ except ImportError:
|
||||
pass
|
||||
|
||||
try:
|
||||
from urlparse import urlparse
|
||||
from urlparse import urlunparse
|
||||
from urlparse import urlparse, urljoin
|
||||
except ImportError:
|
||||
from urllib.parse import urlparse
|
||||
from urllib.parse import urlunparse
|
||||
from urllib.parse import urlparse, urljoin
|
||||
|
||||
##########################################################################################################################################################
|
||||
|
||||
__version__ = '1.1.9'
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
|
||||
__version__ = '1.2.31'
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class CipherSuiteAdapter(HTTPAdapter):
|
||||
|
||||
def __init__(self, cipherSuite=None, **kwargs):
|
||||
self.cipherSuite = cipherSuite
|
||||
__attrs__ = [
|
||||
'ssl_context',
|
||||
'max_retries',
|
||||
'config',
|
||||
'_pool_connections',
|
||||
'_pool_maxsize',
|
||||
'_pool_block'
|
||||
]
|
||||
|
||||
if hasattr(ssl, 'PROTOCOL_TLS'):
|
||||
self.ssl_context = create_urllib3_context(
|
||||
ssl_version=getattr(ssl, 'PROTOCOL_TLSv1_3', ssl.PROTOCOL_TLSv1_2),
|
||||
ciphers=self.cipherSuite
|
||||
)
|
||||
else:
|
||||
self.ssl_context = create_urllib3_context(ssl_version=ssl.PROTOCOL_TLSv1)
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.ssl_context = kwargs.pop('ssl_context', None)
|
||||
self.cipherSuite = kwargs.pop('cipherSuite', None)
|
||||
|
||||
if not self.ssl_context:
|
||||
self.ssl_context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
|
||||
self.ssl_context.set_ciphers(self.cipherSuite)
|
||||
self.ssl_context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
|
||||
|
||||
super(CipherSuiteAdapter, self).__init__(**kwargs)
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
kwargs['ssl_context'] = self.ssl_context
|
||||
return super(CipherSuiteAdapter, self).init_poolmanager(*args, **kwargs)
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def proxy_manager_for(self, *args, **kwargs):
|
||||
kwargs['ssl_context'] = self.ssl_context
|
||||
return super(CipherSuiteAdapter, self).proxy_manager_for(*args, **kwargs)
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class CloudScraper(Session):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.debug = kwargs.pop('debug', False)
|
||||
self.delay = kwargs.pop('delay', None)
|
||||
self.interpreter = kwargs.pop('interpreter', 'js2py')
|
||||
self.allow_brotli = kwargs.pop('allow_brotli', True if 'brotli' in sys.modules.keys() else False)
|
||||
self.cipherSuite = None
|
||||
self.cipherSuite = kwargs.pop('cipherSuite', None)
|
||||
self.interpreter = kwargs.pop('interpreter', 'native')
|
||||
self.recaptcha = kwargs.pop('recaptcha', {})
|
||||
self.allow_brotli = kwargs.pop(
|
||||
'allow_brotli',
|
||||
True if 'brotli' in sys.modules.keys() else False
|
||||
)
|
||||
|
||||
self.user_agent = User_Agent(
|
||||
allow_brotli=self.allow_brotli,
|
||||
browser=kwargs.pop('browser', None)
|
||||
)
|
||||
|
||||
self._solveDepthCnt = 0
|
||||
self.solveDepth = kwargs.pop('solveDepth', 3)
|
||||
|
||||
super(CloudScraper, self).__init__(*args, **kwargs)
|
||||
|
||||
# pylint: disable=E0203
|
||||
if 'requests' in self.headers['User-Agent']:
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Set a random User-Agent if no custom User-Agent has been set
|
||||
self.headers = User_Agent(allow_brotli=self.allow_brotli).headers
|
||||
# ------------------------------------------------------------------------------- #
|
||||
self.headers = self.user_agent.headers
|
||||
if not self.cipherSuite:
|
||||
self.cipherSuite = self.user_agent.cipherSuite
|
||||
|
||||
self.mount('https://', CipherSuiteAdapter(self.loadCipherSuite()))
|
||||
if isinstance(self.cipherSuite, list):
|
||||
self.cipherSuite = ':'.join(self.cipherSuite)
|
||||
|
||||
##########################################################################################################################################################
|
||||
self.mount(
|
||||
'https://',
|
||||
CipherSuiteAdapter(
|
||||
cipherSuite=self.cipherSuite
|
||||
)
|
||||
)
|
||||
|
||||
# purely to allow us to pickle dump
|
||||
copyreg.pickle(ssl.SSLContext, lambda obj: (obj.__class__, (obj.protocol,)))
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Allow us to pickle our session back with all variables
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def __getstate__(self):
|
||||
return self.__dict__
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Raise an Exception with no stacktrace and reset depth counter.
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def simpleException(self, exception, msg):
|
||||
self._solveDepthCnt = 0
|
||||
sys.tracebacklimit = 0
|
||||
raise exception(msg)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# debug the request via the response
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@staticmethod
|
||||
def debugRequest(req):
|
||||
try:
|
||||
print(dump.dump_all(req).decode('utf-8'))
|
||||
except: # noqa
|
||||
pass
|
||||
except ValueError as e:
|
||||
print("Debug Error: {}".format(getattr(e, 'message', e)))
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Unescape / decode html entities
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def loadCipherSuite(self):
|
||||
if self.cipherSuite:
|
||||
return self.cipherSuite
|
||||
@staticmethod
|
||||
def unescape(html_text):
|
||||
if sys.version_info >= (3, 0):
|
||||
if sys.version_info >= (3, 4):
|
||||
return html.unescape(html_text)
|
||||
|
||||
self.cipherSuite = ''
|
||||
return HTMLParser().unescape(html_text)
|
||||
|
||||
if hasattr(ssl, 'PROTOCOL_TLS'):
|
||||
ciphers = [
|
||||
'ECDHE-ECDSA-AES128-GCM-SHA256', 'ECDHE-RSA-AES128-GCM-SHA256', 'ECDHE-ECDSA-AES256-GCM-SHA384',
|
||||
'ECDHE-RSA-AES256-GCM-SHA384', 'ECDHE-ECDSA-CHACHA20-POLY1305-SHA256', 'ECDHE-RSA-CHACHA20-POLY1305-SHA256',
|
||||
'ECDHE-RSA-AES128-CBC-SHA', 'ECDHE-RSA-AES256-CBC-SHA', 'RSA-AES128-GCM-SHA256', 'RSA-AES256-GCM-SHA384',
|
||||
'ECDHE-RSA-AES128-GCM-SHA256', 'RSA-AES256-SHA', '3DES-EDE-CBC'
|
||||
]
|
||||
return HTMLParser().unescape(html_text)
|
||||
|
||||
if hasattr(ssl, 'PROTOCOL_TLSv1_3'):
|
||||
ciphers.insert(0, ['GREASE_3A', 'GREASE_6A', 'AES128-GCM-SHA256', 'AES256-GCM-SHA256', 'AES256-GCM-SHA384', 'CHACHA20-POLY1305-SHA256'])
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Decode Brotli on older versions of urllib3 manually
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
ctx = ssl.SSLContext(getattr(ssl, 'PROTOCOL_TLSv1_3', ssl.PROTOCOL_TLSv1_2))
|
||||
|
||||
for cipher in ciphers:
|
||||
try:
|
||||
ctx.set_ciphers(cipher)
|
||||
self.cipherSuite = '{}:{}'.format(self.cipherSuite, cipher).rstrip(':')
|
||||
except ssl.SSLError:
|
||||
pass
|
||||
|
||||
return self.cipherSuite
|
||||
|
||||
##########################################################################################################################################################
|
||||
|
||||
def request(self, method, url, *args, **kwargs):
|
||||
ourSuper = super(CloudScraper, self)
|
||||
resp = ourSuper.request(method, url, *args, **kwargs)
|
||||
|
||||
if resp.headers.get('Content-Encoding') == 'br':
|
||||
def decodeBrotli(self, resp):
|
||||
if requests.packages.urllib3.__version__ < '1.25.1' and resp.headers.get('Content-Encoding') == 'br':
|
||||
if self.allow_brotli and resp._content:
|
||||
resp._content = brotli.decompress(resp.content)
|
||||
else:
|
||||
logging.warning('Brotli content detected, But option is disabled, we will not continue.')
|
||||
return resp
|
||||
logging.warning(
|
||||
'You\'re running urllib3 {}, Brotli content detected, '
|
||||
'Which requires manual decompression, '
|
||||
'But option allow_brotli is set to False, '
|
||||
'We will not continue to decompress.'.format(requests.packages.urllib3.__version__)
|
||||
)
|
||||
|
||||
return resp
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Our hijacker request function
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def request(self, method, url, *args, **kwargs):
|
||||
# pylint: disable=E0203
|
||||
if kwargs.get('proxies') and kwargs.get('proxies') != self.proxies:
|
||||
self.proxies = kwargs.get('proxies')
|
||||
|
||||
resp = self.decodeBrotli(
|
||||
super(CloudScraper, self).request(method, url, *args, **kwargs)
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Debug request
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if self.debug:
|
||||
self.debugRequest(resp)
|
||||
|
||||
# Check if Cloudflare anti-bot is on
|
||||
if self.isChallengeRequest(resp):
|
||||
if resp.request.method != 'GET':
|
||||
# Work around if the initial request is not a GET,
|
||||
# Supersede with a GET then re-request the original METHOD.
|
||||
self.request('GET', resp.url)
|
||||
resp = ourSuper.request(method, url, *args, **kwargs)
|
||||
else:
|
||||
# Solve Challenge
|
||||
resp = self.sendChallengeResponse(resp, **kwargs)
|
||||
if self.is_Challenge_Request(resp):
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Try to solve the challenge and send it back
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if self._solveDepthCnt >= self.solveDepth:
|
||||
_ = self._solveDepthCnt
|
||||
self.simpleException(
|
||||
CloudflareLoopProtection,
|
||||
"!!Loop Protection!! We have tried to solve {} time(s) in a row.".format(_)
|
||||
)
|
||||
|
||||
self._solveDepthCnt += 1
|
||||
|
||||
resp = self.Challenge_Response(resp, **kwargs)
|
||||
else:
|
||||
if not resp.is_redirect and resp.status_code not in [429, 503]:
|
||||
self._solveDepthCnt = 0
|
||||
|
||||
return resp
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# check if the response contains a valid Cloudflare challenge
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@staticmethod
|
||||
def isChallengeRequest(resp):
|
||||
if resp.headers.get('Server', '').startswith('cloudflare'):
|
||||
if b'why_captcha' in resp.content or b'/cdn-cgi/l/chk_captcha' in resp.content:
|
||||
raise ValueError('Captcha')
|
||||
|
||||
def is_IUAM_Challenge(resp):
|
||||
try:
|
||||
return (
|
||||
resp.status_code in [429, 503]
|
||||
and all(s in resp.content for s in [b'jschl_vc', b'jschl_answer'])
|
||||
resp.headers.get('Server', '').startswith('cloudflare')
|
||||
and resp.status_code in [429, 503]
|
||||
and re.search(
|
||||
r'action="/.*?__cf_chl_jschl_tk__=\S+".*?name="jschl_vc"\svalue=.*?',
|
||||
resp.text,
|
||||
re.M | re.DOTALL
|
||||
)
|
||||
)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
##########################################################################################################################################################
|
||||
|
||||
def sendChallengeResponse(self, resp, **original_kwargs):
|
||||
body = resp.text
|
||||
|
||||
# Cloudflare requires a delay before solving the challenge
|
||||
if not self.delay:
|
||||
try:
|
||||
delay = float(re.search(r'submit\(\);\r?\n\s*},\s*([0-9]+)', body).group(1)) / float(1000)
|
||||
if isinstance(delay, (int, float)):
|
||||
self.delay = delay
|
||||
except: # noqa
|
||||
pass
|
||||
|
||||
sleep(self.delay)
|
||||
|
||||
parsed_url = urlparse(resp.url)
|
||||
domain = parsed_url.netloc
|
||||
submit_url = '{}://{}/cdn-cgi/l/chk_jschl'.format(parsed_url.scheme, domain)
|
||||
|
||||
cloudflare_kwargs = deepcopy(original_kwargs)
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# check if the response contains a valid Cloudflare reCaptcha challenge
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@staticmethod
|
||||
def is_reCaptcha_Challenge(resp):
|
||||
try:
|
||||
params = OrderedDict()
|
||||
|
||||
s = re.search(r'name="s"\svalue="(?P<s_value>[^"]+)', body)
|
||||
if s:
|
||||
params['s'] = s.group('s_value')
|
||||
|
||||
params.update(
|
||||
[
|
||||
('jschl_vc', re.search(r'name="jschl_vc" value="(\w+)"', body).group(1)),
|
||||
('pass', re.search(r'name="pass" value="(.+?)"', body).group(1))
|
||||
]
|
||||
)
|
||||
|
||||
params = cloudflare_kwargs.setdefault('params', params)
|
||||
|
||||
except Exception as e:
|
||||
raise ValueError('Unable to parse Cloudflare anti-bots page: {} {}'.format(e.message, BUG_REPORT))
|
||||
|
||||
# Solve the Javascript challenge
|
||||
params['jschl_answer'] = JavaScriptInterpreter.dynamicImport(self.interpreter).solveChallenge(body, domain)
|
||||
|
||||
# Requests transforms any request into a GET after a redirect,
|
||||
# so the redirect has to be handled manually here to allow for
|
||||
# performing other types of requests even as the first request.
|
||||
|
||||
cloudflare_kwargs['allow_redirects'] = False
|
||||
|
||||
redirect = self.request(resp.request.method, submit_url, **cloudflare_kwargs)
|
||||
redirect_location = urlparse(redirect.headers['Location'])
|
||||
if not redirect_location.netloc:
|
||||
redirect_url = urlunparse(
|
||||
(
|
||||
parsed_url.scheme,
|
||||
domain,
|
||||
redirect_location.path,
|
||||
redirect_location.params,
|
||||
redirect_location.query,
|
||||
redirect_location.fragment
|
||||
return (
|
||||
resp.headers.get('Server', '').startswith('cloudflare')
|
||||
and resp.status_code == 403
|
||||
and re.search(
|
||||
r'action="/.*?__cf_chl_captcha_tk__=\S+".*?data\-sitekey=.*?',
|
||||
resp.text,
|
||||
re.M | re.DOTALL
|
||||
)
|
||||
)
|
||||
return self.request(resp.request.method, redirect_url, **original_kwargs)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return self.request(resp.request.method, redirect.headers['Location'], **original_kwargs)
|
||||
return False
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# check if the response contains Firewall 1020 Error
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@staticmethod
|
||||
def is_Firewall_Blocked(resp):
|
||||
try:
|
||||
return (
|
||||
resp.headers.get('Server', '').startswith('cloudflare')
|
||||
and resp.status_code == 403
|
||||
and re.search(
|
||||
r'<span class="cf-error-code">1020</span>',
|
||||
resp.text,
|
||||
re.M | re.DOTALL
|
||||
)
|
||||
)
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Wrapper for is_reCaptcha_Challenge, is_IUAM_Challenge, is_Firewall_Blocked
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def is_Challenge_Request(self, resp):
|
||||
if self.is_Firewall_Blocked(resp):
|
||||
self.simpleException(
|
||||
CloudflareCode1020,
|
||||
'Cloudflare has blocked this request (Code 1020 Detected).'
|
||||
)
|
||||
|
||||
if self.is_reCaptcha_Challenge(resp) or self.is_IUAM_Challenge(resp):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Try to solve cloudflare javascript challenge.
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def IUAM_Challenge_Response(self, body, url, interpreter):
|
||||
try:
|
||||
formPayload = re.search(
|
||||
r'<form (?P<form>id="challenge-form" action="(?P<challengeUUID>.*?'
|
||||
r'__cf_chl_jschl_tk__=\S+)"(.*?)</form>)',
|
||||
body,
|
||||
re.M | re.DOTALL
|
||||
).groupdict()
|
||||
|
||||
if not all(key in formPayload for key in ['form', 'challengeUUID']):
|
||||
self.simpleException(
|
||||
CloudflareIUAMError,
|
||||
"Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly."
|
||||
)
|
||||
|
||||
payload = OrderedDict(
|
||||
re.findall(
|
||||
r'name="(r|jschl_vc|pass)"\svalue="(.*?)"',
|
||||
formPayload['form']
|
||||
)
|
||||
)
|
||||
|
||||
except AttributeError:
|
||||
self.simpleException(
|
||||
CloudflareIUAMError,
|
||||
"Cloudflare IUAM detected, unfortunately we can't extract the parameters correctly."
|
||||
)
|
||||
|
||||
hostParsed = urlparse(url)
|
||||
|
||||
try:
|
||||
payload['jschl_answer'] = JavaScriptInterpreter.dynamicImport(
|
||||
interpreter
|
||||
).solveChallenge(body, hostParsed.netloc)
|
||||
except Exception as e:
|
||||
self.simpleException(
|
||||
CloudflareIUAMError,
|
||||
'Unable to parse Cloudflare anti-bots page: {}'.format(
|
||||
getattr(e, 'message', e)
|
||||
)
|
||||
)
|
||||
|
||||
return {
|
||||
'url': '{}://{}{}'.format(
|
||||
hostParsed.scheme,
|
||||
hostParsed.netloc,
|
||||
self.unescape(formPayload['challengeUUID'])
|
||||
),
|
||||
'data': payload
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Try to solve the reCaptcha challenge via 3rd party.
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def reCaptcha_Challenge_Response(self, provider, provider_params, body, url):
|
||||
try:
|
||||
formPayload = re.search(
|
||||
r'<form class="challenge-form" (?P<form>id="challenge-form" '
|
||||
r'action="(?P<challengeUUID>.*?__cf_chl_captcha_tk__=\S+)"(.*?)</form>)',
|
||||
body,
|
||||
re.M | re.DOTALL
|
||||
).groupdict()
|
||||
|
||||
if not all(key in formPayload for key in ['form', 'challengeUUID']):
|
||||
self.simpleException(
|
||||
CloudflareReCaptchaError,
|
||||
"Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly."
|
||||
)
|
||||
|
||||
payload = OrderedDict(
|
||||
re.findall(
|
||||
r'(name="r"\svalue|data-ray|data-sitekey)="(.*?)"',
|
||||
formPayload['form']
|
||||
)
|
||||
)
|
||||
except (AttributeError):
|
||||
self.simpleException(
|
||||
CloudflareReCaptchaError,
|
||||
"Cloudflare reCaptcha detected, unfortunately we can't extract the parameters correctly."
|
||||
)
|
||||
|
||||
hostParsed = urlparse(url)
|
||||
|
||||
return {
|
||||
'url': '{}://{}{}'.format(
|
||||
hostParsed.scheme,
|
||||
hostParsed.netloc,
|
||||
self.unescape(formPayload['challengeUUID'])
|
||||
),
|
||||
'data': OrderedDict([
|
||||
('r', payload.get('name="r" value', '')),
|
||||
('id', payload.get('data-ray')),
|
||||
(
|
||||
'g-recaptcha-response',
|
||||
reCaptcha.dynamicImport(
|
||||
provider.lower()
|
||||
).solveCaptcha(
|
||||
url,
|
||||
payload['data-sitekey'],
|
||||
provider_params
|
||||
)
|
||||
)
|
||||
])
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Attempt to handle and send the challenge response back to cloudflare
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def Challenge_Response(self, resp, **kwargs):
|
||||
if self.is_reCaptcha_Challenge(resp):
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# double down on the request as some websites are only checking
|
||||
# if cfuid is populated before issuing reCaptcha.
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
resp = self.decodeBrotli(
|
||||
super(CloudScraper, self).request(resp.request.method, resp.url, **kwargs)
|
||||
)
|
||||
|
||||
if not self.is_reCaptcha_Challenge(resp):
|
||||
return resp
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# if no reCaptcha provider raise a runtime error.
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if not self.recaptcha or not isinstance(self.recaptcha, dict) or not self.recaptcha.get('provider'):
|
||||
self.simpleException(
|
||||
CloudflareReCaptchaProvider,
|
||||
"Cloudflare reCaptcha detected, unfortunately you haven't loaded an anti reCaptcha provider "
|
||||
"correctly via the 'recaptcha' parameter."
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# if provider is return_response, return the response without doing anything.
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if self.recaptcha.get('provider') == 'return_response':
|
||||
return resp
|
||||
|
||||
self.recaptcha['proxies'] = self.proxies
|
||||
submit_url = self.reCaptcha_Challenge_Response(
|
||||
self.recaptcha.get('provider'),
|
||||
self.recaptcha,
|
||||
resp.text,
|
||||
resp.url
|
||||
)
|
||||
else:
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Cloudflare requires a delay before solving the challenge
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if not self.delay:
|
||||
try:
|
||||
delay = float(
|
||||
re.search(
|
||||
r'submit\(\);\r?\n\s*},\s*([0-9]+)',
|
||||
resp.text
|
||||
).group(1)
|
||||
) / float(1000)
|
||||
if isinstance(delay, (int, float)):
|
||||
self.delay = delay
|
||||
except (AttributeError, ValueError):
|
||||
self.simpleException(
|
||||
CloudflareIUAMError,
|
||||
"Cloudflare IUAM possibility malformed, issue extracing delay value."
|
||||
)
|
||||
|
||||
sleep(self.delay)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
submit_url = self.IUAM_Challenge_Response(
|
||||
resp.text,
|
||||
resp.url,
|
||||
self.interpreter
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Send the Challenge Response back to Cloudflare
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if submit_url:
|
||||
|
||||
def updateAttr(obj, name, newValue):
|
||||
try:
|
||||
obj[name].update(newValue)
|
||||
return obj[name]
|
||||
except (AttributeError, KeyError):
|
||||
obj[name] = {}
|
||||
obj[name].update(newValue)
|
||||
return obj[name]
|
||||
|
||||
cloudflare_kwargs = deepcopy(kwargs)
|
||||
cloudflare_kwargs['allow_redirects'] = False
|
||||
cloudflare_kwargs['data'] = updateAttr(
|
||||
cloudflare_kwargs,
|
||||
'data',
|
||||
submit_url['data']
|
||||
)
|
||||
|
||||
urlParsed = urlparse(resp.url)
|
||||
cloudflare_kwargs['headers'] = updateAttr(
|
||||
cloudflare_kwargs,
|
||||
'headers',
|
||||
{
|
||||
'Origin': '{}://{}'.format(urlParsed.scheme, urlParsed.netloc),
|
||||
'Referer': resp.url
|
||||
}
|
||||
)
|
||||
|
||||
challengeSubmitResponse = self.request(
|
||||
'POST',
|
||||
submit_url['url'],
|
||||
**cloudflare_kwargs
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Return response if Cloudflare is doing content pass through instead of 3xx
|
||||
# else request with redirect URL also handle protocol scheme change http -> https
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if not challengeSubmitResponse.is_redirect:
|
||||
return challengeSubmitResponse
|
||||
else:
|
||||
cloudflare_kwargs = deepcopy(kwargs)
|
||||
cloudflare_kwargs['headers'] = updateAttr(
|
||||
cloudflare_kwargs,
|
||||
'headers',
|
||||
{'Referer': challengeSubmitResponse.url}
|
||||
)
|
||||
|
||||
if not urlparse(challengeSubmitResponse.headers['Location']).netloc:
|
||||
redirect_location = urljoin(
|
||||
challengeSubmitResponse.url,
|
||||
challengeSubmitResponse.headers['Location']
|
||||
)
|
||||
else:
|
||||
redirect_location = challengeSubmitResponse.headers['Location']
|
||||
|
||||
return self.request(
|
||||
resp.request.method,
|
||||
redirect_location,
|
||||
**cloudflare_kwargs
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# We shouldn't be here...
|
||||
# Re-request the original query and/or process again....
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
return self.request(resp.request.method, resp.url, **kwargs)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@classmethod
|
||||
def create_scraper(cls, sess=None, **kwargs):
|
||||
@@ -247,24 +590,30 @@ class CloudScraper(Session):
|
||||
scraper = cls(**kwargs)
|
||||
|
||||
if sess:
|
||||
attrs = ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']
|
||||
for attr in attrs:
|
||||
for attr in ['auth', 'cert', 'cookies', 'headers', 'hooks', 'params', 'proxies', 'data']:
|
||||
val = getattr(sess, attr, None)
|
||||
if val:
|
||||
setattr(scraper, attr, val)
|
||||
|
||||
return scraper
|
||||
|
||||
##########################################################################################################################################################
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
# Functions for integrating cloudscraper with other applications and scripts
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@classmethod
|
||||
def get_tokens(cls, url, **kwargs):
|
||||
scraper = cls.create_scraper(
|
||||
debug=kwargs.pop('debug', False),
|
||||
delay=kwargs.pop('delay', None),
|
||||
interpreter=kwargs.pop('interpreter', 'js2py'),
|
||||
allow_brotli=kwargs.pop('allow_brotli', True),
|
||||
**{
|
||||
field: kwargs.pop(field, None) for field in [
|
||||
'allow_brotli',
|
||||
'browser',
|
||||
'debug',
|
||||
'delay',
|
||||
'interpreter',
|
||||
'recaptcha'
|
||||
] if field in kwargs
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -283,7 +632,11 @@ class CloudScraper(Session):
|
||||
cookie_domain = d
|
||||
break
|
||||
else:
|
||||
raise ValueError('Unable to find Cloudflare cookies. Does the site actually have Cloudflare IUAM ("I\'m Under Attack Mode") enabled?')
|
||||
cls.simpleException(
|
||||
CloudflareIUAMError,
|
||||
"Unable to find Cloudflare cookies. Does the site actually "
|
||||
"have Cloudflare IUAM (I'm Under Attack Mode) enabled?"
|
||||
)
|
||||
|
||||
return (
|
||||
{
|
||||
@@ -293,7 +646,7 @@ class CloudScraper(Session):
|
||||
scraper.headers['User-Agent']
|
||||
)
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@classmethod
|
||||
def get_cookie_string(cls, url, **kwargs):
|
||||
@@ -304,7 +657,18 @@ class CloudScraper(Session):
|
||||
return '; '.join('='.join(pair) for pair in tokens.items()), user_agent
|
||||
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if ssl.OPENSSL_VERSION_INFO < (1, 1, 1):
|
||||
print(
|
||||
"DEPRECATION: The OpenSSL being used by this python install ({}) does not meet the minimum supported "
|
||||
"version (>= OpenSSL 1.1.1) in order to support TLS 1.3 required by Cloudflare, "
|
||||
"You may encounter an unexpected reCaptcha or cloudflare 1020 blocks.".format(
|
||||
ssl.OPENSSL_VERSION
|
||||
)
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
create_scraper = CloudScraper.create_scraper
|
||||
get_tokens = CloudScraper.get_tokens
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
"""
|
||||
cloudscraper.exceptions
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
This module contains the set of cloudscraper exceptions.
|
||||
"""
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class CloudflareException(Exception):
|
||||
"""
|
||||
Base exception class for cloudscraper for Cloudflare
|
||||
"""
|
||||
|
||||
|
||||
class CloudflareLoopProtection(CloudflareException):
|
||||
"""
|
||||
Raise an exception for recursive depth protection
|
||||
"""
|
||||
|
||||
|
||||
class CloudflareCode1020(CloudflareException):
|
||||
"""
|
||||
Raise an exception for Cloudflare code 1020 block
|
||||
"""
|
||||
|
||||
|
||||
class CloudflareIUAMError(CloudflareException):
|
||||
"""
|
||||
Raise an error for problem extracting IUAM paramters
|
||||
from Cloudflare payload
|
||||
"""
|
||||
|
||||
|
||||
class CloudflareReCaptchaError(CloudflareException):
|
||||
"""
|
||||
Raise an error for problem extracting reCaptcha paramters
|
||||
from Cloudflare payload
|
||||
"""
|
||||
|
||||
|
||||
class CloudflareReCaptchaProvider(CloudflareException):
|
||||
"""
|
||||
Raise an exception for no reCaptcha provider loaded for Cloudflare.
|
||||
"""
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class reCaptchaException(Exception):
|
||||
"""
|
||||
Base exception class for cloudscraper reCaptcha Providers
|
||||
"""
|
||||
|
||||
|
||||
class reCaptchaServiceUnavailable(reCaptchaException):
|
||||
"""
|
||||
Raise an exception for external services that cannot be reached
|
||||
"""
|
||||
|
||||
|
||||
class reCaptchaAPIError(reCaptchaException):
|
||||
"""
|
||||
Raise an error for error from API response.
|
||||
"""
|
||||
|
||||
|
||||
class reCaptchaAccountError(reCaptchaException):
|
||||
"""
|
||||
Raise an error for reCaptcha provider account problem.
|
||||
"""
|
||||
|
||||
|
||||
class reCaptchaTimeout(reCaptchaException):
|
||||
"""
|
||||
Raise an exception for reCaptcha provider taking too long.
|
||||
"""
|
||||
|
||||
|
||||
class reCaptchaParameter(reCaptchaException):
|
||||
"""
|
||||
Raise an exception for bad or missing Parameter.
|
||||
"""
|
||||
|
||||
|
||||
class reCaptchaBadJobID(reCaptchaException):
|
||||
"""
|
||||
Raise an exception for invalid job id.
|
||||
"""
|
||||
|
||||
|
||||
class reCaptchaReportError(reCaptchaException):
|
||||
"""
|
||||
Raise an error for reCaptcha provider unable to report bad solve.
|
||||
"""
|
||||
@@ -1,4 +1,3 @@
|
||||
import re
|
||||
import sys
|
||||
import logging
|
||||
import abc
|
||||
@@ -8,20 +7,24 @@ if sys.version_info >= (3, 4):
|
||||
else:
|
||||
ABC = abc.ABCMeta('ABC', (), {})
|
||||
|
||||
##########################################################################################################################################################
|
||||
|
||||
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
interpreters = {}
|
||||
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class JavaScriptInterpreter(ABC):
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@abc.abstractmethod
|
||||
def __init__(self, name):
|
||||
interpreters[name] = self
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@classmethod
|
||||
def dynamicImport(cls, name):
|
||||
if name not in interpreters:
|
||||
@@ -35,55 +38,17 @@ class JavaScriptInterpreter(ABC):
|
||||
|
||||
return interpreters[name]
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@abc.abstractmethod
|
||||
def eval(self, jsEnv, js):
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def solveChallenge(self, body, domain):
|
||||
try:
|
||||
js = re.search(
|
||||
r'setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n',
|
||||
body
|
||||
).group(1)
|
||||
except Exception:
|
||||
raise ValueError('Unable to identify Cloudflare IUAM Javascript on website. {}'.format(BUG_REPORT))
|
||||
|
||||
js = re.sub(r'\s{2,}', ' ', js, flags=re.MULTILINE | re.DOTALL).replace('\'; 121\'', '')
|
||||
js += '\na.value;'
|
||||
|
||||
jsEnv = '''
|
||||
String.prototype.italics=function(str) {{return "<i>" + this + "</i>";}};
|
||||
var document = {{
|
||||
createElement: function () {{
|
||||
return {{ firstChild: {{ href: "https://{domain}/" }} }}
|
||||
}},
|
||||
getElementById: function () {{
|
||||
return {{"innerHTML": "{innerHTML}"}};
|
||||
}}
|
||||
}};
|
||||
'''
|
||||
|
||||
try:
|
||||
innerHTML = re.search(
|
||||
r'<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)</div>',
|
||||
body,
|
||||
re.MULTILINE | re.DOTALL
|
||||
)
|
||||
innerHTML = innerHTML.group(2) if innerHTML else ''
|
||||
|
||||
except: # noqa
|
||||
logging.error('Error extracting Cloudflare IUAM Javascript. {}'.format(BUG_REPORT))
|
||||
raise
|
||||
|
||||
try:
|
||||
result = self.eval(
|
||||
re.sub(r'\s{2,}', ' ', jsEnv.format(domain=domain, innerHTML=innerHTML), flags=re.MULTILINE | re.DOTALL),
|
||||
js
|
||||
)
|
||||
|
||||
float(result)
|
||||
return float(self.eval(body, domain))
|
||||
except Exception:
|
||||
logging.error('Error executing Cloudflare IUAM Javascript. {}'.format(BUG_REPORT))
|
||||
raise
|
||||
|
||||
return result
|
||||
|
||||
@@ -0,0 +1,103 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import os
|
||||
import sys
|
||||
import ctypes.util
|
||||
|
||||
from ctypes import c_void_p, c_size_t, byref, create_string_buffer, CDLL
|
||||
|
||||
from . import JavaScriptInterpreter
|
||||
from .encapsulated import template
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class ChallengeInterpreter(JavaScriptInterpreter):
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def __init__(self):
|
||||
super(ChallengeInterpreter, self).__init__('chakracore')
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def eval(self, body, domain):
|
||||
chakraCoreLibrary = None
|
||||
|
||||
# check current working directory.
|
||||
for _libraryFile in ['libChakraCore.so', 'libChakraCore.dylib', 'ChakraCore.dll']:
|
||||
if os.path.isfile(os.path.join(os.getcwd(), _libraryFile)):
|
||||
chakraCoreLibrary = os.path.join(os.getcwd(), _libraryFile)
|
||||
continue
|
||||
|
||||
if not chakraCoreLibrary:
|
||||
chakraCoreLibrary = ctypes.util.find_library('ChakraCore')
|
||||
|
||||
if not chakraCoreLibrary:
|
||||
sys.tracebacklimit = 0
|
||||
raise RuntimeError(
|
||||
'ChakraCore library not found in current path or any of your system library paths, '
|
||||
'please download from https://www.github.com/VeNoMouS/cloudscraper/tree/ChakraCore/, '
|
||||
'or https://github.com/Microsoft/ChakraCore/'
|
||||
)
|
||||
|
||||
try:
|
||||
chakraCore = CDLL(chakraCoreLibrary)
|
||||
except OSError:
|
||||
sys.tracebacklimit = 0
|
||||
raise RuntimeError('There was an error loading the ChakraCore library {}'.format(chakraCoreLibrary))
|
||||
|
||||
if sys.platform != 'win32':
|
||||
chakraCore.DllMain(0, 1, 0)
|
||||
chakraCore.DllMain(0, 2, 0)
|
||||
|
||||
script = create_string_buffer(template(body, domain).encode('utf-16'))
|
||||
|
||||
runtime = c_void_p()
|
||||
chakraCore.JsCreateRuntime(0, 0, byref(runtime))
|
||||
|
||||
context = c_void_p()
|
||||
chakraCore.JsCreateContext(runtime, byref(context))
|
||||
chakraCore.JsSetCurrentContext(context)
|
||||
|
||||
fname = c_void_p()
|
||||
chakraCore.JsCreateString(
|
||||
'iuam-challenge.js',
|
||||
len('iuam-challenge.js'),
|
||||
byref(fname)
|
||||
)
|
||||
|
||||
scriptSource = c_void_p()
|
||||
chakraCore.JsCreateExternalArrayBuffer(
|
||||
script,
|
||||
len(script),
|
||||
0,
|
||||
0,
|
||||
byref(scriptSource)
|
||||
)
|
||||
|
||||
jsResult = c_void_p()
|
||||
chakraCore.JsRun(scriptSource, 0, fname, 0x02, byref(jsResult))
|
||||
|
||||
resultJSString = c_void_p()
|
||||
chakraCore.JsConvertValueToString(jsResult, byref(resultJSString))
|
||||
|
||||
stringLength = c_size_t()
|
||||
chakraCore.JsCopyString(resultJSString, 0, 0, byref(stringLength))
|
||||
|
||||
resultSTR = create_string_buffer(stringLength.value + 1)
|
||||
chakraCore.JsCopyString(
|
||||
resultJSString,
|
||||
byref(resultSTR),
|
||||
stringLength.value + 1,
|
||||
0
|
||||
)
|
||||
|
||||
chakraCore.JsDisposeRuntime(runtime)
|
||||
|
||||
return resultSTR.value
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
ChallengeInterpreter()
|
||||
@@ -0,0 +1,58 @@
|
||||
import logging
|
||||
import re
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
def template(body, domain):
|
||||
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
|
||||
|
||||
try:
|
||||
js = re.search(
|
||||
r'setTimeout\(function\(\){\s+(var s,t,o,p,b,r,e,a,k,i,n,g,f.+?\r?\n[\s\S]+?a\.value =.+?)\r?\n',
|
||||
body
|
||||
).group(1)
|
||||
except Exception:
|
||||
raise ValueError('Unable to identify Cloudflare IUAM Javascript on website. {}'.format(BUG_REPORT))
|
||||
|
||||
js = re.sub(r'\s{2,}', ' ', js, flags=re.MULTILINE | re.DOTALL).replace('\'; 121\'', '')
|
||||
js += '\na.value;'
|
||||
|
||||
jsEnv = '''
|
||||
String.prototype.italics=function(str) {{return "<i>" + this + "</i>";}};
|
||||
var document = {{
|
||||
createElement: function () {{
|
||||
return {{ firstChild: {{ href: "https://{domain}/" }} }}
|
||||
}},
|
||||
getElementById: function () {{
|
||||
return {{"innerHTML": "{innerHTML}"}};
|
||||
}}
|
||||
}};
|
||||
'''
|
||||
|
||||
try:
|
||||
innerHTML = re.search(
|
||||
r'<div(?: [^<>]*)? id="([^<>]*?)">([^<>]*?)</div>',
|
||||
body,
|
||||
re.MULTILINE | re.DOTALL
|
||||
)
|
||||
innerHTML = innerHTML.group(2) if innerHTML else ''
|
||||
|
||||
except: # noqa
|
||||
logging.error('Error extracting Cloudflare IUAM Javascript. {}'.format(BUG_REPORT))
|
||||
raise
|
||||
|
||||
return '{}{}'.format(
|
||||
re.sub(
|
||||
r'\s{2,}',
|
||||
' ',
|
||||
jsEnv.format(
|
||||
domain=domain,
|
||||
innerHTML=innerHTML
|
||||
),
|
||||
re.MULTILINE | re.DOTALL
|
||||
),
|
||||
js
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
@@ -6,27 +6,39 @@ import base64
|
||||
|
||||
from . import JavaScriptInterpreter
|
||||
|
||||
from .encapsulated import template
|
||||
from .jsunfuck import jsunfuck
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class ChallengeInterpreter(JavaScriptInterpreter):
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def __init__(self):
|
||||
super(ChallengeInterpreter, self).__init__('js2py')
|
||||
|
||||
def eval(self, jsEnv, js):
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def eval(self, body, domain):
|
||||
|
||||
jsPayload = template(body, domain)
|
||||
|
||||
if js2py.eval_js('(+(+!+[]+[+!+[]]+(!![]+[])[!+[]+!+[]+!+[]]+[!+[]+!+[]]+[+[]])+[])[+!+[]]') == '1':
|
||||
logging.warning('WARNING - Please upgrade your js2py https://github.com/PiotrDabkowski/Js2Py, applying work around for the meantime.')
|
||||
js = jsunfuck(js)
|
||||
jsPayload = jsunfuck(jsPayload)
|
||||
|
||||
def atob(s):
|
||||
return base64.b64decode('{}'.format(s)).decode('utf-8')
|
||||
|
||||
js2py.disable_pyimport()
|
||||
context = js2py.EvalJs({'atob': atob})
|
||||
result = context.eval('{}{}'.format(jsEnv, js))
|
||||
result = context.eval(jsPayload)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
ChallengeInterpreter()
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
import operator as op
|
||||
|
||||
from . import JavaScriptInterpreter
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class ChallengeInterpreter(JavaScriptInterpreter):
|
||||
|
||||
def __init__(self):
|
||||
super(ChallengeInterpreter, self).__init__('native')
|
||||
|
||||
def eval(self, body, domain):
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
operators = {
|
||||
'+': op.add,
|
||||
'-': op.sub,
|
||||
'*': op.mul,
|
||||
'/': op.truediv
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def jsfuckToNumber(jsFuck):
|
||||
t = ''
|
||||
|
||||
split_numbers = re.compile(r'-?\d+').findall
|
||||
|
||||
for i in re.findall(
|
||||
r'\((?:\d|\+|\-)*\)',
|
||||
jsFuck.replace('!+[]', '1').replace('!![]', '1').replace('[]', '0').lstrip('+').replace('(+', '(')
|
||||
):
|
||||
t = '{}{}'.format(t, sum(int(x) for x in split_numbers(i)))
|
||||
|
||||
return int(t)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def divisorMath(payload, needle, domain):
|
||||
jsfuckMath = payload.split('/')
|
||||
if needle in jsfuckMath[1]:
|
||||
expression = re.findall(r"^(.*?)(.)\(function", jsfuckMath[1])[0]
|
||||
expression_value = operators[expression[1]](
|
||||
float(jsfuckToNumber(expression[0])),
|
||||
float(ord(domain[jsfuckToNumber(jsfuckMath[1][
|
||||
jsfuckMath[1].find('"("+p+")")}') + len('"("+p+")")}'):-2
|
||||
])]))
|
||||
)
|
||||
else:
|
||||
expression_value = jsfuckToNumber(jsfuckMath[1])
|
||||
|
||||
expression_value = jsfuckToNumber(jsfuckMath[0]) / float(expression_value)
|
||||
|
||||
return expression_value
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def challengeSolve(body, domain):
|
||||
jschl_answer = 0
|
||||
|
||||
jsfuckChallenge = re.search(
|
||||
r"setTimeout\(function\(\){\s+var.*?f,\s*(?P<variable>\w+).*?:(?P<init>\S+)};"
|
||||
r".*?\('challenge-form'\);\s+;(?P<challenge>.*?a\.value)"
|
||||
r"(?:.*id=\"cf-dn-.*?>(?P<k>\S+)<)?",
|
||||
body,
|
||||
re.DOTALL | re.MULTILINE
|
||||
).groupdict()
|
||||
|
||||
jsfuckChallenge['challenge'] = re.finditer(
|
||||
r'{}.*?([+\-*/])=(.*?);(?=a\.value|{})'.format(
|
||||
jsfuckChallenge['variable'],
|
||||
jsfuckChallenge['variable']
|
||||
),
|
||||
jsfuckChallenge['challenge']
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if '/' in jsfuckChallenge['init']:
|
||||
val = jsfuckChallenge['init'].split('/')
|
||||
jschl_answer = jsfuckToNumber(val[0]) / float(jsfuckToNumber(val[1]))
|
||||
else:
|
||||
jschl_answer = jsfuckToNumber(jsfuckChallenge['init'])
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
for expressionMatch in jsfuckChallenge['challenge']:
|
||||
oper, expression = expressionMatch.groups()
|
||||
|
||||
if '/' in expression:
|
||||
expression_value = divisorMath(expression, 'function(p)', domain)
|
||||
else:
|
||||
if 'Element' in expression:
|
||||
expression_value = divisorMath(jsfuckChallenge['k'], '"("+p+")")}', domain)
|
||||
else:
|
||||
expression_value = jsfuckToNumber(expression)
|
||||
|
||||
jschl_answer = operators[oper](jschl_answer, expression_value)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
if not jsfuckChallenge['k'] and '+ t.length' in body:
|
||||
jschl_answer += len(domain)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
return '{0:.10f}'.format(jschl_answer)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
return challengeSolve(body, domain)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
ChallengeInterpreter()
|
||||
@@ -1,22 +1,23 @@
|
||||
import base64
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
from . import JavaScriptInterpreter
|
||||
from .encapsulated import template
|
||||
|
||||
##########################################################################################################################################################
|
||||
|
||||
BUG_REPORT = 'Cloudflare may have changed their technique, or there may be a bug in the script.'
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class ChallengeInterpreter(JavaScriptInterpreter):
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def __init__(self):
|
||||
super(ChallengeInterpreter, self).__init__('nodejs')
|
||||
|
||||
def eval(self, jsEnv, js):
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def eval(self, body, domain):
|
||||
try:
|
||||
js = 'var atob = function(str) {return Buffer.from(str, "base64").toString("binary");};' \
|
||||
'var challenge = atob("%s");' \
|
||||
@@ -24,23 +25,25 @@ class ChallengeInterpreter(JavaScriptInterpreter):
|
||||
'var options = {filename: "iuam-challenge.js", timeout: 4000};' \
|
||||
'var answer = require("vm").runInNewContext(challenge, context, options);' \
|
||||
'process.stdout.write(String(answer));' \
|
||||
% base64.b64encode('{}{}'.format(jsEnv, js).encode('UTF-8')).decode('ascii')
|
||||
% base64.b64encode(template(body, domain).encode('UTF-8')).decode('ascii')
|
||||
|
||||
return subprocess.check_output(['node', '-e', js])
|
||||
|
||||
except OSError as e:
|
||||
if e.errno == 2:
|
||||
raise EnvironmentError(
|
||||
'Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`). Your Node binary may be called `nodejs` rather than `node`, '
|
||||
'in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems. (Please read the cloudscraper'
|
||||
' README\'s Dependencies section: https://github.com/VeNoMouS/cloudscraper#dependencies.'
|
||||
'Missing Node.js runtime. Node is required and must be in the PATH (check with `node -v`).\n\n'
|
||||
'Your Node binary may be called `nodejs` rather than `node`, '
|
||||
'in which case you may need to run `apt-get install nodejs-legacy` on some Debian-based systems.\n\n'
|
||||
'(Please read the cloudscraper README\'s Dependencies section: '
|
||||
'https://github.com/VeNoMouS/cloudscraper#dependencies.)'
|
||||
)
|
||||
raise
|
||||
except Exception:
|
||||
logging.error('Error executing Cloudflare IUAM Javascript. %s' % BUG_REPORT)
|
||||
raise
|
||||
sys.tracebacklimit = 0
|
||||
raise RuntimeError('Error executing Cloudflare IUAM Javascript in nodejs')
|
||||
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
ChallengeInterpreter()
|
||||
|
||||
@@ -0,0 +1,33 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import sys
|
||||
|
||||
try:
|
||||
import v8eval
|
||||
except ImportError:
|
||||
sys.tracebacklimit = 0
|
||||
raise RuntimeError('Please install the python module v8eval either via pip or download it from https://github.com/sony/v8eval')
|
||||
|
||||
from . import JavaScriptInterpreter
|
||||
from .encapsulated import template
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class ChallengeInterpreter(JavaScriptInterpreter):
|
||||
|
||||
def __init__(self):
|
||||
super(ChallengeInterpreter, self).__init__('v8')
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def eval(self, body, domain):
|
||||
try:
|
||||
return v8eval.V8().eval(template(body, domain))
|
||||
except (TypeError, v8eval.V8Error):
|
||||
RuntimeError('We encountered an error running the V8 Engine.')
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
ChallengeInterpreter()
|
||||
@@ -0,0 +1,236 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
from ..exceptions import (
|
||||
reCaptchaServiceUnavailable,
|
||||
reCaptchaAPIError,
|
||||
reCaptchaTimeout,
|
||||
reCaptchaParameter,
|
||||
reCaptchaBadJobID,
|
||||
reCaptchaReportError
|
||||
)
|
||||
|
||||
try:
|
||||
import polling
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install the python module 'polling' via pip or download it from "
|
||||
"https://github.com/justiniso/polling/"
|
||||
)
|
||||
|
||||
from . import reCaptcha
|
||||
|
||||
|
||||
class captchaSolver(reCaptcha):
|
||||
|
||||
def __init__(self):
|
||||
super(captchaSolver, self).__init__('2captcha')
|
||||
self.host = 'https://2captcha.com'
|
||||
self.session = requests.Session()
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@staticmethod
|
||||
def checkErrorStatus(response, request_type):
|
||||
if response.status_code in [500, 502]:
|
||||
raise reCaptchaServiceUnavailable('2Captcha: Server Side Error {}'.format(response.status_code))
|
||||
|
||||
errors = {
|
||||
'in.php': {
|
||||
"ERROR_WRONG_USER_KEY": "You've provided api_key parameter value is in incorrect format, it should contain 32 symbols.",
|
||||
"ERROR_KEY_DOES_NOT_EXIST": "The api_key you've provided does not exists.",
|
||||
"ERROR_ZERO_BALANCE": "You don't have sufficient funds on your account.",
|
||||
"ERROR_PAGEURL": "pageurl parameter is missing in your request.",
|
||||
"ERROR_NO_SLOT_AVAILABLE":
|
||||
"No Slots Available.\nYou can receive this error in two cases:\n"
|
||||
"1. If you solve ReCaptcha: the queue of your captchas that are not distributed to workers is too long. "
|
||||
"Queue limit changes dynamically and depends on total amount of captchas awaiting solution and usually it's between 50 and 100 captchas.\n"
|
||||
"2. If you solve Normal Captcha: your maximum rate for normal captchas is lower than current rate on the server."
|
||||
"You can change your maximum rate in your account's settings.",
|
||||
"ERROR_IP_NOT_ALLOWED": "The request is sent from the IP that is not on the list of your allowed IPs.",
|
||||
"IP_BANNED": "Your IP address is banned due to many frequent attempts to access the server using wrong authorization keys.",
|
||||
"ERROR_BAD_TOKEN_OR_PAGEURL":
|
||||
"You can get this error code when sending ReCaptcha V2. "
|
||||
"That happens if your request contains invalid pair of googlekey and pageurl. "
|
||||
"The common reason for that is that ReCaptcha is loaded inside an iframe hosted on another domain/subdomain.",
|
||||
"ERROR_GOOGLEKEY":
|
||||
"You can get this error code when sending ReCaptcha V2. "
|
||||
"That means that sitekey value provided in your request is incorrect: it's blank or malformed.",
|
||||
"MAX_USER_TURN": "You made more than 60 requests within 3 seconds.Your account is banned for 10 seconds. Ban will be lifted automatically."
|
||||
},
|
||||
'res.php': {
|
||||
"ERROR_CAPTCHA_UNSOLVABLE":
|
||||
"We are unable to solve your captcha - three of our workers were unable solve it "
|
||||
"or we didn't get an answer within 90 seconds (300 seconds for ReCaptcha V2). "
|
||||
"We will not charge you for that request.",
|
||||
"ERROR_WRONG_USER_KEY": "You've provided api_key parameter value in incorrect format, it should contain 32 symbols.",
|
||||
"ERROR_KEY_DOES_NOT_EXIST": "The api_key you've provided does not exists.",
|
||||
"ERROR_WRONG_ID_FORMAT": "You've provided captcha ID in wrong format. The ID can contain numbers only.",
|
||||
"ERROR_WRONG_CAPTCHA_ID": "You've provided incorrect captcha ID.",
|
||||
"ERROR_BAD_DUPLICATES":
|
||||
"Error is returned when 100% accuracy feature is enabled. "
|
||||
"The error means that max numbers of tries is reached but min number of matches not found.",
|
||||
"REPORT_NOT_RECORDED": "Error is returned to your complain request if you already complained lots of correctly solved captchas.",
|
||||
"ERROR_IP_ADDRES":
|
||||
"You can receive this error code when registering a pingback (callback) IP or domain."
|
||||
"That happes if your request is coming from an IP address that doesn't match the IP address of your pingback IP or domain.",
|
||||
"ERROR_TOKEN_EXPIRED": "You can receive this error code when sending GeeTest. That error means that challenge value you provided is expired.",
|
||||
"ERROR_EMPTY_ACTION": "Action parameter is missing or no value is provided for action parameter."
|
||||
}
|
||||
}
|
||||
|
||||
if response.json().get('status') is False and response.json().get('request') in errors.get(request_type):
|
||||
raise reCaptchaAPIError(
|
||||
'{} {}'.format(
|
||||
response.json().get('request'),
|
||||
errors.get(request_type).get(response.json().get('request'))
|
||||
)
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def reportJob(self, jobID):
|
||||
if not jobID:
|
||||
raise reCaptchaBadJobID(
|
||||
"2Captcha: Error bad job id to request reCaptcha."
|
||||
)
|
||||
|
||||
def _checkRequest(response):
|
||||
if response.ok and response.json().get('status') == 1:
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response, 'res.php')
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.get(
|
||||
'{}/res.php'.format(self.host),
|
||||
params={
|
||||
'key': self.api_key,
|
||||
'action': 'reportbad',
|
||||
'id': jobID,
|
||||
'json': '1'
|
||||
}
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=5,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
if response:
|
||||
return True
|
||||
else:
|
||||
raise reCaptchaReportError(
|
||||
"2Captcha: Error - Failed to report bad reCaptcha solve."
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def requestJob(self, jobID):
|
||||
if not jobID:
|
||||
raise reCaptchaBadJobID("2Captcha: Error bad job id to request reCaptcha.")
|
||||
|
||||
def _checkRequest(response):
|
||||
if response.ok and response.json().get('status') == 1:
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response, 'res.php')
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.get(
|
||||
'{}/res.php'.format(self.host),
|
||||
params={
|
||||
'key': self.api_key,
|
||||
'action': 'get',
|
||||
'id': jobID,
|
||||
'json': '1'
|
||||
}
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=5,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
if response:
|
||||
return response.json().get('request')
|
||||
else:
|
||||
raise reCaptchaTimeout(
|
||||
"2Captcha: Error failed to solve reCaptcha."
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def requestSolve(self, site_url, site_key):
|
||||
def _checkRequest(response):
|
||||
if response.ok and response.json().get("status") == 1 and response.json().get('request'):
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response, 'in.php')
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.post(
|
||||
'{}/in.php'.format(self.host),
|
||||
data={
|
||||
'key': self.api_key,
|
||||
'method': 'userrecaptcha',
|
||||
'googlekey': site_key,
|
||||
'pageurl': site_url,
|
||||
'json': '1',
|
||||
'soft_id': '5507698'
|
||||
},
|
||||
allow_redirects=False
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=5,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
if response:
|
||||
return response.json().get('request')
|
||||
else:
|
||||
raise reCaptchaBadJobID(
|
||||
'2Captcha: Error no job id was returned.'
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
|
||||
jobID = None
|
||||
|
||||
if not reCaptchaParams.get('api_key'):
|
||||
raise reCaptchaParameter(
|
||||
"2Captcha: Missing api_key parameter."
|
||||
)
|
||||
|
||||
self.api_key = reCaptchaParams.get('api_key')
|
||||
|
||||
if reCaptchaParams.get('proxy'):
|
||||
self.session.proxies = reCaptchaParams.get('proxies')
|
||||
|
||||
try:
|
||||
jobID = self.requestSolve(site_url, site_key)
|
||||
return self.requestJob(jobID)
|
||||
except polling.TimeoutException:
|
||||
try:
|
||||
if jobID:
|
||||
self.reportJob(jobID)
|
||||
except polling.TimeoutException:
|
||||
raise reCaptchaTimeout(
|
||||
"2Captcha: reCaptcha solve took to long and also failed reporting the job the job id {}.".format(jobID)
|
||||
)
|
||||
|
||||
raise reCaptchaTimeout(
|
||||
"2Captcha: reCaptcha solve took to long to execute job id {}, aborting.".format(jobID)
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
captchaSolver()
|
||||
@@ -0,0 +1,207 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import re
|
||||
import requests
|
||||
|
||||
try:
|
||||
import polling
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install the python module 'polling' via pip or download it from "
|
||||
"https://github.com/justiniso/polling/"
|
||||
)
|
||||
|
||||
from ..exceptions import (
|
||||
reCaptchaServiceUnavailable,
|
||||
reCaptchaAPIError,
|
||||
reCaptchaTimeout,
|
||||
reCaptchaParameter,
|
||||
reCaptchaBadJobID
|
||||
)
|
||||
|
||||
from . import reCaptcha
|
||||
|
||||
|
||||
class captchaSolver(reCaptcha):
|
||||
|
||||
def __init__(self):
|
||||
super(captchaSolver, self).__init__('9kw')
|
||||
self.host = 'https://www.9kw.eu/index.cgi'
|
||||
self.maxtimeout = 180
|
||||
self.session = requests.Session()
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@staticmethod
|
||||
def checkErrorStatus(response):
|
||||
if response.status_code in [500, 502]:
|
||||
raise reCaptchaServiceUnavailable(
|
||||
'9kw: Server Side Error {}'.format(response.status_code)
|
||||
)
|
||||
|
||||
error_codes = {
|
||||
1: 'No API Key available.',
|
||||
2: 'No API key found.',
|
||||
3: 'No active API key found.',
|
||||
4: 'API Key has been disabled by the operator. ',
|
||||
5: 'No user found.',
|
||||
6: 'No data found.',
|
||||
7: 'Found No ID.',
|
||||
8: 'found No captcha.',
|
||||
9: 'No image found.',
|
||||
10: 'Image size not allowed.',
|
||||
11: 'credit is not sufficient.',
|
||||
12: 'what was done.',
|
||||
13: 'No answer contain.',
|
||||
14: 'Captcha already been answered.',
|
||||
15: 'Captcha to quickly filed.',
|
||||
16: 'JD check active.',
|
||||
17: 'Unknown problem.',
|
||||
18: 'Found No ID.',
|
||||
19: 'Incorrect answer.',
|
||||
20: 'Do not timely filed (Incorrect UserID).',
|
||||
21: 'Link not allowed.',
|
||||
22: 'Prohibited submit.',
|
||||
23: 'Entering prohibited.',
|
||||
24: 'Too little credit.',
|
||||
25: 'No entry found.',
|
||||
26: 'No Conditions accepted.',
|
||||
27: 'No coupon code found in the database.',
|
||||
28: 'Already unused voucher code.',
|
||||
29: 'maxTimeout under 60 seconds.',
|
||||
30: 'User not found.',
|
||||
31: 'An account is not yet 24 hours in system.',
|
||||
32: 'An account does not have the full rights.',
|
||||
33: 'Plugin needed a update.',
|
||||
34: 'No HTTPS allowed.',
|
||||
35: 'No HTTP allowed.',
|
||||
36: 'Source not allowed.',
|
||||
37: 'Transfer denied.',
|
||||
38: 'Incorrect answer without space',
|
||||
39: 'Incorrect answer with space',
|
||||
40: 'Incorrect answer with not only numbers',
|
||||
41: 'Incorrect answer with not only A-Z, a-z',
|
||||
42: 'Incorrect answer with not only 0-9, A-Z, a-z',
|
||||
43: 'Incorrect answer with not only [0-9,- ]',
|
||||
44: 'Incorrect answer with not only [0-9A-Za-z,- ]',
|
||||
45: 'Incorrect answer with not only coordinates',
|
||||
46: 'Incorrect answer with not only multiple coordinates',
|
||||
47: 'Incorrect answer with not only data',
|
||||
48: 'Incorrect answer with not only rotate number',
|
||||
49: 'Incorrect answer with not only text',
|
||||
50: 'Incorrect answer with not only text and too short',
|
||||
51: 'Incorrect answer with not enough chars',
|
||||
52: 'Incorrect answer with too many chars',
|
||||
53: 'Incorrect answer without no or yes',
|
||||
54: 'Assignment was not found.'
|
||||
}
|
||||
|
||||
if response.text.startswith('{'):
|
||||
if response.json().get('error'):
|
||||
raise reCaptchaAPIError(error_codes.get(int(response.json().get('error'))))
|
||||
else:
|
||||
error_code = int(re.search(r'^00(?P<error_code>\d+)', response.text).groupdict().get('error_code', 0))
|
||||
if error_code:
|
||||
raise reCaptchaAPIError(error_codes.get(error_code))
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def requestJob(self, jobID):
|
||||
if not jobID:
|
||||
raise reCaptchaBadJobID(
|
||||
"9kw: Error bad job id to request reCaptcha against."
|
||||
)
|
||||
|
||||
def _checkRequest(response):
|
||||
if response.ok and response.json().get('answer') != 'NO DATA':
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response)
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.get(
|
||||
self.host,
|
||||
params={
|
||||
'apikey': self.api_key,
|
||||
'action': 'usercaptchacorrectdata',
|
||||
'id': jobID,
|
||||
'info': 1,
|
||||
'json': 1
|
||||
}
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=10,
|
||||
timeout=(self.maxtimeout + 10)
|
||||
)
|
||||
|
||||
if response:
|
||||
return response.json().get('answer')
|
||||
else:
|
||||
raise reCaptchaTimeout("9kw: Error failed to solve reCaptcha.")
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def requestSolve(self, site_url, site_key):
|
||||
def _checkRequest(response):
|
||||
if response.ok and response.text.startswith('{') and response.json().get('captchaid'):
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response)
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.post(
|
||||
self.host,
|
||||
data={
|
||||
'apikey': self.api_key,
|
||||
'action': 'usercaptchaupload',
|
||||
'interactive': 1,
|
||||
'file-upload-01': site_key,
|
||||
'oldsource': 'recaptchav2',
|
||||
'pageurl': site_url,
|
||||
'maxtimeout': self.maxtimeout,
|
||||
'json': 1
|
||||
},
|
||||
allow_redirects=False
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=5,
|
||||
timeout=(self.maxtimeout + 10)
|
||||
)
|
||||
|
||||
if response:
|
||||
return response.json().get('captchaid')
|
||||
else:
|
||||
raise reCaptchaBadJobID('9kw: Error no valid job id was returned.')
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
|
||||
jobID = None
|
||||
|
||||
if not reCaptchaParams.get('api_key'):
|
||||
raise reCaptchaParameter("9kw: Missing api_key parameter.")
|
||||
|
||||
self.api_key = reCaptchaParams.get('api_key')
|
||||
|
||||
if reCaptchaParams.get('maxtimeout'):
|
||||
self.maxtimeout = reCaptchaParams.get('maxtimeout')
|
||||
|
||||
if reCaptchaParams.get('proxy'):
|
||||
self.session.proxies = reCaptchaParams.get('proxies')
|
||||
|
||||
try:
|
||||
jobID = self.requestSolve(site_url, site_key)
|
||||
return self.requestJob(jobID)
|
||||
except polling.TimeoutException:
|
||||
raise reCaptchaTimeout(
|
||||
"9kw: reCaptcha solve took to long to execute 'captchaid' {}, aborting.".format(jobID)
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
captchaSolver()
|
||||
@@ -0,0 +1,46 @@
|
||||
import abc
|
||||
import logging
|
||||
import sys
|
||||
|
||||
if sys.version_info >= (3, 4):
|
||||
ABC = abc.ABC # noqa
|
||||
else:
|
||||
ABC = abc.ABCMeta('ABC', (), {})
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
captchaSolvers = {}
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class reCaptcha(ABC):
|
||||
@abc.abstractmethod
|
||||
def __init__(self, name):
|
||||
captchaSolvers[name] = self
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@classmethod
|
||||
def dynamicImport(cls, name):
|
||||
if name not in captchaSolvers:
|
||||
try:
|
||||
__import__('{}.{}'.format(cls.__module__, name))
|
||||
if not isinstance(captchaSolvers.get(name), reCaptcha):
|
||||
raise ImportError('The anti reCaptcha provider was not initialized.')
|
||||
except ImportError:
|
||||
logging.error("Unable to load {} anti reCaptcha provider".format(name))
|
||||
raise
|
||||
|
||||
return captchaSolvers[name]
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@abc.abstractmethod
|
||||
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def solveCaptcha(self, site_url, site_key, reCaptchaParams):
|
||||
return self.getCaptchaAnswer(site_url, site_key, reCaptchaParams)
|
||||
@@ -0,0 +1,49 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from ..exceptions import reCaptchaParameter
|
||||
|
||||
try:
|
||||
from python_anticaptcha import (
|
||||
AnticaptchaClient,
|
||||
NoCaptchaTaskProxylessTask
|
||||
)
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install the python module 'python_anticaptcha' via pip or download it from "
|
||||
"https://github.com/ad-m/python-anticaptcha"
|
||||
)
|
||||
|
||||
from . import reCaptcha
|
||||
|
||||
|
||||
class captchaSolver(reCaptcha):
|
||||
|
||||
def __init__(self):
|
||||
super(captchaSolver, self).__init__('anticaptcha')
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
|
||||
if not reCaptchaParams.get('api_key'):
|
||||
raise reCaptchaParameter("anticaptcha: Missing api_key parameter.")
|
||||
|
||||
client = AnticaptchaClient(reCaptchaParams.get('api_key'))
|
||||
|
||||
if reCaptchaParams.get('proxy'):
|
||||
client.session.proxies = reCaptchaParams.get('proxies')
|
||||
|
||||
task = NoCaptchaTaskProxylessTask(site_url, site_key)
|
||||
|
||||
if not hasattr(client, 'createTaskSmee'):
|
||||
raise NotImplementedError(
|
||||
"Please upgrade 'python_anticaptcha' via pip or download it from "
|
||||
"https://github.com/ad-m/python-anticaptcha"
|
||||
)
|
||||
|
||||
job = client.createTaskSmee(task)
|
||||
return job.get_solution_response()
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
captchaSolver()
|
||||
@@ -0,0 +1,227 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import json
|
||||
import requests
|
||||
|
||||
try:
|
||||
import polling
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Please install the python module 'polling' via pip or download it from "
|
||||
"https://github.com/justiniso/polling/"
|
||||
)
|
||||
|
||||
from ..exceptions import (
|
||||
reCaptchaServiceUnavailable,
|
||||
reCaptchaAccountError,
|
||||
reCaptchaTimeout,
|
||||
reCaptchaParameter,
|
||||
reCaptchaBadJobID,
|
||||
reCaptchaReportError
|
||||
)
|
||||
|
||||
from . import reCaptcha
|
||||
|
||||
|
||||
class captchaSolver(reCaptcha):
|
||||
|
||||
def __init__(self):
|
||||
super(captchaSolver, self).__init__('deathbycaptcha')
|
||||
self.host = 'http://api.dbcapi.me/api'
|
||||
self.session = requests.Session()
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
@staticmethod
|
||||
def checkErrorStatus(response):
|
||||
errors = dict(
|
||||
[
|
||||
(400, "DeathByCaptcha: 400 Bad Request"),
|
||||
(403, "DeathByCaptcha: 403 Forbidden - Invalid credentails or insufficient credits."),
|
||||
# (500, "DeathByCaptcha: 500 Internal Server Error."),
|
||||
(503, "DeathByCaptcha: 503 Service Temporarily Unavailable.")
|
||||
]
|
||||
)
|
||||
|
||||
if response.status_code in errors:
|
||||
raise reCaptchaServiceUnavailable(errors.get(response.status_code))
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def login(self, username, password):
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
def _checkRequest(response):
|
||||
if response.ok:
|
||||
if response.json().get('is_banned'):
|
||||
raise reCaptchaAccountError('DeathByCaptcha: Your account is banned.')
|
||||
|
||||
if response.json().get('balanace') == 0:
|
||||
raise reCaptchaAccountError('DeathByCaptcha: insufficient credits.')
|
||||
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response)
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.post(
|
||||
'{}/user'.format(self.host),
|
||||
headers={'Accept': 'application/json'},
|
||||
data={
|
||||
'username': self.username,
|
||||
'password': self.password
|
||||
}
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=10,
|
||||
timeout=120
|
||||
)
|
||||
|
||||
self.debugRequest(response)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def reportJob(self, jobID):
|
||||
if not jobID:
|
||||
raise reCaptchaBadJobID(
|
||||
"DeathByCaptcha: Error bad job id to report failed reCaptcha."
|
||||
)
|
||||
|
||||
def _checkRequest(response):
|
||||
if response.status_code == 200:
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response)
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.post(
|
||||
'{}/captcha/{}/report'.format(self.host, jobID),
|
||||
headers={'Accept': 'application/json'},
|
||||
data={
|
||||
'username': self.username,
|
||||
'password': self.password
|
||||
}
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=10,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
if response:
|
||||
return True
|
||||
else:
|
||||
raise reCaptchaReportError(
|
||||
"DeathByCaptcha: Error report failed reCaptcha."
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def requestJob(self, jobID):
|
||||
if not jobID:
|
||||
raise reCaptchaBadJobID(
|
||||
"DeathByCaptcha: Error bad job id to request reCaptcha."
|
||||
)
|
||||
|
||||
def _checkRequest(response):
|
||||
if response.ok and response.json().get('text'):
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response)
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.get(
|
||||
'{}/captcha/{}'.format(self.host, jobID),
|
||||
headers={'Accept': 'application/json'}
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=10,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
if response:
|
||||
return response.json().get('text')
|
||||
else:
|
||||
raise reCaptchaTimeout(
|
||||
"DeathByCaptcha: Error failed to solve reCaptcha."
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def requestSolve(self, site_url, site_key):
|
||||
def _checkRequest(response):
|
||||
if response.ok and response.json().get("is_correct") and response.json().get('captcha'):
|
||||
return response
|
||||
|
||||
self.checkErrorStatus(response)
|
||||
|
||||
return None
|
||||
|
||||
response = polling.poll(
|
||||
lambda: self.session.post(
|
||||
'{}/captcha'.format(self.host),
|
||||
headers={'Accept': 'application/json'},
|
||||
data={
|
||||
'username': self.username,
|
||||
'password': self.password,
|
||||
'type': '4',
|
||||
'token_params': json.dumps({
|
||||
'googlekey': site_key,
|
||||
'pageurl': site_url
|
||||
})
|
||||
},
|
||||
allow_redirects=False
|
||||
),
|
||||
check_success=_checkRequest,
|
||||
step=10,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
if response:
|
||||
return response.json().get('captcha')
|
||||
else:
|
||||
raise reCaptchaBadJobID(
|
||||
'DeathByCaptcha: Error no job id was returned.'
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
|
||||
jobID = None
|
||||
|
||||
for param in ['username', 'password']:
|
||||
if not reCaptchaParams.get(param):
|
||||
raise reCaptchaParameter(
|
||||
"DeathByCaptcha: Missing '{}' parameter.".format(param)
|
||||
)
|
||||
setattr(self, param, reCaptchaParams.get(param))
|
||||
|
||||
if reCaptchaParams.get('proxy'):
|
||||
self.session.proxies = reCaptchaParams.get('proxies')
|
||||
|
||||
try:
|
||||
jobID = self.requestSolve(site_url, site_key)
|
||||
return self.requestJob(jobID)
|
||||
except polling.TimeoutException:
|
||||
try:
|
||||
if jobID:
|
||||
self.reportJob(jobID)
|
||||
except polling.TimeoutException:
|
||||
raise reCaptchaTimeout(
|
||||
"DeathByCaptcha: reCaptcha solve took to long and also failed reporting the job id {}.".format(jobID)
|
||||
)
|
||||
|
||||
raise reCaptchaTimeout(
|
||||
"DeathByCaptcha: reCaptcha solve took to long to execute job id {}, aborting.".format(jobID)
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
captchaSolver()
|
||||
@@ -1,40 +1,117 @@
|
||||
import os
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import ssl
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
|
||||
class User_Agent():
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.headers = None
|
||||
self.cipherSuite = []
|
||||
self.loadUserAgent(*args, **kwargs)
|
||||
|
||||
##########################################################################################################################################################
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def loadHeaders(self, user_agents, user_agent_version):
|
||||
if user_agents.get(self.browser).get('releases').get(user_agent_version).get('headers'):
|
||||
self.headers = user_agents.get(self.browser).get('releases').get(user_agent_version).get('headers')
|
||||
else:
|
||||
self.headers = user_agents.get(self.browser).get('default_headers')
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def filterAgents(self, releases):
|
||||
filtered = {}
|
||||
|
||||
for release in releases:
|
||||
if self.mobile and releases[release]['User-Agent']['mobile']:
|
||||
filtered[release] = filtered.get(release, []) + releases[release]['User-Agent']['mobile']
|
||||
|
||||
if self.desktop and releases[release]['User-Agent']['desktop']:
|
||||
filtered[release] = filtered.get(release, []) + releases[release]['User-Agent']['desktop']
|
||||
|
||||
return filtered
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def tryMatchCustom(self, user_agents):
|
||||
for browser in user_agents:
|
||||
for release in user_agents[browser]['releases']:
|
||||
for platform in ['mobile', 'desktop']:
|
||||
if re.search(re.escape(self.custom), ' '.join(user_agents[browser]['releases'][release]['User-Agent'][platform])):
|
||||
self.browser = browser
|
||||
self.loadHeaders(user_agents, release)
|
||||
self.headers['User-Agent'] = self.custom
|
||||
self.cipherSuite = user_agents[self.browser].get('cipherSuite', [])
|
||||
return True
|
||||
return False
|
||||
|
||||
# ------------------------------------------------------------------------------- #
|
||||
|
||||
def loadUserAgent(self, *args, **kwargs):
|
||||
browser = kwargs.pop('browser', 'chrome')
|
||||
self.browser = kwargs.pop('browser', None)
|
||||
|
||||
user_agents = json.load(
|
||||
open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r'),
|
||||
object_pairs_hook=OrderedDict
|
||||
)
|
||||
if isinstance(self.browser, dict):
|
||||
self.custom = self.browser.get('custom', None)
|
||||
self.desktop = self.browser.get('desktop', True)
|
||||
self.mobile = self.browser.get('mobile', True)
|
||||
self.browser = self.browser.get('browser', None)
|
||||
else:
|
||||
self.custom = kwargs.pop('custom', None)
|
||||
self.desktop = kwargs.pop('desktop', True)
|
||||
self.mobile = kwargs.pop('mobile', True)
|
||||
|
||||
if not user_agents.get(browser):
|
||||
logging.error('Sorry "{}" browser User-Agent was not found.'.format(browser))
|
||||
raise
|
||||
if not self.desktop and not self.mobile:
|
||||
sys.tracebacklimit = 0
|
||||
raise RuntimeError("Sorry you can't have mobile and desktop disabled at the same time.")
|
||||
|
||||
user_agent = random.choice(user_agents.get(browser))
|
||||
with open(os.path.join(os.path.dirname(__file__), 'browsers.json'), 'r') as fp:
|
||||
user_agents = json.load(
|
||||
fp,
|
||||
object_pairs_hook=OrderedDict
|
||||
)
|
||||
|
||||
self.headers = user_agent.get('headers')
|
||||
self.headers['User-Agent'] = random.choice(user_agent.get('User-Agent'))
|
||||
if self.custom:
|
||||
if not self.tryMatchCustom(user_agents):
|
||||
self.cipherSuite = [
|
||||
ssl._DEFAULT_CIPHERS,
|
||||
'!AES128-SHA',
|
||||
'!ECDHE-RSA-AES256-SHA',
|
||||
]
|
||||
self.headers = OrderedDict([
|
||||
('User-Agent', self.custom),
|
||||
('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'),
|
||||
('Accept-Language', 'en-US,en;q=0.9'),
|
||||
('Accept-Encoding', 'gzip, deflate, br')
|
||||
])
|
||||
else:
|
||||
if self.browser and not user_agents.get(self.browser):
|
||||
sys.tracebacklimit = 0
|
||||
raise RuntimeError('Sorry "{}" browser User-Agent was not found.'.format(self.browser))
|
||||
|
||||
if not kwargs.get('allow_brotli', False):
|
||||
if 'br' in self.headers['Accept-Encoding']:
|
||||
self.headers['Accept-Encoding'] = ','.join([encoding for encoding in self.headers['Accept-Encoding'].split(',') if encoding.strip() != 'br']).strip()
|
||||
if not self.browser:
|
||||
self.browser = random.SystemRandom().choice(list(user_agents))
|
||||
|
||||
self.cipherSuite = user_agents.get(self.browser).get('cipherSuite', [])
|
||||
|
||||
filteredAgents = self.filterAgents(user_agents.get(self.browser).get('releases'))
|
||||
|
||||
user_agent_version = random.SystemRandom().choice(list(filteredAgents))
|
||||
|
||||
self.loadHeaders(user_agents, user_agent_version)
|
||||
|
||||
self.headers['User-Agent'] = random.SystemRandom().choice(filteredAgents[user_agent_version])
|
||||
|
||||
if not kwargs.get('allow_brotli', False) and 'br' in self.headers['Accept-Encoding']:
|
||||
self.headers['Accept-Encoding'] = ','.join([
|
||||
encoding for encoding in self.headers['Accept-Encoding'].split(',') if encoding.strip() != 'br'
|
||||
]).strip()
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -43,6 +43,8 @@ python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.get
|
||||
# subscenter:list
|
||||
python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.getLogger('rebulk').setLevel(logging.WARNING); import subliminal_patch, subliminal; subliminal.region.configure('dogpile.cache.memory'); from subliminal_patch.core import SZProviderPool; from babelfish import Language; from subliminal.core import scan_video; print SZProviderPool(providers=['subscenter'], )['subscenter'].list_subtitles(scan_video('FULL_PATH'), languages=[Language('heb')])"
|
||||
|
||||
# subscene:list
|
||||
python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.getLogger('rebulk').setLevel(logging.WARNING); import subliminal_patch, subliminal; subliminal.region.configure('dogpile.cache.memory'); from subliminal_patch.core import SZProviderPool; from subzero.language import Language; from subzero.video import parse_video; SZProviderPool(providers=['subscene'], provider_configs={'subscene': {'username': 'USERNAME', 'password': 'PASSWORD'}})['subscene'].list_subtitles(parse_video('FILENAME', {}, {'type': 'episode'}, dry_run=True), languages=[Language('eng')])"
|
||||
|
||||
# refining
|
||||
python -c "import logging; logging.basicConfig(level=logging.DEBUG); logging.getLogger('rebulk').setLevel(logging.WARNING); import os; os.environ['U1pfT01EQl9LRVk'] = '789CF30DAC2C8B0AF433F5C9AD34290A712DF30D7135F12D0FB3E502006FDE081E'; import subliminal_patch, subliminal; subliminal.region.configure('dogpile.cache.memory'); from subzero.video import parse_video, refine_video; video = parse_video('FILE_NAME', {'type': 'episode'}, dry_run=True); print refine_video(video)"
|
||||
|
||||
@@ -12,3 +12,6 @@ class UnknownFormatIdentifierError(Pysubs2Error):
|
||||
|
||||
class FormatAutodetectionError(Pysubs2Error):
|
||||
"""Subtitle format is ambiguous or unknown."""
|
||||
|
||||
class ContentNotUsable(Pysubs2Error):
|
||||
"""Current content not usable for specified format"""
|
||||
|
||||
@@ -41,6 +41,7 @@ class SSAStyle(object):
|
||||
self.italic = False #: Italic
|
||||
self.underline = False #: Underline (ASS only)
|
||||
self.strikeout = False #: Strikeout (ASS only)
|
||||
self.drawing = False #: Drawing (ASS only, see http://docs.aegisub.org/3.1/ASS_Tags/#drawing-tags
|
||||
self.scalex = 100.0 #: Horizontal scaling (ASS only)
|
||||
self.scaley = 100.0 #: Vertical scaling (ASS only)
|
||||
self.spacing = 0.0 #: Letter spacing (ASS only)
|
||||
|
||||
@@ -5,6 +5,7 @@ from .formatbase import FormatBase
|
||||
from .ssaevent import SSAEvent
|
||||
from .ssastyle import SSAStyle
|
||||
from .substation import parse_tags
|
||||
from .exceptions import ContentNotUsable
|
||||
from .time import ms_to_times, make_time, TIMESTAMP, timestamp_to_ms
|
||||
|
||||
#: Largest timestamp allowed in SubRip, ie. 99:59:59,999.
|
||||
@@ -81,6 +82,7 @@ class SubripFormat(FormatBase):
|
||||
if sty.italic: fragment = "<i>%s</i>" % fragment
|
||||
if sty.underline: fragment = "<u>%s</u>" % fragment
|
||||
if sty.strikeout: fragment = "<s>%s</s>" % fragment
|
||||
if sty.drawing: raise ContentNotUsable
|
||||
body.append(fragment)
|
||||
|
||||
return re.sub("\n+", "\n", "".join(body).strip())
|
||||
@@ -90,7 +92,10 @@ class SubripFormat(FormatBase):
|
||||
for i, line in enumerate(visible_lines, 1):
|
||||
start = ms_to_timestamp(line.start)
|
||||
end = ms_to_timestamp(line.end)
|
||||
text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
|
||||
try:
|
||||
text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
|
||||
except ContentNotUsable:
|
||||
continue
|
||||
|
||||
print("%d" % i, file=fp) # Python 2.7 compat
|
||||
print(start, "-->", end, file=fp)
|
||||
|
||||
@@ -110,7 +110,7 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
|
||||
|
||||
def apply_overrides(all_overrides):
|
||||
s = style.copy()
|
||||
for tag in re.findall(r"\\[ibus][10]|\\r[a-zA-Z_0-9 ]*", all_overrides):
|
||||
for tag in re.findall(r"\\[ibusp][0-9]|\\r[a-zA-Z_0-9 ]*", all_overrides):
|
||||
if tag == r"\r":
|
||||
s = style.copy() # reset to original line style
|
||||
elif tag.startswith(r"\r"):
|
||||
@@ -122,6 +122,13 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
|
||||
elif "b" in tag: s.bold = "1" in tag
|
||||
elif "u" in tag: s.underline = "1" in tag
|
||||
elif "s" in tag: s.strikeout = "1" in tag
|
||||
elif "p" in tag:
|
||||
try:
|
||||
scale = int(tag[2:])
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
|
||||
s.drawing = scale > 0
|
||||
return s
|
||||
|
||||
overrides = SSAEvent.OVERRIDE_SEQUENCE.findall(text)
|
||||
|
||||
@@ -27,3 +27,8 @@ class ServiceUnavailable(ProviderError):
|
||||
class DownloadLimitExceeded(ProviderError):
|
||||
"""Exception raised by providers when download limit is exceeded."""
|
||||
pass
|
||||
|
||||
|
||||
class DownloadLimitPerDayExceeded(ProviderError):
|
||||
"""Exception raised by providers when download limit is exceeded."""
|
||||
pass
|
||||
|
||||
@@ -264,7 +264,7 @@ class SZProviderPool(ProviderPool):
|
||||
requests.exceptions.SSLError,
|
||||
requests.Timeout,
|
||||
socket.timeout):
|
||||
logger.error('Provider %r connection error', subtitle.provider_name)
|
||||
logger.exception('Provider %r connection error', subtitle.provider_name)
|
||||
|
||||
except ResponseNotReady:
|
||||
logger.error('Provider %r response error, reinitializing', subtitle.provider_name)
|
||||
|
||||
@@ -20,7 +20,7 @@ from exceptions import APIThrottled
|
||||
from dogpile.cache.api import NO_VALUE
|
||||
from subliminal.cache import region
|
||||
from subliminal_patch.pitcher import pitchers
|
||||
from cloudscraper import CloudScraper
|
||||
from cloudscraper import CloudScraper, User_Agent
|
||||
|
||||
try:
|
||||
import brotli
|
||||
@@ -89,7 +89,9 @@ class CFSession(CloudScraper):
|
||||
|
||||
# Check if Cloudflare anti-bot is on
|
||||
try:
|
||||
if self.isChallengeRequest(resp):
|
||||
print repr(resp)
|
||||
if self.is_IUAM_Challenge(resp):
|
||||
print "TRYYYYYYYYYY"
|
||||
if resp.request.method != 'GET':
|
||||
# Work around if the initial request is not a GET,
|
||||
# Supersede with a GET then re-request the original METHOD.
|
||||
@@ -97,9 +99,10 @@ class CFSession(CloudScraper):
|
||||
resp = ourSuper.request(method, url, *args, **kwargs)
|
||||
else:
|
||||
# Solve Challenge
|
||||
resp = self.sendChallengeResponse(resp, **kwargs)
|
||||
resp = self.Challenge_Response(resp, **kwargs)
|
||||
|
||||
except ValueError, e:
|
||||
print "YEEEEEEEEEEEEEE"
|
||||
if e.message == "Captcha":
|
||||
parsed_url = urlparse(url)
|
||||
domain = parsed_url.netloc
|
||||
@@ -241,12 +244,20 @@ class SubZeroRequestsTransport(xmlrpclib.SafeTransport):
|
||||
# change our user agent to reflect Requests
|
||||
user_agent = "Python XMLRPC with Requests (python-requests.org)"
|
||||
proxies = None
|
||||
xm_ver = 1
|
||||
session_var = "PHPSESSID"
|
||||
|
||||
def __init__(self, use_https=True, verify=None, user_agent=None, timeout=10, *args, **kwargs):
|
||||
self.verify = pem_file if verify is None else verify
|
||||
self.use_https = use_https
|
||||
self.user_agent = user_agent if user_agent is not None else self.user_agent
|
||||
self.timeout = timeout
|
||||
self.session = requests.Session()
|
||||
self.session.headers['User-Agent'] = self.user_agent
|
||||
# if 'requests' in self.session.headers['User-Agent']:
|
||||
# # Set a random User-Agent if no custom User-Agent has been set
|
||||
# self.session.headers = User_Agent(allow_brotli=False).headers
|
||||
|
||||
proxy = os.environ.get('SZ_HTTP_PROXY')
|
||||
if proxy:
|
||||
self.proxies = {
|
||||
@@ -260,18 +271,40 @@ class SubZeroRequestsTransport(xmlrpclib.SafeTransport):
|
||||
"""
|
||||
Make an xmlrpc request.
|
||||
"""
|
||||
headers = {'User-Agent': self.user_agent}
|
||||
url = self._build_url(host, handler)
|
||||
cache_key = "xm%s_%s" % (self.xm_ver, host)
|
||||
|
||||
old_sessvar = self.session.cookies.get(self.session_var, "")
|
||||
if not old_sessvar:
|
||||
data = region.get(cache_key)
|
||||
if data is not NO_VALUE:
|
||||
logger.debug("Trying to re-use headers/cookies for %s" % host)
|
||||
self.session.cookies, self.session.headers = data
|
||||
old_sessvar = self.session.cookies.get(self.session_var, "")
|
||||
|
||||
try:
|
||||
resp = requests.post(url, data=request_body, headers=headers,
|
||||
stream=True, timeout=self.timeout, proxies=self.proxies,
|
||||
verify=self.verify)
|
||||
resp = self.session.post(url, data=request_body,
|
||||
stream=True, timeout=self.timeout, proxies=self.proxies,
|
||||
verify=self.verify)
|
||||
|
||||
if self.session_var in resp.cookies and resp.cookies[self.session_var] != old_sessvar:
|
||||
logger.debug("Storing %s cookies" % host)
|
||||
region.set(cache_key, [self.session.cookies, self.session.headers])
|
||||
except ValueError:
|
||||
logger.debug("Wiping cookies/headers cache (VE) for %s" % host)
|
||||
region.delete(cache_key)
|
||||
raise
|
||||
except Exception:
|
||||
logger.debug("Wiping cookies/headers cache (EX) for %s" % host)
|
||||
region.delete(cache_key)
|
||||
raise # something went wrong
|
||||
else:
|
||||
resp.raise_for_status()
|
||||
try:
|
||||
resp.raise_for_status()
|
||||
except requests.exceptions.HTTPError:
|
||||
logger.debug("Wiping cookies/headers cache (RE) for %s" % host)
|
||||
region.delete(cache_key)
|
||||
raise
|
||||
|
||||
try:
|
||||
if 'x-ratelimit-remaining' in resp.headers and int(resp.headers['x-ratelimit-remaining']) <= 2:
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
import logging
|
||||
import re
|
||||
import datetime
|
||||
import types
|
||||
|
||||
import subliminal
|
||||
import time
|
||||
|
||||
@@ -10,7 +12,8 @@ from random import randint
|
||||
from dogpile.cache.api import NO_VALUE
|
||||
from requests import Session
|
||||
from subliminal.cache import region
|
||||
from subliminal.exceptions import DownloadLimitExceeded, AuthenticationError, ConfigurationError
|
||||
from subliminal.exceptions import DownloadLimitExceeded, AuthenticationError, ConfigurationError, \
|
||||
DownloadLimitPerDayExceeded
|
||||
from subliminal.providers.addic7ed import Addic7edProvider as _Addic7edProvider, \
|
||||
Addic7edSubtitle as _Addic7edSubtitle, ParserBeautifulSoup
|
||||
from subliminal.subtitle import fix_line_ending
|
||||
@@ -64,6 +67,7 @@ class Addic7edProvider(_Addic7edProvider):
|
||||
'slk', 'slv', 'spa', 'sqi', 'srp', 'swe', 'tha', 'tur', 'ukr', 'vie', 'zho'
|
||||
]} | {Language.fromietf(l) for l in ["sr-Latn", "sr-Cyrl"]}
|
||||
|
||||
vip = False
|
||||
USE_ADDICTED_RANDOM_AGENTS = False
|
||||
hearing_impaired_verifiable = True
|
||||
subtitle_class = Addic7edSubtitle
|
||||
@@ -72,9 +76,10 @@ class Addic7edProvider(_Addic7edProvider):
|
||||
sanitize_characters = {'-', ':', '(', ')', '.', '/'}
|
||||
last_show_ids_fetch_key = "addic7ed_last_id_fetch"
|
||||
|
||||
def __init__(self, username=None, password=None, use_random_agents=False):
|
||||
def __init__(self, username=None, password=None, use_random_agents=False, is_vip=False):
|
||||
super(Addic7edProvider, self).__init__(username=username, password=password)
|
||||
self.USE_ADDICTED_RANDOM_AGENTS = use_random_agents
|
||||
self.vip = is_vip
|
||||
|
||||
if not all((username, password)):
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
@@ -397,6 +402,27 @@ class Addic7edProvider(_Addic7edProvider):
|
||||
return subtitles
|
||||
|
||||
def download_subtitle(self, subtitle):
|
||||
last_dls = region.get("addic7ed_dls")
|
||||
now = datetime.datetime.now()
|
||||
one_day = datetime.timedelta(hours=24)
|
||||
|
||||
def raise_limit():
|
||||
logger.info("Addic7ed: Downloads per day exceeded (%s)", cap)
|
||||
raise DownloadLimitPerDayExceeded
|
||||
|
||||
if not isinstance(last_dls, types.ListType):
|
||||
last_dls = []
|
||||
else:
|
||||
# filter all non-expired DLs
|
||||
last_dls = filter(lambda t: t + one_day > now, last_dls)
|
||||
region.set("addic7ed_dls", last_dls)
|
||||
|
||||
cap = self.vip and 80 or 40
|
||||
amount = len(last_dls)
|
||||
|
||||
if amount >= cap:
|
||||
raise_limit()
|
||||
|
||||
# download the subtitle
|
||||
r = self.session.get(self.server_url + subtitle.download_link, headers={'Referer': subtitle.page_link},
|
||||
timeout=10)
|
||||
@@ -408,7 +434,7 @@ class Addic7edProvider(_Addic7edProvider):
|
||||
if not r.content:
|
||||
# Provider wrongful return a status of 304 Not Modified with an empty content
|
||||
# raise_for_status won't raise exception for that status code
|
||||
logger.error('Unable to download subtitle. No data returned from provider')
|
||||
logger.error('Addic7ed: Unable to download subtitle. No data returned from provider')
|
||||
return
|
||||
|
||||
# detect download limit exceeded
|
||||
@@ -416,3 +442,10 @@ class Addic7edProvider(_Addic7edProvider):
|
||||
raise DownloadLimitExceeded
|
||||
|
||||
subtitle.content = fix_line_ending(r.content)
|
||||
last_dls.append(datetime.datetime.now())
|
||||
region.set("addic7ed_dls", last_dls)
|
||||
logger.info("Addic7ed: Used %s/%s downloads", amount + 1, cap)
|
||||
|
||||
if amount + 1 >= cap:
|
||||
raise_limit()
|
||||
|
||||
|
||||
@@ -105,7 +105,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
|
||||
|
||||
def __init__(self, username=None, password=None, use_tag_search=False, only_foreign=False, also_foreign=False,
|
||||
skip_wrong_fps=True, is_vip=False, use_ssl=True, timeout=15):
|
||||
if any((username, password)) and not all((username, password)):
|
||||
if not all((username, password)):
|
||||
raise ConfigurationError('Username and password must be specified')
|
||||
|
||||
self.username = username or ''
|
||||
@@ -154,6 +154,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
|
||||
logger.debug('Logged in with token %r', self.token[:10]+"X"*(len(self.token)-10))
|
||||
|
||||
region.set("os_token", self.token)
|
||||
time.sleep(1)
|
||||
|
||||
def use_token_or_login(self, func):
|
||||
if not self.token:
|
||||
@@ -162,6 +163,7 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
|
||||
try:
|
||||
return func()
|
||||
except Unauthorized:
|
||||
logger.debug("Token not valid, logging in again")
|
||||
self.log_in()
|
||||
return func()
|
||||
|
||||
@@ -197,16 +199,11 @@ class OpenSubtitlesProvider(ProviderRetryMixin, _OpenSubtitlesProvider):
|
||||
return
|
||||
|
||||
logger.error("Login failed, please check your credentials")
|
||||
raise
|
||||
|
||||
def terminate(self):
|
||||
if self.token:
|
||||
try:
|
||||
checked(lambda: self.server.LogOut(self.token))
|
||||
except:
|
||||
logger.error("Logout failed: %s", traceback.format_exc())
|
||||
|
||||
self.server = None
|
||||
self.token = None
|
||||
#self.token = None
|
||||
|
||||
def list_subtitles(self, video, languages):
|
||||
"""
|
||||
|
||||
@@ -278,6 +278,12 @@ class Subtitle(Subtitle_):
|
||||
|
||||
@classmethod
|
||||
def pysubs2_to_unicode(cls, sub, format="srt"):
|
||||
"""
|
||||
this is a modified version of pysubs2.SubripFormat.to_file with special handling for drawing tags in ASS
|
||||
:param sub:
|
||||
:param format:
|
||||
:return:
|
||||
"""
|
||||
def ms_to_timestamp(ms, mssep=","):
|
||||
"""Convert ms to 'HH:MM:SS,mmm'"""
|
||||
# XXX throw on overflow/underflow?
|
||||
@@ -289,9 +295,12 @@ class Subtitle(Subtitle_):
|
||||
def prepare_text(text, style):
|
||||
body = []
|
||||
for fragment, sty in parse_tags(text, style, sub.styles):
|
||||
fragment = fragment.replace(ur"\h", u" ")
|
||||
fragment = fragment.replace(ur"\n", u"\n")
|
||||
fragment = fragment.replace(ur"\N", u"\n")
|
||||
fragment = fragment.replace(r"\h", u" ")
|
||||
fragment = fragment.replace(r"\n", u"\n")
|
||||
fragment = fragment.replace(r"\N", u"\n")
|
||||
if sty.drawing:
|
||||
raise pysubs2.ContentNotUsable
|
||||
|
||||
if format == "srt":
|
||||
if sty.italic:
|
||||
fragment = u"<i>%s</i>" % fragment
|
||||
@@ -323,7 +332,10 @@ class Subtitle(Subtitle_):
|
||||
for i, line in enumerate(visible_lines, 1):
|
||||
start = ms_to_timestamp(line.start, mssep=mssep)
|
||||
end = ms_to_timestamp(line.end, mssep=mssep)
|
||||
text = prepare_text(line.text, sub.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
|
||||
try:
|
||||
text = prepare_text(line.text, sub.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
|
||||
except pysubs2.ContentNotUsable:
|
||||
continue
|
||||
|
||||
out.append(u"%d\n" % i)
|
||||
out.append(u"%s --> %s\n" % (start, end))
|
||||
|
||||
@@ -24,6 +24,7 @@ if debug:
|
||||
sub = Subtitle(Language.fromietf("eng"), mods=["common", "remove_HI", "OCR_fixes", "fix_uppercase", "shift_offset(ms=0,s=1)"])
|
||||
sub.content = open(fn).read()
|
||||
sub.normalize()
|
||||
sub.is_valid()
|
||||
content = sub.get_modified_content(debug=True)
|
||||
|
||||
#submod = SubMod(debug=debug)
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
# coding=utf-8
|
||||
class EmptyEntryError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class EmptyLineError(Exception):
|
||||
pass
|
||||
@@ -6,7 +6,8 @@ import pysubs2
|
||||
import logging
|
||||
import time
|
||||
|
||||
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
|
||||
from mods import EMPTY_TAG_PROCESSOR
|
||||
from exc import EmptyEntryError
|
||||
from registry import registry
|
||||
from subzero.language import Language
|
||||
|
||||
@@ -300,11 +301,11 @@ class SubtitleModifications(object):
|
||||
mod = self.initialized_mods[identifier]
|
||||
|
||||
try:
|
||||
line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
|
||||
line = mod.modify(line.strip(), entry=t, debug=self.debug, parent=self, index=index,
|
||||
**args)
|
||||
except EmptyEntryError:
|
||||
if self.debug:
|
||||
logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
|
||||
logger.debug(u"%d: %s: %r -> ''", index, identifier, t)
|
||||
skip_entry = True
|
||||
break
|
||||
|
||||
@@ -329,11 +330,11 @@ class SubtitleModifications(object):
|
||||
mod = self.initialized_mods[identifier]
|
||||
|
||||
try:
|
||||
line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
|
||||
line = mod.modify(line.strip(), entry=t, debug=self.debug, parent=self, index=index,
|
||||
procs=["last_process"], **args)
|
||||
except EmptyEntryError:
|
||||
if self.debug:
|
||||
logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
|
||||
logger.debug(u"%d: %s: %r -> ''", index, identifier, t)
|
||||
skip_entry = True
|
||||
break
|
||||
|
||||
|
||||
@@ -107,9 +107,3 @@ empty_line_post_processors = [
|
||||
]
|
||||
|
||||
|
||||
class EmptyEntryError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class EmptyLineError(Exception):
|
||||
pass
|
||||
|
||||
@@ -7,6 +7,7 @@ from subzero.modification.mods import SubtitleTextModification, empty_line_post_
|
||||
from subzero.modification.processors import FuncProcessor
|
||||
from subzero.modification.processors.re_processor import NReProcessor
|
||||
from subzero.modification import registry
|
||||
from tld import get_tld
|
||||
|
||||
|
||||
ENGLISH = Language("eng")
|
||||
@@ -28,7 +29,7 @@ class CommonFixes(SubtitleTextModification):
|
||||
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"),
|
||||
|
||||
# line = _/-/\s
|
||||
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
|
||||
NReProcessor(re.compile(r'(?u)(^\W*[-_.:<>~"\']+\W*$)'), "", name="CM_non_word_only"),
|
||||
|
||||
# remove >>
|
||||
NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),
|
||||
@@ -113,7 +114,9 @@ class CommonFixes(SubtitleTextModification):
|
||||
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]| \.))'), r"\1", name="CM_punctuation_space"),
|
||||
|
||||
# add space after punctuation
|
||||
NReProcessor(re.compile(r'(?u)([!?.,:])([A-zÀ-ž]{2,})'), r"\1 \2", name="CM_punctuation_space2"),
|
||||
NReProcessor(re.compile(r'(?u)(([^\s]*)([!?.,:])([A-zÀ-ž]{2,}))'),
|
||||
lambda match: u"%s%s %s" % (match.group(2), match.group(3), match.group(4)) if not get_tld(match.group(1), fail_silently=True, fix_protocol=True) else match.group(1),
|
||||
name="CM_punctuation_space2"),
|
||||
|
||||
# fix lowercase I in english
|
||||
NReProcessor(re.compile(r'(?u)(\b)i(\b)'), r"\1I\2", name="CM_EN_lowercase_i",
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
# coding=utf-8
|
||||
import re
|
||||
|
||||
from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, EmptyEntryError, TAG
|
||||
from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, TAG
|
||||
from subzero.modification.exc import EmptyEntryError
|
||||
from subzero.modification.processors.re_processor import NReProcessor
|
||||
from subzero.modification import registry
|
||||
|
||||
@@ -46,7 +47,7 @@ class HearingImpaired(SubtitleTextModification):
|
||||
name="HI_before_colon_noncaps"),
|
||||
|
||||
# brackets (only remove if at least 3 chars in brackets)
|
||||
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
|
||||
NReProcessor(re.compile(ur'(?sux)-?%(t)s["\']*[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]]["\']*[\s:]*%(t)s' %
|
||||
{"t": TAG}), "", name="HI_brackets"),
|
||||
|
||||
#NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
|
||||
@@ -90,8 +91,8 @@ class HearingImpaired(SubtitleTextModification):
|
||||
"", name="HI_music_symbols_only"),
|
||||
|
||||
# remove music entries
|
||||
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
|
||||
"", name="HI_music"),
|
||||
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[*#¶♫♪]+\s*.+|.+\s*[*#¶♫♪]+\s*$)'),
|
||||
"", name="HI_music", entry=True),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ class Processor(object):
|
||||
supported = None
|
||||
enabled = True
|
||||
|
||||
def __init__(self, name=None, parent=None, supported=None):
|
||||
def __init__(self, name=None, parent=None, supported=None, **kwargs):
|
||||
self.name = name
|
||||
self.parent = parent
|
||||
self.supported = supported if supported else lambda parent: True
|
||||
@@ -35,7 +35,7 @@ class Processor(object):
|
||||
class FuncProcessor(Processor):
|
||||
func = None
|
||||
|
||||
def __init__(self, func, name=None, parent=None, supported=None):
|
||||
def __init__(self, func, name=None, parent=None, supported=None, **kwargs):
|
||||
super(FuncProcessor, self).__init__(name=name, supported=supported)
|
||||
self.func = func
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
import re
|
||||
import logging
|
||||
|
||||
from subzero.modification.exc import EmptyEntryError
|
||||
from subzero.modification.processors import Processor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -14,13 +15,22 @@ class ReProcessor(Processor):
|
||||
pattern = None
|
||||
replace_with = None
|
||||
|
||||
def __init__(self, pattern, replace_with, name=None, supported=None):
|
||||
def __init__(self, pattern, replace_with, name=None, supported=None, entry=False, **kwargs):
|
||||
super(ReProcessor, self).__init__(name=name, supported=supported)
|
||||
self.pattern = pattern
|
||||
self.replace_with = replace_with
|
||||
self.use_entry = entry
|
||||
|
||||
def process(self, content, debug=False, **kwargs):
|
||||
return self.pattern.sub(self.replace_with, content)
|
||||
def process(self, content, debug=False, entry=None, **kwargs):
|
||||
if not self.use_entry:
|
||||
return self.pattern.sub(self.replace_with, content)
|
||||
|
||||
ret = self.pattern.sub(self.replace_with, entry)
|
||||
if not ret:
|
||||
raise EmptyEntryError()
|
||||
elif ret != entry:
|
||||
return ret
|
||||
return content
|
||||
|
||||
|
||||
class NReProcessor(ReProcessor):
|
||||
@@ -36,7 +46,7 @@ class MultipleWordReProcessor(ReProcessor):
|
||||
}
|
||||
replaces found key in pattern with the corresponding value in data
|
||||
"""
|
||||
def __init__(self, snr_dict, name=None, parent=None, supported=None):
|
||||
def __init__(self, snr_dict, name=None, parent=None, supported=None, **kwargs):
|
||||
super(ReProcessor, self).__init__(name=name, supported=supported)
|
||||
self.snr_dict = snr_dict
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ class StringProcessor(Processor):
|
||||
String replacement processor base
|
||||
"""
|
||||
|
||||
def __init__(self, search, replace, name=None, parent=None, supported=None):
|
||||
def __init__(self, search, replace, name=None, parent=None, supported=None, **kwargs):
|
||||
super(StringProcessor, self).__init__(name=name, supported=supported)
|
||||
self.search = search
|
||||
self.replace = replace
|
||||
@@ -31,7 +31,7 @@ class MultipleLineProcessor(Processor):
|
||||
"data": {"old_value": "new_value"}
|
||||
}
|
||||
"""
|
||||
def __init__(self, snr_dict, name=None, parent=None, supported=None):
|
||||
def __init__(self, snr_dict, name=None, parent=None, supported=None, **kwargs):
|
||||
super(MultipleLineProcessor, self).__init__(name=name, supported=supported)
|
||||
self.snr_dict = snr_dict
|
||||
|
||||
|
||||
@@ -19,6 +19,8 @@ I can't keep running. L can't!
|
||||
<b>i don't know. Some kind of wrong "1 00" number---
|
||||
of signal, drawing the Tardis off.... course.</b>
|
||||
# I'm singing in the rain
|
||||
www.website.com
|
||||
www.nowebsite.badlol
|
||||
|
||||
4
|
||||
00:00:16,099 --> 00:00:17,224
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from .utils import get_fld, get_tld, get_tld_names, is_tld, parse_tld, Result, update_tld_names
|
||||
__title__ = u'tld'
|
||||
__version__ = u'0.11.10'
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'get_fld', u'get_tld', u'get_tld_names', u'is_tld',
|
||||
u'parse_tld', u'Result', u'update_tld_names')
|
||||
@@ -0,0 +1,57 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from six import with_metaclass as _py_backwards_six_withmetaclass
|
||||
from codecs import open as codecs_open
|
||||
try:
|
||||
from urllib.request import urlopen
|
||||
except ImportError:
|
||||
from six.moves.urllib.request import urlopen as urlopen
|
||||
from .exceptions import TldIOError, TldImproperlyConfigured
|
||||
from .helpers import project_dir
|
||||
from .registry import Registry
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'BaseTLDSourceParser',)
|
||||
|
||||
|
||||
class BaseTLDSourceParser(
|
||||
_py_backwards_six_withmetaclass(Registry, *[object])):
|
||||
u'Base TLD source parser.'
|
||||
uid = None
|
||||
source_url = None
|
||||
local_path = None
|
||||
|
||||
@classmethod
|
||||
def validate(cls):
|
||||
u'Constructor.'
|
||||
if (not cls.uid):
|
||||
raise TldImproperlyConfigured(
|
||||
u'The `uid` property of the TLD source parser shall be defined.')
|
||||
|
||||
@classmethod
|
||||
def get_tld_names(cls, fail_silently=False, retry_count=0):
|
||||
u'Get tld names.\n\n :param fail_silently:\n :param retry_count:\n :return:\n '
|
||||
cls.validate()
|
||||
raise NotImplementedError(
|
||||
u'Your TLD source parser shall implement `get_tld_names` method.')
|
||||
|
||||
@classmethod
|
||||
def update_tld_names(cls, fail_silently=False):
|
||||
u'Update the local copy of the TLD file.\n\n :param fail_silently:\n :return:\n '
|
||||
try:
|
||||
remote_file = urlopen(cls.source_url)
|
||||
local_file = codecs_open(project_dir(
|
||||
cls.local_path), u'wb', encoding='utf8')
|
||||
local_file.write(remote_file.read().decode(u'utf8'))
|
||||
local_file.close()
|
||||
remote_file.close()
|
||||
except Exception as err:
|
||||
if fail_silently:
|
||||
return False
|
||||
raise TldIOError(err)
|
||||
return True
|
||||
@@ -0,0 +1,45 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from typing import Any
|
||||
from . import defaults
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'get_setting', u'reset_settings', u'set_setting', u'settings')
|
||||
|
||||
|
||||
class Settings(object):
|
||||
u'Settings registry.'
|
||||
|
||||
def __init__(self):
|
||||
self._settings = {
|
||||
|
||||
}
|
||||
self._settings_get = self._settings.get
|
||||
|
||||
def set(self, name, value):
|
||||
u'\n Override default settings.\n\n :param str name:\n :param mixed value:\n '
|
||||
self._settings[name] = value
|
||||
|
||||
def get(self, name, default=None):
|
||||
u'\n Gets a variable from local settings.\n\n :param str name:\n :param mixed default: Default value.\n :return mixed:\n '
|
||||
if (name in self._settings):
|
||||
return self._settings_get(name, default)
|
||||
elif hasattr(defaults, name):
|
||||
return getattr(defaults, name, default)
|
||||
return default
|
||||
|
||||
def reset(self):
|
||||
u'Reset settings.'
|
||||
for name in defaults.__all__:
|
||||
self.set(name, getattr(defaults, name))
|
||||
|
||||
|
||||
settings = Settings()
|
||||
get_setting = settings.get
|
||||
set_setting = settings.set
|
||||
reset_settings = settings.reset
|
||||
@@ -0,0 +1,13 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from os.path import dirname
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'DEBUG', u'NAMES_LOCAL_PATH_PARENT')
|
||||
NAMES_LOCAL_PATH_PARENT = dirname(__file__)
|
||||
DEBUG = False
|
||||
@@ -0,0 +1,49 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from .conf import get_setting
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'TldBadUrl', u'TldDomainNotFound',
|
||||
u'TldImproperlyConfigured', u'TldIOError')
|
||||
|
||||
|
||||
class TldIOError(IOError):
|
||||
u'TldIOError.\n\n Supposed to be thrown when problems with reading/writing occur.\n '
|
||||
|
||||
def __init__(self, msg=None):
|
||||
tld_names_local_path = get_setting(u'NAMES_LOCAL_PATH')
|
||||
if (msg is None):
|
||||
msg = (u"Can't read from or write to the %s file!" %
|
||||
tld_names_local_path)
|
||||
super(TldIOError, self).__init__(msg)
|
||||
|
||||
|
||||
class TldDomainNotFound(ValueError):
|
||||
u"TldDomainNotFound.\n\n Supposed to be thrown when domain name is not found (didn't match) the\n local TLD policy.\n "
|
||||
|
||||
def __init__(self, domain_name):
|
||||
super(TldDomainNotFound, self).__init__(
|
||||
(u"Domain %s didn't match any existing TLD name!" % domain_name))
|
||||
|
||||
|
||||
class TldBadUrl(ValueError):
|
||||
u'TldBadUrl.\n\n Supposed to be thrown when bad URL is given.\n '
|
||||
|
||||
def __init__(self, url):
|
||||
super(TldBadUrl, self).__init__((u'Is not a valid URL %s!' % url))
|
||||
|
||||
|
||||
class TldImproperlyConfigured(Exception):
|
||||
u'TldImproperlyConfigured.\n\n Supposed to be thrown when code is improperly configured. Typical use-case\n is when user tries to use `get_tld` function with both `search_public` and\n `search_private` set to False.\n '
|
||||
|
||||
def __init__(self, msg=None):
|
||||
if (msg is None):
|
||||
msg = u'Improperly configured.'
|
||||
else:
|
||||
msg = (u'Improperly configured. %s' % msg)
|
||||
super(TldImproperlyConfigured, self).__init__(msg)
|
||||
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from os.path import abspath, join
|
||||
from .conf import get_setting
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'project_dir', u'PROJECT_DIR')
|
||||
|
||||
|
||||
def project_dir(base):
|
||||
u'Project dir.'
|
||||
tld_names_local_path_parent = get_setting(u'NAMES_LOCAL_PATH_PARENT')
|
||||
return abspath(join(tld_names_local_path_parent, base).replace(u'\\', u'/'))
|
||||
|
||||
|
||||
PROJECT_DIR = project_dir
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
@@ -0,0 +1,5 @@
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
@@ -0,0 +1,41 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from typing import Type, Dict
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'Registry',)
|
||||
|
||||
|
||||
class Registry(type):
|
||||
REGISTRY = {
|
||||
|
||||
}
|
||||
|
||||
def __new__(cls, name, bases, attrs):
|
||||
new_cls = type.__new__(cls, name, bases, attrs)
|
||||
if getattr(new_cls, u'_uid', None):
|
||||
cls.REGISTRY[new_cls._uid] = new_cls
|
||||
return new_cls
|
||||
|
||||
@property
|
||||
def _uid(cls):
|
||||
return getattr(cls, 'uid', cls.__name__)
|
||||
|
||||
@classmethod
|
||||
def reset(cls):
|
||||
cls.REGISTRY = {
|
||||
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get(cls, key, default=None):
|
||||
return cls.REGISTRY.get(key, default)
|
||||
|
||||
@classmethod
|
||||
def items(cls):
|
||||
return cls.REGISTRY.items()
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,57 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from typing import Dict
|
||||
try:
|
||||
from urllib.parse import SplitResult
|
||||
except ImportError:
|
||||
from six.moves.urllib_parse import SplitResult
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'Result',)
|
||||
|
||||
|
||||
class Result(object):
|
||||
u'Container.'
|
||||
__slots__ = (u'subdomain', u'domain', u'tld', u'__fld', u'parsed_url')
|
||||
|
||||
def __init__(self, tld, domain, subdomain, parsed_url):
|
||||
self.tld = tld
|
||||
self.domain = (domain if (domain != u'') else tld)
|
||||
self.subdomain = subdomain
|
||||
self.parsed_url = parsed_url
|
||||
if domain:
|
||||
self.__fld = u''.join(
|
||||
[u'{}'.format(self.domain), u'.', u'{}'.format(self.tld)])
|
||||
else:
|
||||
self.__fld = self.tld
|
||||
|
||||
@property
|
||||
def extension(self):
|
||||
u'Alias of ``tld``.\n\n :return str:\n '
|
||||
return self.tld
|
||||
suffix = extension
|
||||
|
||||
@property
|
||||
def fld(self):
|
||||
u'First level domain.\n\n :return:\n :rtype: str\n '
|
||||
return self.__fld
|
||||
|
||||
def __str__(self):
|
||||
return self.tld
|
||||
__repr__ = __str__
|
||||
|
||||
@property
|
||||
def __dict__(self):
|
||||
u'Mimic __dict__ functionality.\n\n :return:\n :rtype: dict\n '
|
||||
return {
|
||||
u'tld': self.tld,
|
||||
u'domain': self.domain,
|
||||
u'subdomain': self.subdomain,
|
||||
u'fld': self.fld,
|
||||
u'parsed_url': self.parsed_url,
|
||||
}
|
||||
@@ -0,0 +1,11 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
import unittest
|
||||
from .test_core import *
|
||||
from .test_commands import *
|
||||
if (__name__ == u'__main__'):
|
||||
unittest.main()
|
||||
@@ -0,0 +1,65 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
import logging
|
||||
import socket
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'internet_available_only', u'log_info')
|
||||
LOG_INFO = True
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def log_info(func):
|
||||
u'Log some useful info.'
|
||||
if (not LOG_INFO):
|
||||
return func
|
||||
|
||||
def inner(self, *args, **kwargs):
|
||||
u'Inner.'
|
||||
result = func(*([self] + list(args)), **kwargs)
|
||||
LOGGER.debug(u'\n\n%s', func.__name__)
|
||||
LOGGER.debug(u'============================')
|
||||
if func.__doc__:
|
||||
LOGGER.debug(u'""" %s """', func.__doc__.strip())
|
||||
LOGGER.debug(u'----------------------------')
|
||||
if (result is not None):
|
||||
LOGGER.debug(result)
|
||||
LOGGER.debug(u'\n++++++++++++++++++++++++++++')
|
||||
return result
|
||||
return inner
|
||||
|
||||
|
||||
@lru_cache(maxsize=32)
|
||||
def is_internet_available(host='8.8.8.8', port=53, timeout=3):
|
||||
u'Check if internet is available.\n\n Host: 8.8.8.8 (google-public-dns-a.google.com)\n OpenPort: 53/tcp\n Service: domain (DNS/TCP)\n '
|
||||
try:
|
||||
socket.setdefaulttimeout(timeout)
|
||||
socket.socket(socket.AF_INET, socket.SOCK_STREAM).connect((host, port))
|
||||
return True
|
||||
except socket.error as ex:
|
||||
print(ex)
|
||||
return False
|
||||
|
||||
|
||||
def internet_available_only(func):
|
||||
|
||||
def inner(self, *args, **kwargs):
|
||||
u'Inner.'
|
||||
if (not is_internet_available()):
|
||||
LOGGER.debug(u'\n\n%s', func.__name__)
|
||||
LOGGER.debug(u'============================')
|
||||
if func.__doc__:
|
||||
LOGGER.debug(u'""" %s """', func.__doc__.strip())
|
||||
LOGGER.debug(u'----------------------------')
|
||||
LOGGER.debug(u'Skipping because no Internet connection available.')
|
||||
LOGGER.debug(u'\n++++++++++++++++++++++++++++')
|
||||
return None
|
||||
result = func(*([self] + list(args)), **kwargs)
|
||||
return result
|
||||
return inner
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,43 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
import logging
|
||||
import unittest
|
||||
import subprocess
|
||||
from .base import log_info, internet_available_only
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'GPL 2.0/LGPL 2.1'
|
||||
__all__ = (u'TestCommands',)
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestCommands(unittest.TestCase):
|
||||
u'Tld commands tests.'
|
||||
|
||||
def setUp(self):
|
||||
u'Set up.'
|
||||
|
||||
@internet_available_only
|
||||
@log_info
|
||||
def test_1_update_tld_names_command(self):
|
||||
u'Test updating the tld names (re-fetch mozilla source).'
|
||||
res = subprocess.check_output([u'update-tld-names']).strip()
|
||||
self.assertEqual(res, b'')
|
||||
return res
|
||||
|
||||
@internet_available_only
|
||||
@log_info
|
||||
def test_1_update_tld_names_mozilla_command(self):
|
||||
u'Test updating the tld names (re-fetch mozilla source).'
|
||||
res = subprocess.check_output(
|
||||
[u'update-tld-names', u'mozilla']).strip()
|
||||
self.assertEqual(res, b'')
|
||||
return res
|
||||
|
||||
|
||||
if (__name__ == u'__main__'):
|
||||
unittest.main()
|
||||
@@ -0,0 +1,708 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import copy
|
||||
import logging
|
||||
from os.path import abspath, join
|
||||
import unittest
|
||||
from tempfile import gettempdir
|
||||
from typing import Type
|
||||
try:
|
||||
from urllib.parse import urlsplit
|
||||
except ImportError:
|
||||
from six.moves.urllib_parse import urlsplit
|
||||
from faker import Faker
|
||||
from .. import defaults
|
||||
from ..base import BaseTLDSourceParser
|
||||
from ..conf import get_setting, reset_settings, set_setting
|
||||
from ..exceptions import TldBadUrl, TldDomainNotFound, TldImproperlyConfigured, TldIOError
|
||||
from ..helpers import project_dir
|
||||
from ..registry import Registry
|
||||
from ..utils import get_fld, get_tld, get_tld_names, get_tld_names_container, is_tld, MozillaTLDSourceParser, BaseMozillaTLDSourceParser, parse_tld, reset_tld_names, update_tld_names, update_tld_names_cli
|
||||
from .base import internet_available_only, log_info
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'TestCore',)
|
||||
LOGGER = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TestCore(unittest.TestCase):
|
||||
u'Core tld functionality tests.'
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.faker = Faker()
|
||||
cls.temp_dir = gettempdir()
|
||||
|
||||
def setUp(self):
|
||||
u'Set up.'
|
||||
self.good_patterns = [{
|
||||
u'url': u'http://www.google.co.uk',
|
||||
u'fld': u'google.co.uk',
|
||||
u'subdomain': u'www',
|
||||
u'domain': u'google',
|
||||
u'suffix': u'co.uk',
|
||||
u'tld': u'co.uk',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.v2.google.co.uk',
|
||||
u'fld': u'google.co.uk',
|
||||
u'subdomain': u'www.v2',
|
||||
u'domain': u'google',
|
||||
u'suffix': u'co.uk',
|
||||
u'tld': u'co.uk',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://хром.гугл.рф',
|
||||
u'fld': u'гугл.рф',
|
||||
u'subdomain': u'хром',
|
||||
u'domain': u'гугл',
|
||||
u'suffix': u'рф',
|
||||
u'tld': u'рф',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.google.co.uk:8001/lorem-ipsum/',
|
||||
u'fld': u'google.co.uk',
|
||||
u'subdomain': u'www',
|
||||
u'domain': u'google',
|
||||
u'suffix': u'co.uk',
|
||||
u'tld': u'co.uk',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.me.cloudfront.net',
|
||||
u'fld': u'me.cloudfront.net',
|
||||
u'subdomain': u'www',
|
||||
u'domain': u'me',
|
||||
u'suffix': u'cloudfront.net',
|
||||
u'tld': u'cloudfront.net',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.v2.forum.tech.google.co.uk:8001/lorem-ipsum/',
|
||||
u'fld': u'google.co.uk',
|
||||
u'subdomain': u'www.v2.forum.tech',
|
||||
u'domain': u'google',
|
||||
u'suffix': u'co.uk',
|
||||
u'tld': u'co.uk',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'https://pantheon.io/',
|
||||
u'fld': u'pantheon.io',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'pantheon',
|
||||
u'suffix': u'io',
|
||||
u'tld': u'io',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'v2.www.google.com',
|
||||
u'fld': u'google.com',
|
||||
u'subdomain': u'v2.www',
|
||||
u'domain': u'google',
|
||||
u'suffix': u'com',
|
||||
u'tld': u'com',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'//v2.www.google.com',
|
||||
u'fld': u'google.com',
|
||||
u'subdomain': u'v2.www',
|
||||
u'domain': u'google',
|
||||
u'suffix': u'com',
|
||||
u'tld': u'com',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://foo@bar.com',
|
||||
u'fld': u'bar.com',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'bar',
|
||||
u'suffix': u'com',
|
||||
u'tld': u'com',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://user:foo@bar.com',
|
||||
u'fld': u'bar.com',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'bar',
|
||||
u'suffix': u'com',
|
||||
u'tld': u'com',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'https://faguoren.xn--fiqs8s',
|
||||
u'fld': u'faguoren.xn--fiqs8s',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'faguoren',
|
||||
u'suffix': u'xn--fiqs8s',
|
||||
u'tld': u'xn--fiqs8s',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'blogs.lemonde.paris',
|
||||
u'fld': u'lemonde.paris',
|
||||
u'subdomain': u'blogs',
|
||||
u'domain': u'lemonde',
|
||||
u'suffix': u'paris',
|
||||
u'tld': u'paris',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'axel.brighton.ac.uk',
|
||||
u'fld': u'brighton.ac.uk',
|
||||
u'subdomain': u'axel',
|
||||
u'domain': u'brighton',
|
||||
u'suffix': u'ac.uk',
|
||||
u'tld': u'ac.uk',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'm.fr.blogspot.com.au',
|
||||
u'fld': u'fr.blogspot.com.au',
|
||||
u'subdomain': u'm',
|
||||
u'domain': u'fr',
|
||||
u'suffix': u'blogspot.com.au',
|
||||
u'tld': u'blogspot.com.au',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'help.www.福岡.jp',
|
||||
u'fld': u'www.福岡.jp',
|
||||
u'subdomain': u'help',
|
||||
u'domain': u'www',
|
||||
u'suffix': u'福岡.jp',
|
||||
u'tld': u'福岡.jp',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'syria.arabic.variant.سوريا',
|
||||
u'fld': u'variant.سوريا',
|
||||
u'subdomain': u'syria.arabic',
|
||||
u'domain': u'variant',
|
||||
u'suffix': u'سوريا',
|
||||
u'tld': u'سوريا',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.help.kawasaki.jp',
|
||||
u'fld': u'www.help.kawasaki.jp',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'www',
|
||||
u'suffix': u'help.kawasaki.jp',
|
||||
u'tld': u'help.kawasaki.jp',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.city.kawasaki.jp',
|
||||
u'fld': u'city.kawasaki.jp',
|
||||
u'subdomain': u'www',
|
||||
u'domain': u'city',
|
||||
u'suffix': u'kawasaki.jp',
|
||||
u'tld': u'kawasaki.jp',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://fedoraproject.org',
|
||||
u'fld': u'fedoraproject.org',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'fedoraproject',
|
||||
u'suffix': u'org',
|
||||
u'tld': u'org',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.cloud.fedoraproject.org',
|
||||
u'fld': u'www.cloud.fedoraproject.org',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'www',
|
||||
u'suffix': u'cloud.fedoraproject.org',
|
||||
u'tld': u'cloud.fedoraproject.org',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'https://www.john.app.os.fedoraproject.org',
|
||||
u'fld': u'john.app.os.fedoraproject.org',
|
||||
u'subdomain': u'www',
|
||||
u'domain': u'john',
|
||||
u'suffix': u'app.os.fedoraproject.org',
|
||||
u'tld': u'app.os.fedoraproject.org',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'ftp://www.xn--mxail5aa.xn--11b4c3d',
|
||||
u'fld': u'xn--mxail5aa.xn--11b4c3d',
|
||||
u'subdomain': u'www',
|
||||
u'domain': u'xn--mxail5aa',
|
||||
u'suffix': u'xn--11b4c3d',
|
||||
u'tld': u'xn--11b4c3d',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://cloud.fedoraproject.org',
|
||||
u'fld': u'cloud.fedoraproject.org',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'cloud.fedoraproject.org',
|
||||
u'suffix': u'cloud.fedoraproject.org',
|
||||
u'tld': u'cloud.fedoraproject.org',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'github.io',
|
||||
u'fld': u'github.io',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'github.io',
|
||||
u'suffix': u'github.io',
|
||||
u'tld': u'github.io',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
u'fix_protocol': True,
|
||||
},
|
||||
}, {
|
||||
u'url': urlsplit(u'http://lemonde.fr/article.html'),
|
||||
u'fld': u'lemonde.fr',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'lemonde',
|
||||
u'suffix': u'fr',
|
||||
u'tld': u'fr',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}]
|
||||
self.bad_patterns = {
|
||||
u'v2.www.google.com': {
|
||||
u'exception': TldBadUrl,
|
||||
},
|
||||
u'/index.php?a=1&b=2': {
|
||||
u'exception': TldBadUrl,
|
||||
},
|
||||
u'http://www.tld.doesnotexist': {
|
||||
u'exception': TldDomainNotFound,
|
||||
},
|
||||
u'https://2001:0db8:0000:85a3:0000:0000:ac1f:8001': {
|
||||
u'exception': TldDomainNotFound,
|
||||
},
|
||||
u'http://192.169.1.1': {
|
||||
u'exception': TldDomainNotFound,
|
||||
},
|
||||
u'http://localhost:8080': {
|
||||
u'exception': TldDomainNotFound,
|
||||
},
|
||||
u'https://localhost': {
|
||||
u'exception': TldDomainNotFound,
|
||||
},
|
||||
u'https://localhost2': {
|
||||
u'exception': TldImproperlyConfigured,
|
||||
u'kwargs': {
|
||||
u'search_public': False,
|
||||
u'search_private': False,
|
||||
},
|
||||
},
|
||||
}
|
||||
self.invalid_tlds = {u'v2.www.google.com', u'tld.doesnotexist',
|
||||
u'2001:0db8:0000:85a3:0000:0000:ac1f', u'192.169.1.1', 'localhost', u'google.com'}
|
||||
self.tld_names_local_path_custom = project_dir(
|
||||
join(u'tests', u'res', u'effective_tld_names_custom.dat.txt'))
|
||||
self.good_patterns_custom_parser = [{
|
||||
u'url': u'http://www.foreverchild',
|
||||
u'fld': u'www.foreverchild',
|
||||
u'subdomain': u'',
|
||||
u'domain': u'www',
|
||||
u'suffix': u'foreverchild',
|
||||
u'tld': u'foreverchild',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}, {
|
||||
u'url': u'http://www.v2.foreverchild',
|
||||
u'fld': u'v2.foreverchild',
|
||||
u'subdomain': u'www',
|
||||
u'domain': u'v2',
|
||||
u'suffix': u'foreverchild',
|
||||
u'tld': u'foreverchild',
|
||||
u'kwargs': {
|
||||
u'fail_silently': True,
|
||||
},
|
||||
}]
|
||||
reset_settings()
|
||||
|
||||
def tearDown(self):
|
||||
u'Tear down.'
|
||||
reset_settings()
|
||||
Registry.reset()
|
||||
|
||||
@property
|
||||
def good_url(self):
|
||||
return self.good_patterns[0][u'url']
|
||||
|
||||
@property
|
||||
def bad_url(self):
|
||||
return list(self.bad_patterns.keys())[0]
|
||||
|
||||
def get_custom_parser_class(self, uid='custom_mozilla', source_url=None, local_path='tests/res/effective_tld_names_custom.dat.txt'):
|
||||
parser_class = type('CustomMozillaTLDSourceParser', (BaseMozillaTLDSourceParser,), {
|
||||
'uid': uid,
|
||||
'source_url': source_url,
|
||||
'local_path': local_path,
|
||||
})
|
||||
return parser_class
|
||||
|
||||
@log_info
|
||||
def test_0_tld_names_loaded(self):
|
||||
u'Test if tld names are loaded.'
|
||||
get_fld(u'http://www.google.co.uk')
|
||||
from ..utils import tld_names
|
||||
res = (len(tld_names) > 0)
|
||||
self.assertTrue(res)
|
||||
return res
|
||||
|
||||
@internet_available_only
|
||||
@log_info
|
||||
def test_1_update_tld_names(self):
|
||||
u'Test updating the tld names (re-fetch mozilla source).'
|
||||
res = update_tld_names(fail_silently=False)
|
||||
self.assertTrue(res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_2_fld_good_patterns_pass(self):
|
||||
u'Test good URL patterns.'
|
||||
res = []
|
||||
for data in self.good_patterns:
|
||||
_res = get_fld(data[u'url'], **data[u'kwargs'])
|
||||
self.assertEqual(_res, data[u'fld'])
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_3_fld_bad_patterns_pass(self):
|
||||
u'Test bad URL patterns.'
|
||||
res = []
|
||||
for (url, params) in self.bad_patterns.items():
|
||||
_res = get_fld(url, fail_silently=True)
|
||||
self.assertEqual(_res, None)
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_4_override_settings(self):
|
||||
u'Testing settings override.'
|
||||
|
||||
def override_settings():
|
||||
u'Override settings.'
|
||||
return get_setting(u'DEBUG')
|
||||
self.assertEqual(defaults.DEBUG, override_settings())
|
||||
set_setting(u'DEBUG', True)
|
||||
self.assertEqual(True, override_settings())
|
||||
return override_settings()
|
||||
|
||||
@log_info
|
||||
def test_5_tld_good_patterns_pass_parsed_object(self):
|
||||
u'Test good URL patterns.'
|
||||
res = []
|
||||
for data in self.good_patterns:
|
||||
kwargs = copy.copy(data[u'kwargs'])
|
||||
kwargs.update({
|
||||
u'as_object': True,
|
||||
})
|
||||
_res = get_tld(data[u'url'], **kwargs)
|
||||
self.assertEqual(_res.tld, data[u'tld'])
|
||||
self.assertEqual(_res.subdomain, data[u'subdomain'])
|
||||
self.assertEqual(_res.domain, data[u'domain'])
|
||||
self.assertEqual(_res.suffix, data[u'suffix'])
|
||||
self.assertEqual(_res.fld, data[u'fld'])
|
||||
self.assertEqual(unicode(_res).encode(u'utf8'),
|
||||
data[u'tld'].encode(u'utf8'))
|
||||
self.assertEqual(_res.__dict__, {
|
||||
u'tld': _res.tld,
|
||||
u'domain': _res.domain,
|
||||
u'subdomain': _res.subdomain,
|
||||
u'fld': _res.fld,
|
||||
u'parsed_url': _res.parsed_url,
|
||||
})
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_6_override_full_names_path(self):
|
||||
default = project_dir(u'dummy.txt')
|
||||
override_base = u'/tmp/test'
|
||||
set_setting(u'NAMES_LOCAL_PATH_PARENT', override_base)
|
||||
modified = project_dir(u'dummy.txt')
|
||||
self.assertNotEqual(default, modified)
|
||||
self.assertEqual(modified, abspath(u'/tmp/test/dummy.txt'))
|
||||
|
||||
@log_info
|
||||
def test_7_public_private(self):
|
||||
res = get_fld(u'http://silly.cc.ua',
|
||||
fail_silently=True, search_private=False)
|
||||
self.assertEqual(res, None)
|
||||
res = get_fld(u'http://silly.cc.ua',
|
||||
fail_silently=True, search_private=True)
|
||||
self.assertEqual(res, u'silly.cc.ua')
|
||||
res = get_fld(u'mercy.compute.amazonaws.com',
|
||||
fail_silently=True, search_private=False, fix_protocol=True)
|
||||
self.assertEqual(res, None)
|
||||
res = get_fld(u'http://whatever.com',
|
||||
fail_silently=True, search_public=False)
|
||||
self.assertEqual(res, None)
|
||||
|
||||
@log_info
|
||||
def test_8_fld_bad_patterns_exceptions(self):
|
||||
u'Test exceptions.'
|
||||
res = []
|
||||
for (url, params) in self.bad_patterns.items():
|
||||
kwargs = (params[u'kwargs'] if (u'kwargs' in params) else {
|
||||
|
||||
})
|
||||
kwargs.update({
|
||||
u'fail_silently': False,
|
||||
})
|
||||
with self.assertRaises(params[u'exception']):
|
||||
_res = get_fld(url, **kwargs)
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_9_tld_good_patterns_pass(self):
|
||||
u'Test `get_tld` good URL patterns.'
|
||||
res = []
|
||||
for data in self.good_patterns:
|
||||
_res = get_tld(data[u'url'], **data[u'kwargs'])
|
||||
self.assertEqual(_res, data[u'tld'])
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_10_tld_bad_patterns_pass(self):
|
||||
u'Test `get_tld` bad URL patterns.'
|
||||
res = []
|
||||
for (url, params) in self.bad_patterns.items():
|
||||
_res = get_tld(url, fail_silently=True)
|
||||
self.assertEqual(_res, None)
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_11_parse_tld_good_patterns(self):
|
||||
u'Test `parse_tld` good URL patterns.'
|
||||
res = []
|
||||
for data in self.good_patterns:
|
||||
_res = parse_tld(data[u'url'], **data[u'kwargs'])
|
||||
self.assertEqual(
|
||||
_res, (data[u'tld'], data[u'domain'], data[u'subdomain']))
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_12_is_tld_good_patterns(self):
|
||||
u'Test `is_tld` good URL patterns.'
|
||||
for data in self.good_patterns:
|
||||
self.assertTrue(is_tld(data[u'tld']))
|
||||
|
||||
@log_info
|
||||
def test_13_is_tld_bad_patterns(self):
|
||||
u'Test `is_tld` bad URL patterns.'
|
||||
for _tld in self.invalid_tlds:
|
||||
self.assertFalse(is_tld(_tld))
|
||||
|
||||
@log_info
|
||||
def test_14_fail_update_tld_names(self):
|
||||
u'Test fail `update_tld_names`.'
|
||||
parser_class = self.get_custom_parser_class(
|
||||
uid='custom_mozilla_2', source_url='i-do-not-exist')
|
||||
with self.assertRaises(TldIOError):
|
||||
update_tld_names(fail_silently=False, parser_uid=parser_class.uid)
|
||||
self.assertFalse(update_tld_names(
|
||||
fail_silently=True, parser_uid=parser_class.uid))
|
||||
|
||||
@log_info
|
||||
def test_15_fail_get_fld_wrong_kwargs(self):
|
||||
u'Test fail `get_fld` with wrong kwargs.'
|
||||
with self.assertRaises(TldImproperlyConfigured):
|
||||
get_fld(self.good_url, as_object=True)
|
||||
|
||||
@log_info
|
||||
def test_16_fail_parse_tld(self):
|
||||
u'Test fail `parse_tld`.\n\n Assert raise TldIOError on wrong `NAMES_SOURCE_URL` for `parse_tld`.\n '
|
||||
parser_class = self.get_custom_parser_class(
|
||||
source_url='i-do-not-exist')
|
||||
parsed_tld = parse_tld(
|
||||
self.bad_url, fail_silently=False, parser_class=parser_class)
|
||||
self.assertEqual(parsed_tld, (None, None, None))
|
||||
|
||||
@log_info
|
||||
def test_17_get_tld_names_and_reset_tld_names(self):
|
||||
u'Test fail `get_tld_names` and repair using `reset_tld_names`.'
|
||||
tmp_filename = join(gettempdir(), u''.join(
|
||||
[u'{}'.format(self.faker.uuid4()), u'.dat.txt']))
|
||||
parser_class = self.get_custom_parser_class(
|
||||
source_url='i-do-not-exist', local_path=tmp_filename)
|
||||
reset_tld_names()
|
||||
if True:
|
||||
with self.assertRaises(TldIOError):
|
||||
get_tld_names(fail_silently=False, parser_class=parser_class)
|
||||
tmp_filename = join(gettempdir(), u''.join(
|
||||
[u'{}'.format(self.faker.uuid4()), u'.dat.txt']))
|
||||
parser_class_2 = self.get_custom_parser_class(
|
||||
source_url='i-do-not-exist-2', local_path=tmp_filename)
|
||||
reset_tld_names()
|
||||
if True:
|
||||
self.assertIsNone(get_tld_names(
|
||||
fail_silently=True, parser_class=parser_class_2))
|
||||
|
||||
@internet_available_only
|
||||
@log_info
|
||||
def test_18_update_tld_names_cli(self):
|
||||
u'Test the return code of the CLI version of `update_tld_names`.'
|
||||
reset_tld_names()
|
||||
res = update_tld_names_cli()
|
||||
self.assertEqual(res, 0)
|
||||
|
||||
@log_info
|
||||
def test_19_parse_tld_custom_tld_names_good_patterns(self):
|
||||
u'Test `parse_tld` good URL patterns for custom tld names.'
|
||||
res = []
|
||||
for data in self.good_patterns_custom_parser:
|
||||
kwargs = copy.copy(data[u'kwargs'])
|
||||
kwargs.update({
|
||||
u'parser_class': self.get_custom_parser_class(),
|
||||
})
|
||||
_res = parse_tld(data[u'url'], **kwargs)
|
||||
self.assertEqual(
|
||||
_res, (data[u'tld'], data[u'domain'], data[u'subdomain']))
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_20_tld_custom_tld_names_good_patterns_pass_parsed_object(self):
|
||||
u'Test `get_tld` good URL patterns for custom tld names.'
|
||||
res = []
|
||||
for data in self.good_patterns_custom_parser:
|
||||
kwargs = copy.copy(data[u'kwargs'])
|
||||
kwargs.update({
|
||||
u'as_object': True,
|
||||
u'parser_class': self.get_custom_parser_class(),
|
||||
})
|
||||
_res = get_tld(data[u'url'], **kwargs)
|
||||
self.assertEqual(_res.tld, data[u'tld'])
|
||||
self.assertEqual(_res.subdomain, data[u'subdomain'])
|
||||
self.assertEqual(_res.domain, data[u'domain'])
|
||||
self.assertEqual(_res.suffix, data[u'suffix'])
|
||||
self.assertEqual(_res.fld, data[u'fld'])
|
||||
self.assertEqual(unicode(_res).encode(u'utf8'),
|
||||
data[u'tld'].encode(u'utf8'))
|
||||
self.assertEqual(_res.__dict__, {
|
||||
u'tld': _res.tld,
|
||||
u'domain': _res.domain,
|
||||
u'subdomain': _res.subdomain,
|
||||
u'fld': _res.fld,
|
||||
u'parsed_url': _res.parsed_url,
|
||||
})
|
||||
res.append(_res)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_21_reset_tld_names_for_custom_parser(self):
|
||||
u'Test `reset_tld_names` for `tld_names_local_path`.'
|
||||
res = []
|
||||
parser_class = self.get_custom_parser_class()
|
||||
for data in self.good_patterns_custom_parser:
|
||||
kwargs = copy.copy(data[u'kwargs'])
|
||||
kwargs.update({
|
||||
u'as_object': True,
|
||||
u'parser_class': self.get_custom_parser_class(),
|
||||
})
|
||||
_res = get_tld(data[u'url'], **kwargs)
|
||||
self.assertEqual(_res.tld, data[u'tld'])
|
||||
self.assertEqual(_res.subdomain, data[u'subdomain'])
|
||||
self.assertEqual(_res.domain, data[u'domain'])
|
||||
self.assertEqual(_res.suffix, data[u'suffix'])
|
||||
self.assertEqual(_res.fld, data[u'fld'])
|
||||
self.assertEqual(unicode(_res).encode(u'utf8'),
|
||||
data[u'tld'].encode(u'utf8'))
|
||||
self.assertEqual(_res.__dict__, {
|
||||
u'tld': _res.tld,
|
||||
u'domain': _res.domain,
|
||||
u'subdomain': _res.subdomain,
|
||||
u'fld': _res.fld,
|
||||
u'parsed_url': _res.parsed_url,
|
||||
})
|
||||
res.append(_res)
|
||||
tld_names = get_tld_names_container()
|
||||
self.assertIn(parser_class.local_path, tld_names)
|
||||
reset_tld_names(parser_class.local_path)
|
||||
self.assertNotIn(parser_class.local_path, tld_names)
|
||||
return res
|
||||
|
||||
@log_info
|
||||
def test_22_fail_define_custom_parser_class_without_uid(self):
|
||||
u'Test fail define custom parser class without `uid`.'
|
||||
|
||||
class CustomParser(BaseTLDSourceParser):
|
||||
pass
|
||||
|
||||
class AnotherCustomParser(BaseTLDSourceParser):
|
||||
uid = u'another-custom-parser'
|
||||
with self.assertRaises(TldImproperlyConfigured):
|
||||
CustomParser.get_tld_names()
|
||||
with self.assertRaises(NotImplementedError):
|
||||
AnotherCustomParser.get_tld_names()
|
||||
|
||||
@log_info
|
||||
def test_23_len_trie_nodes(self):
|
||||
u'Test len of the trie nodes.'
|
||||
get_tld(u'http://delusionalinsanity.com')
|
||||
tld_names = get_tld_names_container()
|
||||
self.assertGreater(
|
||||
len(tld_names[MozillaTLDSourceParser.local_path]), 0)
|
||||
|
||||
@log_info
|
||||
def test_24_get_tld_names_no_arguments(self):
|
||||
u'Test len of the trie nodes.'
|
||||
tld_names = get_tld_names()
|
||||
self.assertGreater(len(tld_names), 0)
|
||||
|
||||
|
||||
if (__name__ == u'__main__'):
|
||||
unittest.main()
|
||||
@@ -0,0 +1,54 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'Trie', u'TrieNode')
|
||||
|
||||
|
||||
class TrieNode(object):
|
||||
u'Class representing a single Trie node.'
|
||||
__slots__ = (u'children', u'exception', u'leaf', u'private')
|
||||
|
||||
def __init__(self):
|
||||
self.children = None
|
||||
self.exception = None
|
||||
self.leaf = False
|
||||
self.private = False
|
||||
|
||||
|
||||
class Trie(object):
|
||||
u'An adhoc Trie data structure to store tlds in reverse notation order.'
|
||||
|
||||
def __init__(self):
|
||||
self.root = TrieNode()
|
||||
self.__nodes = 0
|
||||
|
||||
def __len__(self):
|
||||
return self.__nodes
|
||||
|
||||
def add(self, tld, private=False):
|
||||
node = self.root
|
||||
for part in reversed(tld.split(u'.')):
|
||||
if part.startswith(u'!'):
|
||||
node.exception = part[1:]
|
||||
break
|
||||
if (node.children is None):
|
||||
node.children = {
|
||||
|
||||
}
|
||||
child = TrieNode()
|
||||
else:
|
||||
child = node.children.get(part)
|
||||
if (child is None):
|
||||
child = TrieNode()
|
||||
node.children[part] = child
|
||||
node = child
|
||||
node.leaf = True
|
||||
if private:
|
||||
node.private = True
|
||||
self.__nodes += 1
|
||||
@@ -0,0 +1,271 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
from __future__ import unicode_literals
|
||||
from __future__ import unicode_literals
|
||||
import argparse
|
||||
from codecs import open as codecs_open
|
||||
from backports.functools_lru_cache import lru_cache
|
||||
from os.path import isabs
|
||||
import sys
|
||||
from typing import Dict, Type, Union, Tuple, List
|
||||
try:
|
||||
from urllib.parse import urlsplit, SplitResult
|
||||
except ImportError:
|
||||
from six.moves.urllib_parse import urlsplit, SplitResult
|
||||
from .base import BaseTLDSourceParser
|
||||
from .exceptions import TldBadUrl, TldDomainNotFound, TldImproperlyConfigured, TldIOError
|
||||
from .helpers import project_dir
|
||||
from .trie import Trie
|
||||
from .registry import Registry
|
||||
from .result import Result
|
||||
__author__ = u'Artur Barseghyan'
|
||||
__copyright__ = u'2013-2019 Artur Barseghyan'
|
||||
__license__ = u'MPL-1.1 OR GPL-2.0-only OR LGPL-2.1-or-later'
|
||||
__all__ = (u'BaseMozillaTLDSourceParser', u'get_fld', u'get_tld', u'get_tld_names', u'get_tld_names_container', u'is_tld', u'MozillaTLDSourceParser', u'parse_tld',
|
||||
u'pop_tld_names_container', u'process_url', u'reset_tld_names', u'Result', u'update_tld_names', u'update_tld_names_cli', u'update_tld_names_container')
|
||||
tld_names = {
|
||||
|
||||
}
|
||||
|
||||
|
||||
def get_tld_names_container():
|
||||
u'Get container of all tld names.\n\n :return:\n :rtype dict:\n '
|
||||
global tld_names
|
||||
return tld_names
|
||||
|
||||
|
||||
def update_tld_names_container(tld_names_local_path, trie_obj):
|
||||
u'Update TLD Names container item.\n\n :param tld_names_local_path:\n :param trie_obj:\n :return:\n '
|
||||
global tld_names
|
||||
tld_names.update({
|
||||
tld_names_local_path: trie_obj,
|
||||
})
|
||||
|
||||
|
||||
def pop_tld_names_container(tld_names_local_path):
|
||||
u'Remove TLD names container item.\n\n :param tld_names_local_path:\n :return:\n '
|
||||
global tld_names
|
||||
tld_names.pop(tld_names_local_path, None)
|
||||
|
||||
|
||||
@lru_cache(maxsize=128, typed=True)
|
||||
def update_tld_names(fail_silently=False, parser_uid=None):
|
||||
u'Update TLD names.\n\n :param fail_silently:\n :param parser_uid:\n :return:\n '
|
||||
results = []
|
||||
results_append = results.append
|
||||
if parser_uid:
|
||||
parser_cls = Registry.get(parser_uid, None)
|
||||
if (parser_cls and parser_cls.source_url):
|
||||
results_append(parser_cls.update_tld_names(
|
||||
fail_silently=fail_silently))
|
||||
else:
|
||||
for (parser_uid, parser_cls) in Registry.items():
|
||||
if (parser_cls and parser_cls.source_url):
|
||||
results_append(parser_cls.update_tld_names(
|
||||
fail_silently=fail_silently))
|
||||
return all(results)
|
||||
|
||||
|
||||
def update_tld_names_cli():
|
||||
u'CLI wrapper for update_tld_names.\n\n Since update_tld_names returns True on success, we need to negate the\n result to match CLI semantics.\n '
|
||||
parser = argparse.ArgumentParser(description='Update TLD names')
|
||||
parser.add_argument(u'parser_uid', nargs='?', default=None,
|
||||
help='UID of the parser to update TLD names for.')
|
||||
parser.add_argument(u'--fail-silently', dest='fail_silently',
|
||||
default=False, action='store_true', help='Fail silently')
|
||||
args = parser.parse_args(sys.argv[1:])
|
||||
parser_uid = args.parser_uid
|
||||
fail_silently = args.fail_silently
|
||||
return int((not update_tld_names(parser_uid=parser_uid, fail_silently=fail_silently)))
|
||||
|
||||
|
||||
def get_tld_names(fail_silently=False, retry_count=0, parser_class=None):
|
||||
u'Build the ``tlds`` list if empty. Recursive.\n\n :param fail_silently: If set to True, no exceptions are raised and None\n is returned on failure.\n :param retry_count: If greater than 1, we raise an exception in order\n to avoid infinite loops.\n :param parser_class:\n :type fail_silently: bool\n :type retry_count: int\n :type parser_class: BaseTLDSourceParser\n :return: List of TLD names\n :rtype: obj:`tld.utils.Trie`\n '
|
||||
if (not parser_class):
|
||||
parser_class = MozillaTLDSourceParser
|
||||
return parser_class.get_tld_names(fail_silently=fail_silently, retry_count=retry_count)
|
||||
|
||||
|
||||
class BaseMozillaTLDSourceParser(BaseTLDSourceParser):
|
||||
|
||||
@classmethod
|
||||
def get_tld_names(cls, fail_silently=False, retry_count=0):
|
||||
u'Parse.\n\n :param fail_silently:\n :param retry_count:\n :return:\n '
|
||||
if (retry_count > 1):
|
||||
if fail_silently:
|
||||
return None
|
||||
else:
|
||||
raise TldIOError
|
||||
global tld_names
|
||||
_tld_names = tld_names
|
||||
if ((cls.local_path in _tld_names) and (_tld_names[cls.local_path] is not None)):
|
||||
return _tld_names
|
||||
local_file = None
|
||||
try:
|
||||
if isabs(cls.local_path):
|
||||
local_path = cls.local_path
|
||||
else:
|
||||
local_path = project_dir(cls.local_path)
|
||||
local_file = codecs_open(local_path, u'r', encoding='utf8')
|
||||
trie = Trie()
|
||||
trie_add = trie.add
|
||||
private_section = False
|
||||
for line in local_file:
|
||||
if (u'===BEGIN PRIVATE DOMAINS===' in line):
|
||||
private_section = True
|
||||
if (u'// xn--' in line):
|
||||
line = line.split()[1]
|
||||
if (line[0] in (u'/', u'\n')):
|
||||
continue
|
||||
trie_add(u''.join([u'{}'.format(line.strip())]),
|
||||
private=private_section)
|
||||
update_tld_names_container(cls.local_path, trie)
|
||||
local_file.close()
|
||||
except IOError as err:
|
||||
cls.update_tld_names(fail_silently=fail_silently)
|
||||
retry_count += 1
|
||||
return cls.get_tld_names(fail_silently=fail_silently, retry_count=retry_count)
|
||||
except Exception as err:
|
||||
if fail_silently:
|
||||
return None
|
||||
else:
|
||||
raise err
|
||||
finally:
|
||||
try:
|
||||
local_file.close()
|
||||
except Exception:
|
||||
pass
|
||||
return _tld_names
|
||||
|
||||
|
||||
class MozillaTLDSourceParser(BaseMozillaTLDSourceParser):
|
||||
u'Mozilla TLD source.'
|
||||
uid = u'mozilla'
|
||||
source_url = u'http://mxr.mozilla.org/mozilla/source/netwerk/dns/src/effective_tld_names.dat?raw=1'
|
||||
local_path = u'res/effective_tld_names.dat.txt'
|
||||
|
||||
|
||||
def process_url(url, fail_silently=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
|
||||
u'Process URL.\n\n :param parser_class:\n :param url:\n :param fail_silently:\n :param fix_protocol:\n :param search_public:\n :param search_private:\n :return:\n '
|
||||
if (not (search_public or search_private)):
|
||||
raise TldImproperlyConfigured(
|
||||
u'Either `search_public` or `search_private` (or both) shall be set to True.')
|
||||
_tld_names = get_tld_names(
|
||||
fail_silently=fail_silently, parser_class=parser_class)
|
||||
if (not isinstance(url, SplitResult)):
|
||||
url = url.lower()
|
||||
if (fix_protocol and (not url.startswith((u'//', u'http://', u'https://')))):
|
||||
url = u''.join([u'https://', u'{}'.format(url)])
|
||||
parsed_url = urlsplit(url)
|
||||
else:
|
||||
parsed_url = url
|
||||
domain_name = parsed_url.hostname
|
||||
if (not domain_name):
|
||||
if fail_silently:
|
||||
return (None, None, parsed_url)
|
||||
else:
|
||||
raise TldBadUrl(url=url)
|
||||
domain_parts = domain_name.split(u'.')
|
||||
tld_names_local_path = parser_class.local_path
|
||||
node = _tld_names[tld_names_local_path].root
|
||||
current_length = 0
|
||||
tld_length = 0
|
||||
match = None
|
||||
len_domain_parts = len(domain_parts)
|
||||
for i in reversed(range(len_domain_parts)):
|
||||
part = domain_parts[i]
|
||||
if (node.children is None):
|
||||
break
|
||||
if (part == node.exception):
|
||||
break
|
||||
child = node.children.get(part)
|
||||
if (child is None):
|
||||
child = node.children.get(u'*')
|
||||
if (child is None):
|
||||
break
|
||||
current_length += 1
|
||||
node = child
|
||||
if node.leaf:
|
||||
tld_length = current_length
|
||||
match = node
|
||||
if ((match is None) or (not match.leaf) or ((not search_public) and (not match.private)) or ((not search_private) and match.private)):
|
||||
if fail_silently:
|
||||
return (None, None, parsed_url)
|
||||
else:
|
||||
raise TldDomainNotFound(domain_name=domain_name)
|
||||
if (len_domain_parts == tld_length):
|
||||
non_zero_i = (- 1)
|
||||
else:
|
||||
non_zero_i = max(1, (len_domain_parts - tld_length))
|
||||
return (domain_parts, non_zero_i, parsed_url)
|
||||
|
||||
|
||||
def get_fld(url, fail_silently=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser, **kwargs):
|
||||
u"Extract the first level domain.\n\n Extract the top level domain based on the mozilla's effective TLD names\n dat file. Returns a string. May throw ``TldBadUrl`` or\n ``TldDomainNotFound`` exceptions if there's bad URL provided or no TLD\n match found respectively.\n\n :param url: URL to get top level domain from.\n :param fail_silently: If set to True, no exceptions are raised and None\n is returned on failure.\n :param fix_protocol: If set to True, missing or wrong protocol is\n ignored (https is appended instead).\n :param search_public: If set to True, search in public domains.\n :param search_private: If set to True, search in private domains.\n :param parser_class:\n :type url: str\n :type fail_silently: bool\n :type fix_protocol: bool\n :type search_public: bool\n :type search_private: bool\n :return: String with top level domain (if ``as_object`` argument\n is set to False) or a ``tld.utils.Result`` object (if ``as_object``\n argument is set to True); returns None on failure.\n :rtype: str\n "
|
||||
if (u'as_object' in kwargs):
|
||||
raise TldImproperlyConfigured(
|
||||
u'`as_object` argument is deprecated for `get_fld`. Use `get_tld` instead.')
|
||||
(domain_parts, non_zero_i, parsed_url) = process_url(url=url, fail_silently=fail_silently,
|
||||
fix_protocol=fix_protocol, search_public=search_public, search_private=search_private, parser_class=parser_class)
|
||||
if (domain_parts is None):
|
||||
return None
|
||||
if (non_zero_i < 0):
|
||||
return parsed_url.hostname
|
||||
return u'.'.join(domain_parts[(non_zero_i - 1):])
|
||||
|
||||
|
||||
def get_tld(url, fail_silently=False, as_object=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
|
||||
u"Extract the top level domain.\n\n Extract the top level domain based on the mozilla's effective TLD names\n dat file. Returns a string. May throw ``TldBadUrl`` or\n ``TldDomainNotFound`` exceptions if there's bad URL provided or no TLD\n match found respectively.\n\n :param url: URL to get top level domain from.\n :param fail_silently: If set to True, no exceptions are raised and None\n is returned on failure.\n :param as_object: If set to True, ``tld.utils.Result`` object is returned,\n ``domain``, ``suffix`` and ``tld`` properties.\n :param fix_protocol: If set to True, missing or wrong protocol is\n ignored (https is appended instead).\n :param search_public: If set to True, search in public domains.\n :param search_private: If set to True, search in private domains.\n :param parser_class:\n :type url: str\n :type fail_silently: bool\n :type as_object: bool\n :type fix_protocol: bool\n :type search_public: bool\n :type search_private: bool\n :return: String with top level domain (if ``as_object`` argument\n is set to False) or a ``tld.utils.Result`` object (if ``as_object``\n argument is set to True); returns None on failure.\n :rtype: str\n "
|
||||
(domain_parts, non_zero_i, parsed_url) = process_url(url=url, fail_silently=fail_silently,
|
||||
fix_protocol=fix_protocol, search_public=search_public, search_private=search_private, parser_class=parser_class)
|
||||
if (domain_parts is None):
|
||||
return None
|
||||
if (not as_object):
|
||||
if (non_zero_i < 0):
|
||||
return parsed_url.hostname
|
||||
return u'.'.join(domain_parts[non_zero_i:])
|
||||
if (non_zero_i < 0):
|
||||
subdomain = u''
|
||||
domain = u''
|
||||
_tld = parsed_url.hostname
|
||||
else:
|
||||
subdomain = u'.'.join(domain_parts[:(non_zero_i - 1)])
|
||||
domain = u'.'.join(domain_parts[(non_zero_i - 1):non_zero_i])
|
||||
_tld = u'.'.join(domain_parts[non_zero_i:])
|
||||
return Result(subdomain=subdomain, domain=domain, tld=_tld, parsed_url=parsed_url)
|
||||
|
||||
|
||||
def parse_tld(url, fail_silently=False, fix_protocol=False, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
|
||||
u'Parse TLD into parts.\n\n :param url:\n :param fail_silently:\n :param fix_protocol:\n :param search_public:\n :param search_private:\n :param parser_class:\n :return:\n :rtype: tuple\n '
|
||||
try:
|
||||
obj = get_tld(url, fail_silently=fail_silently, as_object=True, fix_protocol=fix_protocol,
|
||||
search_public=search_public, search_private=search_private, parser_class=parser_class)
|
||||
_tld = obj.tld
|
||||
domain = obj.domain
|
||||
subdomain = obj.subdomain
|
||||
except (TldBadUrl, TldDomainNotFound, TldImproperlyConfigured, TldIOError):
|
||||
_tld = None
|
||||
domain = None
|
||||
subdomain = None
|
||||
return (_tld, domain, subdomain)
|
||||
|
||||
|
||||
def is_tld(value, search_public=True, search_private=True, parser_class=MozillaTLDSourceParser):
|
||||
u'Check if given URL is tld.\n\n :param value: URL to get top level domain from.\n :param search_public: If set to True, search in public domains.\n :param search_private: If set to True, search in private domains.\n :param parser_class:\n :type value: str\n :type search_public: bool\n :type search_private: bool\n :return:\n :rtype: bool\n '
|
||||
_tld = get_tld(url=value, fail_silently=True, fix_protocol=True,
|
||||
search_public=search_public, search_private=search_private, parser_class=parser_class)
|
||||
return (value == _tld)
|
||||
|
||||
|
||||
def reset_tld_names(tld_names_local_path=None):
|
||||
u'Reset the ``tld_names`` to empty value.\n\n If ``tld_names_local_path`` is given, removes specified\n entry from ``tld_names`` instead.\n\n :param tld_names_local_path:\n :type tld_names_local_path: str\n :return:\n '
|
||||
if tld_names_local_path:
|
||||
pop_tld_names_container(tld_names_local_path)
|
||||
else:
|
||||
global tld_names
|
||||
tld_names = {
|
||||
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user