Compare commits

...

5 Commits

Author SHA1 Message Date
panni f8f99f0fb2 submod retry; WIP 2019-05-19 06:03:55 +02:00
panni f337b53ae3 submod: HI: remove music
submod: common: be less aggressive about music symbols
submod: HI: be less aggressive about brackets
submod: HI: be less aggressive about MAN
2019-05-18 06:23:04 +02:00
panni aea6050d71 subtitle: try decoding with utf-16 by default as well 2019-05-17 23:45:06 +02:00
panni 13d5e0761e providers: subscene: fix endpoint once again 2019-05-13 16:14:26 +02:00
panni ce28d0284c back from dev 2019-05-12 06:17:08 +02:00
10 changed files with 202 additions and 113 deletions
+2 -2
View File
@@ -23,7 +23,7 @@
<key>PlexPluginConsoleLogging</key>
<string>0</string>
<key>PlexPluginDevMode</key>
<string>0</string>
<string>1</string>
<key>PlexPluginCodePolicy</key>
<!-- this allows channels to access some python methods which are otherwise blocked, as well as import external code libraries, and interact with the PMS HTTP API -->
<string>Elevated</string>
@@ -32,7 +32,7 @@
&lt;h1&gt;Sub-Zero for Plex&lt;/h1&gt;&lt;i&gt;Subtitles done right&lt;/i&gt;
Version 2.6.5.3074
Version 2.6.5.3074 DEV
Originally based on @bramwalet's awesome &lt;a href=&quot;https://github.com/bramwalet/Subliminal.bundle&quot;&gt;Subliminal.bundle&lt;/a&gt;
@@ -210,7 +210,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
for series in [video.series] + video.alternative_series:
term = u"%s - %s Season" % (series, p.number_to_words("%sth" % video.season).capitalize())
logger.debug('Searching for alternative results: %s', term)
film = search(term, session=self.session, release=False)
film = search(term, session=self.session, release=False, throttle=self.search_throttle)
if film and film.subtitles:
logger.debug('Alternative results found: %s', len(film.subtitles))
subtitles += self.parse_results(video, film)
@@ -222,7 +222,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
term = u"%s S%02i" % (series, video.season)
logger.debug('Searching for packs: %s', term)
time.sleep(self.search_throttle)
film = search(term, session=self.session)
film = search(term, session=self.session, throttle=self.search_throttle)
if film and film.subtitles:
logger.debug('Pack results found: %s', len(film.subtitles))
subtitles += self.parse_results(video, film)
@@ -236,7 +236,8 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
more_than_one = len([video.title] + video.alternative_titles) > 1
for title in [video.title] + video.alternative_titles:
logger.debug('Searching for movie results: %s', title)
film = search(title, year=video.year, session=self.session, limit_to=None, release=False)
film = search(title, year=video.year, session=self.session, limit_to=None, release=False,
throttle=self.search_throttle)
if film and film.subtitles:
subtitles += self.parse_results(video, film)
if more_than_one:
@@ -117,14 +117,14 @@ class Subtitle(Subtitle_):
logger.info('Guessing encoding for language %s', self.language)
encodings = ['utf-8']
encodings = ['utf-8', 'utf-16']
# add language-specific encodings
# http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages
if self.language.alpha3 == 'zho':
encodings.extend(['cp936', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp_2', 'cp950', 'gb18030', 'big5',
'big5hkscs', 'utf-16'])
'big5hkscs'])
elif self.language.alpha3 == 'jpn':
encodings.extend(['shift-jis', 'cp932', 'euc_jp', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', ])
@@ -28,6 +28,8 @@ import re
import enum
import sys
import requests
import time
is_PY2 = sys.version_info[0] < 3
if is_PY2:
@@ -55,7 +57,9 @@ def soup_for(url, session=None, user_agent=DEFAULT_USER_AGENT):
r = Request(url, data=None, headers=dict(HEADERS, **{"User-Agent": user_agent}))
html = urlopen(r).read().decode("utf-8")
else:
html = session.get(url).text
ret = session.get(url)
ret.raise_for_status()
html = ret.text
return BeautifulSoup(html, "html.parser")
@@ -243,17 +247,34 @@ def get_first_film(soup, section, year=None, session=None):
return Film.from_url(url, session=session)
def search(term, release=True, session=None, year=None, limit_to=SearchTypes.Exact):
soup = soup_for("%s/subtitles/%s?q=%s" % (SITE_DOMAIN, "release" if release else "search", term), session=session)
def search(term, release=True, session=None, year=None, limit_to=SearchTypes.Exact, throttle=0):
# note to subscene: if you actually start to randomize the endpoint, we'll have to query your server even more
endpoints = ["searching", "search", "srch", "find"]
if release:
endpoints = ["release"]
if "Subtitle search by" in str(soup):
rows = soup.find("table").tbody.find_all("tr")
subtitles = Subtitle.from_rows(rows)
return Film(term, subtitles=subtitles)
soup = None
for endpoint in endpoints:
try:
soup = soup_for("%s/subtitles/%s?q=%s" % (SITE_DOMAIN, endpoint, term),
session=session)
except requests.HTTPError, e:
if e.response.status_code == 404:
time.sleep(throttle)
# fixme: detect endpoint from html
continue
raise
break
for junk, search_type in SearchTypes.__members__.items():
if section_exists(soup, search_type):
return get_first_film(soup, search_type, year=year, session=session)
if soup:
if "Subtitle search by" in str(soup):
rows = soup.find("table").tbody.find_all("tr")
subtitles = Subtitle.from_rows(rows)
return Film(term, subtitles=subtitles)
if limit_to == search_type:
return
for junk, search_type in SearchTypes.__members__.items():
if section_exists(soup, search_type):
return get_first_film(soup, search_type, year=year, session=session)
if limit_to == search_type:
return
@@ -6,7 +6,7 @@ import pysubs2
import logging
import time
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError, FullContentRep
from registry import registry
from subzero.language import Language
@@ -257,7 +257,16 @@ class SubtitleModifications(object):
mod.modify(None, debug=self.debug, parent=self, **args)
def apply_line_mods(self, new_entries, mods):
for index, entry in enumerate(self.f, 1):
index = 1
entries = self.f[:]
entry_count = len(entries)
while 1:
if index > entry_count - 1:
break
entry = entries[index]
applied_mods = []
lines = []
@@ -265,86 +274,110 @@ class SubtitleModifications(object):
start_tags = []
end_tags = []
t = entry.text.strip()
if not t:
text = entry.text.replace(ur"\N", "\n").strip()
if not text:
if self.debug:
logger.debug(u"Skipping empty line: %s", index)
index += 1
continue
skip_entry = False
for line in t.split(ur"\N"):
# don't bother the mods with surrounding tags
old_line = line
line = line.strip()
skip_line = False
line_count += 1
if not line:
continue
# clean {\X0} tags before processing
# fixme: handle nested tags?
start_tag = u""
end_tag = u""
if line.startswith(self.font_style_tag_start):
start_tag = line[:5]
line = line[5:]
if line[-5:-3] == self.font_style_tag_start:
end_tag = line[-5:]
line = line[:-5]
for order, identifier, args in mods:
mod = self.initialized_mods[identifier]
try:
line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
**args)
except EmptyEntryError:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
skip_entry = True
break
try:
for line in text.split("\n"):
# don't bother the mods with surrounding tags
old_line = line
line = line.strip()
skip_line = False
line_count += 1
if not line:
continue
# clean {\X0} tags before processing
# fixme: handle nested tags?
start_tag = u""
end_tag = u""
if line.startswith(self.font_style_tag_start):
start_tag = line[:5]
line = line[5:]
if line[-5:-3] == self.font_style_tag_start:
end_tag = line[-5:]
line = line[:-5]
last_procs_mods = []
# fixme: this double loop is ugly
for order, identifier, args in mods:
mod = self.initialized_mods[identifier]
line = mod.modify(line.strip(), entry=text, debug=self.debug, parent=self, index=index,
**args)
if not line:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
skip_line = True
break
applied_mods.append(identifier)
if mod.last_processors:
last_procs_mods.append([identifier, args])
if skip_line:
continue
for identifier, args in last_procs_mods:
mod = self.initialized_mods[identifier]
line = mod.modify(line.strip(), entry=text, debug=self.debug, parent=self, index=index,
procs=["last_process"], **args)
if not line:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
skip_line = True
break
if skip_line:
continue
if start_tag:
start_tags.append(start_tag)
if end_tag:
end_tags.append(end_tag)
# append new line and clean possibly newly added empty tags
cleaned_line = EMPTY_TAG_PROCESSOR.process(start_tag + line + end_tag, debug=self.debug).strip()
if cleaned_line:
# we may have a single closing tag, if so, try appending it to the previous line
if len(cleaned_line) == 5 and cleaned_line.startswith("{\\") and cleaned_line.endswith("0}"):
if lines:
prev_line = lines.pop()
lines.append(prev_line + cleaned_line)
continue
lines.append(cleaned_line)
else:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
skip_line = True
break
logger.debug(u"%d: Ditching now empty line (%r)", index, line)
applied_mods.append(identifier)
if skip_entry:
lines = []
break
if skip_line:
if not lines:
# don't bother logging when the entry only had one line
if self.debug and line_count > 1:
logger.debug(u"%d: %r -> ''", index, text)
index += 1
continue
except EmptyEntryError, e:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, e.mod.identifier, e.entry)
index += 1
continue
if start_tag:
start_tags.append(start_tag)
if end_tag:
end_tags.append(end_tag)
# append new line and clean possibly newly added empty tags
cleaned_line = EMPTY_TAG_PROCESSOR.process(start_tag + line + end_tag, debug=self.debug).strip()
if cleaned_line:
# we may have a single closing tag, if so, try appending it to the previous line
if len(cleaned_line) == 5 and cleaned_line.startswith("{\\") and cleaned_line.endswith("0}"):
if lines:
prev_line = lines.pop()
lines.append(prev_line + cleaned_line)
continue
lines.append(cleaned_line)
else:
if self.debug:
logger.debug(u"%d: Ditching now empty line (%r)", index, line)
if not lines:
# don't bother logging when the entry only had one line
if self.debug and line_count > 1:
logger.debug(u"%d: %r -> ''", index, entry.text)
except FullContentRep, e:
if self.debug:
logger.debug(u"%d: %s: %r -> %r", index, e.mod.identifier, text, e.new_content)
new_entries.append(e.new_content.replace("\n", ur"\N"))
index += 1
continue
new_text = ur"\N".join(lines)
@@ -373,6 +406,8 @@ class SubtitleModifications(object):
entry.text = new_text
new_entries.append(entry)
index += 1
SubMod = SubtitleModifications
@@ -21,6 +21,7 @@ class SubtitleModification(object):
pre_processors = []
processors = []
post_processors = []
last_processors = []
languages = []
def __init__(self, parent):
@@ -46,7 +47,7 @@ class SubtitleModification(object):
continue
old_content = new_content
new_content = processor.process(new_content, debug=debug, **kwargs)
new_content = processor.process(new_content, debug=debug, mod=self, **kwargs)
if not new_content:
if debug:
logger.debug("Processor returned empty line: %s", processor.name)
@@ -67,15 +68,16 @@ class SubtitleModification(object):
def post_process(self, content, debug=False, parent=None, **kwargs):
return self._process(content, self.post_processors, debug=debug, parent=parent, **kwargs)
def modify(self, content, debug=False, parent=None, **kwargs):
def modify(self, content, debug=False, parent=None, procs=None, **kwargs):
if not content:
return
new_content = content
for method in ("pre_process", "process", "post_process"):
for method in procs or ("pre_process", "process", "post_process"):
if not new_content:
return
new_content = getattr(self, method)(new_content, debug=debug, parent=parent, **kwargs)
new_content = self._process(new_content, getattr(self, "%sors" % method),
debug=debug, parent=parent, **kwargs)
return new_content
@@ -105,5 +107,22 @@ empty_line_post_processors = [
]
class EmptyEntryError(Exception):
class ModEvent(Exception):
def __init__(self, *args, **kwargs):
self.mod = kwargs.pop("mod", None)
self.entry = kwargs.pop("entry", None)
super(ModEvent, self).__init__(*args, **kwargs)
class EmptyEntryError(ModEvent):
pass
class EmptyLineError(ModEvent):
pass
class FullContentRep(ModEvent):
def __init__(self, *args, **kwargs):
self.new_content = kwargs.pop("new_content", None)
super(FullContentRep, self).__init__(*args, **kwargs)
@@ -28,7 +28,7 @@ class CommonFixes(SubtitleTextModification):
NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1", name="CM_multidash"),
# line = _/-/\s
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="CM_non_word_only"),
NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),
# remove >>
NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),
@@ -37,7 +37,7 @@ class CommonFixes(SubtitleTextModification):
NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),
# fix music symbols
NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s*)|(\s*[*#¶]+\s*$)'),
NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
lambda x: u"" if x.group(1) else u"",
name="CM_music_symbols"),
@@ -1,7 +1,8 @@
# coding=utf-8
import re
from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, EmptyEntryError, TAG
from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, EmptyEntryError, TAG, \
FullContentRep
from subzero.modification.processors.re_processor import NReProcessor
from subzero.modification import registry
@@ -10,9 +11,11 @@ class FullBracketEntryProcessor(NReProcessor):
def process(self, content, debug=False, **kwargs):
entry = kwargs.get("entry")
if entry:
rep_content = super(FullBracketEntryProcessor, self).process(entry, debug=debug, **kwargs)
if not rep_content.strip():
raise EmptyEntryError()
rep_content = super(FullBracketEntryProcessor, self).process(entry, debug=debug, **kwargs).strip()
if not rep_content:
raise EmptyEntryError(mod=self.mod, entry=entry)
if content != rep_content:
raise FullContentRep(new_content=rep_content, mod=self.mod, entry=entry)
return content
@@ -49,11 +52,11 @@ class HearingImpaired(SubtitleTextModification):
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
{"t": TAG}), "", name="HI_brackets"),
NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
FullBracketEntryProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
"", name="HI_bracket_open_start"),
NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
name="HI_bracket_open_end"),
#NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
# name="HI_bracket_open_end"),
# text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
# NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
@@ -73,7 +76,7 @@ class HearingImpaired(SubtitleTextModification):
supported=lambda p: not p.only_uppercase),
# remove MAN:
NReProcessor(re.compile(ur'(?suxi)(.*MAN:\s*)'), "", name="HI_remove_man"),
NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),
# dash in front
# NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
@@ -81,13 +84,18 @@ class HearingImpaired(SubtitleTextModification):
# all caps at start before new sentence
NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
# remove music symbols
NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
"", name="HI_music_symbols_only"),
]
post_processors = empty_line_post_processors
last_processors = [
# remove music symbols
NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
"", name="HI_music_symbols_only"),
# remove music entries
NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
"", name="HI_music"),
]
registry.register(HearingImpaired)
@@ -7,12 +7,14 @@ class Processor(object):
"""
name = None
parent = None
mod = None
supported = None
enabled = True
def __init__(self, name=None, parent=None, supported=None):
def __init__(self, name=None, parent=None, mod=None, supported=None):
self.name = name
self.parent = parent
self.mod = mod
self.supported = supported if supported else lambda parent: True
@property
@@ -20,6 +22,8 @@ class Processor(object):
return self.name
def process(self, content, debug=False, **kwargs):
if not self.mod:
self.mod = kwargs.get("mod", None)
return content
def __repr__(self):
@@ -14,12 +14,13 @@ class ReProcessor(Processor):
pattern = None
replace_with = None
def __init__(self, pattern, replace_with, name=None, supported=None):
super(ReProcessor, self).__init__(name=name, supported=supported)
def __init__(self, pattern, replace_with, name=None, supported=None, **kwargs):
super(ReProcessor, self).__init__(name=name, supported=supported, **kwargs)
self.pattern = pattern
self.replace_with = replace_with
def process(self, content, debug=False, **kwargs):
super(ReProcessor, self).process(content, debug=debug, **kwargs)
return self.pattern.sub(self.replace_with, content)