submod retry; WIP

submod: HI: remove music
submod: common: be less aggressive about music symbols submod: HI: be less aggressive about brackets submod: HI: be less aggressive about MAN
2019-05-19 06:03:55 +02:00 · 2019-05-18 06:23:04 +02:00 · 2019-05-17 23:45:06 +02:00 · 2019-05-13 16:14:26 +02:00 · 2019-05-12 06:17:08 +02:00
10 changed files with 202 additions and 113 deletions
@@ -23,7 +23,7 @@
        <key>PlexPluginConsoleLogging</key>
        <string>0</string>
        <key>PlexPluginDevMode</key>
-        <string>0</string>
+        <string>1</string>
         <key>PlexPluginCodePolicy</key>
            <!-- this allows channels to access some python methods which are otherwise blocked, as well as import external code libraries, and interact with the PMS HTTP API -->
            <string>Elevated</string>
@@ -32,7 +32,7 @@

 &lt;h1&gt;Sub-Zero for Plex&lt;/h1&gt;&lt;i&gt;Subtitles done right&lt;/i&gt;

-Version 2.6.5.3074
+Version 2.6.5.3074 DEV

 Originally based on @bramwalet's awesome &lt;a href=&quot;https://github.com/bramwalet/Subliminal.bundle&quot;&gt;Subliminal.bundle&lt;/a&gt;

@@ -210,7 +210,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
            for series in [video.series] + video.alternative_series:
                term = u"%s - %s Season" % (series, p.number_to_words("%sth" % video.season).capitalize())
                logger.debug('Searching for alternative results: %s', term)
-                film = search(term, session=self.session, release=False)
+                film = search(term, session=self.session, release=False, throttle=self.search_throttle)
                if film and film.subtitles:
                    logger.debug('Alternative results found: %s', len(film.subtitles))
                    subtitles += self.parse_results(video, film)
@@ -222,7 +222,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
                    term = u"%s S%02i" % (series, video.season)
                    logger.debug('Searching for packs: %s', term)
                    time.sleep(self.search_throttle)
-                    film = search(term, session=self.session)
+                    film = search(term, session=self.session, throttle=self.search_throttle)
                    if film and film.subtitles:
                        logger.debug('Pack results found: %s', len(film.subtitles))
                        subtitles += self.parse_results(video, film)
@@ -236,7 +236,8 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
            more_than_one = len([video.title] + video.alternative_titles) > 1
            for title in [video.title] + video.alternative_titles:
                logger.debug('Searching for movie results: %s', title)
-                film = search(title, year=video.year, session=self.session, limit_to=None, release=False)
+                film = search(title, year=video.year, session=self.session, limit_to=None, release=False,
+                              throttle=self.search_throttle)
                if film and film.subtitles:
                    subtitles += self.parse_results(video, film)
                if more_than_one:
@@ -117,14 +117,14 @@ class Subtitle(Subtitle_):

        logger.info('Guessing encoding for language %s', self.language)

-        encodings = ['utf-8']
+        encodings = ['utf-8', 'utf-16']

        # add language-specific encodings
        # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages

        if self.language.alpha3 == 'zho':
            encodings.extend(['cp936', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp_2', 'cp950', 'gb18030', 'big5',
-                              'big5hkscs', 'utf-16'])
+                              'big5hkscs'])
        elif self.language.alpha3 == 'jpn':
            encodings.extend(['shift-jis', 'cp932', 'euc_jp', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
                              'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', ])
@@ -28,6 +28,8 @@ import re

 import enum
 import sys
+import requests
+import time

 is_PY2 = sys.version_info[0] < 3
 if is_PY2:
@@ -55,7 +57,9 @@ def soup_for(url, session=None, user_agent=DEFAULT_USER_AGENT):
        r = Request(url, data=None, headers=dict(HEADERS, **{"User-Agent": user_agent}))
        html = urlopen(r).read().decode("utf-8")
    else:
-        html = session.get(url).text
+        ret = session.get(url)
+        ret.raise_for_status()
+        html = ret.text
    return BeautifulSoup(html, "html.parser")


@@ -243,17 +247,34 @@ def get_first_film(soup, section, year=None, session=None):
    return Film.from_url(url, session=session)


-def search(term, release=True, session=None, year=None, limit_to=SearchTypes.Exact):
-    soup = soup_for("%s/subtitles/%s?q=%s" % (SITE_DOMAIN, "release" if release else "search", term), session=session)
+def search(term, release=True, session=None, year=None, limit_to=SearchTypes.Exact, throttle=0):
+    # note to subscene: if you actually start to randomize the endpoint, we'll have to query your server even more
+    endpoints = ["searching", "search", "srch", "find"]
+    if release:
+        endpoints = ["release"]

-    if "Subtitle search by" in str(soup):
-        rows = soup.find("table").tbody.find_all("tr")
-        subtitles = Subtitle.from_rows(rows)
-        return Film(term, subtitles=subtitles)
+    soup = None
+    for endpoint in endpoints:
+        try:
+            soup = soup_for("%s/subtitles/%s?q=%s" % (SITE_DOMAIN, endpoint, term),
+                            session=session)
+        except requests.HTTPError, e:
+            if e.response.status_code == 404:
+                time.sleep(throttle)
+                # fixme: detect endpoint from html
+                continue
+            raise
+        break

-    for junk, search_type in SearchTypes.__members__.items():
-        if section_exists(soup, search_type):
-            return get_first_film(soup, search_type, year=year, session=session)
+    if soup:
+        if "Subtitle search by" in str(soup):
+            rows = soup.find("table").tbody.find_all("tr")
+            subtitles = Subtitle.from_rows(rows)
+            return Film(term, subtitles=subtitles)

-        if limit_to == search_type:
-            return
+        for junk, search_type in SearchTypes.__members__.items():
+            if section_exists(soup, search_type):
+                return get_first_film(soup, search_type, year=year, session=session)
+
+            if limit_to == search_type:
+                return
@@ -6,7 +6,7 @@ import pysubs2
 import logging
 import time

-from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
+from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError, FullContentRep
 from registry import registry
 from subzero.language import Language

@@ -257,7 +257,16 @@ class SubtitleModifications(object):
                mod.modify(None, debug=self.debug, parent=self, **args)

    def apply_line_mods(self, new_entries, mods):
-        for index, entry in enumerate(self.f, 1):
+        index = 1
+        entries = self.f[:]
+        entry_count = len(entries)
+
+        while 1:
+            if index > entry_count - 1:
+                break
+
+            entry = entries[index]
+
            applied_mods = []
            lines = []

@@ -265,86 +274,110 @@ class SubtitleModifications(object):
            start_tags = []
            end_tags = []

-            t = entry.text.strip()
-            if not t:
+            text = entry.text.replace(ur"\N", "\n").strip()
+            if not text:
                if self.debug:
                    logger.debug(u"Skipping empty line: %s", index)
+                index += 1
                continue

-            skip_entry = False
-            for line in t.split(ur"\N"):
-                # don't bother the mods with surrounding tags
-                old_line = line
-                line = line.strip()
-                skip_line = False
-                line_count += 1
-
-                if not line:
-                    continue
-
-                # clean {\X0} tags before processing
-                # fixme: handle nested tags?
-                start_tag = u""
-                end_tag = u""
-                if line.startswith(self.font_style_tag_start):
-                    start_tag = line[:5]
-                    line = line[5:]
-                if line[-5:-3] == self.font_style_tag_start:
-                    end_tag = line[-5:]
-                    line = line[:-5]
-
-                for order, identifier, args in mods:
-                    mod = self.initialized_mods[identifier]
-
-                    try:
-                        line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
-                                          **args)
-                    except EmptyEntryError:
-                        if self.debug:
-                            logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
-                        skip_entry = True
-                        break
+            try:
+                for line in text.split("\n"):
+                    # don't bother the mods with surrounding tags
+                    old_line = line
+                    line = line.strip()
+                    skip_line = False
+                    line_count += 1

                    if not line:
+                        continue
+
+                    # clean {\X0} tags before processing
+                    # fixme: handle nested tags?
+                    start_tag = u""
+                    end_tag = u""
+                    if line.startswith(self.font_style_tag_start):
+                        start_tag = line[:5]
+                        line = line[5:]
+                    if line[-5:-3] == self.font_style_tag_start:
+                        end_tag = line[-5:]
+                        line = line[:-5]
+
+                    last_procs_mods = []
+
+                    # fixme: this double loop is ugly
+                    for order, identifier, args in mods:
+                        mod = self.initialized_mods[identifier]
+
+                        line = mod.modify(line.strip(), entry=text, debug=self.debug, parent=self, index=index,
+                                          **args)
+
+                        if not line:
+                            if self.debug:
+                                logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
+                            skip_line = True
+                            break
+
+                        applied_mods.append(identifier)
+                        if mod.last_processors:
+                            last_procs_mods.append([identifier, args])
+
+                    if skip_line:
+                        continue
+
+                    for identifier, args in last_procs_mods:
+                        mod = self.initialized_mods[identifier]
+
+                        line = mod.modify(line.strip(), entry=text, debug=self.debug, parent=self, index=index,
+                                          procs=["last_process"], **args)
+
+                        if not line:
+                            if self.debug:
+                                logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
+                            skip_line = True
+                            break
+
+                    if skip_line:
+                        continue
+
+                    if start_tag:
+                        start_tags.append(start_tag)
+
+                    if end_tag:
+                        end_tags.append(end_tag)
+
+                    # append new line and clean possibly newly added empty tags
+                    cleaned_line = EMPTY_TAG_PROCESSOR.process(start_tag + line + end_tag, debug=self.debug).strip()
+                    if cleaned_line:
+                        # we may have a single closing tag, if so, try appending it to the previous line
+                        if len(cleaned_line) == 5 and cleaned_line.startswith("{\\") and cleaned_line.endswith("0}"):
+                            if lines:
+                                prev_line = lines.pop()
+                                lines.append(prev_line + cleaned_line)
+                                continue
+
+                        lines.append(cleaned_line)
+                    else:
                        if self.debug:
-                            logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
-                        skip_line = True
-                        break
+                            logger.debug(u"%d: Ditching now empty line (%r)", index, line)

-                    applied_mods.append(identifier)
-
-                if skip_entry:
-                    lines = []
-                    break
-
-                if skip_line:
+                if not lines:
+                    # don't bother logging when the entry only had one line
+                    if self.debug and line_count > 1:
+                        logger.debug(u"%d: %r -> ''", index, text)
+                    index += 1
                    continue
+            except EmptyEntryError, e:
+                if self.debug:
+                    logger.debug(u"%d: %s: %r -> ''", index, e.mod.identifier, e.entry)
+                index += 1
+                continue

-                if start_tag:
-                    start_tags.append(start_tag)
-
-                if end_tag:
-                    end_tags.append(end_tag)
-
-                # append new line and clean possibly newly added empty tags
-                cleaned_line = EMPTY_TAG_PROCESSOR.process(start_tag + line + end_tag, debug=self.debug).strip()
-                if cleaned_line:
-                    # we may have a single closing tag, if so, try appending it to the previous line
-                    if len(cleaned_line) == 5 and cleaned_line.startswith("{\\") and cleaned_line.endswith("0}"):
-                        if lines:
-                            prev_line = lines.pop()
-                            lines.append(prev_line + cleaned_line)
-                            continue
-
-                    lines.append(cleaned_line)
-                else:
-                    if self.debug:
-                        logger.debug(u"%d: Ditching now empty line (%r)", index, line)
-
-            if not lines:
-                # don't bother logging when the entry only had one line
-                if self.debug and line_count > 1:
-                    logger.debug(u"%d: %r -> ''", index, entry.text)
+            except FullContentRep, e:
+                if self.debug:
+                    logger.debug(u"%d: %s: %r -> %r", index, e.mod.identifier, text, e.new_content)
+                new_entries.append(e.new_content.replace("\n", ur"\N"))
+                index += 1
                continue

            new_text = ur"\N".join(lines)
@@ -373,6 +406,8 @@ class SubtitleModifications(object):
                entry.text = new_text

            new_entries.append(entry)
+            index += 1
+

 SubMod = SubtitleModifications

@@ -21,6 +21,7 @@ class SubtitleModification(object):
    pre_processors = []
    processors = []
    post_processors = []
+    last_processors = []
    languages = []

    def __init__(self, parent):
@@ -46,7 +47,7 @@ class SubtitleModification(object):
                continue

            old_content = new_content
-            new_content = processor.process(new_content, debug=debug, **kwargs)
+            new_content = processor.process(new_content, debug=debug, mod=self, **kwargs)
            if not new_content:
                if debug:
                    logger.debug("Processor returned empty line: %s", processor.name)
@@ -67,15 +68,16 @@ class SubtitleModification(object):
    def post_process(self, content, debug=False, parent=None, **kwargs):
        return self._process(content, self.post_processors, debug=debug, parent=parent, **kwargs)

-    def modify(self, content, debug=False, parent=None, **kwargs):
+    def modify(self, content, debug=False, parent=None, procs=None, **kwargs):
        if not content:
            return

        new_content = content
-        for method in ("pre_process", "process", "post_process"):
+        for method in procs or ("pre_process", "process", "post_process"):
            if not new_content:
                return
-            new_content = getattr(self, method)(new_content, debug=debug, parent=parent, **kwargs)
+            new_content = self._process(new_content, getattr(self, "%sors" % method),
+                                        debug=debug, parent=parent, **kwargs)

        return new_content

@@ -105,5 +107,22 @@ empty_line_post_processors = [
 ]


-class EmptyEntryError(Exception):
+class ModEvent(Exception):
+    def __init__(self, *args, **kwargs):
+        self.mod = kwargs.pop("mod", None)
+        self.entry = kwargs.pop("entry", None)
+        super(ModEvent, self).__init__(*args, **kwargs)
+
+
+class EmptyEntryError(ModEvent):
    pass
+
+
+class EmptyLineError(ModEvent):
+    pass
+
+
+class FullContentRep(ModEvent):
+    def __init__(self, *args, **kwargs):
+        self.new_content = kwargs.pop("new_content", None)
+        super(FullContentRep, self).__init__(*args, **kwargs)
@@ -28,7 +28,7 @@ class CommonFixes(SubtitleTextModification):
        NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"),

        # line = _/-/\s
-        NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="CM_non_word_only"),
+        NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),

        # remove >>
        NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),
@@ -37,7 +37,7 @@ class CommonFixes(SubtitleTextModification):
        NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),

        # fix music symbols
-        NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s*)|(\s*[*#¶]+\s*$)'),
+        NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
                     lambda x: u"♪ " if x.group(1) else u" ♪",
                     name="CM_music_symbols"),

@@ -1,7 +1,8 @@
 # coding=utf-8
 import re

-from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, EmptyEntryError, TAG
+from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, EmptyEntryError, TAG, \
+    FullContentRep
 from subzero.modification.processors.re_processor import NReProcessor
 from subzero.modification import registry

@@ -10,9 +11,11 @@ class FullBracketEntryProcessor(NReProcessor):
    def process(self, content, debug=False, **kwargs):
        entry = kwargs.get("entry")
        if entry:
-            rep_content = super(FullBracketEntryProcessor, self).process(entry, debug=debug, **kwargs)
-            if not rep_content.strip():
-                raise EmptyEntryError()
+            rep_content = super(FullBracketEntryProcessor, self).process(entry, debug=debug, **kwargs).strip()
+            if not rep_content:
+                raise EmptyEntryError(mod=self.mod, entry=entry)
+            if content != rep_content:
+                raise FullContentRep(new_content=rep_content, mod=self.mod, entry=entry)
        return content


@@ -49,11 +52,11 @@ class HearingImpaired(SubtitleTextModification):
        NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
                                {"t": TAG}), "", name="HI_brackets"),

-        NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
+        FullBracketEntryProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
                     "", name="HI_bracket_open_start"),

-        NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
-                     name="HI_bracket_open_end"),
+        #NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
+        #             name="HI_bracket_open_end"),

        # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
        # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
@@ -73,7 +76,7 @@ class HearingImpaired(SubtitleTextModification):
                     supported=lambda p: not p.only_uppercase),

        # remove MAN:
-        NReProcessor(re.compile(ur'(?suxi)(.*MAN:\s*)'), "", name="HI_remove_man"),
+        NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),

        # dash in front
        # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
@@ -81,13 +84,18 @@ class HearingImpaired(SubtitleTextModification):
        # all caps at start before new sentence
        NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
                     name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
-
-        # remove music symbols
-        NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
-                     "", name="HI_music_symbols_only"),
    ]

    post_processors = empty_line_post_processors
+    last_processors = [
+        # remove music symbols
+        NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
+                     "", name="HI_music_symbols_only"),
+
+        # remove music entries
+        NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
+                     "", name="HI_music"),
+    ]


 registry.register(HearingImpaired)
@@ -7,12 +7,14 @@ class Processor(object):
    """
    name = None
    parent = None
+    mod = None
    supported = None
    enabled = True

-    def __init__(self, name=None, parent=None, supported=None):
+    def __init__(self, name=None, parent=None, mod=None, supported=None):
        self.name = name
        self.parent = parent
+        self.mod = mod
        self.supported = supported if supported else lambda parent: True

    @property
@@ -20,6 +22,8 @@ class Processor(object):
        return self.name

    def process(self, content, debug=False, **kwargs):
+        if not self.mod:
+            self.mod = kwargs.get("mod", None)
        return content

    def __repr__(self):
@@ -14,12 +14,13 @@ class ReProcessor(Processor):
    pattern = None
    replace_with = None

-    def __init__(self, pattern, replace_with, name=None, supported=None):
-        super(ReProcessor, self).__init__(name=name, supported=supported)
+    def __init__(self, pattern, replace_with, name=None, supported=None, **kwargs):
+        super(ReProcessor, self).__init__(name=name, supported=supported, **kwargs)
        self.pattern = pattern
        self.replace_with = replace_with

    def process(self, content, debug=False, **kwargs):
+        super(ReProcessor, self).process(content, debug=debug, **kwargs)
        return self.pattern.sub(self.replace_with, content)
Author	SHA1	Message	Date
panni	f8f99f0fb2	submod retry; WIP	2019-05-19 06:03:55 +02:00
panni	f337b53ae3	submod: HI: remove music submod: common: be less aggressive about music symbols submod: HI: be less aggressive about brackets submod: HI: be less aggressive about MAN	2019-05-18 06:23:04 +02:00
panni	aea6050d71	subtitle: try decoding with utf-16 by default as well	2019-05-17 23:45:06 +02:00
panni	13d5e0761e	providers: subscene: fix endpoint once again	2019-05-13 16:14:26 +02:00
panni	ce28d0284c	back from dev	2019-05-12 06:17:08 +02:00