submod: HI: remove music

submod: common: be less aggressive about music symbols submod: HI: be less aggressive about brackets submod: HI: be less aggressive about MAN
subtitle: try decoding with utf-16 by default as well
2019-05-18 06:23:04 +02:00 · 2019-05-17 23:45:06 +02:00 · 2019-05-13 16:14:26 +02:00 · 2019-05-12 06:17:08 +02:00
8 changed files with 96 additions and 33 deletions
@@ -23,7 +23,7 @@
        <key>PlexPluginConsoleLogging</key>
        <string>0</string>
        <key>PlexPluginDevMode</key>
-        <string>0</string>
+        <string>1</string>
         <key>PlexPluginCodePolicy</key>
            <!-- this allows channels to access some python methods which are otherwise blocked, as well as import external code libraries, and interact with the PMS HTTP API -->
            <string>Elevated</string>
@@ -32,7 +32,7 @@

 &lt;h1&gt;Sub-Zero for Plex&lt;/h1&gt;&lt;i&gt;Subtitles done right&lt;/i&gt;

-Version 2.6.5.3074
+Version 2.6.5.3074 DEV

 Originally based on @bramwalet's awesome &lt;a href=&quot;https://github.com/bramwalet/Subliminal.bundle&quot;&gt;Subliminal.bundle&lt;/a&gt;

@@ -210,7 +210,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
            for series in [video.series] + video.alternative_series:
                term = u"%s - %s Season" % (series, p.number_to_words("%sth" % video.season).capitalize())
                logger.debug('Searching for alternative results: %s', term)
-                film = search(term, session=self.session, release=False)
+                film = search(term, session=self.session, release=False, throttle=self.search_throttle)
                if film and film.subtitles:
                    logger.debug('Alternative results found: %s', len(film.subtitles))
                    subtitles += self.parse_results(video, film)
@@ -222,7 +222,7 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
                    term = u"%s S%02i" % (series, video.season)
                    logger.debug('Searching for packs: %s', term)
                    time.sleep(self.search_throttle)
-                    film = search(term, session=self.session)
+                    film = search(term, session=self.session, throttle=self.search_throttle)
                    if film and film.subtitles:
                        logger.debug('Pack results found: %s', len(film.subtitles))
                        subtitles += self.parse_results(video, film)
@@ -236,7 +236,8 @@ class SubsceneProvider(Provider, ProviderSubtitleArchiveMixin):
            more_than_one = len([video.title] + video.alternative_titles) > 1
            for title in [video.title] + video.alternative_titles:
                logger.debug('Searching for movie results: %s', title)
-                film = search(title, year=video.year, session=self.session, limit_to=None, release=False)
+                film = search(title, year=video.year, session=self.session, limit_to=None, release=False,
+                              throttle=self.search_throttle)
                if film and film.subtitles:
                    subtitles += self.parse_results(video, film)
                if more_than_one:
@@ -117,14 +117,14 @@ class Subtitle(Subtitle_):

        logger.info('Guessing encoding for language %s', self.language)

-        encodings = ['utf-8']
+        encodings = ['utf-8', 'utf-16']

        # add language-specific encodings
        # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages

        if self.language.alpha3 == 'zho':
            encodings.extend(['cp936', 'gb2312', 'gbk', 'gb18030', 'hz', 'iso2022_jp_2', 'cp950', 'gb18030', 'big5',
-                              'big5hkscs', 'utf-16'])
+                              'big5hkscs'])
        elif self.language.alpha3 == 'jpn':
            encodings.extend(['shift-jis', 'cp932', 'euc_jp', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2',
                              'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', ])
@@ -28,6 +28,8 @@ import re

 import enum
 import sys
+import requests
+import time

 is_PY2 = sys.version_info[0] < 3
 if is_PY2:
@@ -55,7 +57,9 @@ def soup_for(url, session=None, user_agent=DEFAULT_USER_AGENT):
        r = Request(url, data=None, headers=dict(HEADERS, **{"User-Agent": user_agent}))
        html = urlopen(r).read().decode("utf-8")
    else:
-        html = session.get(url).text
+        ret = session.get(url)
+        ret.raise_for_status()
+        html = ret.text
    return BeautifulSoup(html, "html.parser")


@@ -243,17 +247,34 @@ def get_first_film(soup, section, year=None, session=None):
    return Film.from_url(url, session=session)


-def search(term, release=True, session=None, year=None, limit_to=SearchTypes.Exact):
-    soup = soup_for("%s/subtitles/%s?q=%s" % (SITE_DOMAIN, "release" if release else "search", term), session=session)
+def search(term, release=True, session=None, year=None, limit_to=SearchTypes.Exact, throttle=0):
+    # note to subscene: if you actually start to randomize the endpoint, we'll have to query your server even more
+    endpoints = ["searching", "search", "srch", "find"]
+    if release:
+        endpoints = ["release"]

-    if "Subtitle search by" in str(soup):
-        rows = soup.find("table").tbody.find_all("tr")
-        subtitles = Subtitle.from_rows(rows)
-        return Film(term, subtitles=subtitles)
+    soup = None
+    for endpoint in endpoints:
+        try:
+            soup = soup_for("%s/subtitles/%s?q=%s" % (SITE_DOMAIN, endpoint, term),
+                            session=session)
+        except requests.HTTPError, e:
+            if e.response.status_code == 404:
+                time.sleep(throttle)
+                # fixme: detect endpoint from html
+                continue
+            raise
+        break

-    for junk, search_type in SearchTypes.__members__.items():
-        if section_exists(soup, search_type):
-            return get_first_film(soup, search_type, year=year, session=session)
+    if soup:
+        if "Subtitle search by" in str(soup):
+            rows = soup.find("table").tbody.find_all("tr")
+            subtitles = Subtitle.from_rows(rows)
+            return Film(term, subtitles=subtitles)

-        if limit_to == search_type:
-            return
+        for junk, search_type in SearchTypes.__members__.items():
+            if section_exists(soup, search_type):
+                return get_first_film(soup, search_type, year=year, session=session)
+
+            if limit_to == search_type:
+                return
@@ -293,6 +293,9 @@ class SubtitleModifications(object):
                    end_tag = line[-5:]
                    line = line[:-5]

+                last_procs_mods = []
+
+                # fixme: this double loop is ugly
                for order, identifier, args in mods:
                    mod = self.initialized_mods[identifier]

@@ -312,6 +315,33 @@ class SubtitleModifications(object):
                        break

                    applied_mods.append(identifier)
+                    if mod.last_processors:
+                        last_procs_mods.append([identifier, args])
+
+                if skip_entry:
+                    lines = []
+                    break
+
+                if skip_line:
+                    continue
+
+                for identifier, args in last_procs_mods:
+                    mod = self.initialized_mods[identifier]
+
+                    try:
+                        line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
+                                          procs=["last_process"], **args)
+                    except EmptyEntryError:
+                        if self.debug:
+                            logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
+                        skip_entry = True
+                        break
+
+                    if not line:
+                        if self.debug:
+                            logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
+                        skip_line = True
+                        break

                if skip_entry:
                    lines = []
@@ -21,6 +21,7 @@ class SubtitleModification(object):
    pre_processors = []
    processors = []
    post_processors = []
+    last_processors = []
    languages = []

    def __init__(self, parent):
@@ -67,15 +68,16 @@ class SubtitleModification(object):
    def post_process(self, content, debug=False, parent=None, **kwargs):
        return self._process(content, self.post_processors, debug=debug, parent=parent, **kwargs)

-    def modify(self, content, debug=False, parent=None, **kwargs):
+    def modify(self, content, debug=False, parent=None, procs=None, **kwargs):
        if not content:
            return

        new_content = content
-        for method in ("pre_process", "process", "post_process"):
+        for method in procs or ("pre_process", "process", "post_process"):
            if not new_content:
                return
-            new_content = getattr(self, method)(new_content, debug=debug, parent=parent, **kwargs)
+            new_content = self._process(new_content, getattr(self, "%sors" % method),
+                                        debug=debug, parent=parent, **kwargs)

        return new_content

@@ -107,3 +109,7 @@ empty_line_post_processors = [

 class EmptyEntryError(Exception):
    pass
+
+
+class EmptyLineError(Exception):
+    pass
@@ -28,7 +28,7 @@ class CommonFixes(SubtitleTextModification):
        NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"),

        # line = _/-/\s
-        NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="CM_non_word_only"),
+        NReProcessor(re.compile(r'(?u)(^\W*[-_.:>~]+\W*$)'), "", name="<CM_non_word_only"),

        # remove >>
        NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"),
@@ -37,7 +37,7 @@ class CommonFixes(SubtitleTextModification):
        NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"),

        # fix music symbols
-        NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s*)|(\s*[*#¶]+\s*$)'),
+        NReProcessor(re.compile(ur'(?u)(^[-\s>~]*[*#¶]+\s+)|(\s*[*#¶]+\s*$)'),
                     lambda x: u"♪ " if x.group(1) else u" ♪",
                     name="CM_music_symbols"),

@@ -49,11 +49,11 @@ class HearingImpaired(SubtitleTextModification):
        NReProcessor(re.compile(ur'(?sux)-?%(t)s[([][^([)\]]+?(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' %
                                {"t": TAG}), "", name="HI_brackets"),

-        NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
-                     "", name="HI_bracket_open_start"),
+        #NReProcessor(re.compile(ur'(?sux)-?%(t)s[([]%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+%(t)s$' % {"t": TAG}),
+        #             "", name="HI_bracket_open_start"),

-        NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
-                     name="HI_bracket_open_end"),
+        #NReProcessor(re.compile(ur'(?sux)-?%(t)s(?=[A-zÀ-ž"\'.]{3,})[^([)\]]+[)\]][\s:]*%(t)s' % {"t": TAG}), "",
+        #             name="HI_bracket_open_end"),

        # text before colon (and possible dash in front), max 11 chars after the first whitespace (if any)
        # NReProcessor(re.compile(r'(?u)(^[A-z\-\'"_]+[\w\s]{0,11}:[^0-9{2}][\s]*)'), "", name="HI_before_colon"),
@@ -73,7 +73,7 @@ class HearingImpaired(SubtitleTextModification):
                     supported=lambda p: not p.only_uppercase),

        # remove MAN:
-        NReProcessor(re.compile(ur'(?suxi)(.*MAN:\s*)'), "", name="HI_remove_man"),
+        NReProcessor(re.compile(ur'(?suxi)(\b(?:WO)MAN:\s*)'), "", name="HI_remove_man"),

        # dash in front
        # NReProcessor(re.compile(r'(?u)^\s*-\s*'), "", name="HI_starting_dash"),
@@ -81,13 +81,18 @@ class HearingImpaired(SubtitleTextModification):
        # all caps at start before new sentence
        NReProcessor(re.compile(ur'(?u)^(?=[A-ZÀ-Ž]{4,})[A-ZÀ-Ž-_\s]+\s([A-ZÀ-Ž][a-zà-ž].+)'), r"\1",
                     name="HI_starting_upper_then_sentence", supported=lambda p: not p.only_uppercase),
-
-        # remove music symbols
-        NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
-                     "", name="HI_music_symbols_only"),
    ]

    post_processors = empty_line_post_processors
+    last_processors = [
+        # remove music symbols
+        NReProcessor(re.compile(ur'(?u)(^%(t)s[*#¶♫♪\s]*%(t)s[*#¶♫♪\s]+%(t)s[*#¶♫♪\s]*%(t)s$)' % {"t": TAG}),
+                     "", name="HI_music_symbols_only"),
+
+        # remove music entries
+        NReProcessor(re.compile(ur'(?ums)(^[-\s>~]*[♫♪]+\s*.+|.+\s*[♫♪]+\s*$)'),
+                     "", name="HI_music"),
+    ]


 registry.register(HearingImpaired)
Author	SHA1	Message	Date
panni	f337b53ae3	submod: HI: remove music submod: common: be less aggressive about music symbols submod: HI: be less aggressive about brackets submod: HI: be less aggressive about MAN	2019-05-18 06:23:04 +02:00
panni	aea6050d71	subtitle: try decoding with utf-16 by default as well	2019-05-17 23:45:06 +02:00
panni	13d5e0761e	providers: subscene: fix endpoint once again	2019-05-13 16:14:26 +02:00
panni	ce28d0284c	back from dev	2019-05-12 06:17:08 +02:00