168 lines
5.7 KiB
Python
168 lines
5.7 KiB
Python
# coding=utf-8
|
|
|
|
import logging
|
|
|
|
import chardet
|
|
import pysrt
|
|
import pysubs2
|
|
from bs4 import UnicodeDammit
|
|
from subliminal.video import Episode, Movie
|
|
from subliminal import Subtitle
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def compute_score(matches, video, scores=None):
|
|
"""Compute the score of the `matches` against the `video`.
|
|
Some matches count as much as a combination of others in order to level the final score:
|
|
* `hash` removes everything else
|
|
* For :class:`~subliminal.video.Episode`
|
|
* `imdb_id` removes `series`, `tvdb_id`, `season`, `episode`, `title` and `year`
|
|
* `tvdb_id` removes `series` and `year`
|
|
* `title` removes `season` and `episode`
|
|
:param video: the video to get the score with.
|
|
:type video: :class:`~subliminal.video.Video`
|
|
:param dict scores: scores to use, if `None`, the :attr:`~subliminal.video.Video.scores` from the video are used.
|
|
:return: score of the subtitle.
|
|
:rtype: int
|
|
|
|
# patch: remove score cap for enabling individual boost
|
|
"""
|
|
final_matches = matches.copy()
|
|
scores = scores or video.scores
|
|
|
|
logger.info('Computing score for matches %r and %r', matches, video)
|
|
|
|
is_episode = isinstance(video, Episode)
|
|
|
|
episode_hash_valid_if = {"series", "season", "episode"}
|
|
movie_hash_valid_if = {"title", "video_codec"}
|
|
|
|
# remove equivalent match combinations
|
|
if 'hash' in final_matches:
|
|
# hash is error-prone, try to fix that
|
|
hash_valid_if = episode_hash_valid_if if is_episode else movie_hash_valid_if
|
|
|
|
if hash_valid_if <= set(final_matches):
|
|
# series, season and episode matched, hash is valid
|
|
logger.debug('Using valid hash, as %s are correct (%r) and (%r)', hash_valid_if, matches, video)
|
|
final_matches &= {'hash', 'hearing_impaired'}
|
|
else:
|
|
# no match, invalidate hash
|
|
logger.debug('Ignoring hash as other matches are wrong (missing: %r) and (%r)', hash_valid_if - matches, video)
|
|
final_matches -= {"hash"}
|
|
|
|
elif is_episode:
|
|
if 'imdb_id' in final_matches:
|
|
final_matches -= {'series', 'tvdb_id', 'season', 'episode', 'title', 'year'}
|
|
if 'tvdb_id' in final_matches:
|
|
final_matches -= {'series', 'year'}
|
|
|
|
# compute score
|
|
logger.debug('Final matches: %r', final_matches)
|
|
score = sum((scores[match] for match in final_matches))
|
|
logger.info('Computed score %d', score)
|
|
|
|
return score
|
|
|
|
|
|
class PatchedSubtitle(Subtitle):
|
|
storage_path = None
|
|
|
|
def guess_encoding(self):
|
|
"""Guess encoding using the language, falling back on chardet.
|
|
|
|
:return: the guessed encoding.
|
|
:rtype: str
|
|
|
|
"""
|
|
logger.info('Guessing encoding for language %s', self.language)
|
|
|
|
# always try utf-8 first
|
|
encodings = ['utf-8']
|
|
|
|
# add language-specific encodings
|
|
if self.language.alpha3 == 'zho':
|
|
encodings.extend(['gb18030', 'big5'])
|
|
elif self.language.alpha3 == 'jpn':
|
|
encodings.append('shift-jis')
|
|
elif self.language.alpha3 == 'ara':
|
|
encodings.append('windows-1256')
|
|
elif self.language.alpha3 == 'heb':
|
|
encodings.append('windows-1255')
|
|
elif self.language.alpha3 == 'tur':
|
|
encodings.extend(['iso-8859-9', 'windows-1254'])
|
|
|
|
# Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script),
|
|
# Romanian (before 1993 spelling reform) and Albanian
|
|
elif self.language.alpha3 in ('pol', 'cze', 'svk', 'hun', 'svn', 'bih', 'hrv', 'srb', 'rou', 'alb'):
|
|
# Eastern European Group 1
|
|
encodings.extend(['windows-1250'])
|
|
|
|
# Bulgarian, Serbian and Macedonian
|
|
elif self.language.alpha3 in ('bul', 'srb', 'mkd'):
|
|
# Eastern European Group 2
|
|
encodings.extend(['windows-1251'])
|
|
else:
|
|
# Western European (windows-1252)
|
|
encodings.append('latin-1')
|
|
|
|
# try to decode
|
|
logger.debug('Trying encodings %r', encodings)
|
|
for encoding in encodings:
|
|
try:
|
|
self.content.decode(encoding)
|
|
except UnicodeDecodeError:
|
|
pass
|
|
else:
|
|
logger.info('Guessed encoding %s', encoding)
|
|
return encoding
|
|
|
|
logger.warning('Could not guess encoding from language')
|
|
|
|
# fallback on chardet
|
|
encoding = chardet.detect(self.content)['encoding']
|
|
logger.info('Chardet found encoding %s', encoding)
|
|
|
|
if not encoding:
|
|
# fallback on bs4
|
|
logger.info('Falling back to bs4 detection')
|
|
a = UnicodeDammit(self.content)
|
|
|
|
Log.Debug("bs4 detected encoding: %s" % a.original_encoding)
|
|
|
|
if a.original_encoding:
|
|
return a.original_encoding
|
|
raise ValueError(u"Couldn't guess the proper encoding for %s" % self)
|
|
|
|
return encoding
|
|
|
|
def is_valid(self):
|
|
"""Check if a :attr:`text` is a valid SubRip format.
|
|
|
|
:return: whether or not the subtitle is valid.
|
|
:rtype: bool
|
|
|
|
"""
|
|
if not self.text:
|
|
return False
|
|
|
|
# valid srt
|
|
try:
|
|
pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
|
|
except Exception, e:
|
|
logger.error("PySRT-parsing failed: %s, trying pysubs2", e)
|
|
else:
|
|
return True
|
|
|
|
# something else, try to return srt
|
|
try:
|
|
logger.debug("Trying parsing with PySubs2")
|
|
subs = pysubs2.SSAFile.from_string(self.text)
|
|
self.content = subs.to_string("srt")
|
|
except:
|
|
logger.exception("Couldn't convert subtitle %s to .srt format", self)
|
|
return False
|
|
|
|
return True
|