Files

168 lines
5.7 KiB
Python

# coding=utf-8
import logging
import chardet
import pysrt
import pysubs2
from bs4 import UnicodeDammit
from subliminal.video import Episode, Movie
from subliminal import Subtitle
logger = logging.getLogger(__name__)
def compute_score(matches, video, scores=None):
"""Compute the score of the `matches` against the `video`.
Some matches count as much as a combination of others in order to level the final score:
* `hash` removes everything else
* For :class:`~subliminal.video.Episode`
* `imdb_id` removes `series`, `tvdb_id`, `season`, `episode`, `title` and `year`
* `tvdb_id` removes `series` and `year`
* `title` removes `season` and `episode`
:param video: the video to get the score with.
:type video: :class:`~subliminal.video.Video`
:param dict scores: scores to use, if `None`, the :attr:`~subliminal.video.Video.scores` from the video are used.
:return: score of the subtitle.
:rtype: int
# patch: remove score cap for enabling individual boost
"""
final_matches = matches.copy()
scores = scores or video.scores
logger.info('Computing score for matches %r and %r', matches, video)
is_episode = isinstance(video, Episode)
episode_hash_valid_if = {"series", "season", "episode"}
movie_hash_valid_if = {"title", "video_codec"}
# remove equivalent match combinations
if 'hash' in final_matches:
# hash is error-prone, try to fix that
hash_valid_if = episode_hash_valid_if if is_episode else movie_hash_valid_if
if hash_valid_if <= set(final_matches):
# series, season and episode matched, hash is valid
logger.debug('Using valid hash, as %s are correct (%r) and (%r)', hash_valid_if, matches, video)
final_matches &= {'hash', 'hearing_impaired'}
else:
# no match, invalidate hash
logger.debug('Ignoring hash as other matches are wrong (missing: %r) and (%r)', hash_valid_if - matches, video)
final_matches -= {"hash"}
elif is_episode:
if 'imdb_id' in final_matches:
final_matches -= {'series', 'tvdb_id', 'season', 'episode', 'title', 'year'}
if 'tvdb_id' in final_matches:
final_matches -= {'series', 'year'}
# compute score
logger.debug('Final matches: %r', final_matches)
score = sum((scores[match] for match in final_matches))
logger.info('Computed score %d', score)
return score
class PatchedSubtitle(Subtitle):
storage_path = None
def guess_encoding(self):
"""Guess encoding using the language, falling back on chardet.
:return: the guessed encoding.
:rtype: str
"""
logger.info('Guessing encoding for language %s', self.language)
# always try utf-8 first
encodings = ['utf-8']
# add language-specific encodings
if self.language.alpha3 == 'zho':
encodings.extend(['gb18030', 'big5'])
elif self.language.alpha3 == 'jpn':
encodings.append('shift-jis')
elif self.language.alpha3 == 'ara':
encodings.append('windows-1256')
elif self.language.alpha3 == 'heb':
encodings.append('windows-1255')
elif self.language.alpha3 == 'tur':
encodings.extend(['iso-8859-9', 'windows-1254'])
# Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script),
# Romanian (before 1993 spelling reform) and Albanian
elif self.language.alpha3 in ('pol', 'cze', 'svk', 'hun', 'svn', 'bih', 'hrv', 'srb', 'rou', 'alb'):
# Eastern European Group 1
encodings.extend(['windows-1250'])
# Bulgarian, Serbian and Macedonian
elif self.language.alpha3 in ('bul', 'srb', 'mkd'):
# Eastern European Group 2
encodings.extend(['windows-1251'])
else:
# Western European (windows-1252)
encodings.append('latin-1')
# try to decode
logger.debug('Trying encodings %r', encodings)
for encoding in encodings:
try:
self.content.decode(encoding)
except UnicodeDecodeError:
pass
else:
logger.info('Guessed encoding %s', encoding)
return encoding
logger.warning('Could not guess encoding from language')
# fallback on chardet
encoding = chardet.detect(self.content)['encoding']
logger.info('Chardet found encoding %s', encoding)
if not encoding:
# fallback on bs4
logger.info('Falling back to bs4 detection')
a = UnicodeDammit(self.content)
Log.Debug("bs4 detected encoding: %s" % a.original_encoding)
if a.original_encoding:
return a.original_encoding
raise ValueError(u"Couldn't guess the proper encoding for %s" % self)
return encoding
def is_valid(self):
"""Check if a :attr:`text` is a valid SubRip format.
:return: whether or not the subtitle is valid.
:rtype: bool
"""
if not self.text:
return False
# valid srt
try:
pysrt.from_string(self.text, error_handling=pysrt.ERROR_RAISE)
except Exception, e:
logger.error("PySRT-parsing failed: %s, trying pysubs2", e)
else:
return True
# something else, try to return srt
try:
logger.debug("Trying parsing with PySubs2")
subs = pysubs2.SSAFile.from_string(self.text)
self.content = subs.to_string("srt")
except:
logger.exception("Couldn't convert subtitle %s to .srt format", self)
return False
return True