submod: common: remove redundant interpunction

This commit is contained in:
panni
2017-12-12 15:02:20 +01:00
parent f6f39b97c8
commit d3ff49ee0c
3 changed files with 11 additions and 6 deletions
+5 -5
View File
@@ -20,7 +20,7 @@ debug = "--debug" in sys.argv
if debug:
logging.basicConfig(level=logging.DEBUG)
sub = Subtitle(Language.fromietf("pol"), mods=["OCR_fixes", "common", "remove_tags", "OCR_fixes", "shift_offset(s=-5, ms=-350)"])
sub = Subtitle(Language.fromietf("eng"), mods=["OCR_fixes", "common", "remove_tags", "OCR_fixes", "shift_offset(s=-5, ms=-350)"])
sub.content = open(fn).read()
sub.normalize()
content = sub.get_modified_content(debug=True)
@@ -35,10 +35,10 @@ content = sub.get_modified_content(debug=True)
#content = fix_text(Subtitle.pysubs2_to_unicode(submod.f, format=format), **ftfy_defaults)\
# .encode(encoding="utf-8")
#print submod.f.to_string("srt", encoding="utf-8")
#print repr(srt)
f = codecs.open("testout.srt", "w+")
f.write(content)
f.close()
#print repr(content)
#f = codecs.open("testout.srt", "w+")
#f.write(content)
#f.close()
#print submod.f.to_string("srt")
#submod.modify("OCR_fixes")
#submod.modify("change_FPS(from=24,to=25)")
@@ -71,6 +71,11 @@ class CommonFixes(SubtitleTextModification):
NReProcessor(re.compile(ur'(?u)((?:[^.\s])+\.\s+)([a-zà-ž])'),
lambda match: ur'%s%s' % (match.group(1), match.group(2).upper()), name="CM_uppercase_after_dot"),
# remove double interpunction
NReProcessor(re.compile(ur'(?u)(\s*[,!?])\s*([,.!?][,.!?\s]*)'),
lambda match: match.group(1).strip() + (" " if match.group(2).endswith(" ") else ""),
name="CM_double_interpunct"),
# remove spaces before punctuation
NReProcessor(re.compile(r'(?u)(?:(?<=^)|(?<=\w)) +([!?.,](?![!?.,]))'), r"\1", name="CM_punctuation_space"),
]
+1 -1
View File
@@ -22,7 +22,7 @@ Mah numbar is wrong: 1 91 7
"less text before colons: Earth. Utah, North America."
MUSIC PLAYS What is that sound?!
ls it , and a punctuation issue ? lol
take them balls it. L like turtles
take them balls it. L like turtles !! ! this, . is bad .
6
00:00:19,686 --> 00:00:21,103