# coding=utf-8 import re from subzero.language import Language from subzero.modification.mods import SubtitleTextModification, empty_line_post_processors, SubtitleModification from subzero.modification.processors import FuncProcessor from subzero.modification.processors.re_processor import NReProcessor from subzero.modification import registry ENGLISH = Language("eng") class CommonFixes(SubtitleTextModification): identifier = "common" description = "Basic common fixes" exclusive = True order = 40 long_description = "Fix common and whitespace/punctuation issues in subtitles" processors = [ # normalize hyphens NReProcessor(re.compile(ur'(?u)([‑‐﹘﹣])'), u"-", name="CM_hyphens"), # -- = em dash NReProcessor(re.compile(r'(?u)(\w|\b|\s|^)(-\s?-{1,2})'), ur"\1—", name="CM_multidash"), # line = _/-/\s NReProcessor(re.compile(r'(?u)(^\W*[-_.:]+\W*$)'), "", name="CM_non_word_only"), # line = : text NReProcessor(re.compile(r'(?u)(^\W*:\s*(?=\w+))'), "", name="CM_empty_colon_start"), # multi space NReProcessor(re.compile(r'(?u)(\s{2,})'), " ", name="CM_multi_space"), # fix music symbols NReProcessor(re.compile(ur'(?u)(?:^[-\s]*[*#¶]+(?![^\s\-*#¶]))|(?:[*#¶]+\s*$)'), u"♪", name="CM_music_symbols"), # '' = " NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛][\'’ʼ❜‘‛]+)'), u'"', name="CM_double_apostrophe"), # double quotes instead of single quotes inside words NReProcessor(re.compile(ur'(?u)([A-zÀ-ž])"([A-zÀ-ž])'), ur"\1'\2", name="CM_double_as_single"), # normalize quotes NReProcessor(re.compile(ur'(?u)(\s*["”“‟„])\s*(["”“‟„]["”“‟„\s]*)'), lambda match: '"' + (" " if match.group(2).endswith(" ") else ""), name="CM_normalize_quotes"), # normalize single quotes NReProcessor(re.compile(ur'(?u)([\'’ʼ❜‘‛])'), u"'", name="CM_normalize_squotes"), # remove leading ... NReProcessor(re.compile(r'(?u)^\.\.\.[\s]*'), "", name="CM_leading_ellipsis"), # remove "downloaded from" tags NReProcessor(re.compile(r'(?ui).+downloaded\s+from.+'), "", name="CM_crap"), # no space after ellipsis NReProcessor(re.compile(r'(?u)\.\.\.(?![\s.,!?\'"])(?!$)'), "... ", name="CM_ellipsis_no_space"), # no space before spaced ellipsis NReProcessor(re.compile(r'(?u)(?<=[^\s])(?> NReProcessor(re.compile(r'(?u)^\s?>>\s*'), "", name="CM_leading_crocodiles"), # replace uppercase I with lowercase L in words NReProcessor(re.compile(ur'(?u)([a-zà-ž]+)(I+)'), lambda match: ur'%s%s' % (match.group(1), "l" * len(match.group(2))), name="CM_uppercase_i_in_word"), # fix spaces in numbers (allows for punctuation: ,.:' (comma/dot only fixed if after space, those may be # countdowns otherwise); don't break up ellipses NReProcessor( re.compile(r'(?u)(\b[0-9]+[0-9:\']*(?