bazarr/libs/subzero/modification/main.py
2018-10-31 17:08:29 +01:00

381 lines
14 KiB
Python

# coding=utf-8
import traceback
import re
import pysubs2
import logging
import time
from mods import EMPTY_TAG_PROCESSOR, EmptyEntryError
from registry import registry
from subzero.language import Language
logger = logging.getLogger(__name__)
lowercase_re = re.compile(ur'(?sux)[a-zà-ž]')
class SubtitleModifications(object):
debug = False
language = None
initialized_mods = {}
mods_used = []
only_uppercase = False
f = None
font_style_tag_start = u"{\\"
def __init__(self, debug=False):
self.debug = debug
self.initialized_mods = {}
self.mods_used = []
def load(self, fn=None, content=None, language=None, encoding="utf-8"):
"""
:param encoding: used for decoding the content when fn is given, not used in case content is given
:param language: babelfish.Language language of the subtitle
:param fn: filename
:param content: unicode
:return:
"""
if language:
self.language = Language.rebuild(language, forced=False)
self.initialized_mods = {}
try:
if fn:
self.f = pysubs2.load(fn, encoding=encoding)
elif content:
self.f = pysubs2.SSAFile.from_string(content)
except (IOError,
UnicodeDecodeError,
pysubs2.exceptions.UnknownFPSError,
pysubs2.exceptions.UnknownFormatIdentifierError,
pysubs2.exceptions.FormatAutodetectionError):
if fn:
logger.exception("Couldn't load subtitle: %s: %s", fn, traceback.format_exc())
elif content:
logger.exception("Couldn't load subtitle: %s", traceback.format_exc())
return bool(self.f)
@classmethod
def parse_identifier(cls, identifier):
# simple identifier
if identifier in registry.mods:
return identifier, {}
# identifier with params; identifier(param=value)
split_args = identifier[identifier.find("(")+1:-1].split(",")
args = dict((key, value) for key, value in [sub.split("=") for sub in split_args])
return identifier[:identifier.find("(")], args
@classmethod
def get_mod_class(cls, identifier):
identifier, args = cls.parse_identifier(identifier)
return registry.mods[identifier]
@classmethod
def get_mod_signature(cls, identifier, **kwargs):
return cls.get_mod_class(identifier).get_signature(**kwargs)
def prepare_mods(self, *mods):
parsed_mods = [(SubtitleModifications.parse_identifier(mod), mod) for mod in mods]
final_mods = {}
line_mods = []
non_line_mods = []
used_mods = []
mods_merged = {}
mods_merged_log = {}
for mod_data, orig_identifier in parsed_mods:
identifier, args = mod_data
if identifier not in registry.mods:
logger.error("Mod %s not loaded", identifier)
continue
mod_cls = registry.mods[identifier]
# exclusive mod, kill old, use newest
if identifier in final_mods and mod_cls.exclusive:
final_mods.pop(identifier)
# language-specific mod, check validity
if mod_cls.languages and self.language not in mod_cls.languages:
if self.debug:
logger.debug("Skipping %s, because %r is not a valid language for this mod",
identifier, self.language)
continue
if mod_cls.only_uppercase and not self.only_uppercase:
if self.debug:
logger.debug("Skipping %s, because the subtitle isn't all uppercase", identifier)
continue
# merge args of duplicate mods if possible
elif mod_cls.args_mergeable and identifier in mods_merged:
mods_merged[identifier] = mod_cls.merge_args(mods_merged[identifier], args)
mods_merged_log[identifier]["identifiers"].append(orig_identifier)
continue
if mod_cls.args_mergeable:
mods_merged[identifier] = mod_cls.merge_args(args, {})
mods_merged_log[identifier] = {"identifiers": [orig_identifier], "final_identifier": orig_identifier}
used_mods.append("%s_ORIG_POSITION" % identifier)
continue
final_mods[identifier] = args
used_mods.append(orig_identifier)
# finalize merged mods into final and used mods
for identifier, args in mods_merged.iteritems():
pos_preserve_index = used_mods.index("%s_ORIG_POSITION" % identifier)
# clear empty mods after merging
if not any(args.values()):
if self.debug:
logger.debug("Skipping %s, empty args", identifier)
if pos_preserve_index > -1:
used_mods.pop(pos_preserve_index)
mods_merged_log.pop(identifier)
continue
# clear empty args
final_mod_args = dict(filter(lambda (k, v): bool(v), args.iteritems()))
_data = SubtitleModifications.get_mod_signature(identifier, **final_mod_args)
if _data == mods_merged_log[identifier]["final_identifier"]:
mods_merged_log.pop(identifier)
else:
mods_merged_log[identifier]["final_identifier"] = _data
if pos_preserve_index > -1:
used_mods[pos_preserve_index] = _data
else:
# should never happen
used_mods.append(_data)
final_mods[identifier] = args
if self.debug:
for identifier, data in mods_merged_log.iteritems():
logger.debug("Merged %s to %s", data["identifiers"], data["final_identifier"])
# separate all mods into line and non-line mods
for identifier, args in final_mods.iteritems():
mod_cls = registry.mods[identifier]
if mod_cls.modifies_whole_file:
non_line_mods.append((identifier, args))
else:
line_mods.append((mod_cls.order, identifier, args))
# initialize the mods
if identifier not in self.initialized_mods:
self.initialized_mods[identifier] = mod_cls(self)
return line_mods, non_line_mods, used_mods
def detect_uppercase(self):
entries_used = 0
for entry in self.f:
entry_used = False
for sub in entry.text.strip().split("\N"):
# skip HI bracket entries, those might actually be lowercase
sub = sub.strip()
for processor in registry.mods["remove_HI"].processors[:4]:
sub = processor.process(sub)
if sub.strip():
if lowercase_re.search(sub):
return False
entry_used = True
else:
# skip full entry
break
if entry_used:
entries_used += 1
if entries_used == 40:
break
return True
def modify(self, *mods):
new_entries = []
start = time.time()
self.only_uppercase = self.detect_uppercase()
if self.only_uppercase and self.debug:
logger.debug("Full-uppercase subtitle found")
line_mods, non_line_mods, mods_used = self.prepare_mods(*mods)
self.mods_used = mods_used
# apply non-last file mods
if non_line_mods:
non_line_mods_start = time.time()
self.apply_non_line_mods(non_line_mods)
if self.debug:
logger.debug("Non-Line mods took %ss", time.time() - non_line_mods_start)
# sort line mods
line_mods.sort(key=lambda x: (x is None, x))
# apply line mods
if line_mods:
line_mods_start = time.time()
self.apply_line_mods(new_entries, line_mods)
if self.debug:
logger.debug("Line mods took %ss", time.time() - line_mods_start)
if new_entries:
self.f.events = new_entries
# apply last file mods
if non_line_mods:
non_line_mods_start = time.time()
self.apply_non_line_mods(non_line_mods, only_last=True)
if self.debug:
logger.debug("Final Non-Line mods took %ss", time.time() - non_line_mods_start)
if self.debug:
logger.debug("Subtitle Modification took %ss", time.time() - start)
logger.debug("Mods applied: %s" % self.mods_used)
def apply_non_line_mods(self, mods, only_last=False):
for identifier, args in mods:
mod = self.initialized_mods[identifier]
if (not only_last and not mod.apply_last) or (only_last and mod.apply_last):
if self.debug:
logger.debug("Applying %s", identifier)
mod.modify(None, debug=self.debug, parent=self, **args)
def apply_line_mods(self, new_entries, mods):
for index, entry in enumerate(self.f, 1):
applied_mods = []
lines = []
line_count = 0
start_tags = []
end_tags = []
t = entry.text.strip()
if not t:
if self.debug:
logger.debug(u"Skipping empty line: %s", index)
continue
skip_entry = False
for line in t.split(ur"\N"):
# don't bother the mods with surrounding tags
old_line = line
line = line.strip()
skip_line = False
line_count += 1
if not line:
continue
# clean {\X0} tags before processing
# fixme: handle nested tags?
start_tag = u""
end_tag = u""
if line.startswith(self.font_style_tag_start):
start_tag = line[:5]
line = line[5:]
if line[-5:-3] == self.font_style_tag_start:
end_tag = line[-5:]
line = line[:-5]
for order, identifier, args in mods:
mod = self.initialized_mods[identifier]
try:
line = mod.modify(line.strip(), entry=entry.text, debug=self.debug, parent=self, index=index,
**args)
except EmptyEntryError:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, entry.text)
skip_entry = True
break
if not line:
if self.debug:
logger.debug(u"%d: %s: %r -> ''", index, identifier, old_line)
skip_line = True
break
applied_mods.append(identifier)
if skip_entry:
lines = []
break
if skip_line:
continue
if start_tag:
start_tags.append(start_tag)
if end_tag:
end_tags.append(end_tag)
# append new line and clean possibly newly added empty tags
cleaned_line = EMPTY_TAG_PROCESSOR.process(start_tag + line + end_tag, debug=self.debug).strip()
if cleaned_line:
# we may have a single closing tag, if so, try appending it to the previous line
if len(cleaned_line) == 5 and cleaned_line.startswith("{\\") and cleaned_line.endswith("0}"):
if lines:
prev_line = lines.pop()
lines.append(prev_line + cleaned_line)
continue
lines.append(cleaned_line)
else:
if self.debug:
logger.debug(u"%d: Ditching now empty line (%r)", index, line)
if not lines:
# don't bother logging when the entry only had one line
if self.debug and line_count > 1:
logger.debug(u"%d: %r -> ''", index, entry.text)
continue
new_text = ur"\N".join(lines)
# cheap man's approach to avoid open tags
add_start_tags = []
add_end_tags = []
if len(start_tags) != len(end_tags):
for tag in start_tags:
end_tag = tag.replace("1", "0")
if end_tag not in end_tags and new_text.count(tag) > new_text.count(end_tag):
add_end_tags.append(end_tag)
for tag in end_tags:
start_tag = tag.replace("0", "1")
if start_tag not in start_tags and new_text.count(tag) > new_text.count(start_tag):
add_start_tags.append(start_tag)
if add_end_tags or add_start_tags:
entry.text = u"".join(add_start_tags) + new_text + u"".join(add_end_tags)
if self.debug:
logger.debug(u"Fixing tags: %s (%r -> %r)", str(add_start_tags+add_end_tags), new_text,
entry.text)
else:
entry.text = new_text
else:
entry.text = new_text
new_entries.append(entry)
SubMod = SubtitleModifications