bazarr/libs/srt_tools/srt-deduplicate

96 lines
2.9 KiB
Python
Executable file

#!/usr/bin/env python
"""Deduplicate repeated subtitles."""
import datetime
import srt_tools.utils
import logging
import operator
log = logging.getLogger(__name__)
try: # Python 2
range = xrange # pytype: disable=name-error
except NameError:
pass
def parse_args():
examples = {
"Remove duplicated subtitles within 5 seconds of each other": "srt deduplicate -i duplicated.srt",
"Remove duplicated subtitles within 500 milliseconds of each other": "srt deduplicate -t 500 -i duplicated.srt",
"Remove duplicated subtitles regardless of temporal proximity": "srt deduplicate -t 0 -i duplicated.srt",
}
parser = srt_tools.utils.basic_parser(
description=__doc__,
examples=examples,
)
parser.add_argument(
"-t",
"--ms",
metavar="MILLISECONDS",
default=datetime.timedelta(milliseconds=5000),
type=lambda ms: datetime.timedelta(milliseconds=int(ms)),
help="how many milliseconds distance a subtitle start time must be "
"within of another to be considered a duplicate "
"(default: 5000ms)",
)
return parser.parse_args()
def deduplicate_subs(orig_subs, acceptable_diff):
"""Remove subtitles with duplicated content."""
indices_to_remove = set()
# If we only store the subtitle itself and compare that, it's possible that
# we'll not only remove the duplicate, but also the _original_ subtitle if
# they have the same sub index/times/etc.
#
# As such, we need to also store the index in the original subs list that
# this entry belongs to for each subtitle prior to sorting.
sorted_subs = sorted(
enumerate(orig_subs), key=lambda sub: (sub[1].content, sub[1].start)
)
for subs in srt_tools.utils.sliding_window(sorted_subs, width=2, inclusive=False):
cur_idx, cur_sub = subs[0]
next_idx, next_sub = subs[1]
if cur_sub.content == next_sub.content and (
not acceptable_diff or cur_sub.start + acceptable_diff >= next_sub.start
):
log.debug(
"Marking l%d/s%d for removal, duplicate of l%d/s%d",
next_idx,
next_sub.index,
cur_idx,
cur_sub.index,
)
indices_to_remove.add(next_idx)
offset = 0
for idx in indices_to_remove:
del orig_subs[idx - offset]
offset += 1
def main():
args = parse_args()
logging.basicConfig(level=args.log_level)
srt_tools.utils.set_basic_args(args)
subs = list(args.input)
deduplicate_subs(subs, args.ms)
output = srt_tools.utils.compose_suggest_on_fail(subs, strict=args.strict)
try:
args.output.write(output)
except (UnicodeEncodeError, TypeError): # Python 2 fallback
args.output.write(output.encode(args.encoding))
if __name__ == "__main__": # pragma: no cover
main()