diff --git a/libs/pysubs2/__init__.py b/libs/pysubs2/__init__.py index 55ec2ede5..af37dc98a 100644 --- a/libs/pysubs2/__init__.py +++ b/libs/pysubs2/__init__.py @@ -10,3 +10,6 @@ load = SSAFile.load #: Alias for :meth:`pysubs2.time.make_time()`. make_time = time.make_time + +#: Alias for `pysubs2.common.VERSION`. +__version__ = VERSION diff --git a/libs/pysubs2/cli.py b/libs/pysubs2/cli.py index fc82bf9b5..020f373fb 100644 --- a/libs/pysubs2/cli.py +++ b/libs/pysubs2/cli.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals, print_function import argparse import codecs import os @@ -8,38 +7,39 @@ import io from io import open import sys from textwrap import dedent -from .formats import get_file_extension +from .formats import get_file_extension, FORMAT_IDENTIFIERS from .time import make_time from .ssafile import SSAFile -from .common import PY3, VERSION +from .common import VERSION +import logging -def positive_float(s): +def positive_float(s: str) -> float: x = float(s) if not x > 0: raise argparse.ArgumentTypeError("%r is not a positive number" % s) return x -def character_encoding(s): +def character_encoding(s: str) -> str: try: codecs.lookup(s) return s except LookupError: raise argparse.ArgumentError -def time(s): +def time(s: str): d = {} for v, k in re.findall(r"(\d*\.?\d*)(ms|m|s|h)", s): d[k] = float(v) return make_time(**d) -def change_ext(path, ext): +def change_ext(path: str, ext: str) -> str: base, _ = op.splitext(path) return base + ext -class Pysubs2CLI(object): +class Pysubs2CLI: def __init__(self): parser = self.parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, prog="pysubs2", @@ -50,6 +50,7 @@ class Pysubs2CLI(object): epilog=dedent(""" usage examples: python -m pysubs2 --to srt *.ass + python -m pysubs2 --to srt --clean *.ass python -m pysubs2 --to microdvd --fps 23.976 *.ass python -m pysubs2 --shift 0.3s *.srt python -m pysubs2 --shift 0.3s retimed_file.srt @@ -57,21 +58,21 @@ class Pysubs2CLI(object): python -m pysubs2 --transform-framerate 25 23.976 *.srt""")) parser.add_argument("files", nargs="*", metavar="FILE", - help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt) or " - "MicroDVD (*.sub) formats. When no files are specified, pysubs2 will work as a pipe, " - "reading from standard input and writing to standard output.") + help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt), " + "MicroDVD (*.sub) or other supported format. When no files are specified, " + "pysubs2 will work as a pipe, reading from standard input and writing to standard output.") parser.add_argument("-v", "--version", action="version", version="pysubs2 %s" % VERSION) - parser.add_argument("-f", "--from", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="input_format", + parser.add_argument("-f", "--from", choices=FORMAT_IDENTIFIERS, dest="input_format", help="By default, subtitle format is detected from the file. This option can be used to " "skip autodetection and force specific format. Generally, it should never be needed.") - parser.add_argument("-t", "--to", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="output_format", + parser.add_argument("-t", "--to", choices=FORMAT_IDENTIFIERS, dest="output_format", help="Convert subtitle files to given format. By default, each file is saved in its " "original format.") - parser.add_argument("--input-enc", metavar="ENCODING", default="iso-8859-1", type=character_encoding, - help="Character encoding for input files. By default, ISO-8859-1 is used for both " - "input and output, which should generally work (for 8-bit encodings).") + parser.add_argument("--input-enc", metavar="ENCODING", default="utf-8", type=character_encoding, + help="Character encoding for input files. By default, UTF-8 is used for both " + "input and output.") parser.add_argument("--output-enc", metavar="ENCODING", type=character_encoding, help="Character encoding for output files. By default, it is the same as input encoding. " "If you wish to convert between encodings, make sure --input-enc is set correctly! " @@ -85,6 +86,11 @@ class Pysubs2CLI(object): help="Use this to save all files to given directory. By default, every file is saved to its parent directory, " "ie. unless it's being saved in different subtitle format (and thus with different file extension), " "it overwrites the original file.") + parser.add_argument("--clean", action="store_true", + help="Attempt to remove non-essential subtitles (eg. karaoke, SSA drawing tags), " + "strip styling information when saving to non-SSA formats") + parser.add_argument("--verbose", action="store_true", + help="Print misc logging") group = parser.add_mutually_exclusive_group() @@ -105,6 +111,9 @@ class Pysubs2CLI(object): args = self.parser.parse_args(argv) errors = 0 + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + if args.output_dir and not op.exists(args.output_dir): os.makedirs(args.output_dir) @@ -138,19 +147,15 @@ class Pysubs2CLI(object): outpath = op.join(args.output_dir, filename) with open(outpath, "w", encoding=args.output_enc) as outfile: - subs.to_file(outfile, output_format, args.fps) + subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean) else: - if PY3: - infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc) - outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc) - else: - infile = io.TextIOWrapper(sys.stdin, args.input_enc) - outfile = io.TextIOWrapper(sys.stdout, args.output_enc) + infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc) + outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc) subs = SSAFile.from_file(infile, args.input_format, args.fps) self.process(subs, args) output_format = args.output_format or subs.format - subs.to_file(outfile, output_format, args.fps) + subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean) return (0 if errors == 0 else 1) @@ -164,6 +169,9 @@ class Pysubs2CLI(object): in_fps, out_fps = args.transform_framerate subs.transform_framerate(in_fps, out_fps) + if args.clean: + subs.remove_miscellaneous_events() + def __main__(): cli = Pysubs2CLI() diff --git a/libs/pysubs2/common.py b/libs/pysubs2/common.py index 4688e5df4..fcea1bf13 100644 --- a/libs/pysubs2/common.py +++ b/libs/pysubs2/common.py @@ -1,30 +1,32 @@ -from collections import namedtuple -import sys +from dataclasses import dataclass +from typing import Union -_Color = namedtuple("Color", "r g b a") -class Color(_Color): +@dataclass(init=False) +class Color: """ - (r, g, b, a) namedtuple for 8-bit RGB color with alpha channel. + 8-bit RGB color with alpha channel. All values are ints from 0 to 255. """ - def __new__(cls, r, g, b, a=0): + r: int + g: int + b: int + a: int = 0 + + def __init__(self, r: int, g: int, b: int, a: int = 0): for value in r, g, b, a: if value not in range(256): raise ValueError("Color channels must have values 0-255") - return _Color.__new__(cls, r, g, b, a) + self.r = r + self.g = g + self.b = b + self.a = a + #: Version of the pysubs2 library. -VERSION = "0.2.4" +VERSION = "1.2.0" -PY3 = sys.version_info.major == 3 - -if PY3: - text_type = str - binary_string_type = bytes -else: - text_type = unicode - binary_string_type = str +IntOrFloat = Union[int, float] diff --git a/libs/pysubs2/exceptions.py b/libs/pysubs2/exceptions.py index b9d528524..9568fa52f 100644 --- a/libs/pysubs2/exceptions.py +++ b/libs/pysubs2/exceptions.py @@ -1,17 +1,22 @@ class Pysubs2Error(Exception): """Base class for pysubs2 exceptions.""" + class UnknownFPSError(Pysubs2Error): """Framerate was not specified and couldn't be inferred otherwise.""" + class UnknownFileExtensionError(Pysubs2Error): """File extension does not pertain to any known subtitle format.""" + class UnknownFormatIdentifierError(Pysubs2Error): """Unknown subtitle format identifier (ie. string like ``"srt"``).""" + class FormatAutodetectionError(Pysubs2Error): """Subtitle format is ambiguous or unknown.""" + class ContentNotUsable(Pysubs2Error): """Current content not usable for specified format""" diff --git a/libs/pysubs2/formatbase.py b/libs/pysubs2/formatbase.py index 1f336618a..21ea9c4f8 100644 --- a/libs/pysubs2/formatbase.py +++ b/libs/pysubs2/formatbase.py @@ -1,4 +1,8 @@ -class FormatBase(object): +from typing import Optional +import io + + +class FormatBase: """ Base class for subtitle format implementations. @@ -14,7 +18,7 @@ class FormatBase(object): """ @classmethod - def from_file(cls, subs, fp, format_, **kwargs): + def from_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs): """ Load subtitle file into an empty SSAFile. @@ -37,7 +41,7 @@ class FormatBase(object): raise NotImplementedError("Parsing is not supported for this format") @classmethod - def to_file(cls, subs, fp, format_, **kwargs): + def to_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs): """ Write SSAFile into a file. @@ -62,7 +66,7 @@ class FormatBase(object): raise NotImplementedError("Writing is not supported for this format") @classmethod - def guess_format(self, text): + def guess_format(self, text: str) -> Optional[str]: """ Return format identifier of recognized format, or None. diff --git a/libs/pysubs2/formats.py b/libs/pysubs2/formats.py index 869a3b6c7..7ce3a1cb8 100644 --- a/libs/pysubs2/formats.py +++ b/libs/pysubs2/formats.py @@ -1,3 +1,5 @@ +from typing import Dict, Type + from .formatbase import FormatBase from .microdvd import MicroDVDFormat from .subrip import SubripFormat @@ -5,20 +7,22 @@ from .jsonformat import JSONFormat from .substation import SubstationFormat from .mpl2 import MPL2Format from .tmp import TmpFormat +from .webvtt import WebVTTFormat from .exceptions import * #: Dict mapping file extensions to format identifiers. -FILE_EXTENSION_TO_FORMAT_IDENTIFIER = { +FILE_EXTENSION_TO_FORMAT_IDENTIFIER: Dict[str, str] = { ".srt": "srt", ".ass": "ass", ".ssa": "ssa", ".sub": "microdvd", ".json": "json", ".txt": "tmp", + ".vtt": "vtt", } #: Dict mapping format identifiers to implementations (FormatBase subclasses). -FORMAT_IDENTIFIER_TO_FORMAT_CLASS = { +FORMAT_IDENTIFIER_TO_FORMAT_CLASS: Dict[str, Type[FormatBase]] = { "srt": SubripFormat, "ass": SubstationFormat, "ssa": SubstationFormat, @@ -26,23 +30,29 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = { "json": JSONFormat, "mpl2": MPL2Format, "tmp": TmpFormat, + "vtt": WebVTTFormat, } -def get_format_class(format_): +FORMAT_IDENTIFIERS = list(FORMAT_IDENTIFIER_TO_FORMAT_CLASS.keys()) + + +def get_format_class(format_: str) -> Type[FormatBase]: """Format identifier -> format class (ie. subclass of FormatBase)""" try: return FORMAT_IDENTIFIER_TO_FORMAT_CLASS[format_] except KeyError: raise UnknownFormatIdentifierError(format_) -def get_format_identifier(ext): + +def get_format_identifier(ext: str) -> str: """File extension -> format identifier""" try: return FILE_EXTENSION_TO_FORMAT_IDENTIFIER[ext] except KeyError: raise UnknownFileExtensionError(ext) -def get_file_extension(format_): + +def get_file_extension(format_: str) -> str: """Format identifier -> file extension""" if format_ not in FORMAT_IDENTIFIER_TO_FORMAT_CLASS: raise UnknownFormatIdentifierError(format_) @@ -53,7 +63,8 @@ def get_file_extension(format_): raise RuntimeError("No file extension for format %r" % format_) -def autodetect_format(content): + +def autodetect_format(content: str) -> str: """Return format identifier for given fragment or raise FormatAutodetectionError.""" formats = set() for impl in FORMAT_IDENTIFIER_TO_FORMAT_CLASS.values(): diff --git a/libs/pysubs2/jsonformat.py b/libs/pysubs2/jsonformat.py index cbd8c29c8..df838ee92 100644 --- a/libs/pysubs2/jsonformat.py +++ b/libs/pysubs2/jsonformat.py @@ -1,20 +1,35 @@ -from __future__ import unicode_literals, print_function - +import dataclasses import json -from .common import Color, PY3 +from .common import Color from .ssaevent import SSAEvent from .ssastyle import SSAStyle from .formatbase import FormatBase +# We're using Color dataclass +# https://stackoverflow.com/questions/51286748/make-the-python-json-encoder-support-pythons-new-dataclasses +class EnhancedJSONEncoder(json.JSONEncoder): + def default(self, o): + if dataclasses.is_dataclass(o): + return dataclasses.asdict(o) + return super().default(o) + + class JSONFormat(FormatBase): + """ + Implementation of JSON subtitle pseudo-format (serialized pysubs2 internal representation) + + This is essentially SubStation Alpha as JSON. + """ @classmethod def guess_format(cls, text): + """See :meth:`pysubs2.formats.FormatBase.guess_format()`""" if text.startswith("{\""): return "json" @classmethod def from_file(cls, subs, fp, format_, **kwargs): + """See :meth:`pysubs2.formats.FormatBase.from_file()`""" data = json.load(fp) subs.info.clear() @@ -25,7 +40,7 @@ class JSONFormat(FormatBase): subs.styles[name] = sty = SSAStyle() for k, v in fields.items(): if "color" in k: - setattr(sty, k, Color(*v)) + setattr(sty, k, Color(**v)) else: setattr(sty, k, v) @@ -33,14 +48,11 @@ class JSONFormat(FormatBase): @classmethod def to_file(cls, subs, fp, format_, **kwargs): + """See :meth:`pysubs2.formats.FormatBase.to_file()`""" data = { "info": dict(**subs.info), "styles": {name: sty.as_dict() for name, sty in subs.styles.items()}, "events": [ev.as_dict() for ev in subs.events] } - if PY3: - json.dump(data, fp) - else: - text = json.dumps(data, fp) - fp.write(unicode(text)) + json.dump(data, fp, cls=EnhancedJSONEncoder) diff --git a/libs/pysubs2/microdvd.py b/libs/pysubs2/microdvd.py index 04b769be0..4114b358e 100644 --- a/libs/pysubs2/microdvd.py +++ b/libs/pysubs2/microdvd.py @@ -1,8 +1,5 @@ -from __future__ import unicode_literals, print_function - from functools import partial import re -from .common import text_type from .exceptions import UnknownFPSError from .ssaevent import SSAEvent from .ssastyle import SSAStyle @@ -15,13 +12,16 @@ MICRODVD_LINE = re.compile(r" *\{ *(\d+) *\} *\{ *(\d+) *\}(.+)") class MicroDVDFormat(FormatBase): + """MicroDVD subtitle format implementation""" @classmethod def guess_format(cls, text): + """See :meth:`pysubs2.formats.FormatBase.guess_format()`""" if any(map(MICRODVD_LINE.match, text.splitlines())): return "microdvd" @classmethod def from_file(cls, subs, fp, format_, fps=None, **kwargs): + """See :meth:`pysubs2.formats.FormatBase.from_file()`""" for line in fp: match = MICRODVD_LINE.match(line) if not match: @@ -63,7 +63,18 @@ class MicroDVDFormat(FormatBase): subs.append(ev) @classmethod - def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, **kwargs): + def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, apply_styles=True, **kwargs): + """ + See :meth:`pysubs2.formats.FormatBase.to_file()` + + The only supported styling is marking whole lines italic. + + Keyword args: + write_fps_declaration: If True, create a zero-duration first subtitle which will contain + the fps. + apply_styles: If False, do not write any styling. + + """ if fps is None: fps = subs.fps @@ -83,11 +94,14 @@ class MicroDVDFormat(FormatBase): # insert an artificial first line telling the framerate if write_fps_declaration: - subs.insert(0, SSAEvent(start=0, end=0, text=text_type(fps))) + subs.insert(0, SSAEvent(start=0, end=0, text=str(fps))) + + for line in subs: + if line.is_comment or line.is_drawing: + continue - for line in (ev for ev in subs if not ev.is_comment): text = "|".join(line.plaintext.splitlines()) - if is_entirely_italic(line): + if apply_styles and is_entirely_italic(line): text = "{Y:i}" + text start, end = map(to_frames, (line.start, line.end)) diff --git a/libs/pysubs2/mpl2.py b/libs/pysubs2/mpl2.py index 5c90bb4f8..3719a2336 100644 --- a/libs/pysubs2/mpl2.py +++ b/libs/pysubs2/mpl2.py @@ -1,6 +1,3 @@ -# coding=utf-8 - -from __future__ import print_function, division, unicode_literals import re from .time import times_to_ms @@ -13,13 +10,16 @@ MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*)") class MPL2Format(FormatBase): + """MPL2 subtitle format implementation""" @classmethod def guess_format(cls, text): + """See :meth:`pysubs2.formats.FormatBase.guess_format()`""" if MPL2_FORMAT.search(text): return "mpl2" @classmethod def from_file(cls, subs, fp, format_, **kwargs): + """See :meth:`pysubs2.formats.FormatBase.from_file()`""" def prepare_text(lines): out = [] for s in lines.split("|"): @@ -37,7 +37,12 @@ class MPL2Format(FormatBase): @classmethod def to_file(cls, subs, fp, format_, **kwargs): + """ + See :meth:`pysubs2.formats.FormatBase.to_file()` + No styling is supported at the moment. + + """ # TODO handle italics for line in subs: if line.is_comment: diff --git a/libs/pysubs2/ssaevent.py b/libs/pysubs2/ssaevent.py index 4d9dac809..36284c93a 100644 --- a/libs/pysubs2/ssaevent.py +++ b/libs/pysubs2/ssaevent.py @@ -1,10 +1,14 @@ -from __future__ import unicode_literals import re +import warnings +from typing import Optional, Dict, Any, ClassVar +import dataclasses + +from .common import IntOrFloat from .time import ms_to_str, make_time -from .common import PY3 -class SSAEvent(object): +@dataclasses.dataclass(repr=False, eq=False, order=False) +class SSAEvent: """ A SubStation Event, ie. one subtitle. @@ -21,36 +25,29 @@ class SSAEvent(object): >>> ev = SSAEvent(start=make_time(s=1), end=make_time(s=2.5), text="Hello World!") """ - OVERRIDE_SEQUENCE = re.compile(r"{[^}]*}") + OVERRIDE_SEQUENCE: ClassVar = re.compile(r"{[^}]*}") - #: All fields in SSAEvent. - FIELDS = frozenset([ - "start", "end", "text", "marked", "layer", "style", - "name", "marginl", "marginr", "marginv", "effect", "type" - ]) - - def __init__(self, **fields): - self.start = 0 #: Subtitle start time (in milliseconds) - self.end = 10000 #: Subtitle end time (in milliseconds) - self.text = "" #: Text of subtitle (with SubStation override tags) - self.marked = False #: (SSA only) - self.layer = 0 #: Layer number, 0 is the lowest layer (ASS only) - self.style = "Default" #: Style name - self.name = "" #: Actor name - self.marginl = 0 #: Left margin - self.marginr = 0 #: Right margin - self.marginv = 0 #: Vertical margin - self.effect = "" #: Line effect - self.type = "Dialogue" #: Line type (Dialogue/Comment) - - for k, v in fields.items(): - if k in self.FIELDS: - setattr(self, k, v) - else: - raise ValueError("SSAEvent has no field named %r" % k) + start: int = 0 #: Subtitle start time (in milliseconds) + end: int = 10000 #: Subtitle end time (in milliseconds) + text: str = "" #: Text of subtitle (with SubStation override tags) + marked: bool = False #: (SSA only) + layer: int = 0 #: Layer number, 0 is the lowest layer (ASS only) + style: str = "Default" #: Style name + name: str = "" #: Actor name + marginl: int = 0 #: Left margin + marginr: int = 0 #: Right margin + marginv: int = 0 #: Vertical margin + effect: str = "" #: Line effect + type: str = "Dialogue" #: Line type (Dialogue/Comment) @property - def duration(self): + def FIELDS(self): + """All fields in SSAEvent.""" + warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning) + return frozenset(field.name for field in dataclasses.fields(self)) + + @property + def duration(self) -> IntOrFloat: """ Subtitle duration in milliseconds (read/write property). @@ -60,14 +57,14 @@ class SSAEvent(object): return self.end - self.start @duration.setter - def duration(self, ms): + def duration(self, ms: int): if ms >= 0: self.end = self.start + ms else: raise ValueError("Subtitle duration cannot be negative") @property - def is_comment(self): + def is_comment(self) -> bool: """ When true, the subtitle is a comment, ie. not visible (read/write property). @@ -77,14 +74,20 @@ class SSAEvent(object): return self.type == "Comment" @is_comment.setter - def is_comment(self, value): + def is_comment(self, value: bool): if value: self.type = "Comment" else: self.type = "Dialogue" @property - def plaintext(self): + def is_drawing(self) -> bool: + """Returns True if line is SSA drawing tag (ie. not text)""" + from .substation import parse_tags + return any(sty.drawing for _, sty in parse_tags(self.text)) + + @property + def plaintext(self) -> str: """ Subtitle text as multi-line string with no tags (read/write property). @@ -99,10 +102,11 @@ class SSAEvent(object): return text @plaintext.setter - def plaintext(self, text): + def plaintext(self, text: str): self.text = text.replace("\n", r"\N") - def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None): + def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0, + frames: Optional[int]=None, fps: Optional[float]=None): """ Shift start and end times. @@ -113,41 +117,39 @@ class SSAEvent(object): self.start += delta self.end += delta - def copy(self): + def copy(self) -> "SSAEvent": """Return a copy of the SSAEvent.""" return SSAEvent(**self.as_dict()) - def as_dict(self): - return {field: getattr(self, field) for field in self.FIELDS} + def as_dict(self) -> Dict[str, Any]: + # dataclasses.asdict() would recursively dictify Color objects, which we don't want + return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} - def equals(self, other): + def equals(self, other: "SSAEvent") -> bool: """Field-based equality for SSAEvents.""" if isinstance(other, SSAEvent): return self.as_dict() == other.as_dict() else: raise TypeError("Cannot compare to non-SSAEvent object") - def __eq__(self, other): + def __eq__(self, other: "SSAEvent"): # XXX document this return self.start == other.start and self.end == other.end - def __ne__(self, other): + def __ne__(self, other: "SSAEvent"): return self.start != other.start or self.end != other.end - def __lt__(self, other): + def __lt__(self, other: "SSAEvent"): return (self.start, self.end) < (other.start, other.end) - def __le__(self, other): + def __le__(self, other: "SSAEvent"): return (self.start, self.end) <= (other.start, other.end) - def __gt__(self, other): + def __gt__(self, other: "SSAEvent"): return (self.start, self.end) > (other.start, other.end) - def __ge__(self, other): + def __ge__(self, other: "SSAEvent"): return (self.start, self.end) >= (other.start, other.end) def __repr__(self): - s = "".format( - self=self, start=ms_to_str(self.start), end=ms_to_str(self.end)) - if not PY3: s = s.encode("utf-8") - return s + return f"" diff --git a/libs/pysubs2/ssafile.py b/libs/pysubs2/ssafile.py index 390a31b54..0c87812f7 100644 --- a/libs/pysubs2/ssafile.py +++ b/libs/pysubs2/ssafile.py @@ -1,16 +1,17 @@ -from __future__ import print_function, unicode_literals, division -from collections import MutableSequence, OrderedDict +from collections import MutableSequence import io from io import open -from itertools import starmap, chain +from itertools import chain import os.path import logging +from typing import Optional, List, Dict, Iterable, Any + +from .common import IntOrFloat from .formats import autodetect_format, get_format_class, get_format_identifier from .substation import is_valid_field_content from .ssaevent import SSAEvent from .ssastyle import SSAStyle from .time import make_time, ms_to_str -from .common import PY3 class SSAFile(MutableSequence): @@ -31,28 +32,37 @@ class SSAFile(MutableSequence): """ - DEFAULT_INFO = OrderedDict([ - ("WrapStyle", "0"), - ("ScaledBorderAndShadow", "yes"), - ("Collisions", "Normal")]) + DEFAULT_INFO = { + "WrapStyle": "0", + "ScaledBorderAndShadow": "yes", + "Collisions": "Normal" + } def __init__(self): - self.events = [] #: List of :class:`SSAEvent` instances, ie. individual subtitles. - self.styles = OrderedDict([("Default", SSAStyle.DEFAULT_STYLE.copy())]) #: Dict of :class:`SSAStyle` instances. - self.info = self.DEFAULT_INFO.copy() #: Dict with script metadata, ie. ``[Script Info]``. - self.aegisub_project = OrderedDict() #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``. - self.fps = None #: Framerate used when reading the file, if applicable. - self.format = None #: Format of source subtitle file, if applicable, eg. ``"srt"``. + self.events: List[SSAEvent] = [] #: List of :class:`SSAEvent` instances, ie. individual subtitles. + self.styles: Dict[str, SSAStyle] = {"Default": SSAStyle.DEFAULT_STYLE.copy()} #: Dict of :class:`SSAStyle` instances. + self.info: Dict[str, str] = self.DEFAULT_INFO.copy() #: Dict with script metadata, ie. ``[Script Info]``. + self.aegisub_project: Dict[str, str] = {} #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``. + self.fonts_opaque: Dict[str, Any] = {} #: Dict with embedded fonts, ie. ``[Fonts]``. + self.fps: Optional[float] = None #: Framerate used when reading the file, if applicable. + self.format: Optional[str] = None #: Format of source subtitle file, if applicable, eg. ``"srt"``. # ------------------------------------------------------------------------ # I/O methods # ------------------------------------------------------------------------ @classmethod - def load(cls, path, encoding="utf-8", format_=None, fps=None, **kwargs): + def load(cls, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile": """ Load subtitle file from given path. + This method is implemented in terms of :meth:`SSAFile.from_file()`. + + See also: + Specific formats may implement additional loading options, + please refer to documentation of the implementation classes + (eg. :meth:`pysubs2.subrip.SubripFormat.from_file()`) + Arguments: path (str): Path to subtitle file. encoding (str): Character encoding of input file. @@ -66,14 +76,7 @@ class SSAFile(MutableSequence): be detected from the file, in which case you don't need to specify it here (when given, this argument overrides autodetection). - keep_unknown_html_tags (bool): This affects SubRip only (SRT), - for other formats this argument is ignored. - By default, HTML tags are converted to equivalent SubStation tags - (eg. ```` to ``{\\i1}`` and any remaining tags are removed - to keep the text clean. Set this parameter to ``True`` - if you want to pass through these tags (eg. ````). - This is useful if your output format is SRT and your player - supports these tags. + kwargs: Extra options for the reader. Returns: SSAFile @@ -100,7 +103,7 @@ class SSAFile(MutableSequence): return cls.from_file(fp, format_, fps=fps, **kwargs) @classmethod - def from_string(cls, string, format_=None, fps=None, **kwargs): + def from_string(cls, string: str, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile": """ Load subtitle file from string. @@ -126,7 +129,7 @@ class SSAFile(MutableSequence): return cls.from_file(fp, format_, fps=fps, **kwargs) @classmethod - def from_file(cls, fp, format_=None, fps=None, **kwargs): + def from_file(cls, fp: io.TextIOBase, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile": """ Read subtitle file from file object. @@ -160,10 +163,17 @@ class SSAFile(MutableSequence): impl.from_file(subs, fp, format_, fps=fps, **kwargs) return subs - def save(self, path, encoding="utf-8", format_=None, fps=None, **kwargs): + def save(self, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs): """ Save subtitle file to given path. + This method is implemented in terms of :meth:`SSAFile.to_file()`. + + See also: + Specific formats may implement additional saving options, + please refer to documentation of the implementation classes + (eg. :meth:`pysubs2.subrip.SubripFormat.to_file()`) + Arguments: path (str): Path to subtitle file. encoding (str): Character encoding of output file. @@ -197,7 +207,7 @@ class SSAFile(MutableSequence): with open(path, "w", encoding=encoding) as fp: self.to_file(fp, format_, fps=fps, **kwargs) - def to_string(self, format_, fps=None, **kwargs): + def to_string(self, format_: str, fps: Optional[float]=None, **kwargs) -> str: """ Get subtitle file as a string. @@ -211,7 +221,7 @@ class SSAFile(MutableSequence): self.to_file(fp, format_, fps=fps, **kwargs) return fp.getvalue() - def to_file(self, fp, format_, fps=None, **kwargs): + def to_file(self, fp: io.TextIOBase, format_: str, fps: Optional[float]=None, **kwargs): """ Write subtitle file to file object. @@ -233,7 +243,8 @@ class SSAFile(MutableSequence): # Retiming subtitles # ------------------------------------------------------------------------ - def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None): + def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0, + frames: Optional[int]=None, fps: Optional[float]=None): """ Shift all subtitles by constant time amount. @@ -255,7 +266,7 @@ class SSAFile(MutableSequence): line.start += delta line.end += delta - def transform_framerate(self, in_fps, out_fps): + def transform_framerate(self, in_fps: float, out_fps: float): """ Rescale all timestamps by ratio of in_fps/out_fps. @@ -282,7 +293,7 @@ class SSAFile(MutableSequence): # Working with styles # ------------------------------------------------------------------------ - def rename_style(self, old_name, new_name): + def rename_style(self, old_name: str, new_name: str): """ Rename a style, including references to it. @@ -311,7 +322,7 @@ class SSAFile(MutableSequence): if line.style == old_name: line.style = new_name - def import_styles(self, subs, overwrite=True): + def import_styles(self, subs: "SSAFile", overwrite: bool=True): """ Merge in styles from other SSAFile. @@ -332,7 +343,39 @@ class SSAFile(MutableSequence): # Helper methods # ------------------------------------------------------------------------ - def equals(self, other): + def remove_miscellaneous_events(self): + """ + Remove subtitles which appear to be non-essential (the --clean in CLI) + + Currently, this removes events matching any of these criteria: + - SSA event type Comment + - SSA drawing tags + - Less than two characters of text + - Duplicated text with identical time interval (only the first event is kept) + """ + new_events = [] + + duplicate_text_ids = set() + times_to_texts = {} + for i, e in enumerate(self): + tmp = times_to_texts.setdefault((e.start, e.end), []) + if tmp.count(e.plaintext) > 0: + duplicate_text_ids.add(i) + tmp.append(e.plaintext) + + for i, e in enumerate(self): + if e.is_drawing or e.is_comment: + continue + if len(e.plaintext.strip()) < 2: + continue + if i in duplicate_text_ids: + continue + + new_events.append(e) + + self.events = new_events + + def equals(self, other: "SSAFile"): """ Equality of two SSAFiles. @@ -357,6 +400,18 @@ class SSAFile(MutableSequence): logging.debug("info %r differs (self=%r, other=%r)", key, sv, ov) return False + for key in set(chain(self.fonts_opaque.keys(), other.fonts_opaque.keys())): + sv, ov = self.fonts_opaque.get(key), other.fonts_opaque.get(key) + if sv is None: + logging.debug("%r missing in self.fonts_opaque", key) + return False + elif ov is None: + logging.debug("%r missing in other.fonts_opaque", key) + return False + elif sv != ov: + logging.debug("fonts_opaque %r differs (self=%r, other=%r)", key, sv, ov) + return False + for key in set(chain(self.styles.keys(), other.styles.keys())): sv, ov = self.styles.get(key), other.styles.get(key) if sv is None: @@ -389,12 +444,10 @@ class SSAFile(MutableSequence): def __repr__(self): if self.events: max_time = max(ev.end for ev in self) - s = "" % \ - (len(self), len(self.styles), ms_to_str(max_time)) + s = f"" else: - s = "" % len(self.styles) + s = f"" - if not PY3: s = s.encode("utf-8") return s # ------------------------------------------------------------------------ @@ -405,22 +458,25 @@ class SSAFile(MutableSequence): """Sort subtitles time-wise, in-place.""" self.events.sort() - def __getitem__(self, item): + def __iter__(self) -> Iterable[SSAEvent]: + return iter(self.events) + + def __getitem__(self, item: int): return self.events[item] - def __setitem__(self, key, value): + def __setitem__(self, key: int, value: SSAEvent): if isinstance(value, SSAEvent): self.events[key] = value else: raise TypeError("SSAFile.events must contain only SSAEvent objects") - def __delitem__(self, key): + def __delitem__(self, key: int): del self.events[key] def __len__(self): return len(self.events) - def insert(self, index, value): + def insert(self, index: int, value: SSAEvent): if isinstance(value, SSAEvent): self.events.insert(index, value) else: diff --git a/libs/pysubs2/ssastyle.py b/libs/pysubs2/ssastyle.py index b7b4a5ef3..fa6a9ddca 100644 --- a/libs/pysubs2/ssastyle.py +++ b/libs/pysubs2/ssastyle.py @@ -1,8 +1,11 @@ -from __future__ import unicode_literals -from .common import Color, PY3 +import warnings +from typing import Dict, Any, ClassVar +import dataclasses +from .common import Color -class SSAStyle(object): +@dataclasses.dataclass(repr=False) +class SSAStyle: """ A SubStation Style. @@ -17,71 +20,57 @@ class SSAStyle(object): This class defines equality (equality of all fields). """ - DEFAULT_STYLE = None + DEFAULT_STYLE: ClassVar["SSAStyle"] = None - #: All fields in SSAStyle. - FIELDS = frozenset([ - "fontname", "fontsize", "primarycolor", "secondarycolor", - "tertiarycolor", "outlinecolor", "backcolor", - "bold", "italic", "underline", "strikeout", - "scalex", "scaley", "spacing", "angle", "borderstyle", - "outline", "shadow", "alignment", - "marginl", "marginr", "marginv", "alphalevel", "encoding" - ]) + @property + def FIELDS(self): + """All fields in SSAStyle.""" + warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning) + return frozenset(field.name for field in dataclasses.fields(self)) - def __init__(self, **fields): - self.fontname = "Arial" #: Font name - self.fontsize = 20.0 #: Font size (in pixels) - self.primarycolor = Color(255, 255, 255, 0) #: Primary color (:class:`pysubs2.Color` instance) - self.secondarycolor = Color(255, 0, 0, 0) #: Secondary color (:class:`pysubs2.Color` instance) - self.tertiarycolor = Color(0, 0, 0, 0) #: Tertiary color (:class:`pysubs2.Color` instance) - self.outlinecolor = Color(0, 0, 0, 0) #: Outline color (:class:`pysubs2.Color` instance) - self.backcolor = Color(0, 0, 0, 0) #: Back, ie. shadow color (:class:`pysubs2.Color` instance) - self.bold = False #: Bold - self.italic = False #: Italic - self.underline = False #: Underline (ASS only) - self.strikeout = False #: Strikeout (ASS only) - self.drawing = False #: Drawing (ASS only, see http://docs.aegisub.org/3.1/ASS_Tags/#drawing-tags - self.scalex = 100.0 #: Horizontal scaling (ASS only) - self.scaley = 100.0 #: Vertical scaling (ASS only) - self.spacing = 0.0 #: Letter spacing (ASS only) - self.angle = 0.0 #: Rotation (ASS only) - self.borderstyle = 1 #: Border style - self.outline = 2.0 #: Outline width (in pixels) - self.shadow = 2.0 #: Shadow depth (in pixels) - self.alignment = 2 #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics) - self.marginl = 10 #: Left margin (in pixels) - self.marginr = 10 #: Right margin (in pixels) - self.marginv = 10 #: Vertical margin (in pixels) - self.alphalevel = 0 #: Old, unused SSA-only field - self.encoding = 1 #: Charset + fontname: str = "Arial" #: Font name + fontsize: float = 20.0 #: Font size (in pixels) + primarycolor: Color = Color(255, 255, 255, 0) #: Primary color (:class:`pysubs2.Color` instance) + secondarycolor: Color = Color(255, 0, 0, 0) #: Secondary color (:class:`pysubs2.Color` instance) + tertiarycolor: Color = Color(0, 0, 0, 0) #: Tertiary color (:class:`pysubs2.Color` instance) + outlinecolor: Color = Color(0, 0, 0, 0) #: Outline color (:class:`pysubs2.Color` instance) + backcolor: Color = Color(0, 0, 0, 0) #: Back, ie. shadow color (:class:`pysubs2.Color` instance) + bold: bool = False #: Bold + italic: bool = False #: Italic + underline: bool = False #: Underline (ASS only) + strikeout: bool = False #: Strikeout (ASS only) + scalex: float = 100.0 #: Horizontal scaling (ASS only) + scaley: float = 100.0 #: Vertical scaling (ASS only) + spacing: float = 0.0 #: Letter spacing (ASS only) + angle: float = 0.0 #: Rotation (ASS only) + borderstyle: int = 1 #: Border style + outline: float = 2.0 #: Outline width (in pixels) + shadow: float = 2.0 #: Shadow depth (in pixels) + alignment: int = 2 #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics) + marginl: int = 10 #: Left margin (in pixels) + marginr: int = 10 #: Right margin (in pixels) + marginv: int = 10 #: Vertical margin (in pixels) + alphalevel: int = 0 #: Old, unused SSA-only field + encoding: int = 1 #: Charset - for k, v in fields.items(): - if k in self.FIELDS: - setattr(self, k, v) - else: - raise ValueError("SSAStyle has no field named %r" % k) + # The following attributes cannot be defined for SSA styles themselves, + # but can be used in override tags and thus are useful to keep here + # for the `pysubs2.substation.parse_tags()` interface which returns + # SSAStyles for text fragments. + drawing: bool = False #: Indicates that text span is a SSA vector drawing, see `pysubs2.substation.parse_tags()` - def copy(self): + def copy(self) -> "SSAStyle": return SSAStyle(**self.as_dict()) - def as_dict(self): - return {field: getattr(self, field) for field in self.FIELDS} - - def __eq__(self, other): - return self.as_dict() == other.as_dict() - - def __ne__(self, other): - return not self == other + def as_dict(self) -> Dict[str, Any]: + # dataclasses.asdict() would recursively dictify Color objects, which we don't want + return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} def __repr__(self): - s = "" SSAStyle.DEFAULT_STYLE = SSAStyle() diff --git a/libs/pysubs2/subrip.py b/libs/pysubs2/subrip.py index 56055b650..d6ed77b5d 100644 --- a/libs/pysubs2/subrip.py +++ b/libs/pysubs2/subrip.py @@ -1,5 +1,3 @@ -from __future__ import print_function, unicode_literals - import re from .formatbase import FormatBase from .ssaevent import SSAEvent @@ -21,25 +19,50 @@ def ms_to_timestamp(ms): class SubripFormat(FormatBase): + """SubRip Text (SRT) subtitle format implementation""" + TIMESTAMP = TIMESTAMP + + @staticmethod + def timestamp_to_ms(groups): + return timestamp_to_ms(groups) + @classmethod def guess_format(cls, text): + """See :meth:`pysubs2.formats.FormatBase.guess_format()`""" if "[Script Info]" in text or "[V4+ Styles]" in text: # disambiguation vs. SSA/ASS return None + if text.lstrip().startswith("WEBVTT"): + # disambiguation vs. WebVTT + return None + for line in text.splitlines(): - if len(TIMESTAMP.findall(line)) == 2: + if len(cls.TIMESTAMP.findall(line)) == 2: return "srt" @classmethod def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs): + """ + See :meth:`pysubs2.formats.FormatBase.from_file()` + + Supported tags: + + - ```` + - ```` + - ```` + + Keyword args: + keep_unknown_html_tags: If True, HTML tags other than i/u/s will be kept as-is. + Otherwise, they will be stripped from input. + """ timestamps = [] # (start, end) following_lines = [] # contains lists of lines following each timestamp for line in fp: - stamps = TIMESTAMP.findall(line) + stamps = cls.TIMESTAMP.findall(line) if len(stamps) == 2: # timestamp line - start, end = map(timestamp_to_ms, stamps) + start, end = map(cls.timestamp_to_ms, stamps) timestamps.append((start, end)) following_lines.append([]) else: @@ -72,16 +95,26 @@ class SubripFormat(FormatBase): for (start, end), lines in zip(timestamps, following_lines)] @classmethod - def to_file(cls, subs, fp, format_, **kwargs): + def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs): + """ + See :meth:`pysubs2.formats.FormatBase.to_file()` + + Italic, underline and strikeout styling is supported. + + Keyword args: + apply_styles: If False, do not write any styling. + + """ def prepare_text(text, style): body = [] for fragment, sty in parse_tags(text, style, subs.styles): fragment = fragment.replace(r"\h", " ") fragment = fragment.replace(r"\n", "\n") fragment = fragment.replace(r"\N", "\n") - if sty.italic: fragment = "%s" % fragment - if sty.underline: fragment = "%s" % fragment - if sty.strikeout: fragment = "%s" % fragment + if apply_styles: + if sty.italic: fragment = "%s" % fragment + if sty.underline: fragment = "%s" % fragment + if sty.strikeout: fragment = "%s" % fragment if sty.drawing: raise ContentNotUsable body.append(fragment) @@ -89,7 +122,8 @@ class SubripFormat(FormatBase): visible_lines = (line for line in subs if not line.is_comment) - for i, line in enumerate(visible_lines, 1): + lineno = 1 + for line in visible_lines: start = ms_to_timestamp(line.start) end = ms_to_timestamp(line.end) try: @@ -97,6 +131,7 @@ class SubripFormat(FormatBase): except ContentNotUsable: continue - print("%d" % i, file=fp) # Python 2.7 compat + print("%d" % lineno, file=fp) # Python 2.7 compat print(start, "-->", end, file=fp) print(text, end="\n\n", file=fp) + lineno += 1 diff --git a/libs/pysubs2/substation.py b/libs/pysubs2/substation.py index 274075a44..6fcae5fc2 100644 --- a/libs/pysubs2/substation.py +++ b/libs/pysubs2/substation.py @@ -1,10 +1,10 @@ -from __future__ import print_function, division, unicode_literals +import logging import re from numbers import Number from .formatbase import FormatBase from .ssaevent import SSAEvent from .ssastyle import SSAStyle -from .common import text_type, Color, PY3, binary_string_type +from .common import Color from .time import make_time, ms_to_times, timestamp_to_ms, TIMESTAMP SSA_ALIGNMENT = (1, 2, 3, 9, 10, 11, 5, 6, 7) @@ -15,7 +15,14 @@ def ass_to_ssa_alignment(i): def ssa_to_ass_alignment(i): return SSA_ALIGNMENT.index(i) + 1 -SECTION_HEADING = re.compile(r"^.{,3}\[[^\]]+\]") # allow for UTF-8 BOM, which is 3 bytes +SECTION_HEADING = re.compile( + r"^.{,3}" # allow 3 chars at start of line for BOM + r"\[" # open square bracket + r"[^]]*[a-z][^]]*" # inside square brackets, at least one lowercase letter (this guards vs. uuencoded font data) + r"]" # close square bracket +) + +FONT_FILE_HEADING = re.compile(r"fontname:\s+(\S+)") STYLE_FORMAT_LINE = { "ass": "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic," @@ -46,7 +53,7 @@ EVENT_FIELDS = { #: Largest timestamp allowed in SubStation, ie. 9:59:59.99. MAX_REPRESENTABLE_TIME = make_time(h=10) - 10 -def ms_to_timestamp(ms): +def ms_to_timestamp(ms: int) -> str: """Convert ms to 'H:MM:SS.cc'""" # XXX throw on overflow/underflow? if ms < 0: ms = 0 @@ -54,28 +61,24 @@ def ms_to_timestamp(ms): h, m, s, ms = ms_to_times(ms) return "%01d:%02d:%02d.%02d" % (h, m, s, ms//10) -def color_to_ass_rgba(c): +def color_to_ass_rgba(c: Color) -> str: return "&H%08X" % ((c.a << 24) | (c.b << 16) | (c.g << 8) | c.r) -def color_to_ssa_rgb(c): +def color_to_ssa_rgb(c: Color) -> str: return "%d" % ((c.b << 16) | (c.g << 8) | c.r) -def ass_rgba_to_color(s): - x = int(s[2:], base=16) +def rgba_to_color(s: str) -> Color: + if s[0] == '&': + x = int(s[2:], base=16) + else: + x = int(s) r = x & 0xff g = (x >> 8) & 0xff b = (x >> 16) & 0xff a = (x >> 24) & 0xff return Color(r, g, b, a) -def ssa_rgb_to_color(s): - x = int(s) - r = x & 0xff - g = (x >> 8) & 0xff - b = (x >> 16) & 0xff - return Color(r, g, b) - -def is_valid_field_content(s): +def is_valid_field_content(s: str) -> bool: """ Returns True if string s can be stored in a SubStation field. @@ -140,8 +143,10 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}): NOTICE = "Script generated by pysubs2\nhttps://pypi.python.org/pypi/pysubs2" class SubstationFormat(FormatBase): + """SubStation Alpha (ASS, SSA) subtitle format implementation""" @classmethod def guess_format(cls, text): + """See :meth:`pysubs2.formats.FormatBase.guess_format()`""" if "V4+ Styles" in text: return "ass" elif "V4 Styles" in text: @@ -149,6 +154,7 @@ class SubstationFormat(FormatBase): @classmethod def from_file(cls, subs, fp, format_, **kwargs): + """See :meth:`pysubs2.formats.FormatBase.from_file()`""" def string_to_field(f, v): if f in {"start", "end"}: @@ -159,10 +165,7 @@ class SubstationFormat(FormatBase): else: return timestamp_to_ms(TIMESTAMP.match(v).groups()) elif "color" in f: - if format_ == "ass": - return ass_rgba_to_color(v) - else: - return ssa_rgb_to_color(v) + return rgba_to_color(v) elif f in {"bold", "underline", "italic", "strikeout"}: return v == "-1" elif f in {"borderstyle", "encoding", "marginl", "marginr", "marginv", "layer", "alphalevel"}: @@ -183,16 +186,22 @@ class SubstationFormat(FormatBase): subs.info.clear() subs.aegisub_project.clear() subs.styles.clear() + subs.fonts_opaque.clear() inside_info_section = False inside_aegisub_section = False + inside_font_section = False + current_font_name = None + current_font_lines_buffer = [] - for line in fp: + for lineno, line in enumerate(fp, 1): line = line.strip() if SECTION_HEADING.match(line): + logging.debug("at line %d: section heading %s", lineno, line) inside_info_section = "Info" in line inside_aegisub_section = "Aegisub" in line + inside_font_section = "Fonts" in line elif inside_info_section or inside_aegisub_section: if line.startswith(";"): continue # skip comments try: @@ -203,6 +212,24 @@ class SubstationFormat(FormatBase): subs.aegisub_project[k] = v.strip() except ValueError: pass + elif inside_font_section: + m = FONT_FILE_HEADING.match(line) + + if current_font_name and (m or not line): + # flush last font on newline or new font name + font_data = current_font_lines_buffer[:] + subs.fonts_opaque[current_font_name] = font_data + logging.debug("at line %d: finished font definition %s", lineno, current_font_name) + current_font_lines_buffer.clear() + current_font_name = None + + if m: + # start new font + font_name = m.group(1) + current_font_name = font_name + elif line: + # add non-empty line to current buffer + current_font_lines_buffer.append(line) elif line.startswith("Style:"): _, rest = line.split(":", 1) buf = rest.strip().split(",") @@ -218,9 +245,18 @@ class SubstationFormat(FormatBase): ev = SSAEvent(**field_dict) subs.events.append(ev) + # cleanup fonts + if current_font_name: + # flush last font on EOF or new section w/o newline + font_data = current_font_lines_buffer[:] + subs.fonts_opaque[current_font_name] = font_data + logging.debug("at EOF: finished font definition %s", current_font_name) + current_font_lines_buffer.clear() + current_font_name = None @classmethod def to_file(cls, subs, fp, format_, header_notice=NOTICE, **kwargs): + """See :meth:`pysubs2.formats.FormatBase.to_file()`""" print("[Script Info]", file=fp) for line in header_notice.splitlines(False): print(";", line, file=fp) @@ -240,19 +276,11 @@ class SubstationFormat(FormatBase): elif f == "marked": return "Marked=%d" % v elif f == "alignment" and format_ == "ssa": - return text_type(ass_to_ssa_alignment(v)) + return str(ass_to_ssa_alignment(v)) elif isinstance(v, bool): return "-1" if v else "0" - elif isinstance(v, (text_type, Number)): - return text_type(v) - elif not PY3 and isinstance(v, binary_string_type): - # A convenience feature, see issue #12 - accept non-unicode strings - # when they are ASCII; this is useful in Python 2, especially for non-text - # fields like style names, where requiring Unicode type seems too stringent - if all(ord(c) < 128 for c in v): - return text_type(v) - else: - raise TypeError("Encountered binary string with non-ASCII codepoint in SubStation field {!r} for line {!r} - please use unicode string instead of str".format(f, line)) + elif isinstance(v, (str, Number)): + return str(v) elif isinstance(v, Color): if format_ == "ass": return color_to_ass_rgba(v) @@ -267,6 +295,14 @@ class SubstationFormat(FormatBase): fields = [field_to_string(f, getattr(sty, f), sty) for f in STYLE_FIELDS[format_]] print("Style: %s" % name, *fields, sep=",", file=fp) + if subs.fonts_opaque: + print("\n[Fonts]", file=fp) + for font_name, font_lines in sorted(subs.fonts_opaque.items()): + print("fontname: {}".format(font_name), file=fp) + for line in font_lines: + print(line, file=fp) + print(file=fp) + print("\n[Events]", file=fp) print(EVENT_FORMAT_LINE[format_], file=fp) for ev in subs.events: diff --git a/libs/pysubs2/time.py b/libs/pysubs2/time.py index 24e9ec077..828c4063d 100644 --- a/libs/pysubs2/time.py +++ b/libs/pysubs2/time.py @@ -1,15 +1,19 @@ -from __future__ import division - from collections import namedtuple import re #: Pattern that matches both SubStation and SubRip timestamps. +from typing import Optional, List, Tuple, Sequence + +from pysubs2.common import IntOrFloat + TIMESTAMP = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[.,](\d{2,3})") Times = namedtuple("Times", ["h", "m", "s", "ms"]) -def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None): + +def make_time(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0, + frames: Optional[int]=None, fps: Optional[float]=None): """ Convert time to milliseconds. @@ -33,7 +37,8 @@ def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None): else: raise ValueError("Both fps and frames must be specified") -def timestamp_to_ms(groups): + +def timestamp_to_ms(groups: Sequence[str]): """ Convert groups from :data:`pysubs2.time.TIMESTAMP` match to milliseconds. @@ -49,7 +54,8 @@ def timestamp_to_ms(groups): ms += h * 3600000 return ms -def tmptimestamp_to_ms(groups): + +def tmptimestamp_to_ms(groups: Sequence[str]): """ Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds. @@ -63,7 +69,9 @@ def tmptimestamp_to_ms(groups): ms += m * 60000 ms += h * 3600000 return ms -def times_to_ms(h=0, m=0, s=0, ms=0): + + +def times_to_ms(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0) -> int: """ Convert hours, minutes, seconds to milliseconds. @@ -79,7 +87,8 @@ def times_to_ms(h=0, m=0, s=0, ms=0): ms += h * 3600000 return int(round(ms)) -def frames_to_ms(frames, fps): + +def frames_to_ms(frames: int, fps: float) -> int: """ Convert frame-based duration to milliseconds. @@ -99,7 +108,8 @@ def frames_to_ms(frames, fps): return int(round(frames * (1000 / fps))) -def ms_to_frames(ms, fps): + +def ms_to_frames(ms: IntOrFloat, fps: float) -> int: """ Convert milliseconds to number of frames. @@ -119,7 +129,8 @@ def ms_to_frames(ms, fps): return int(round((ms / 1000) * fps)) -def ms_to_times(ms): + +def ms_to_times(ms: IntOrFloat) -> Tuple[int, int, int, int]: """ Convert milliseconds to normalized tuple (h, m, s, ms). @@ -138,7 +149,8 @@ def ms_to_times(ms): s, ms = divmod(ms, 1000) return Times(h, m, s, ms) -def ms_to_str(ms, fractions=False): + +def ms_to_str(ms: IntOrFloat, fractions: bool=False) -> str: """ Prettyprint milliseconds to [-]H:MM:SS[.mmm] @@ -156,6 +168,6 @@ def ms_to_str(ms, fractions=False): sgn = "-" if ms < 0 else "" h, m, s, ms = ms_to_times(abs(ms)) if fractions: - return sgn + "{:01d}:{:02d}:{:02d}.{:03d}".format(h, m, s, ms) + return f"{sgn}{h:01d}:{m:02d}:{s:02d}.{ms:03d}" else: - return sgn + "{:01d}:{:02d}:{:02d}".format(h, m, s) + return f"{sgn}{h:01d}:{m:02d}:{s:02d}" diff --git a/libs/pysubs2/tmp.py b/libs/pysubs2/tmp.py index aae55202c..392c8615f 100644 --- a/libs/pysubs2/tmp.py +++ b/libs/pysubs2/tmp.py @@ -1,5 +1,3 @@ -from __future__ import print_function, unicode_literals - import re from .formatbase import FormatBase from .ssaevent import SSAEvent @@ -15,6 +13,7 @@ TMP_LINE = re.compile(r"(\d{1,2}:\d{2}:\d{2}):(.+)") #: Largest timestamp allowed in Tmp, ie. 99:59:59. MAX_REPRESENTABLE_TIME = make_time(h=100) - 1 + def ms_to_timestamp(ms): """Convert ms to 'HH:MM:SS'""" # XXX throw on overflow/underflow? @@ -25,8 +24,10 @@ def ms_to_timestamp(ms): class TmpFormat(FormatBase): + """TMP subtitle format implementation""" @classmethod def guess_format(cls, text): + """See :meth:`pysubs2.formats.FormatBase.guess_format()`""" if "[Script Info]" in text or "[V4+ Styles]" in text: # disambiguation vs. SSA/ASS return None @@ -37,8 +38,14 @@ class TmpFormat(FormatBase): @classmethod def from_file(cls, subs, fp, format_, **kwargs): - timestamps = [] # (start) - lines = [] # contains lists of lines following each timestamp + """See :meth:`pysubs2.formats.FormatBase.from_file()`""" + events = [] + + def prepare_text(text): + text = text.replace("|", r"\N") # convert newlines + text = re.sub(r"< *u *>", "{\\\\u1}", text) # not r" for Python 2.7 compat, triggers unicodeescape + text = re.sub(r"< */? *[a-zA-Z][^>]*>", "", text) # strip other HTML tags + return text for line in fp: match = TMP_LINE.match(line) @@ -47,42 +54,54 @@ class TmpFormat(FormatBase): start, text = match.groups() start = tmptimestamp_to_ms(TMPTIMESTAMP.match(start).groups()) - #calculate endtime from starttime + 500 miliseconds + 67 miliseconds per each character (15 chars per second) - end = start + 500 + (len(line) * 67) - timestamps.append((start, end)) - lines.append(text) - def prepare_text(lines): - lines = lines.replace("|", r"\N") # convert newlines - lines = re.sub(r"< *u *>", "{\\\\u1}", lines) # not r" for Python 2.7 compat, triggers unicodeescape - lines = re.sub(r"< */? *[a-zA-Z][^>]*>", "", lines) # strip other HTML tags - return lines + # Unfortunately, end timestamp is not given; try to estimate something reasonable: + # start + 500 ms + 67 ms/character (15 chars per second) + end_guess = start + 500 + (len(line) * 67) - subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines)) - for (start, end), lines in zip(timestamps, lines)] + event = SSAEvent(start=start, end=end_guess, text=prepare_text(text)) + events.append(event) + + # correct any overlapping subtitles created by end_guess + for i in range(len(events) - 1): + events[i].end = min(events[i].end, events[i+1].start) + + subs.events = events @classmethod - def to_file(cls, subs, fp, format_, **kwargs): + def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs): + """ + See :meth:`pysubs2.formats.FormatBase.to_file()` + + Italic, underline and strikeout styling is supported. + + Keyword args: + apply_styles: If False, do not write any styling. + + """ def prepare_text(text, style): body = [] + skip = False for fragment, sty in parse_tags(text, style, subs.styles): fragment = fragment.replace(r"\h", " ") fragment = fragment.replace(r"\n", "\n") fragment = fragment.replace(r"\N", "\n") - if sty.italic: fragment = "%s" % fragment - if sty.underline: fragment = "%s" % fragment - if sty.strikeout: fragment = "%s" % fragment + if apply_styles: + if sty.italic: fragment = "%s" % fragment + if sty.underline: fragment = "%s" % fragment + if sty.strikeout: fragment = "%s" % fragment + if sty.drawing: skip = True body.append(fragment) - return re.sub("\n+", "\n", "".join(body).strip()) + if skip: + return "" + else: + return re.sub("\n+", "\n", "".join(body).strip()) visible_lines = (line for line in subs if not line.is_comment) - for i, line in enumerate(visible_lines, 1): + for line in visible_lines: start = ms_to_timestamp(line.start) - #end = ms_to_timestamp(line.end) text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE)) - #print("%d" % i, file=fp) # Python 2.7 compat print(start + ":" + text, end="\n", file=fp) - #print(text, end="\n\n", file=fp) diff --git a/libs/pysubs2/webvtt.py b/libs/pysubs2/webvtt.py new file mode 100644 index 000000000..cb9bee076 --- /dev/null +++ b/libs/pysubs2/webvtt.py @@ -0,0 +1,36 @@ +import re +from .subrip import SubripFormat +from .time import make_time + + +class WebVTTFormat(SubripFormat): + """ + Web Video Text Tracks (WebVTT) subtitle format implementation + + Currently, this shares implementation with :class:`pysubs2.subrip.SubripFormat`. + """ + TIMESTAMP = re.compile(r"(\d{0,4}:)?(\d{2}):(\d{2})\.(\d{2,3})") + + @staticmethod + def timestamp_to_ms(groups): + _h, _m, _s, _ms = groups + if not _h: + h = 0 + else: + h = int(_h.strip(":")) + m, s, ms = map(int, (_m, _s, _ms)) + return make_time(h=h, m=m, s=s, ms=ms) + + @classmethod + def guess_format(cls, text): + """See :meth:`pysubs2.formats.FormatBase.guess_format()`""" + if text.lstrip().startswith("WEBVTT"): + return "vtt" + + @classmethod + def to_file(cls, subs, fp, format_, **kwargs): + """ + See :meth:`pysubs2.formats.FormatBase.to_file()` + """ + print("WEBVTT\n", file=fp) + return SubripFormat.to_file(subs=subs, fp=fp, format_=format_, **kwargs)