Updated pysubs2 module to support newer SSA files.

This commit is contained in:
morpheus65535 2021-07-14 19:13:28 -04:00
parent 60353c0367
commit 09a8335a03
17 changed files with 548 additions and 299 deletions

View file

@ -10,3 +10,6 @@ load = SSAFile.load
#: Alias for :meth:`pysubs2.time.make_time()`.
make_time = time.make_time
#: Alias for `pysubs2.common.VERSION`.
__version__ = VERSION

View file

@ -1,4 +1,3 @@
from __future__ import unicode_literals, print_function
import argparse
import codecs
import os
@ -8,38 +7,39 @@ import io
from io import open
import sys
from textwrap import dedent
from .formats import get_file_extension
from .formats import get_file_extension, FORMAT_IDENTIFIERS
from .time import make_time
from .ssafile import SSAFile
from .common import PY3, VERSION
from .common import VERSION
import logging
def positive_float(s):
def positive_float(s: str) -> float:
x = float(s)
if not x > 0:
raise argparse.ArgumentTypeError("%r is not a positive number" % s)
return x
def character_encoding(s):
def character_encoding(s: str) -> str:
try:
codecs.lookup(s)
return s
except LookupError:
raise argparse.ArgumentError
def time(s):
def time(s: str):
d = {}
for v, k in re.findall(r"(\d*\.?\d*)(ms|m|s|h)", s):
d[k] = float(v)
return make_time(**d)
def change_ext(path, ext):
def change_ext(path: str, ext: str) -> str:
base, _ = op.splitext(path)
return base + ext
class Pysubs2CLI(object):
class Pysubs2CLI:
def __init__(self):
parser = self.parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,
prog="pysubs2",
@ -50,6 +50,7 @@ class Pysubs2CLI(object):
epilog=dedent("""
usage examples:
python -m pysubs2 --to srt *.ass
python -m pysubs2 --to srt --clean *.ass
python -m pysubs2 --to microdvd --fps 23.976 *.ass
python -m pysubs2 --shift 0.3s *.srt
python -m pysubs2 --shift 0.3s <my_file.srt >retimed_file.srt
@ -57,21 +58,21 @@ class Pysubs2CLI(object):
python -m pysubs2 --transform-framerate 25 23.976 *.srt"""))
parser.add_argument("files", nargs="*", metavar="FILE",
help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt) or "
"MicroDVD (*.sub) formats. When no files are specified, pysubs2 will work as a pipe, "
"reading from standard input and writing to standard output.")
help="Input subtitle files. Can be in SubStation Alpha (*.ass, *.ssa), SubRip (*.srt), "
"MicroDVD (*.sub) or other supported format. When no files are specified, "
"pysubs2 will work as a pipe, reading from standard input and writing to standard output.")
parser.add_argument("-v", "--version", action="version", version="pysubs2 %s" % VERSION)
parser.add_argument("-f", "--from", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="input_format",
parser.add_argument("-f", "--from", choices=FORMAT_IDENTIFIERS, dest="input_format",
help="By default, subtitle format is detected from the file. This option can be used to "
"skip autodetection and force specific format. Generally, it should never be needed.")
parser.add_argument("-t", "--to", choices=["ass", "ssa", "srt", "microdvd", "json"], dest="output_format",
parser.add_argument("-t", "--to", choices=FORMAT_IDENTIFIERS, dest="output_format",
help="Convert subtitle files to given format. By default, each file is saved in its "
"original format.")
parser.add_argument("--input-enc", metavar="ENCODING", default="iso-8859-1", type=character_encoding,
help="Character encoding for input files. By default, ISO-8859-1 is used for both "
"input and output, which should generally work (for 8-bit encodings).")
parser.add_argument("--input-enc", metavar="ENCODING", default="utf-8", type=character_encoding,
help="Character encoding for input files. By default, UTF-8 is used for both "
"input and output.")
parser.add_argument("--output-enc", metavar="ENCODING", type=character_encoding,
help="Character encoding for output files. By default, it is the same as input encoding. "
"If you wish to convert between encodings, make sure --input-enc is set correctly! "
@ -85,6 +86,11 @@ class Pysubs2CLI(object):
help="Use this to save all files to given directory. By default, every file is saved to its parent directory, "
"ie. unless it's being saved in different subtitle format (and thus with different file extension), "
"it overwrites the original file.")
parser.add_argument("--clean", action="store_true",
help="Attempt to remove non-essential subtitles (eg. karaoke, SSA drawing tags), "
"strip styling information when saving to non-SSA formats")
parser.add_argument("--verbose", action="store_true",
help="Print misc logging")
group = parser.add_mutually_exclusive_group()
@ -105,6 +111,9 @@ class Pysubs2CLI(object):
args = self.parser.parse_args(argv)
errors = 0
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
if args.output_dir and not op.exists(args.output_dir):
os.makedirs(args.output_dir)
@ -138,19 +147,15 @@ class Pysubs2CLI(object):
outpath = op.join(args.output_dir, filename)
with open(outpath, "w", encoding=args.output_enc) as outfile:
subs.to_file(outfile, output_format, args.fps)
subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean)
else:
if PY3:
infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)
else:
infile = io.TextIOWrapper(sys.stdin, args.input_enc)
outfile = io.TextIOWrapper(sys.stdout, args.output_enc)
infile = io.TextIOWrapper(sys.stdin.buffer, args.input_enc)
outfile = io.TextIOWrapper(sys.stdout.buffer, args.output_enc)
subs = SSAFile.from_file(infile, args.input_format, args.fps)
self.process(subs, args)
output_format = args.output_format or subs.format
subs.to_file(outfile, output_format, args.fps)
subs.to_file(outfile, output_format, args.fps, apply_styles=not args.clean)
return (0 if errors == 0 else 1)
@ -164,6 +169,9 @@ class Pysubs2CLI(object):
in_fps, out_fps = args.transform_framerate
subs.transform_framerate(in_fps, out_fps)
if args.clean:
subs.remove_miscellaneous_events()
def __main__():
cli = Pysubs2CLI()

View file

@ -1,30 +1,32 @@
from collections import namedtuple
import sys
from dataclasses import dataclass
from typing import Union
_Color = namedtuple("Color", "r g b a")
class Color(_Color):
@dataclass(init=False)
class Color:
"""
(r, g, b, a) namedtuple for 8-bit RGB color with alpha channel.
8-bit RGB color with alpha channel.
All values are ints from 0 to 255.
"""
def __new__(cls, r, g, b, a=0):
r: int
g: int
b: int
a: int = 0
def __init__(self, r: int, g: int, b: int, a: int = 0):
for value in r, g, b, a:
if value not in range(256):
raise ValueError("Color channels must have values 0-255")
return _Color.__new__(cls, r, g, b, a)
self.r = r
self.g = g
self.b = b
self.a = a
#: Version of the pysubs2 library.
VERSION = "0.2.4"
VERSION = "1.2.0"
PY3 = sys.version_info.major == 3
if PY3:
text_type = str
binary_string_type = bytes
else:
text_type = unicode
binary_string_type = str
IntOrFloat = Union[int, float]

View file

@ -1,17 +1,22 @@
class Pysubs2Error(Exception):
"""Base class for pysubs2 exceptions."""
class UnknownFPSError(Pysubs2Error):
"""Framerate was not specified and couldn't be inferred otherwise."""
class UnknownFileExtensionError(Pysubs2Error):
"""File extension does not pertain to any known subtitle format."""
class UnknownFormatIdentifierError(Pysubs2Error):
"""Unknown subtitle format identifier (ie. string like ``"srt"``)."""
class FormatAutodetectionError(Pysubs2Error):
"""Subtitle format is ambiguous or unknown."""
class ContentNotUsable(Pysubs2Error):
"""Current content not usable for specified format"""

View file

@ -1,4 +1,8 @@
class FormatBase(object):
from typing import Optional
import io
class FormatBase:
"""
Base class for subtitle format implementations.
@ -14,7 +18,7 @@ class FormatBase(object):
"""
@classmethod
def from_file(cls, subs, fp, format_, **kwargs):
def from_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs):
"""
Load subtitle file into an empty SSAFile.
@ -37,7 +41,7 @@ class FormatBase(object):
raise NotImplementedError("Parsing is not supported for this format")
@classmethod
def to_file(cls, subs, fp, format_, **kwargs):
def to_file(cls, subs, fp: io.TextIOBase, format_: str, **kwargs):
"""
Write SSAFile into a file.
@ -62,7 +66,7 @@ class FormatBase(object):
raise NotImplementedError("Writing is not supported for this format")
@classmethod
def guess_format(self, text):
def guess_format(self, text: str) -> Optional[str]:
"""
Return format identifier of recognized format, or None.

View file

@ -1,3 +1,5 @@
from typing import Dict, Type
from .formatbase import FormatBase
from .microdvd import MicroDVDFormat
from .subrip import SubripFormat
@ -5,20 +7,22 @@ from .jsonformat import JSONFormat
from .substation import SubstationFormat
from .mpl2 import MPL2Format
from .tmp import TmpFormat
from .webvtt import WebVTTFormat
from .exceptions import *
#: Dict mapping file extensions to format identifiers.
FILE_EXTENSION_TO_FORMAT_IDENTIFIER = {
FILE_EXTENSION_TO_FORMAT_IDENTIFIER: Dict[str, str] = {
".srt": "srt",
".ass": "ass",
".ssa": "ssa",
".sub": "microdvd",
".json": "json",
".txt": "tmp",
".vtt": "vtt",
}
#: Dict mapping format identifiers to implementations (FormatBase subclasses).
FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
FORMAT_IDENTIFIER_TO_FORMAT_CLASS: Dict[str, Type[FormatBase]] = {
"srt": SubripFormat,
"ass": SubstationFormat,
"ssa": SubstationFormat,
@ -26,23 +30,29 @@ FORMAT_IDENTIFIER_TO_FORMAT_CLASS = {
"json": JSONFormat,
"mpl2": MPL2Format,
"tmp": TmpFormat,
"vtt": WebVTTFormat,
}
def get_format_class(format_):
FORMAT_IDENTIFIERS = list(FORMAT_IDENTIFIER_TO_FORMAT_CLASS.keys())
def get_format_class(format_: str) -> Type[FormatBase]:
"""Format identifier -> format class (ie. subclass of FormatBase)"""
try:
return FORMAT_IDENTIFIER_TO_FORMAT_CLASS[format_]
except KeyError:
raise UnknownFormatIdentifierError(format_)
def get_format_identifier(ext):
def get_format_identifier(ext: str) -> str:
"""File extension -> format identifier"""
try:
return FILE_EXTENSION_TO_FORMAT_IDENTIFIER[ext]
except KeyError:
raise UnknownFileExtensionError(ext)
def get_file_extension(format_):
def get_file_extension(format_: str) -> str:
"""Format identifier -> file extension"""
if format_ not in FORMAT_IDENTIFIER_TO_FORMAT_CLASS:
raise UnknownFormatIdentifierError(format_)
@ -53,7 +63,8 @@ def get_file_extension(format_):
raise RuntimeError("No file extension for format %r" % format_)
def autodetect_format(content):
def autodetect_format(content: str) -> str:
"""Return format identifier for given fragment or raise FormatAutodetectionError."""
formats = set()
for impl in FORMAT_IDENTIFIER_TO_FORMAT_CLASS.values():

View file

@ -1,20 +1,35 @@
from __future__ import unicode_literals, print_function
import dataclasses
import json
from .common import Color, PY3
from .common import Color
from .ssaevent import SSAEvent
from .ssastyle import SSAStyle
from .formatbase import FormatBase
# We're using Color dataclass
# https://stackoverflow.com/questions/51286748/make-the-python-json-encoder-support-pythons-new-dataclasses
class EnhancedJSONEncoder(json.JSONEncoder):
def default(self, o):
if dataclasses.is_dataclass(o):
return dataclasses.asdict(o)
return super().default(o)
class JSONFormat(FormatBase):
"""
Implementation of JSON subtitle pseudo-format (serialized pysubs2 internal representation)
This is essentially SubStation Alpha as JSON.
"""
@classmethod
def guess_format(cls, text):
"""See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
if text.startswith("{\""):
return "json"
@classmethod
def from_file(cls, subs, fp, format_, **kwargs):
"""See :meth:`pysubs2.formats.FormatBase.from_file()`"""
data = json.load(fp)
subs.info.clear()
@ -25,7 +40,7 @@ class JSONFormat(FormatBase):
subs.styles[name] = sty = SSAStyle()
for k, v in fields.items():
if "color" in k:
setattr(sty, k, Color(*v))
setattr(sty, k, Color(**v))
else:
setattr(sty, k, v)
@ -33,14 +48,11 @@ class JSONFormat(FormatBase):
@classmethod
def to_file(cls, subs, fp, format_, **kwargs):
"""See :meth:`pysubs2.formats.FormatBase.to_file()`"""
data = {
"info": dict(**subs.info),
"styles": {name: sty.as_dict() for name, sty in subs.styles.items()},
"events": [ev.as_dict() for ev in subs.events]
}
if PY3:
json.dump(data, fp)
else:
text = json.dumps(data, fp)
fp.write(unicode(text))
json.dump(data, fp, cls=EnhancedJSONEncoder)

View file

@ -1,8 +1,5 @@
from __future__ import unicode_literals, print_function
from functools import partial
import re
from .common import text_type
from .exceptions import UnknownFPSError
from .ssaevent import SSAEvent
from .ssastyle import SSAStyle
@ -15,13 +12,16 @@ MICRODVD_LINE = re.compile(r" *\{ *(\d+) *\} *\{ *(\d+) *\}(.+)")
class MicroDVDFormat(FormatBase):
"""MicroDVD subtitle format implementation"""
@classmethod
def guess_format(cls, text):
"""See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
if any(map(MICRODVD_LINE.match, text.splitlines())):
return "microdvd"
@classmethod
def from_file(cls, subs, fp, format_, fps=None, **kwargs):
"""See :meth:`pysubs2.formats.FormatBase.from_file()`"""
for line in fp:
match = MICRODVD_LINE.match(line)
if not match:
@ -63,7 +63,18 @@ class MicroDVDFormat(FormatBase):
subs.append(ev)
@classmethod
def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, **kwargs):
def to_file(cls, subs, fp, format_, fps=None, write_fps_declaration=True, apply_styles=True, **kwargs):
"""
See :meth:`pysubs2.formats.FormatBase.to_file()`
The only supported styling is marking whole lines italic.
Keyword args:
write_fps_declaration: If True, create a zero-duration first subtitle which will contain
the fps.
apply_styles: If False, do not write any styling.
"""
if fps is None:
fps = subs.fps
@ -83,11 +94,14 @@ class MicroDVDFormat(FormatBase):
# insert an artificial first line telling the framerate
if write_fps_declaration:
subs.insert(0, SSAEvent(start=0, end=0, text=text_type(fps)))
subs.insert(0, SSAEvent(start=0, end=0, text=str(fps)))
for line in subs:
if line.is_comment or line.is_drawing:
continue
for line in (ev for ev in subs if not ev.is_comment):
text = "|".join(line.plaintext.splitlines())
if is_entirely_italic(line):
if apply_styles and is_entirely_italic(line):
text = "{Y:i}" + text
start, end = map(to_frames, (line.start, line.end))

View file

@ -1,6 +1,3 @@
# coding=utf-8
from __future__ import print_function, division, unicode_literals
import re
from .time import times_to_ms
@ -13,13 +10,16 @@ MPL2_FORMAT = re.compile(r"^(?um)\[(-?\d+)\]\[(-?\d+)\](.*)")
class MPL2Format(FormatBase):
"""MPL2 subtitle format implementation"""
@classmethod
def guess_format(cls, text):
"""See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
if MPL2_FORMAT.search(text):
return "mpl2"
@classmethod
def from_file(cls, subs, fp, format_, **kwargs):
"""See :meth:`pysubs2.formats.FormatBase.from_file()`"""
def prepare_text(lines):
out = []
for s in lines.split("|"):
@ -37,7 +37,12 @@ class MPL2Format(FormatBase):
@classmethod
def to_file(cls, subs, fp, format_, **kwargs):
"""
See :meth:`pysubs2.formats.FormatBase.to_file()`
No styling is supported at the moment.
"""
# TODO handle italics
for line in subs:
if line.is_comment:

View file

@ -1,10 +1,14 @@
from __future__ import unicode_literals
import re
import warnings
from typing import Optional, Dict, Any, ClassVar
import dataclasses
from .common import IntOrFloat
from .time import ms_to_str, make_time
from .common import PY3
class SSAEvent(object):
@dataclasses.dataclass(repr=False, eq=False, order=False)
class SSAEvent:
"""
A SubStation Event, ie. one subtitle.
@ -21,36 +25,29 @@ class SSAEvent(object):
>>> ev = SSAEvent(start=make_time(s=1), end=make_time(s=2.5), text="Hello World!")
"""
OVERRIDE_SEQUENCE = re.compile(r"{[^}]*}")
OVERRIDE_SEQUENCE: ClassVar = re.compile(r"{[^}]*}")
#: All fields in SSAEvent.
FIELDS = frozenset([
"start", "end", "text", "marked", "layer", "style",
"name", "marginl", "marginr", "marginv", "effect", "type"
])
def __init__(self, **fields):
self.start = 0 #: Subtitle start time (in milliseconds)
self.end = 10000 #: Subtitle end time (in milliseconds)
self.text = "" #: Text of subtitle (with SubStation override tags)
self.marked = False #: (SSA only)
self.layer = 0 #: Layer number, 0 is the lowest layer (ASS only)
self.style = "Default" #: Style name
self.name = "" #: Actor name
self.marginl = 0 #: Left margin
self.marginr = 0 #: Right margin
self.marginv = 0 #: Vertical margin
self.effect = "" #: Line effect
self.type = "Dialogue" #: Line type (Dialogue/Comment)
for k, v in fields.items():
if k in self.FIELDS:
setattr(self, k, v)
else:
raise ValueError("SSAEvent has no field named %r" % k)
start: int = 0 #: Subtitle start time (in milliseconds)
end: int = 10000 #: Subtitle end time (in milliseconds)
text: str = "" #: Text of subtitle (with SubStation override tags)
marked: bool = False #: (SSA only)
layer: int = 0 #: Layer number, 0 is the lowest layer (ASS only)
style: str = "Default" #: Style name
name: str = "" #: Actor name
marginl: int = 0 #: Left margin
marginr: int = 0 #: Right margin
marginv: int = 0 #: Vertical margin
effect: str = "" #: Line effect
type: str = "Dialogue" #: Line type (Dialogue/Comment)
@property
def duration(self):
def FIELDS(self):
"""All fields in SSAEvent."""
warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning)
return frozenset(field.name for field in dataclasses.fields(self))
@property
def duration(self) -> IntOrFloat:
"""
Subtitle duration in milliseconds (read/write property).
@ -60,14 +57,14 @@ class SSAEvent(object):
return self.end - self.start
@duration.setter
def duration(self, ms):
def duration(self, ms: int):
if ms >= 0:
self.end = self.start + ms
else:
raise ValueError("Subtitle duration cannot be negative")
@property
def is_comment(self):
def is_comment(self) -> bool:
"""
When true, the subtitle is a comment, ie. not visible (read/write property).
@ -77,14 +74,20 @@ class SSAEvent(object):
return self.type == "Comment"
@is_comment.setter
def is_comment(self, value):
def is_comment(self, value: bool):
if value:
self.type = "Comment"
else:
self.type = "Dialogue"
@property
def plaintext(self):
def is_drawing(self) -> bool:
"""Returns True if line is SSA drawing tag (ie. not text)"""
from .substation import parse_tags
return any(sty.drawing for _, sty in parse_tags(self.text))
@property
def plaintext(self) -> str:
"""
Subtitle text as multi-line string with no tags (read/write property).
@ -99,10 +102,11 @@ class SSAEvent(object):
return text
@plaintext.setter
def plaintext(self, text):
def plaintext(self, text: str):
self.text = text.replace("\n", r"\N")
def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None):
def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
frames: Optional[int]=None, fps: Optional[float]=None):
"""
Shift start and end times.
@ -113,41 +117,39 @@ class SSAEvent(object):
self.start += delta
self.end += delta
def copy(self):
def copy(self) -> "SSAEvent":
"""Return a copy of the SSAEvent."""
return SSAEvent(**self.as_dict())
def as_dict(self):
return {field: getattr(self, field) for field in self.FIELDS}
def as_dict(self) -> Dict[str, Any]:
# dataclasses.asdict() would recursively dictify Color objects, which we don't want
return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
def equals(self, other):
def equals(self, other: "SSAEvent") -> bool:
"""Field-based equality for SSAEvents."""
if isinstance(other, SSAEvent):
return self.as_dict() == other.as_dict()
else:
raise TypeError("Cannot compare to non-SSAEvent object")
def __eq__(self, other):
def __eq__(self, other: "SSAEvent"):
# XXX document this
return self.start == other.start and self.end == other.end
def __ne__(self, other):
def __ne__(self, other: "SSAEvent"):
return self.start != other.start or self.end != other.end
def __lt__(self, other):
def __lt__(self, other: "SSAEvent"):
return (self.start, self.end) < (other.start, other.end)
def __le__(self, other):
def __le__(self, other: "SSAEvent"):
return (self.start, self.end) <= (other.start, other.end)
def __gt__(self, other):
def __gt__(self, other: "SSAEvent"):
return (self.start, self.end) > (other.start, other.end)
def __ge__(self, other):
def __ge__(self, other: "SSAEvent"):
return (self.start, self.end) >= (other.start, other.end)
def __repr__(self):
s = "<SSAEvent type={self.type} start={start} end={end} text='{self.text}'>".format(
self=self, start=ms_to_str(self.start), end=ms_to_str(self.end))
if not PY3: s = s.encode("utf-8")
return s
return f"<SSAEvent type={self.type} start={ms_to_str(self.start)} end={ms_to_str(self.end)} text={self.text!r}>"

View file

@ -1,16 +1,17 @@
from __future__ import print_function, unicode_literals, division
from collections import MutableSequence, OrderedDict
from collections import MutableSequence
import io
from io import open
from itertools import starmap, chain
from itertools import chain
import os.path
import logging
from typing import Optional, List, Dict, Iterable, Any
from .common import IntOrFloat
from .formats import autodetect_format, get_format_class, get_format_identifier
from .substation import is_valid_field_content
from .ssaevent import SSAEvent
from .ssastyle import SSAStyle
from .time import make_time, ms_to_str
from .common import PY3
class SSAFile(MutableSequence):
@ -31,28 +32,37 @@ class SSAFile(MutableSequence):
"""
DEFAULT_INFO = OrderedDict([
("WrapStyle", "0"),
("ScaledBorderAndShadow", "yes"),
("Collisions", "Normal")])
DEFAULT_INFO = {
"WrapStyle": "0",
"ScaledBorderAndShadow": "yes",
"Collisions": "Normal"
}
def __init__(self):
self.events = [] #: List of :class:`SSAEvent` instances, ie. individual subtitles.
self.styles = OrderedDict([("Default", SSAStyle.DEFAULT_STYLE.copy())]) #: Dict of :class:`SSAStyle` instances.
self.info = self.DEFAULT_INFO.copy() #: Dict with script metadata, ie. ``[Script Info]``.
self.aegisub_project = OrderedDict() #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``.
self.fps = None #: Framerate used when reading the file, if applicable.
self.format = None #: Format of source subtitle file, if applicable, eg. ``"srt"``.
self.events: List[SSAEvent] = [] #: List of :class:`SSAEvent` instances, ie. individual subtitles.
self.styles: Dict[str, SSAStyle] = {"Default": SSAStyle.DEFAULT_STYLE.copy()} #: Dict of :class:`SSAStyle` instances.
self.info: Dict[str, str] = self.DEFAULT_INFO.copy() #: Dict with script metadata, ie. ``[Script Info]``.
self.aegisub_project: Dict[str, str] = {} #: Dict with Aegisub project, ie. ``[Aegisub Project Garbage]``.
self.fonts_opaque: Dict[str, Any] = {} #: Dict with embedded fonts, ie. ``[Fonts]``.
self.fps: Optional[float] = None #: Framerate used when reading the file, if applicable.
self.format: Optional[str] = None #: Format of source subtitle file, if applicable, eg. ``"srt"``.
# ------------------------------------------------------------------------
# I/O methods
# ------------------------------------------------------------------------
@classmethod
def load(cls, path, encoding="utf-8", format_=None, fps=None, **kwargs):
def load(cls, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
"""
Load subtitle file from given path.
This method is implemented in terms of :meth:`SSAFile.from_file()`.
See also:
Specific formats may implement additional loading options,
please refer to documentation of the implementation classes
(eg. :meth:`pysubs2.subrip.SubripFormat.from_file()`)
Arguments:
path (str): Path to subtitle file.
encoding (str): Character encoding of input file.
@ -66,14 +76,7 @@ class SSAFile(MutableSequence):
be detected from the file, in which case you don't need
to specify it here (when given, this argument overrides
autodetection).
keep_unknown_html_tags (bool): This affects SubRip only (SRT),
for other formats this argument is ignored.
By default, HTML tags are converted to equivalent SubStation tags
(eg. ``<i>`` to ``{\\i1}`` and any remaining tags are removed
to keep the text clean. Set this parameter to ``True``
if you want to pass through these tags (eg. ``<sub>``).
This is useful if your output format is SRT and your player
supports these tags.
kwargs: Extra options for the reader.
Returns:
SSAFile
@ -100,7 +103,7 @@ class SSAFile(MutableSequence):
return cls.from_file(fp, format_, fps=fps, **kwargs)
@classmethod
def from_string(cls, string, format_=None, fps=None, **kwargs):
def from_string(cls, string: str, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
"""
Load subtitle file from string.
@ -126,7 +129,7 @@ class SSAFile(MutableSequence):
return cls.from_file(fp, format_, fps=fps, **kwargs)
@classmethod
def from_file(cls, fp, format_=None, fps=None, **kwargs):
def from_file(cls, fp: io.TextIOBase, format_: Optional[str]=None, fps: Optional[float]=None, **kwargs) -> "SSAFile":
"""
Read subtitle file from file object.
@ -160,10 +163,17 @@ class SSAFile(MutableSequence):
impl.from_file(subs, fp, format_, fps=fps, **kwargs)
return subs
def save(self, path, encoding="utf-8", format_=None, fps=None, **kwargs):
def save(self, path: str, encoding: str="utf-8", format_: Optional[str]=None, fps: Optional[float]=None, **kwargs):
"""
Save subtitle file to given path.
This method is implemented in terms of :meth:`SSAFile.to_file()`.
See also:
Specific formats may implement additional saving options,
please refer to documentation of the implementation classes
(eg. :meth:`pysubs2.subrip.SubripFormat.to_file()`)
Arguments:
path (str): Path to subtitle file.
encoding (str): Character encoding of output file.
@ -197,7 +207,7 @@ class SSAFile(MutableSequence):
with open(path, "w", encoding=encoding) as fp:
self.to_file(fp, format_, fps=fps, **kwargs)
def to_string(self, format_, fps=None, **kwargs):
def to_string(self, format_: str, fps: Optional[float]=None, **kwargs) -> str:
"""
Get subtitle file as a string.
@ -211,7 +221,7 @@ class SSAFile(MutableSequence):
self.to_file(fp, format_, fps=fps, **kwargs)
return fp.getvalue()
def to_file(self, fp, format_, fps=None, **kwargs):
def to_file(self, fp: io.TextIOBase, format_: str, fps: Optional[float]=None, **kwargs):
"""
Write subtitle file to file object.
@ -233,7 +243,8 @@ class SSAFile(MutableSequence):
# Retiming subtitles
# ------------------------------------------------------------------------
def shift(self, h=0, m=0, s=0, ms=0, frames=None, fps=None):
def shift(self, h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
frames: Optional[int]=None, fps: Optional[float]=None):
"""
Shift all subtitles by constant time amount.
@ -255,7 +266,7 @@ class SSAFile(MutableSequence):
line.start += delta
line.end += delta
def transform_framerate(self, in_fps, out_fps):
def transform_framerate(self, in_fps: float, out_fps: float):
"""
Rescale all timestamps by ratio of in_fps/out_fps.
@ -282,7 +293,7 @@ class SSAFile(MutableSequence):
# Working with styles
# ------------------------------------------------------------------------
def rename_style(self, old_name, new_name):
def rename_style(self, old_name: str, new_name: str):
"""
Rename a style, including references to it.
@ -311,7 +322,7 @@ class SSAFile(MutableSequence):
if line.style == old_name:
line.style = new_name
def import_styles(self, subs, overwrite=True):
def import_styles(self, subs: "SSAFile", overwrite: bool=True):
"""
Merge in styles from other SSAFile.
@ -332,7 +343,39 @@ class SSAFile(MutableSequence):
# Helper methods
# ------------------------------------------------------------------------
def equals(self, other):
def remove_miscellaneous_events(self):
"""
Remove subtitles which appear to be non-essential (the --clean in CLI)
Currently, this removes events matching any of these criteria:
- SSA event type Comment
- SSA drawing tags
- Less than two characters of text
- Duplicated text with identical time interval (only the first event is kept)
"""
new_events = []
duplicate_text_ids = set()
times_to_texts = {}
for i, e in enumerate(self):
tmp = times_to_texts.setdefault((e.start, e.end), [])
if tmp.count(e.plaintext) > 0:
duplicate_text_ids.add(i)
tmp.append(e.plaintext)
for i, e in enumerate(self):
if e.is_drawing or e.is_comment:
continue
if len(e.plaintext.strip()) < 2:
continue
if i in duplicate_text_ids:
continue
new_events.append(e)
self.events = new_events
def equals(self, other: "SSAFile"):
"""
Equality of two SSAFiles.
@ -357,6 +400,18 @@ class SSAFile(MutableSequence):
logging.debug("info %r differs (self=%r, other=%r)", key, sv, ov)
return False
for key in set(chain(self.fonts_opaque.keys(), other.fonts_opaque.keys())):
sv, ov = self.fonts_opaque.get(key), other.fonts_opaque.get(key)
if sv is None:
logging.debug("%r missing in self.fonts_opaque", key)
return False
elif ov is None:
logging.debug("%r missing in other.fonts_opaque", key)
return False
elif sv != ov:
logging.debug("fonts_opaque %r differs (self=%r, other=%r)", key, sv, ov)
return False
for key in set(chain(self.styles.keys(), other.styles.keys())):
sv, ov = self.styles.get(key), other.styles.get(key)
if sv is None:
@ -389,12 +444,10 @@ class SSAFile(MutableSequence):
def __repr__(self):
if self.events:
max_time = max(ev.end for ev in self)
s = "<SSAFile with %d events and %d styles, last timestamp %s>" % \
(len(self), len(self.styles), ms_to_str(max_time))
s = f"<SSAFile with {len(self)} events and {len(self.styles)} styles, last timestamp {ms_to_str(max_time)}>"
else:
s = "<SSAFile with 0 events and %d styles>" % len(self.styles)
s = f"<SSAFile with 0 events and {len(self.styles)} styles>"
if not PY3: s = s.encode("utf-8")
return s
# ------------------------------------------------------------------------
@ -405,22 +458,25 @@ class SSAFile(MutableSequence):
"""Sort subtitles time-wise, in-place."""
self.events.sort()
def __getitem__(self, item):
def __iter__(self) -> Iterable[SSAEvent]:
return iter(self.events)
def __getitem__(self, item: int):
return self.events[item]
def __setitem__(self, key, value):
def __setitem__(self, key: int, value: SSAEvent):
if isinstance(value, SSAEvent):
self.events[key] = value
else:
raise TypeError("SSAFile.events must contain only SSAEvent objects")
def __delitem__(self, key):
def __delitem__(self, key: int):
del self.events[key]
def __len__(self):
return len(self.events)
def insert(self, index, value):
def insert(self, index: int, value: SSAEvent):
if isinstance(value, SSAEvent):
self.events.insert(index, value)
else:

View file

@ -1,8 +1,11 @@
from __future__ import unicode_literals
from .common import Color, PY3
import warnings
from typing import Dict, Any, ClassVar
import dataclasses
from .common import Color
class SSAStyle(object):
@dataclasses.dataclass(repr=False)
class SSAStyle:
"""
A SubStation Style.
@ -17,71 +20,57 @@ class SSAStyle(object):
This class defines equality (equality of all fields).
"""
DEFAULT_STYLE = None
DEFAULT_STYLE: ClassVar["SSAStyle"] = None
#: All fields in SSAStyle.
FIELDS = frozenset([
"fontname", "fontsize", "primarycolor", "secondarycolor",
"tertiarycolor", "outlinecolor", "backcolor",
"bold", "italic", "underline", "strikeout",
"scalex", "scaley", "spacing", "angle", "borderstyle",
"outline", "shadow", "alignment",
"marginl", "marginr", "marginv", "alphalevel", "encoding"
])
@property
def FIELDS(self):
"""All fields in SSAStyle."""
warnings.warn("Deprecated in 1.2.0 - it's a dataclass now", DeprecationWarning)
return frozenset(field.name for field in dataclasses.fields(self))
def __init__(self, **fields):
self.fontname = "Arial" #: Font name
self.fontsize = 20.0 #: Font size (in pixels)
self.primarycolor = Color(255, 255, 255, 0) #: Primary color (:class:`pysubs2.Color` instance)
self.secondarycolor = Color(255, 0, 0, 0) #: Secondary color (:class:`pysubs2.Color` instance)
self.tertiarycolor = Color(0, 0, 0, 0) #: Tertiary color (:class:`pysubs2.Color` instance)
self.outlinecolor = Color(0, 0, 0, 0) #: Outline color (:class:`pysubs2.Color` instance)
self.backcolor = Color(0, 0, 0, 0) #: Back, ie. shadow color (:class:`pysubs2.Color` instance)
self.bold = False #: Bold
self.italic = False #: Italic
self.underline = False #: Underline (ASS only)
self.strikeout = False #: Strikeout (ASS only)
self.drawing = False #: Drawing (ASS only, see http://docs.aegisub.org/3.1/ASS_Tags/#drawing-tags
self.scalex = 100.0 #: Horizontal scaling (ASS only)
self.scaley = 100.0 #: Vertical scaling (ASS only)
self.spacing = 0.0 #: Letter spacing (ASS only)
self.angle = 0.0 #: Rotation (ASS only)
self.borderstyle = 1 #: Border style
self.outline = 2.0 #: Outline width (in pixels)
self.shadow = 2.0 #: Shadow depth (in pixels)
self.alignment = 2 #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics)
self.marginl = 10 #: Left margin (in pixels)
self.marginr = 10 #: Right margin (in pixels)
self.marginv = 10 #: Vertical margin (in pixels)
self.alphalevel = 0 #: Old, unused SSA-only field
self.encoding = 1 #: Charset
fontname: str = "Arial" #: Font name
fontsize: float = 20.0 #: Font size (in pixels)
primarycolor: Color = Color(255, 255, 255, 0) #: Primary color (:class:`pysubs2.Color` instance)
secondarycolor: Color = Color(255, 0, 0, 0) #: Secondary color (:class:`pysubs2.Color` instance)
tertiarycolor: Color = Color(0, 0, 0, 0) #: Tertiary color (:class:`pysubs2.Color` instance)
outlinecolor: Color = Color(0, 0, 0, 0) #: Outline color (:class:`pysubs2.Color` instance)
backcolor: Color = Color(0, 0, 0, 0) #: Back, ie. shadow color (:class:`pysubs2.Color` instance)
bold: bool = False #: Bold
italic: bool = False #: Italic
underline: bool = False #: Underline (ASS only)
strikeout: bool = False #: Strikeout (ASS only)
scalex: float = 100.0 #: Horizontal scaling (ASS only)
scaley: float = 100.0 #: Vertical scaling (ASS only)
spacing: float = 0.0 #: Letter spacing (ASS only)
angle: float = 0.0 #: Rotation (ASS only)
borderstyle: int = 1 #: Border style
outline: float = 2.0 #: Outline width (in pixels)
shadow: float = 2.0 #: Shadow depth (in pixels)
alignment: int = 2 #: Numpad-style alignment, eg. 7 is "top left" (that is, ASS alignment semantics)
marginl: int = 10 #: Left margin (in pixels)
marginr: int = 10 #: Right margin (in pixels)
marginv: int = 10 #: Vertical margin (in pixels)
alphalevel: int = 0 #: Old, unused SSA-only field
encoding: int = 1 #: Charset
for k, v in fields.items():
if k in self.FIELDS:
setattr(self, k, v)
else:
raise ValueError("SSAStyle has no field named %r" % k)
# The following attributes cannot be defined for SSA styles themselves,
# but can be used in override tags and thus are useful to keep here
# for the `pysubs2.substation.parse_tags()` interface which returns
# SSAStyles for text fragments.
drawing: bool = False #: Indicates that text span is a SSA vector drawing, see `pysubs2.substation.parse_tags()`
def copy(self):
def copy(self) -> "SSAStyle":
return SSAStyle(**self.as_dict())
def as_dict(self):
return {field: getattr(self, field) for field in self.FIELDS}
def __eq__(self, other):
return self.as_dict() == other.as_dict()
def __ne__(self, other):
return not self == other
def as_dict(self) -> Dict[str, Any]:
# dataclasses.asdict() would recursively dictify Color objects, which we don't want
return {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
def __repr__(self):
s = "<SSAStyle "
s += "%rpx " % self.fontsize
if self.bold: s += "bold "
if self.italic: s += "italic "
s += "{!r}>".format(self.fontname)
if not PY3: s = s.encode("utf-8")
return s
return f"<SSAStyle {self.fontsize!r}px" \
f"{' bold' if self.bold else ''}" \
f"{' italic' if self.italic else ''}" \
f" {self.fontname!r}>"
SSAStyle.DEFAULT_STYLE = SSAStyle()

View file

@ -1,5 +1,3 @@
from __future__ import print_function, unicode_literals
import re
from .formatbase import FormatBase
from .ssaevent import SSAEvent
@ -21,25 +19,50 @@ def ms_to_timestamp(ms):
class SubripFormat(FormatBase):
"""SubRip Text (SRT) subtitle format implementation"""
TIMESTAMP = TIMESTAMP
@staticmethod
def timestamp_to_ms(groups):
return timestamp_to_ms(groups)
@classmethod
def guess_format(cls, text):
"""See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
if "[Script Info]" in text or "[V4+ Styles]" in text:
# disambiguation vs. SSA/ASS
return None
if text.lstrip().startswith("WEBVTT"):
# disambiguation vs. WebVTT
return None
for line in text.splitlines():
if len(TIMESTAMP.findall(line)) == 2:
if len(cls.TIMESTAMP.findall(line)) == 2:
return "srt"
@classmethod
def from_file(cls, subs, fp, format_, keep_unknown_html_tags=False, **kwargs):
"""
See :meth:`pysubs2.formats.FormatBase.from_file()`
Supported tags:
- ``<i>``
- ``<u>``
- ``<s>``
Keyword args:
keep_unknown_html_tags: If True, HTML tags other than i/u/s will be kept as-is.
Otherwise, they will be stripped from input.
"""
timestamps = [] # (start, end)
following_lines = [] # contains lists of lines following each timestamp
for line in fp:
stamps = TIMESTAMP.findall(line)
stamps = cls.TIMESTAMP.findall(line)
if len(stamps) == 2: # timestamp line
start, end = map(timestamp_to_ms, stamps)
start, end = map(cls.timestamp_to_ms, stamps)
timestamps.append((start, end))
following_lines.append([])
else:
@ -72,16 +95,26 @@ class SubripFormat(FormatBase):
for (start, end), lines in zip(timestamps, following_lines)]
@classmethod
def to_file(cls, subs, fp, format_, **kwargs):
def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs):
"""
See :meth:`pysubs2.formats.FormatBase.to_file()`
Italic, underline and strikeout styling is supported.
Keyword args:
apply_styles: If False, do not write any styling.
"""
def prepare_text(text, style):
body = []
for fragment, sty in parse_tags(text, style, subs.styles):
fragment = fragment.replace(r"\h", " ")
fragment = fragment.replace(r"\n", "\n")
fragment = fragment.replace(r"\N", "\n")
if sty.italic: fragment = "<i>%s</i>" % fragment
if sty.underline: fragment = "<u>%s</u>" % fragment
if sty.strikeout: fragment = "<s>%s</s>" % fragment
if apply_styles:
if sty.italic: fragment = "<i>%s</i>" % fragment
if sty.underline: fragment = "<u>%s</u>" % fragment
if sty.strikeout: fragment = "<s>%s</s>" % fragment
if sty.drawing: raise ContentNotUsable
body.append(fragment)
@ -89,7 +122,8 @@ class SubripFormat(FormatBase):
visible_lines = (line for line in subs if not line.is_comment)
for i, line in enumerate(visible_lines, 1):
lineno = 1
for line in visible_lines:
start = ms_to_timestamp(line.start)
end = ms_to_timestamp(line.end)
try:
@ -97,6 +131,7 @@ class SubripFormat(FormatBase):
except ContentNotUsable:
continue
print("%d" % i, file=fp) # Python 2.7 compat
print("%d" % lineno, file=fp) # Python 2.7 compat
print(start, "-->", end, file=fp)
print(text, end="\n\n", file=fp)
lineno += 1

View file

@ -1,10 +1,10 @@
from __future__ import print_function, division, unicode_literals
import logging
import re
from numbers import Number
from .formatbase import FormatBase
from .ssaevent import SSAEvent
from .ssastyle import SSAStyle
from .common import text_type, Color, PY3, binary_string_type
from .common import Color
from .time import make_time, ms_to_times, timestamp_to_ms, TIMESTAMP
SSA_ALIGNMENT = (1, 2, 3, 9, 10, 11, 5, 6, 7)
@ -15,7 +15,14 @@ def ass_to_ssa_alignment(i):
def ssa_to_ass_alignment(i):
return SSA_ALIGNMENT.index(i) + 1
SECTION_HEADING = re.compile(r"^.{,3}\[[^\]]+\]") # allow for UTF-8 BOM, which is 3 bytes
SECTION_HEADING = re.compile(
r"^.{,3}" # allow 3 chars at start of line for BOM
r"\[" # open square bracket
r"[^]]*[a-z][^]]*" # inside square brackets, at least one lowercase letter (this guards vs. uuencoded font data)
r"]" # close square bracket
)
FONT_FILE_HEADING = re.compile(r"fontname:\s+(\S+)")
STYLE_FORMAT_LINE = {
"ass": "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic,"
@ -46,7 +53,7 @@ EVENT_FIELDS = {
#: Largest timestamp allowed in SubStation, ie. 9:59:59.99.
MAX_REPRESENTABLE_TIME = make_time(h=10) - 10
def ms_to_timestamp(ms):
def ms_to_timestamp(ms: int) -> str:
"""Convert ms to 'H:MM:SS.cc'"""
# XXX throw on overflow/underflow?
if ms < 0: ms = 0
@ -54,28 +61,24 @@ def ms_to_timestamp(ms):
h, m, s, ms = ms_to_times(ms)
return "%01d:%02d:%02d.%02d" % (h, m, s, ms//10)
def color_to_ass_rgba(c):
def color_to_ass_rgba(c: Color) -> str:
return "&H%08X" % ((c.a << 24) | (c.b << 16) | (c.g << 8) | c.r)
def color_to_ssa_rgb(c):
def color_to_ssa_rgb(c: Color) -> str:
return "%d" % ((c.b << 16) | (c.g << 8) | c.r)
def ass_rgba_to_color(s):
x = int(s[2:], base=16)
def rgba_to_color(s: str) -> Color:
if s[0] == '&':
x = int(s[2:], base=16)
else:
x = int(s)
r = x & 0xff
g = (x >> 8) & 0xff
b = (x >> 16) & 0xff
a = (x >> 24) & 0xff
return Color(r, g, b, a)
def ssa_rgb_to_color(s):
x = int(s)
r = x & 0xff
g = (x >> 8) & 0xff
b = (x >> 16) & 0xff
return Color(r, g, b)
def is_valid_field_content(s):
def is_valid_field_content(s: str) -> bool:
"""
Returns True if string s can be stored in a SubStation field.
@ -140,8 +143,10 @@ def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
NOTICE = "Script generated by pysubs2\nhttps://pypi.python.org/pypi/pysubs2"
class SubstationFormat(FormatBase):
"""SubStation Alpha (ASS, SSA) subtitle format implementation"""
@classmethod
def guess_format(cls, text):
"""See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
if "V4+ Styles" in text:
return "ass"
elif "V4 Styles" in text:
@ -149,6 +154,7 @@ class SubstationFormat(FormatBase):
@classmethod
def from_file(cls, subs, fp, format_, **kwargs):
"""See :meth:`pysubs2.formats.FormatBase.from_file()`"""
def string_to_field(f, v):
if f in {"start", "end"}:
@ -159,10 +165,7 @@ class SubstationFormat(FormatBase):
else:
return timestamp_to_ms(TIMESTAMP.match(v).groups())
elif "color" in f:
if format_ == "ass":
return ass_rgba_to_color(v)
else:
return ssa_rgb_to_color(v)
return rgba_to_color(v)
elif f in {"bold", "underline", "italic", "strikeout"}:
return v == "-1"
elif f in {"borderstyle", "encoding", "marginl", "marginr", "marginv", "layer", "alphalevel"}:
@ -183,16 +186,22 @@ class SubstationFormat(FormatBase):
subs.info.clear()
subs.aegisub_project.clear()
subs.styles.clear()
subs.fonts_opaque.clear()
inside_info_section = False
inside_aegisub_section = False
inside_font_section = False
current_font_name = None
current_font_lines_buffer = []
for line in fp:
for lineno, line in enumerate(fp, 1):
line = line.strip()
if SECTION_HEADING.match(line):
logging.debug("at line %d: section heading %s", lineno, line)
inside_info_section = "Info" in line
inside_aegisub_section = "Aegisub" in line
inside_font_section = "Fonts" in line
elif inside_info_section or inside_aegisub_section:
if line.startswith(";"): continue # skip comments
try:
@ -203,6 +212,24 @@ class SubstationFormat(FormatBase):
subs.aegisub_project[k] = v.strip()
except ValueError:
pass
elif inside_font_section:
m = FONT_FILE_HEADING.match(line)
if current_font_name and (m or not line):
# flush last font on newline or new font name
font_data = current_font_lines_buffer[:]
subs.fonts_opaque[current_font_name] = font_data
logging.debug("at line %d: finished font definition %s", lineno, current_font_name)
current_font_lines_buffer.clear()
current_font_name = None
if m:
# start new font
font_name = m.group(1)
current_font_name = font_name
elif line:
# add non-empty line to current buffer
current_font_lines_buffer.append(line)
elif line.startswith("Style:"):
_, rest = line.split(":", 1)
buf = rest.strip().split(",")
@ -218,9 +245,18 @@ class SubstationFormat(FormatBase):
ev = SSAEvent(**field_dict)
subs.events.append(ev)
# cleanup fonts
if current_font_name:
# flush last font on EOF or new section w/o newline
font_data = current_font_lines_buffer[:]
subs.fonts_opaque[current_font_name] = font_data
logging.debug("at EOF: finished font definition %s", current_font_name)
current_font_lines_buffer.clear()
current_font_name = None
@classmethod
def to_file(cls, subs, fp, format_, header_notice=NOTICE, **kwargs):
"""See :meth:`pysubs2.formats.FormatBase.to_file()`"""
print("[Script Info]", file=fp)
for line in header_notice.splitlines(False):
print(";", line, file=fp)
@ -240,19 +276,11 @@ class SubstationFormat(FormatBase):
elif f == "marked":
return "Marked=%d" % v
elif f == "alignment" and format_ == "ssa":
return text_type(ass_to_ssa_alignment(v))
return str(ass_to_ssa_alignment(v))
elif isinstance(v, bool):
return "-1" if v else "0"
elif isinstance(v, (text_type, Number)):
return text_type(v)
elif not PY3 and isinstance(v, binary_string_type):
# A convenience feature, see issue #12 - accept non-unicode strings
# when they are ASCII; this is useful in Python 2, especially for non-text
# fields like style names, where requiring Unicode type seems too stringent
if all(ord(c) < 128 for c in v):
return text_type(v)
else:
raise TypeError("Encountered binary string with non-ASCII codepoint in SubStation field {!r} for line {!r} - please use unicode string instead of str".format(f, line))
elif isinstance(v, (str, Number)):
return str(v)
elif isinstance(v, Color):
if format_ == "ass":
return color_to_ass_rgba(v)
@ -267,6 +295,14 @@ class SubstationFormat(FormatBase):
fields = [field_to_string(f, getattr(sty, f), sty) for f in STYLE_FIELDS[format_]]
print("Style: %s" % name, *fields, sep=",", file=fp)
if subs.fonts_opaque:
print("\n[Fonts]", file=fp)
for font_name, font_lines in sorted(subs.fonts_opaque.items()):
print("fontname: {}".format(font_name), file=fp)
for line in font_lines:
print(line, file=fp)
print(file=fp)
print("\n[Events]", file=fp)
print(EVENT_FORMAT_LINE[format_], file=fp)
for ev in subs.events:

View file

@ -1,15 +1,19 @@
from __future__ import division
from collections import namedtuple
import re
#: Pattern that matches both SubStation and SubRip timestamps.
from typing import Optional, List, Tuple, Sequence
from pysubs2.common import IntOrFloat
TIMESTAMP = re.compile(r"(\d{1,2}):(\d{2}):(\d{2})[.,](\d{2,3})")
Times = namedtuple("Times", ["h", "m", "s", "ms"])
def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None):
def make_time(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0,
frames: Optional[int]=None, fps: Optional[float]=None):
"""
Convert time to milliseconds.
@ -33,7 +37,8 @@ def make_time(h=0, m=0, s=0, ms=0, frames=None, fps=None):
else:
raise ValueError("Both fps and frames must be specified")
def timestamp_to_ms(groups):
def timestamp_to_ms(groups: Sequence[str]):
"""
Convert groups from :data:`pysubs2.time.TIMESTAMP` match to milliseconds.
@ -49,7 +54,8 @@ def timestamp_to_ms(groups):
ms += h * 3600000
return ms
def tmptimestamp_to_ms(groups):
def tmptimestamp_to_ms(groups: Sequence[str]):
"""
Convert groups from :data:`pysubs2.time.TMPTIMESTAMP` match to milliseconds.
@ -63,7 +69,9 @@ def tmptimestamp_to_ms(groups):
ms += m * 60000
ms += h * 3600000
return ms
def times_to_ms(h=0, m=0, s=0, ms=0):
def times_to_ms(h: IntOrFloat=0, m: IntOrFloat=0, s: IntOrFloat=0, ms: IntOrFloat=0) -> int:
"""
Convert hours, minutes, seconds to milliseconds.
@ -79,7 +87,8 @@ def times_to_ms(h=0, m=0, s=0, ms=0):
ms += h * 3600000
return int(round(ms))
def frames_to_ms(frames, fps):
def frames_to_ms(frames: int, fps: float) -> int:
"""
Convert frame-based duration to milliseconds.
@ -99,7 +108,8 @@ def frames_to_ms(frames, fps):
return int(round(frames * (1000 / fps)))
def ms_to_frames(ms, fps):
def ms_to_frames(ms: IntOrFloat, fps: float) -> int:
"""
Convert milliseconds to number of frames.
@ -119,7 +129,8 @@ def ms_to_frames(ms, fps):
return int(round((ms / 1000) * fps))
def ms_to_times(ms):
def ms_to_times(ms: IntOrFloat) -> Tuple[int, int, int, int]:
"""
Convert milliseconds to normalized tuple (h, m, s, ms).
@ -138,7 +149,8 @@ def ms_to_times(ms):
s, ms = divmod(ms, 1000)
return Times(h, m, s, ms)
def ms_to_str(ms, fractions=False):
def ms_to_str(ms: IntOrFloat, fractions: bool=False) -> str:
"""
Prettyprint milliseconds to [-]H:MM:SS[.mmm]
@ -156,6 +168,6 @@ def ms_to_str(ms, fractions=False):
sgn = "-" if ms < 0 else ""
h, m, s, ms = ms_to_times(abs(ms))
if fractions:
return sgn + "{:01d}:{:02d}:{:02d}.{:03d}".format(h, m, s, ms)
return f"{sgn}{h:01d}:{m:02d}:{s:02d}.{ms:03d}"
else:
return sgn + "{:01d}:{:02d}:{:02d}".format(h, m, s)
return f"{sgn}{h:01d}:{m:02d}:{s:02d}"

View file

@ -1,5 +1,3 @@
from __future__ import print_function, unicode_literals
import re
from .formatbase import FormatBase
from .ssaevent import SSAEvent
@ -15,6 +13,7 @@ TMP_LINE = re.compile(r"(\d{1,2}:\d{2}:\d{2}):(.+)")
#: Largest timestamp allowed in Tmp, ie. 99:59:59.
MAX_REPRESENTABLE_TIME = make_time(h=100) - 1
def ms_to_timestamp(ms):
"""Convert ms to 'HH:MM:SS'"""
# XXX throw on overflow/underflow?
@ -25,8 +24,10 @@ def ms_to_timestamp(ms):
class TmpFormat(FormatBase):
"""TMP subtitle format implementation"""
@classmethod
def guess_format(cls, text):
"""See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
if "[Script Info]" in text or "[V4+ Styles]" in text:
# disambiguation vs. SSA/ASS
return None
@ -37,8 +38,14 @@ class TmpFormat(FormatBase):
@classmethod
def from_file(cls, subs, fp, format_, **kwargs):
timestamps = [] # (start)
lines = [] # contains lists of lines following each timestamp
"""See :meth:`pysubs2.formats.FormatBase.from_file()`"""
events = []
def prepare_text(text):
text = text.replace("|", r"\N") # convert newlines
text = re.sub(r"< *u *>", "{\\\\u1}", text) # not r" for Python 2.7 compat, triggers unicodeescape
text = re.sub(r"< */? *[a-zA-Z][^>]*>", "", text) # strip other HTML tags
return text
for line in fp:
match = TMP_LINE.match(line)
@ -47,42 +54,54 @@ class TmpFormat(FormatBase):
start, text = match.groups()
start = tmptimestamp_to_ms(TMPTIMESTAMP.match(start).groups())
#calculate endtime from starttime + 500 miliseconds + 67 miliseconds per each character (15 chars per second)
end = start + 500 + (len(line) * 67)
timestamps.append((start, end))
lines.append(text)
def prepare_text(lines):
lines = lines.replace("|", r"\N") # convert newlines
lines = re.sub(r"< *u *>", "{\\\\u1}", lines) # not r" for Python 2.7 compat, triggers unicodeescape
lines = re.sub(r"< */? *[a-zA-Z][^>]*>", "", lines) # strip other HTML tags
return lines
# Unfortunately, end timestamp is not given; try to estimate something reasonable:
# start + 500 ms + 67 ms/character (15 chars per second)
end_guess = start + 500 + (len(line) * 67)
subs.events = [SSAEvent(start=start, end=end, text=prepare_text(lines))
for (start, end), lines in zip(timestamps, lines)]
event = SSAEvent(start=start, end=end_guess, text=prepare_text(text))
events.append(event)
# correct any overlapping subtitles created by end_guess
for i in range(len(events) - 1):
events[i].end = min(events[i].end, events[i+1].start)
subs.events = events
@classmethod
def to_file(cls, subs, fp, format_, **kwargs):
def to_file(cls, subs, fp, format_, apply_styles=True, **kwargs):
"""
See :meth:`pysubs2.formats.FormatBase.to_file()`
Italic, underline and strikeout styling is supported.
Keyword args:
apply_styles: If False, do not write any styling.
"""
def prepare_text(text, style):
body = []
skip = False
for fragment, sty in parse_tags(text, style, subs.styles):
fragment = fragment.replace(r"\h", " ")
fragment = fragment.replace(r"\n", "\n")
fragment = fragment.replace(r"\N", "\n")
if sty.italic: fragment = "<i>%s</i>" % fragment
if sty.underline: fragment = "<u>%s</u>" % fragment
if sty.strikeout: fragment = "<s>%s</s>" % fragment
if apply_styles:
if sty.italic: fragment = "<i>%s</i>" % fragment
if sty.underline: fragment = "<u>%s</u>" % fragment
if sty.strikeout: fragment = "<s>%s</s>" % fragment
if sty.drawing: skip = True
body.append(fragment)
return re.sub("\n+", "\n", "".join(body).strip())
if skip:
return ""
else:
return re.sub("\n+", "\n", "".join(body).strip())
visible_lines = (line for line in subs if not line.is_comment)
for i, line in enumerate(visible_lines, 1):
for line in visible_lines:
start = ms_to_timestamp(line.start)
#end = ms_to_timestamp(line.end)
text = prepare_text(line.text, subs.styles.get(line.style, SSAStyle.DEFAULT_STYLE))
#print("%d" % i, file=fp) # Python 2.7 compat
print(start + ":" + text, end="\n", file=fp)
#print(text, end="\n\n", file=fp)

36
libs/pysubs2/webvtt.py Normal file
View file

@ -0,0 +1,36 @@
import re
from .subrip import SubripFormat
from .time import make_time
class WebVTTFormat(SubripFormat):
"""
Web Video Text Tracks (WebVTT) subtitle format implementation
Currently, this shares implementation with :class:`pysubs2.subrip.SubripFormat`.
"""
TIMESTAMP = re.compile(r"(\d{0,4}:)?(\d{2}):(\d{2})\.(\d{2,3})")
@staticmethod
def timestamp_to_ms(groups):
_h, _m, _s, _ms = groups
if not _h:
h = 0
else:
h = int(_h.strip(":"))
m, s, ms = map(int, (_m, _s, _ms))
return make_time(h=h, m=m, s=s, ms=ms)
@classmethod
def guess_format(cls, text):
"""See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
if text.lstrip().startswith("WEBVTT"):
return "vtt"
@classmethod
def to_file(cls, subs, fp, format_, **kwargs):
"""
See :meth:`pysubs2.formats.FormatBase.to_file()`
"""
print("WEBVTT\n", file=fp)
return SubripFormat.to_file(subs=subs, fp=fp, format_=format_, **kwargs)