bazarr/libs/pysubs2/substation.py

import logging
import re
from numbers import Number
from .formatbase import FormatBase
from .ssaevent import SSAEvent
from .ssastyle import SSAStyle
from .common import Color
from .time import make_time, ms_to_times, timestamp_to_ms, TIMESTAMP, TIMESTAMP_SHORT

SSA_ALIGNMENT = (1, 2, 3, 9, 10, 11, 5, 6, 7)

def ass_to_ssa_alignment(i):
    return SSA_ALIGNMENT[i-1]

def ssa_to_ass_alignment(i):
    return SSA_ALIGNMENT.index(i) + 1

SECTION_HEADING = re.compile(
    r"^.{,3}"  # allow 3 chars at start of line for BOM
    r"\["  # open square bracket
    r"[^]]*[a-z][^]]*"  # inside square brackets, at least one lowercase letter (this guards vs. uuencoded font data)
    r"]"  # close square bracket
)

FONT_FILE_HEADING = re.compile(r"fontname:\s+(\S+)")

STYLE_FORMAT_LINE = {
    "ass": "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic,"
           " Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment,"
           " MarginL, MarginR, MarginV, Encoding",
    "ssa": "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, TertiaryColour, BackColour, Bold, Italic,"
           " BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, AlphaLevel, Encoding"
}

STYLE_FIELDS = {
    "ass": ["fontname", "fontsize", "primarycolor", "secondarycolor", "outlinecolor", "backcolor", "bold", "italic",
            "underline", "strikeout", "scalex", "scaley", "spacing", "angle", "borderstyle", "outline", "shadow",
            "alignment", "marginl", "marginr", "marginv", "encoding"],
    "ssa": ["fontname", "fontsize", "primarycolor", "secondarycolor", "tertiarycolor", "backcolor", "bold", "italic",
            "borderstyle", "outline", "shadow", "alignment", "marginl", "marginr", "marginv", "alphalevel", "encoding"]
}

EVENT_FORMAT_LINE = {
    "ass": "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text",
    "ssa": "Format: Marked, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text"
}

EVENT_FIELDS = {
    "ass": ["layer", "start", "end", "style", "name", "marginl", "marginr", "marginv", "effect", "text"],
    "ssa": ["marked", "start", "end", "style", "name", "marginl", "marginr", "marginv", "effect", "text"]
}

#: Largest timestamp allowed in SubStation, ie. 9:59:59.99.
MAX_REPRESENTABLE_TIME = make_time(h=10) - 10

def color_to_ass_rgba(c: Color) -> str:
    return "&H%08X" % ((c.a << 24) | (c.b << 16) | (c.g << 8) | c.r)

def color_to_ssa_rgb(c: Color) -> str:
    return "%d" % ((c.b << 16) | (c.g << 8) | c.r)

def rgba_to_color(s: str) -> Color:
    if s[0] == '&':
        x = int(s[2:], base=16)
    else:
        x = int(s)
    r = x & 0xff
    g = (x >> 8) & 0xff
    b = (x >> 16) & 0xff
    a = (x >> 24) & 0xff
    return Color(r, g, b, a)

def is_valid_field_content(s: str) -> bool:
    """
    Returns True if string s can be stored in a SubStation field.

    Fields are written in CSV-like manner, thus commas and/or newlines
    are not acceptable in the string.

    """
    return "\n" not in s and "," not in s


def parse_tags(text, style=SSAStyle.DEFAULT_STYLE, styles={}):
    """
    Split text into fragments with computed SSAStyles.

    Returns list of tuples (fragment, style), where fragment is a part of text
    between two brace-delimited override sequences, and style is the computed
    styling of the fragment, ie. the original style modified by all override
    sequences before the fragment.

    Newline and non-breakable space overrides are left as-is.

    Supported override tags:

    - i, b, u, s
    - r (with or without style name)

    """

    fragments = SSAEvent.OVERRIDE_SEQUENCE.split(text)
    if len(fragments) == 1:
        return [(text, style)]

    def apply_overrides(all_overrides):
        s = style.copy()
        for tag in re.findall(r"\\[ibusp][0-9]|\\r[a-zA-Z_0-9 ]*", all_overrides):
            if tag == r"\r":
                s = style.copy() # reset to original line style
            elif tag.startswith(r"\r"):
                name = tag[2:]
                if name in styles:
                    s = styles[name].copy() # reset to named style
            else:
                if "i" in tag: s.italic = "1" in tag
                elif "b" in tag: s.bold = "1" in tag
                elif "u" in tag: s.underline = "1" in tag
                elif "s" in tag: s.strikeout = "1" in tag
                elif "p" in tag:
                    try:
                        scale = int(tag[2:])
                    except (ValueError, IndexError):
                        continue

                    s.drawing = scale > 0
        return s

    overrides = SSAEvent.OVERRIDE_SEQUENCE.findall(text)
    overrides_prefix_sum = ["".join(overrides[:i]) for i in range(len(overrides) + 1)]
    computed_styles = map(apply_overrides, overrides_prefix_sum)
    return list(zip(fragments, computed_styles))


NOTICE = "Script generated by pysubs2\nhttps://pypi.python.org/pypi/pysubs2"

class SubstationFormat(FormatBase):
    """SubStation Alpha (ASS, SSA) subtitle format implementation"""

    @staticmethod
    def ms_to_timestamp(ms: int) -> str:
        """Convert ms to 'H:MM:SS.cc'"""
        # XXX throw on overflow/underflow?
        if ms < 0: ms = 0
        if ms > MAX_REPRESENTABLE_TIME: ms = MAX_REPRESENTABLE_TIME
        h, m, s, ms = ms_to_times(ms)
        return "%01d:%02d:%02d.%02d" % (h, m, s, ms//10)

    @classmethod
    def guess_format(cls, text):
        """See :meth:`pysubs2.formats.FormatBase.guess_format()`"""
        if re.search(r"V4\+ Styles", text, re.IGNORECASE):
            return "ass"
        elif re.search(r"V4 Styles", text, re.IGNORECASE):
            return "ssa"

    @classmethod
    def from_file(cls, subs, fp, format_, **kwargs):
        """See :meth:`pysubs2.formats.FormatBase.from_file()`"""

        def string_to_field(f: str, v: str):
            # Per issue #45, we should handle the case where there is extra whitespace around the values.
            # Extra whitespace is removed in non-string fields where it would break the parser otherwise,
            # and in font name (where it doesn't really make sense). It is preserved in Dialogue string
            # fields like Text, Name and Effect (to avoid introducing unnecessary change to parser output).

            if f in {"start", "end"}:
                v = v.strip()
                if v.startswith("-"):
                    # handle negative timestamps
                    v = v[1:]
                    sign = -1
                else:
                    sign = 1

                m = TIMESTAMP.match(v)
                if m is None:
                    m = TIMESTAMP_SHORT.match(v)
                    if m is None:
                        raise ValueError("Failed to parse timestamp: {!r}".format(v))

                return sign * timestamp_to_ms(m.groups())
            elif "color" in f:
                v = v.strip()
                return rgba_to_color(v)
            elif f in {"bold", "underline", "italic", "strikeout"}:
                return v == "-1"
            elif f in {"borderstyle", "encoding", "marginl", "marginr", "marginv", "layer", "alphalevel"}:
                return int(v)
            elif f in {"fontsize", "scalex", "scaley", "spacing", "angle", "outline", "shadow"}:
                return float(v)
            elif f == "marked":
                return v.endswith("1")
            elif f == "alignment":
                i = int(v)
                if format_ == "ass":
                    return i
                else:
                    return ssa_to_ass_alignment(i)
            elif f == "fontname":
                return v.strip()
            else:
                return v

        subs.info.clear()
        subs.aegisub_project.clear()
        subs.styles.clear()
        subs.fonts_opaque.clear()

        inside_info_section = False
        inside_aegisub_section = False
        inside_font_section = False
        current_font_name = None
        current_font_lines_buffer = []

        for lineno, line in enumerate(fp, 1):
            line = line.strip()

            if SECTION_HEADING.match(line):
                logging.debug("at line %d: section heading %s", lineno, line)
                inside_info_section = "Info" in line
                inside_aegisub_section = "Aegisub" in line
                inside_font_section = "Fonts" in line
            elif inside_info_section or inside_aegisub_section:
                if line.startswith(";"): continue # skip comments
                try:
                    k, v = line.split(":", 1)
                    if inside_info_section:
                        subs.info[k] = v.strip()
                    elif inside_aegisub_section:
                        subs.aegisub_project[k] = v.strip()
                except ValueError:
                    pass
            elif inside_font_section:
                m = FONT_FILE_HEADING.match(line)

                if current_font_name and (m or not line):
                    # flush last font on newline or new font name
                    font_data = current_font_lines_buffer[:]
                    subs.fonts_opaque[current_font_name] = font_data
                    logging.debug("at line %d: finished font definition %s", lineno, current_font_name)
                    current_font_lines_buffer.clear()
                    current_font_name = None

                if m:
                    # start new font
                    font_name = m.group(1)
                    current_font_name = font_name
                elif line:
                    # add non-empty line to current buffer
                    current_font_lines_buffer.append(line)
            elif line.startswith("Style:"):
                _, rest = line.split(":", 1)
                buf = rest.strip().split(",")
                name, raw_fields = buf[0], buf[1:] # splat workaround for Python 2.7
                field_dict = {f: string_to_field(f, v) for f, v in zip(STYLE_FIELDS[format_], raw_fields)}
                sty = SSAStyle(**field_dict)
                subs.styles[name] = sty
            elif line.startswith("Dialogue:") or line.startswith("Comment:"):
                ev_type, rest = line.split(":", 1)
                raw_fields = rest.strip().split(",", len(EVENT_FIELDS[format_])-1)
                field_dict = {f: string_to_field(f, v) for f, v in zip(EVENT_FIELDS[format_], raw_fields)}
                field_dict["type"] = ev_type
                ev = SSAEvent(**field_dict)
                subs.events.append(ev)

        # cleanup fonts
        if current_font_name:
            # flush last font on EOF or new section w/o newline
            font_data = current_font_lines_buffer[:]
            subs.fonts_opaque[current_font_name] = font_data
            logging.debug("at EOF: finished font definition %s", current_font_name)
            current_font_lines_buffer.clear()
            current_font_name = None

    @classmethod
    def to_file(cls, subs, fp, format_, header_notice=NOTICE, **kwargs):
        """See :meth:`pysubs2.formats.FormatBase.to_file()`"""
        print("[Script Info]", file=fp)
        for line in header_notice.splitlines(False):
            print(";", line, file=fp)

        subs.info["ScriptType"] = "v4.00+" if format_ == "ass" else "v4.00"
        for k, v in subs.info.items():
            print(k, v, sep=": ", file=fp)

        if subs.aegisub_project:
            print("\n[Aegisub Project Garbage]", file=fp)
            for k, v in subs.aegisub_project.items():
                print(k, v, sep=": ", file=fp)

        def field_to_string(f, v, line):
            if f in {"start", "end"}:
                return cls.ms_to_timestamp(v)
            elif f == "marked":
                return "Marked=%d" % v
            elif f == "alignment" and format_ == "ssa":
                return str(ass_to_ssa_alignment(v))
            elif isinstance(v, bool):
                return "-1" if v else "0"
            elif isinstance(v, (str, Number)):
                return str(v)
            elif isinstance(v, Color):
                if format_ == "ass":
                    return color_to_ass_rgba(v)
                else:
                    return color_to_ssa_rgb(v)
            else:
                raise TypeError("Unexpected type when writing a SubStation field {!r} for line {!r}".format(f, line))

        print("\n[V4+ Styles]" if format_ == "ass" else "\n[V4 Styles]", file=fp)
        print(STYLE_FORMAT_LINE[format_], file=fp)
        for name, sty in subs.styles.items():
            fields = [field_to_string(f, getattr(sty, f), sty) for f in STYLE_FIELDS[format_]]
            print("Style: %s" % name, *fields, sep=",", file=fp)

        if subs.fonts_opaque:
            print("\n[Fonts]", file=fp)
            for font_name, font_lines in sorted(subs.fonts_opaque.items()):
                print("fontname: {}".format(font_name), file=fp)
                for line in font_lines:
                    print(line, file=fp)
                print(file=fp)

        print("\n[Events]", file=fp)
        print(EVENT_FORMAT_LINE[format_], file=fp)
        for ev in subs.events:
            fields = [field_to_string(f, getattr(ev, f), ev) for f in EVENT_FIELDS[format_]]
            print(ev.type, end=": ", file=fp)
            print(*fields, sep=",", file=fp)