2020-06-11 00:04:54 +08:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
from datetime import timedelta
|
|
|
|
import logging
|
2022-01-24 12:07:52 +08:00
|
|
|
from typing import Any, Optional
|
2020-06-11 00:04:54 +08:00
|
|
|
|
2020-06-13 09:57:52 +08:00
|
|
|
try:
|
2020-08-05 10:42:27 +08:00
|
|
|
import cchardet as chardet
|
2020-06-13 09:57:52 +08:00
|
|
|
except ImportError:
|
2022-01-24 12:07:52 +08:00
|
|
|
import chardet # type: ignore
|
2020-06-11 00:04:54 +08:00
|
|
|
import pysubs2
|
2022-01-24 12:07:52 +08:00
|
|
|
from ffsubsync.sklearn_shim import TransformerMixin
|
2020-06-11 00:04:54 +08:00
|
|
|
import srt
|
|
|
|
|
2022-01-24 12:07:52 +08:00
|
|
|
from ffsubsync.constants import *
|
|
|
|
from ffsubsync.file_utils import open_file
|
|
|
|
from ffsubsync.generic_subtitles import GenericSubtitle, GenericSubtitlesFile, SubsMixin
|
2020-06-11 00:04:54 +08:00
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
2022-01-24 12:07:52 +08:00
|
|
|
logger: logging.Logger = logging.getLogger(__name__)
|
2020-06-11 00:04:54 +08:00
|
|
|
|
|
|
|
|
2022-01-24 12:07:52 +08:00
|
|
|
def _preprocess_subs(
|
|
|
|
subs,
|
|
|
|
max_subtitle_seconds: Optional[int] = None,
|
|
|
|
start_seconds: int = 0,
|
|
|
|
tolerant: bool = True,
|
|
|
|
) -> List[GenericSubtitle]:
|
2020-06-11 00:04:54 +08:00
|
|
|
subs_list = []
|
|
|
|
start_time = timedelta(seconds=start_seconds)
|
|
|
|
max_duration = timedelta(days=1)
|
|
|
|
if max_subtitle_seconds is not None:
|
|
|
|
max_duration = timedelta(seconds=max_subtitle_seconds)
|
|
|
|
subs = iter(subs)
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
next_sub = GenericSubtitle.wrap_inner_subtitle(next(subs))
|
|
|
|
if next_sub.start < start_time:
|
|
|
|
continue
|
|
|
|
next_sub.end = min(next_sub.end, next_sub.start + max_duration)
|
|
|
|
subs_list.append(next_sub)
|
|
|
|
# We don't catch SRTParseError here b/c that is typically raised when we
|
|
|
|
# are trying to parse with the wrong encoding, in which case we might
|
|
|
|
# be able to try another one on the *entire* set of subtitles elsewhere.
|
|
|
|
except ValueError as e:
|
|
|
|
if tolerant:
|
|
|
|
logger.warning(e)
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
raise
|
|
|
|
except StopIteration:
|
|
|
|
break
|
|
|
|
return subs_list
|
|
|
|
|
|
|
|
|
|
|
|
class GenericSubtitleParser(SubsMixin, TransformerMixin):
|
2022-01-24 12:07:52 +08:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
fmt: str = "srt",
|
|
|
|
encoding: str = "infer",
|
|
|
|
caching: bool = False,
|
|
|
|
max_subtitle_seconds: Optional[int] = None,
|
|
|
|
start_seconds: int = 0,
|
|
|
|
skip_ssa_info: bool = False,
|
|
|
|
) -> None:
|
2020-06-11 00:04:54 +08:00
|
|
|
super(self.__class__, self).__init__()
|
2022-01-24 12:07:52 +08:00
|
|
|
self.sub_format: str = fmt
|
|
|
|
self.encoding: str = encoding
|
|
|
|
self.caching: bool = caching
|
|
|
|
self.fit_fname: Optional[str] = None
|
|
|
|
self.detected_encoding_: Optional[str] = None
|
|
|
|
self.max_subtitle_seconds: Optional[int] = max_subtitle_seconds
|
|
|
|
self.start_seconds: int = start_seconds
|
|
|
|
# FIXME: hack to get tests to pass; remove
|
|
|
|
self._skip_ssa_info: bool = skip_ssa_info
|
2020-06-11 00:04:54 +08:00
|
|
|
|
2022-01-24 12:07:52 +08:00
|
|
|
def fit(self, fname: str, *_) -> "GenericSubtitleParser":
|
|
|
|
if self.caching and self.fit_fname == ("<stdin>" if fname is None else fname):
|
2020-06-11 00:04:54 +08:00
|
|
|
return self
|
|
|
|
encodings_to_try = (self.encoding,)
|
2022-01-24 12:07:52 +08:00
|
|
|
with open_file(fname, "rb") as f:
|
2020-06-11 00:04:54 +08:00
|
|
|
subs = f.read()
|
2022-01-24 12:07:52 +08:00
|
|
|
if self.encoding == "infer":
|
|
|
|
encodings_to_try = (chardet.detect(subs)["encoding"],)
|
2020-06-13 03:08:44 +08:00
|
|
|
self.detected_encoding_ = encodings_to_try[0]
|
2022-01-24 12:07:52 +08:00
|
|
|
logger.info("detected encoding: %s" % self.detected_encoding_)
|
2020-06-11 00:04:54 +08:00
|
|
|
exc = None
|
|
|
|
for encoding in encodings_to_try:
|
|
|
|
try:
|
2022-01-24 12:07:52 +08:00
|
|
|
decoded_subs = subs.decode(encoding, errors="replace").strip()
|
|
|
|
if self.sub_format == "srt":
|
2020-06-11 00:04:54 +08:00
|
|
|
parsed_subs = srt.parse(decoded_subs)
|
2022-01-24 12:07:52 +08:00
|
|
|
elif self.sub_format in ("ass", "ssa", "sub"):
|
2020-06-11 00:04:54 +08:00
|
|
|
parsed_subs = pysubs2.SSAFile.from_string(decoded_subs)
|
|
|
|
else:
|
2022-01-24 12:07:52 +08:00
|
|
|
raise NotImplementedError(
|
|
|
|
"unsupported format: %s" % self.sub_format
|
|
|
|
)
|
|
|
|
extra_generic_subtitle_file_kwargs = {}
|
|
|
|
if isinstance(parsed_subs, pysubs2.SSAFile):
|
|
|
|
extra_generic_subtitle_file_kwargs.update(
|
|
|
|
dict(
|
|
|
|
styles=parsed_subs.styles,
|
|
|
|
# pysubs2 on Python >= 3.6 doesn't support this
|
|
|
|
fonts_opaque=getattr(parsed_subs, "fonts_opaque", None),
|
|
|
|
info=parsed_subs.info if not self._skip_ssa_info else None,
|
|
|
|
)
|
|
|
|
)
|
2020-06-11 00:04:54 +08:00
|
|
|
self.subs_ = GenericSubtitlesFile(
|
2022-01-24 12:07:52 +08:00
|
|
|
_preprocess_subs(
|
|
|
|
parsed_subs,
|
|
|
|
max_subtitle_seconds=self.max_subtitle_seconds,
|
|
|
|
start_seconds=self.start_seconds,
|
|
|
|
),
|
2020-06-11 00:04:54 +08:00
|
|
|
sub_format=self.sub_format,
|
2021-04-13 12:02:29 +08:00
|
|
|
encoding=encoding,
|
2022-01-24 12:07:52 +08:00
|
|
|
**extra_generic_subtitle_file_kwargs,
|
2020-06-11 00:04:54 +08:00
|
|
|
)
|
2022-01-24 12:07:52 +08:00
|
|
|
self.fit_fname = "<stdin>" if fname is None else fname
|
2020-06-13 03:08:44 +08:00
|
|
|
if len(encodings_to_try) > 1:
|
|
|
|
self.detected_encoding_ = encoding
|
2022-01-24 12:07:52 +08:00
|
|
|
logger.info("detected encoding: %s" % self.detected_encoding_)
|
2020-06-11 00:04:54 +08:00
|
|
|
return self
|
|
|
|
except Exception as e:
|
|
|
|
exc = e
|
|
|
|
continue
|
|
|
|
raise exc
|
|
|
|
|
2022-01-24 12:07:52 +08:00
|
|
|
def transform(self, *_) -> GenericSubtitlesFile:
|
2020-06-11 00:04:54 +08:00
|
|
|
return self.subs_
|
2022-01-24 12:07:52 +08:00
|
|
|
|
|
|
|
|
|
|
|
def make_subtitle_parser(
|
|
|
|
fmt: str,
|
|
|
|
encoding: str = DEFAULT_ENCODING,
|
|
|
|
caching: bool = False,
|
|
|
|
max_subtitle_seconds: int = DEFAULT_MAX_SUBTITLE_SECONDS,
|
|
|
|
start_seconds: int = DEFAULT_START_SECONDS,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> GenericSubtitleParser:
|
|
|
|
return GenericSubtitleParser(
|
|
|
|
fmt=fmt,
|
|
|
|
encoding=encoding,
|
|
|
|
caching=caching,
|
|
|
|
max_subtitle_seconds=max_subtitle_seconds,
|
|
|
|
start_seconds=start_seconds,
|
|
|
|
skip_ssa_info=kwargs.get("skip_ssa_info", False),
|
|
|
|
)
|