mirror of
https://github.com/morpheus65535/bazarr.git
synced 2025-01-01 12:32:25 +08:00
Added experimental Python 3.11 support
This commit is contained in:
parent
cd016840f9
commit
c92d574bf2
142 changed files with 648 additions and 685 deletions
|
@ -20,8 +20,8 @@ def check_python_version():
|
|||
print("Python " + minimum_py3_str + " or greater required. "
|
||||
"Current version is " + platform.python_version() + ". Please upgrade Python.")
|
||||
sys.exit(1)
|
||||
elif int(python_version[0]) == 3 and int(python_version[1]) > 10:
|
||||
print("Python version greater than 3.10.x is unsupported. Current version is " + platform.python_version() +
|
||||
elif int(python_version[0]) == 3 and int(python_version[1]) > 11:
|
||||
print("Python version greater than 3.11.x is unsupported. Current version is " + platform.python_version() +
|
||||
". Keep in mind that even if it works, you're on your own.")
|
||||
elif (int(python_version[0]) == minimum_py3_tuple[0] and int(python_version[1]) < minimum_py3_tuple[1]) or \
|
||||
(int(python_version[0]) != minimum_py3_tuple[0]):
|
||||
|
|
|
@ -7,7 +7,7 @@ import re
|
|||
from guess_language import guess_language
|
||||
from subliminal_patch import core
|
||||
from subzero.language import Language
|
||||
from charamel import Detector
|
||||
from chardet import detect
|
||||
|
||||
from app.config import settings
|
||||
from constants import hi_regex
|
||||
|
@ -76,7 +76,12 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
|
|||
with open(subtitle_path, 'rb') as f:
|
||||
text = f.read()
|
||||
|
||||
try:
|
||||
encoding = detect(text)['encoding']
|
||||
if not encoding:
|
||||
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
|
||||
"It's probably a binary file: " + subtitle_path)
|
||||
continue
|
||||
if 'UTF' in encoding:
|
||||
text = text.decode('utf-8')
|
||||
detected_language = guess_language(text)
|
||||
# add simplified and traditional chinese detection
|
||||
|
@ -86,35 +91,18 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
|
|||
".hant", ".big5", ".traditional"]
|
||||
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(traditional_chinese)) or (str(subtitle_path).lower())[:-5] in traditional_chinese_fuzzy:
|
||||
detected_language == 'zt'
|
||||
except UnicodeDecodeError:
|
||||
detector = Detector()
|
||||
else:
|
||||
text = text.decode(encoding)
|
||||
|
||||
detected_language = guess_language(text)
|
||||
if detected_language:
|
||||
logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
|
||||
detected_language))
|
||||
try:
|
||||
guess = detector.detect(text)
|
||||
subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language), forced=forced,
|
||||
hi=False)
|
||||
except Exception:
|
||||
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
|
||||
"It's probably a binary file: " + subtitle_path)
|
||||
continue
|
||||
else:
|
||||
logging.debug('BAZARR detected encoding %r', guess)
|
||||
try:
|
||||
text = text.decode(guess)
|
||||
except Exception:
|
||||
logging.debug(
|
||||
"BAZARR skipping this subtitles because we can't decode the file using the "
|
||||
"guessed encoding. It's probably a binary file: " + subtitle_path)
|
||||
continue
|
||||
detected_language = guess_language(text)
|
||||
except Exception:
|
||||
logging.debug('BAZARR was unable to detect encoding for this subtitles file: %r', subtitle_path)
|
||||
finally:
|
||||
if detected_language:
|
||||
logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
|
||||
detected_language))
|
||||
try:
|
||||
subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language), forced=forced,
|
||||
hi=False)
|
||||
except Exception:
|
||||
pass
|
||||
pass
|
||||
|
||||
# If language is still None (undetected), skip it
|
||||
if hasattr(subtitles[subtitle], 'basename') and not subtitles[subtitle].basename:
|
||||
|
@ -139,24 +127,15 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
|
|||
with open(subtitle_path, 'rb') as f:
|
||||
text = f.read()
|
||||
|
||||
try:
|
||||
encoding = detect(text)['encoding']
|
||||
if not encoding:
|
||||
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
|
||||
"It's probably a binary file: " + subtitle_path)
|
||||
continue
|
||||
if 'UTF' in encoding:
|
||||
text = text.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
detector = Detector()
|
||||
try:
|
||||
guess = detector.detect(text)
|
||||
except Exception:
|
||||
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
|
||||
"It's probably a binary file: " + subtitle_path)
|
||||
continue
|
||||
else:
|
||||
logging.debug('BAZARR detected encoding %r', guess)
|
||||
try:
|
||||
text = text.decode(guess)
|
||||
except Exception:
|
||||
logging.debug("BAZARR skipping this subtitles because we can't decode the file using the "
|
||||
"guessed encoding. It's probably a binary file: " + subtitle_path)
|
||||
continue
|
||||
else:
|
||||
text = text.decode(encoding)
|
||||
|
||||
if bool(re.search(hi_regex, text)):
|
||||
subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True)
|
||||
|
|
|
@ -4,7 +4,7 @@ import os
|
|||
import logging
|
||||
import hashlib
|
||||
|
||||
from charamel import Detector
|
||||
from chardet import detect
|
||||
from bs4 import UnicodeDammit
|
||||
|
||||
from app.config import settings
|
||||
|
@ -64,8 +64,7 @@ def force_unicode(s):
|
|||
try:
|
||||
s = s.decode("utf-8")
|
||||
except UnicodeDecodeError:
|
||||
detector = Detector()
|
||||
t = detector.detect(s)
|
||||
t = detect(s)['encoding']
|
||||
try:
|
||||
s = s.decode(t)
|
||||
except UnicodeDecodeError:
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Usage:
|
||||
>>> import charamel
|
||||
>>> detector = charamel.Detector()
|
||||
>>> content = b'El espa\xf1ol o castellano del lat\xedn hablado'
|
||||
>>> encoding = detector.detect(content)
|
||||
>>> encoding
|
||||
<Encoding.ISO_8859_14: 'iso8859_14'>
|
||||
>>> content.decode(encoding)
|
||||
'El español o castellano del latín hablado'
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
from .detector import Detector # noqa: F401
|
||||
from .encoding import Encoding # noqa: F401
|
||||
|
||||
__version__ = '1.0.0'
|
|
@ -1,133 +0,0 @@
|
|||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
import itertools
|
||||
import math
|
||||
from typing import Dict, List, Optional, Sequence, Set, Tuple
|
||||
|
||||
from charamel.encoding import Encoding
|
||||
from charamel.resources import load_biases, load_features, load_weights
|
||||
|
||||
|
||||
def _get_features(content: bytes) -> Set[int]:
|
||||
"""
|
||||
Extract unique byte uni-grams and bi-grams
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
|
||||
Returns:
|
||||
Set of integers that represent byte n-grams
|
||||
"""
|
||||
pairs = zip(content, itertools.islice(content, 1, None))
|
||||
return set(content).union(x * 256 + y for x, y in pairs)
|
||||
|
||||
|
||||
def _apply_sigmoid(value: float) -> float:
|
||||
"""
|
||||
Apply sigmoid function to given value
|
||||
"""
|
||||
return 1 / (1 + math.exp(-value))
|
||||
|
||||
|
||||
class Detector:
|
||||
"""
|
||||
Universal encoding detector
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
encodings: Sequence[Encoding] = tuple(Encoding),
|
||||
min_confidence: float = 0.0,
|
||||
):
|
||||
"""
|
||||
Create universal encoding detector for given encodings
|
||||
|
||||
Args:
|
||||
encodings: Encodings that will be supported by this Detector instance,
|
||||
less encodings lead to faster runtime
|
||||
min_confidence: Minimum confidence threshold for encodings
|
||||
|
||||
Example:
|
||||
>>> detector = Detector(
|
||||
... encodings=[Encoding.UTF_8, Encoding.BIG_5],
|
||||
... min_confidence=0.7,
|
||||
... )
|
||||
"""
|
||||
if not encodings:
|
||||
raise ValueError('No encodings specified')
|
||||
|
||||
if not 0.0 <= min_confidence <= 1.0:
|
||||
raise ValueError('min_confidence must be in range [0, 1]')
|
||||
|
||||
self._features = load_features()
|
||||
self._weights = load_weights(encodings)
|
||||
self._biases = load_biases(encodings)
|
||||
self._min_confidence = min_confidence
|
||||
|
||||
def _score(self, content: bytes) -> Dict[Encoding, float]:
|
||||
"""
|
||||
Compute how likely each encoding is able to decode the content
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
|
||||
Returns:
|
||||
Real-valued score for each encoding
|
||||
"""
|
||||
scores = self._biases.copy()
|
||||
features = _get_features(content).intersection(self._features)
|
||||
indices = [self._features[feature] for feature in features]
|
||||
for encoding, weights in self._weights.items():
|
||||
scores[encoding] += sum(weights[index] for index in indices)
|
||||
return scores
|
||||
|
||||
def detect(self, content: bytes) -> Optional[Encoding]:
|
||||
"""
|
||||
Detect the most probable encoding for given byte content
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
|
||||
Returns:
|
||||
Encoding or `None` if not confident enough
|
||||
|
||||
Example:
|
||||
>>> detector = Detector()
|
||||
>>> detector.detect(b'\xc4\xe3\xba\xc3')
|
||||
<Encoding.GB_K: 'gbk'>
|
||||
"""
|
||||
scores = self._score(content)
|
||||
if scores:
|
||||
encoding, score = max(scores.items(), key=lambda x: x[1])
|
||||
if _apply_sigmoid(score) >= self._min_confidence:
|
||||
return encoding
|
||||
return None
|
||||
|
||||
def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:
|
||||
"""
|
||||
Detect `top` probable encodings with confidences
|
||||
|
||||
Args:
|
||||
content: Encoded text
|
||||
top: How many of the most likely encodings to return
|
||||
|
||||
Example:
|
||||
>>> detector = Detector()
|
||||
>>> detector.probe(b'\xc4\xe3\xba\xc3')
|
||||
[(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),
|
||||
(<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),
|
||||
(<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]
|
||||
"""
|
||||
scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)
|
||||
confidences = [
|
||||
(encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]
|
||||
]
|
||||
return [
|
||||
(encoding, confidence)
|
||||
for encoding, confidence in confidences
|
||||
if confidence >= self._min_confidence
|
||||
]
|
|
@ -1,122 +0,0 @@
|
|||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
import encodings.aliases
|
||||
import enum
|
||||
|
||||
|
||||
@enum.unique
|
||||
class Encoding(str, enum.Enum):
|
||||
"""
|
||||
Python character encodings
|
||||
"""
|
||||
|
||||
ASCII = 'ascii'
|
||||
BIG_5 = 'big5'
|
||||
BIG_5_HKSCS = 'big5hkscs'
|
||||
CP_037 = 'cp037'
|
||||
CP_273 = 'cp273'
|
||||
CP_424 = 'cp424'
|
||||
CP_437 = 'cp437'
|
||||
CP_500 = 'cp500'
|
||||
CP_720 = 'cp720'
|
||||
CP_737 = 'cp737'
|
||||
CP_775 = 'cp775'
|
||||
CP_850 = 'cp850'
|
||||
CP_852 = 'cp852'
|
||||
CP_855 = 'cp855'
|
||||
CP_856 = 'cp856'
|
||||
CP_857 = 'cp857'
|
||||
CP_858 = 'cp858'
|
||||
CP_860 = 'cp860'
|
||||
CP_861 = 'cp861'
|
||||
CP_862 = 'cp862'
|
||||
CP_863 = 'cp863'
|
||||
CP_864 = 'cp864'
|
||||
CP_865 = 'cp865'
|
||||
CP_866 = 'cp866'
|
||||
CP_869 = 'cp869'
|
||||
CP_874 = 'cp874'
|
||||
CP_875 = 'cp875'
|
||||
CP_932 = 'cp932'
|
||||
CP_949 = 'cp949'
|
||||
CP_950 = 'cp950'
|
||||
CP_1006 = 'cp1006'
|
||||
CP_1026 = 'cp1026'
|
||||
CP_1125 = 'cp1125'
|
||||
CP_1140 = 'cp1140'
|
||||
CP_1250 = 'cp1250'
|
||||
CP_1251 = 'cp1251'
|
||||
CP_1252 = 'cp1252'
|
||||
CP_1253 = 'cp1253'
|
||||
CP_1254 = 'cp1254'
|
||||
CP_1255 = 'cp1255'
|
||||
CP_1256 = 'cp1256'
|
||||
CP_1257 = 'cp1257'
|
||||
CP_1258 = 'cp1258'
|
||||
EUC_JP = 'euc_jp'
|
||||
EUC_JIS_2004 = 'euc_jis_2004'
|
||||
EUC_JIS_X_0213 = 'euc_jisx0213'
|
||||
EUC_KR = 'euc_kr'
|
||||
GB_2312 = 'gb2312'
|
||||
GB_K = 'gbk'
|
||||
GB_18030 = 'gb18030'
|
||||
HZ = 'hz'
|
||||
ISO_2022_JP = 'iso2022_jp'
|
||||
ISO_2022_JP_1 = 'iso2022_jp_1'
|
||||
ISO_2022_JP_2 = 'iso2022_jp_2'
|
||||
ISO_2022_JP_2004 = 'iso2022_jp_2004'
|
||||
ISO_2022_JP_3 = 'iso2022_jp_3'
|
||||
ISO_2022_JP_EXT = 'iso2022_jp_ext'
|
||||
ISO_2022_KR = 'iso2022_kr'
|
||||
LATIN_1 = 'latin_1'
|
||||
ISO_8859_2 = 'iso8859_2'
|
||||
ISO_8859_3 = 'iso8859_3'
|
||||
ISO_8859_4 = 'iso8859_4'
|
||||
ISO_8859_5 = 'iso8859_5'
|
||||
ISO_8859_6 = 'iso8859_6'
|
||||
ISO_8859_7 = 'iso8859_7'
|
||||
ISO_8859_8 = 'iso8859_8'
|
||||
ISO_8859_9 = 'iso8859_9'
|
||||
ISO_8859_10 = 'iso8859_10'
|
||||
ISO_8859_11 = 'iso8859_11'
|
||||
ISO_8859_13 = 'iso8859_13'
|
||||
ISO_8859_14 = 'iso8859_14'
|
||||
ISO_8859_15 = 'iso8859_15'
|
||||
ISO_8859_16 = 'iso8859_16'
|
||||
JOHAB = 'johab'
|
||||
KOI_8_R = 'koi8_r'
|
||||
KOI_8_T = 'koi8_t'
|
||||
KOI_8_U = 'koi8_u'
|
||||
KZ_1048 = 'kz1048'
|
||||
MAC_CYRILLIC = 'mac_cyrillic'
|
||||
MAC_GREEK = 'mac_greek'
|
||||
MAC_ICELAND = 'mac_iceland'
|
||||
MAC_LATIN_2 = 'mac_latin2'
|
||||
MAC_ROMAN = 'mac_roman'
|
||||
MAC_TURKISH = 'mac_turkish'
|
||||
PTCP_154 = 'ptcp154'
|
||||
SHIFT_JIS = 'shift_jis'
|
||||
SHIFT_JIS_2004 = 'shift_jis_2004'
|
||||
SHIFT_JIS_X_0213 = 'shift_jisx0213'
|
||||
TIS_620 = 'tis_620'
|
||||
UTF_32 = 'utf_32'
|
||||
UTF_32_BE = 'utf_32_be'
|
||||
UTF_32_LE = 'utf_32_le'
|
||||
UTF_16 = 'utf_16'
|
||||
UTF_16_BE = 'utf_16_be'
|
||||
UTF_16_LE = 'utf_16_le'
|
||||
UTF_7 = 'utf_7'
|
||||
UTF_8 = 'utf_8'
|
||||
UTF_8_SIG = 'utf_8_sig'
|
||||
|
||||
@classmethod
|
||||
def _missing_(cls, value):
|
||||
normalized = encodings.normalize_encoding(value).lower()
|
||||
normalized = encodings.aliases.aliases.get(normalized, normalized)
|
||||
if value != normalized:
|
||||
return cls(normalized)
|
||||
return super()._missing_(value)
|
|
@ -1,72 +0,0 @@
|
|||
"""
|
||||
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Licensed under Apache 2.0
|
||||
"""
|
||||
import gzip
|
||||
import pathlib
|
||||
import struct
|
||||
from typing import Any, Dict, List, Sequence
|
||||
|
||||
from charamel.encoding import Encoding
|
||||
|
||||
RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute()
|
||||
WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights'
|
||||
|
||||
|
||||
def _unpack(file: pathlib.Path, pattern: str) -> List[Any]:
|
||||
"""
|
||||
Unpack struct values from file
|
||||
|
||||
Args:
|
||||
file: File that stores struct-packed values
|
||||
pattern: Struct pattern
|
||||
|
||||
Returns:
|
||||
List of unpacked values
|
||||
"""
|
||||
with gzip.open(file, 'rb') as data:
|
||||
return [values[0] for values in struct.iter_unpack(pattern, data.read())]
|
||||
|
||||
|
||||
def load_features() -> Dict[int, int]:
|
||||
"""
|
||||
Load byte-level feature names and indices
|
||||
|
||||
Returns:
|
||||
Mapping from features to their indices in weight matrix
|
||||
"""
|
||||
features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H')
|
||||
return {feature: index for index, feature in enumerate(features)}
|
||||
|
||||
|
||||
def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]:
|
||||
"""
|
||||
Load linear model bias values for given encodings
|
||||
|
||||
Args:
|
||||
encodings: List of encodings
|
||||
|
||||
Returns:
|
||||
Mapping from encodings to their biases
|
||||
"""
|
||||
biases = {}
|
||||
with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data:
|
||||
for line in data:
|
||||
encoding, bias = line.decode().split()
|
||||
biases[encoding] = float(bias)
|
||||
|
||||
return {encoding: biases[encoding] for encoding in encodings}
|
||||
|
||||
|
||||
def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]:
|
||||
"""
|
||||
|
||||
:param encodings:
|
||||
:return:
|
||||
"""
|
||||
weights = {}
|
||||
for encoding in encodings:
|
||||
weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e')
|
||||
return weights
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue