Added experimental Python 3.11 support

This commit is contained in:
morpheus65535 2023-06-23 00:03:23 -04:00
parent cd016840f9
commit c92d574bf2
142 changed files with 648 additions and 685 deletions

View file

@ -20,8 +20,8 @@ def check_python_version():
print("Python " + minimum_py3_str + " or greater required. "
"Current version is " + platform.python_version() + ". Please upgrade Python.")
sys.exit(1)
elif int(python_version[0]) == 3 and int(python_version[1]) > 10:
print("Python version greater than 3.10.x is unsupported. Current version is " + platform.python_version() +
elif int(python_version[0]) == 3 and int(python_version[1]) > 11:
print("Python version greater than 3.11.x is unsupported. Current version is " + platform.python_version() +
". Keep in mind that even if it works, you're on your own.")
elif (int(python_version[0]) == minimum_py3_tuple[0] and int(python_version[1]) < minimum_py3_tuple[1]) or \
(int(python_version[0]) != minimum_py3_tuple[0]):

View file

@ -7,7 +7,7 @@ import re
from guess_language import guess_language
from subliminal_patch import core
from subzero.language import Language
from charamel import Detector
from chardet import detect
from app.config import settings
from constants import hi_regex
@ -76,7 +76,12 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
with open(subtitle_path, 'rb') as f:
text = f.read()
try:
encoding = detect(text)['encoding']
if not encoding:
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
"It's probably a binary file: " + subtitle_path)
continue
if 'UTF' in encoding:
text = text.decode('utf-8')
detected_language = guess_language(text)
# add simplified and traditional chinese detection
@ -86,35 +91,18 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
".hant", ".big5", ".traditional"]
if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(traditional_chinese)) or (str(subtitle_path).lower())[:-5] in traditional_chinese_fuzzy:
detected_language == 'zt'
except UnicodeDecodeError:
detector = Detector()
else:
text = text.decode(encoding)
detected_language = guess_language(text)
if detected_language:
logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
detected_language))
try:
guess = detector.detect(text)
subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language), forced=forced,
hi=False)
except Exception:
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
"It's probably a binary file: " + subtitle_path)
continue
else:
logging.debug('BAZARR detected encoding %r', guess)
try:
text = text.decode(guess)
except Exception:
logging.debug(
"BAZARR skipping this subtitles because we can't decode the file using the "
"guessed encoding. It's probably a binary file: " + subtitle_path)
continue
detected_language = guess_language(text)
except Exception:
logging.debug('BAZARR was unable to detect encoding for this subtitles file: %r', subtitle_path)
finally:
if detected_language:
logging.debug("BAZARR external subtitles detected and guessed this language: " + str(
detected_language))
try:
subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language), forced=forced,
hi=False)
except Exception:
pass
pass
# If language is still None (undetected), skip it
if hasattr(subtitles[subtitle], 'basename') and not subtitles[subtitle].basename:
@ -139,24 +127,15 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde
with open(subtitle_path, 'rb') as f:
text = f.read()
try:
encoding = detect(text)['encoding']
if not encoding:
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
"It's probably a binary file: " + subtitle_path)
continue
if 'UTF' in encoding:
text = text.decode('utf-8')
except UnicodeDecodeError:
detector = Detector()
try:
guess = detector.detect(text)
except Exception:
logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. "
"It's probably a binary file: " + subtitle_path)
continue
else:
logging.debug('BAZARR detected encoding %r', guess)
try:
text = text.decode(guess)
except Exception:
logging.debug("BAZARR skipping this subtitles because we can't decode the file using the "
"guessed encoding. It's probably a binary file: " + subtitle_path)
continue
else:
text = text.decode(encoding)
if bool(re.search(hi_regex, text)):
subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True)

View file

@ -4,7 +4,7 @@ import os
import logging
import hashlib
from charamel import Detector
from chardet import detect
from bs4 import UnicodeDammit
from app.config import settings
@ -64,8 +64,7 @@ def force_unicode(s):
try:
s = s.decode("utf-8")
except UnicodeDecodeError:
detector = Detector()
t = detector.detect(s)
t = detect(s)['encoding']
try:
s = s.decode(t)
except UnicodeDecodeError:

View file

@ -1,20 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Usage:
>>> import charamel
>>> detector = charamel.Detector()
>>> content = b'El espa\xf1ol o castellano del lat\xedn hablado'
>>> encoding = detector.detect(content)
>>> encoding
<Encoding.ISO_8859_14: 'iso8859_14'>
>>> content.decode(encoding)
'El español o castellano del latín hablado'
Licensed under Apache 2.0
"""
from .detector import Detector # noqa: F401
from .encoding import Encoding # noqa: F401
__version__ = '1.0.0'

View file

@ -1,133 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import itertools
import math
from typing import Dict, List, Optional, Sequence, Set, Tuple
from charamel.encoding import Encoding
from charamel.resources import load_biases, load_features, load_weights
def _get_features(content: bytes) -> Set[int]:
"""
Extract unique byte uni-grams and bi-grams
Args:
content: Encoded text
Returns:
Set of integers that represent byte n-grams
"""
pairs = zip(content, itertools.islice(content, 1, None))
return set(content).union(x * 256 + y for x, y in pairs)
def _apply_sigmoid(value: float) -> float:
"""
Apply sigmoid function to given value
"""
return 1 / (1 + math.exp(-value))
class Detector:
"""
Universal encoding detector
"""
def __init__(
self,
encodings: Sequence[Encoding] = tuple(Encoding),
min_confidence: float = 0.0,
):
"""
Create universal encoding detector for given encodings
Args:
encodings: Encodings that will be supported by this Detector instance,
less encodings lead to faster runtime
min_confidence: Minimum confidence threshold for encodings
Example:
>>> detector = Detector(
... encodings=[Encoding.UTF_8, Encoding.BIG_5],
... min_confidence=0.7,
... )
"""
if not encodings:
raise ValueError('No encodings specified')
if not 0.0 <= min_confidence <= 1.0:
raise ValueError('min_confidence must be in range [0, 1]')
self._features = load_features()
self._weights = load_weights(encodings)
self._biases = load_biases(encodings)
self._min_confidence = min_confidence
def _score(self, content: bytes) -> Dict[Encoding, float]:
"""
Compute how likely each encoding is able to decode the content
Args:
content: Encoded text
Returns:
Real-valued score for each encoding
"""
scores = self._biases.copy()
features = _get_features(content).intersection(self._features)
indices = [self._features[feature] for feature in features]
for encoding, weights in self._weights.items():
scores[encoding] += sum(weights[index] for index in indices)
return scores
def detect(self, content: bytes) -> Optional[Encoding]:
"""
Detect the most probable encoding for given byte content
Args:
content: Encoded text
Returns:
Encoding or `None` if not confident enough
Example:
>>> detector = Detector()
>>> detector.detect(b'\xc4\xe3\xba\xc3')
<Encoding.GB_K: 'gbk'>
"""
scores = self._score(content)
if scores:
encoding, score = max(scores.items(), key=lambda x: x[1])
if _apply_sigmoid(score) >= self._min_confidence:
return encoding
return None
def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]:
"""
Detect `top` probable encodings with confidences
Args:
content: Encoded text
top: How many of the most likely encodings to return
Example:
>>> detector = Detector()
>>> detector.probe(b'\xc4\xe3\xba\xc3')
[(<Encoding.GB_K: 'gbk'>, 0.6940633812304486),
(<Encoding.GB_18030: 'gb18030'>, 0.6886364021582343),
(<Encoding.GB_2312: 'gb2312'>, 0.6707061223726806)]
"""
scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True)
confidences = [
(encoding, _apply_sigmoid(score)) for encoding, score in scores[:top]
]
return [
(encoding, confidence)
for encoding, confidence in confidences
if confidence >= self._min_confidence
]

View file

@ -1,122 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import encodings.aliases
import enum
@enum.unique
class Encoding(str, enum.Enum):
"""
Python character encodings
"""
ASCII = 'ascii'
BIG_5 = 'big5'
BIG_5_HKSCS = 'big5hkscs'
CP_037 = 'cp037'
CP_273 = 'cp273'
CP_424 = 'cp424'
CP_437 = 'cp437'
CP_500 = 'cp500'
CP_720 = 'cp720'
CP_737 = 'cp737'
CP_775 = 'cp775'
CP_850 = 'cp850'
CP_852 = 'cp852'
CP_855 = 'cp855'
CP_856 = 'cp856'
CP_857 = 'cp857'
CP_858 = 'cp858'
CP_860 = 'cp860'
CP_861 = 'cp861'
CP_862 = 'cp862'
CP_863 = 'cp863'
CP_864 = 'cp864'
CP_865 = 'cp865'
CP_866 = 'cp866'
CP_869 = 'cp869'
CP_874 = 'cp874'
CP_875 = 'cp875'
CP_932 = 'cp932'
CP_949 = 'cp949'
CP_950 = 'cp950'
CP_1006 = 'cp1006'
CP_1026 = 'cp1026'
CP_1125 = 'cp1125'
CP_1140 = 'cp1140'
CP_1250 = 'cp1250'
CP_1251 = 'cp1251'
CP_1252 = 'cp1252'
CP_1253 = 'cp1253'
CP_1254 = 'cp1254'
CP_1255 = 'cp1255'
CP_1256 = 'cp1256'
CP_1257 = 'cp1257'
CP_1258 = 'cp1258'
EUC_JP = 'euc_jp'
EUC_JIS_2004 = 'euc_jis_2004'
EUC_JIS_X_0213 = 'euc_jisx0213'
EUC_KR = 'euc_kr'
GB_2312 = 'gb2312'
GB_K = 'gbk'
GB_18030 = 'gb18030'
HZ = 'hz'
ISO_2022_JP = 'iso2022_jp'
ISO_2022_JP_1 = 'iso2022_jp_1'
ISO_2022_JP_2 = 'iso2022_jp_2'
ISO_2022_JP_2004 = 'iso2022_jp_2004'
ISO_2022_JP_3 = 'iso2022_jp_3'
ISO_2022_JP_EXT = 'iso2022_jp_ext'
ISO_2022_KR = 'iso2022_kr'
LATIN_1 = 'latin_1'
ISO_8859_2 = 'iso8859_2'
ISO_8859_3 = 'iso8859_3'
ISO_8859_4 = 'iso8859_4'
ISO_8859_5 = 'iso8859_5'
ISO_8859_6 = 'iso8859_6'
ISO_8859_7 = 'iso8859_7'
ISO_8859_8 = 'iso8859_8'
ISO_8859_9 = 'iso8859_9'
ISO_8859_10 = 'iso8859_10'
ISO_8859_11 = 'iso8859_11'
ISO_8859_13 = 'iso8859_13'
ISO_8859_14 = 'iso8859_14'
ISO_8859_15 = 'iso8859_15'
ISO_8859_16 = 'iso8859_16'
JOHAB = 'johab'
KOI_8_R = 'koi8_r'
KOI_8_T = 'koi8_t'
KOI_8_U = 'koi8_u'
KZ_1048 = 'kz1048'
MAC_CYRILLIC = 'mac_cyrillic'
MAC_GREEK = 'mac_greek'
MAC_ICELAND = 'mac_iceland'
MAC_LATIN_2 = 'mac_latin2'
MAC_ROMAN = 'mac_roman'
MAC_TURKISH = 'mac_turkish'
PTCP_154 = 'ptcp154'
SHIFT_JIS = 'shift_jis'
SHIFT_JIS_2004 = 'shift_jis_2004'
SHIFT_JIS_X_0213 = 'shift_jisx0213'
TIS_620 = 'tis_620'
UTF_32 = 'utf_32'
UTF_32_BE = 'utf_32_be'
UTF_32_LE = 'utf_32_le'
UTF_16 = 'utf_16'
UTF_16_BE = 'utf_16_be'
UTF_16_LE = 'utf_16_le'
UTF_7 = 'utf_7'
UTF_8 = 'utf_8'
UTF_8_SIG = 'utf_8_sig'
@classmethod
def _missing_(cls, value):
normalized = encodings.normalize_encoding(value).lower()
normalized = encodings.aliases.aliases.get(normalized, normalized)
if value != normalized:
return cls(normalized)
return super()._missing_(value)

View file

@ -1,72 +0,0 @@
"""
🌏 Charamel: Truly Universal Encoding Detection in Python 🌎
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Licensed under Apache 2.0
"""
import gzip
import pathlib
import struct
from typing import Any, Dict, List, Sequence
from charamel.encoding import Encoding
RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute()
WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights'
def _unpack(file: pathlib.Path, pattern: str) -> List[Any]:
"""
Unpack struct values from file
Args:
file: File that stores struct-packed values
pattern: Struct pattern
Returns:
List of unpacked values
"""
with gzip.open(file, 'rb') as data:
return [values[0] for values in struct.iter_unpack(pattern, data.read())]
def load_features() -> Dict[int, int]:
"""
Load byte-level feature names and indices
Returns:
Mapping from features to their indices in weight matrix
"""
features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H')
return {feature: index for index, feature in enumerate(features)}
def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]:
"""
Load linear model bias values for given encodings
Args:
encodings: List of encodings
Returns:
Mapping from encodings to their biases
"""
biases = {}
with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data:
for line in data:
encoding, bias = line.decode().split()
biases[encoding] = float(bias)
return {encoding: biases[encoding] for encoding in encodings}
def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]:
"""
:param encodings:
:return:
"""
weights = {}
for encoding in encodings:
weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e')
return weights

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show more