diff --git a/bazarr.py b/bazarr.py index 5276bca39..c85aee411 100644 --- a/bazarr.py +++ b/bazarr.py @@ -20,8 +20,8 @@ def check_python_version(): print("Python " + minimum_py3_str + " or greater required. " "Current version is " + platform.python_version() + ". Please upgrade Python.") sys.exit(1) - elif int(python_version[0]) == 3 and int(python_version[1]) > 10: - print("Python version greater than 3.10.x is unsupported. Current version is " + platform.python_version() + + elif int(python_version[0]) == 3 and int(python_version[1]) > 11: + print("Python version greater than 3.11.x is unsupported. Current version is " + platform.python_version() + ". Keep in mind that even if it works, you're on your own.") elif (int(python_version[0]) == minimum_py3_tuple[0] and int(python_version[1]) < minimum_py3_tuple[1]) or \ (int(python_version[0]) != minimum_py3_tuple[0]): diff --git a/bazarr/subtitles/indexer/utils.py b/bazarr/subtitles/indexer/utils.py index c40fdff48..f34a26fa6 100644 --- a/bazarr/subtitles/indexer/utils.py +++ b/bazarr/subtitles/indexer/utils.py @@ -7,7 +7,7 @@ import re from guess_language import guess_language from subliminal_patch import core from subzero.language import Language -from charamel import Detector +from chardet import detect from app.config import settings from constants import hi_regex @@ -76,7 +76,12 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde with open(subtitle_path, 'rb') as f: text = f.read() - try: + encoding = detect(text)['encoding'] + if not encoding: + logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. " + "It's probably a binary file: " + subtitle_path) + continue + if 'UTF' in encoding: text = text.decode('utf-8') detected_language = guess_language(text) # add simplified and traditional chinese detection @@ -86,35 +91,18 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde ".hant", ".big5", ".traditional"] if str(os.path.splitext(subtitle)[0]).lower().endswith(tuple(traditional_chinese)) or (str(subtitle_path).lower())[:-5] in traditional_chinese_fuzzy: detected_language == 'zt' - except UnicodeDecodeError: - detector = Detector() + else: + text = text.decode(encoding) + + detected_language = guess_language(text) + if detected_language: + logging.debug("BAZARR external subtitles detected and guessed this language: " + str( + detected_language)) try: - guess = detector.detect(text) + subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language), forced=forced, + hi=False) except Exception: - logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. " - "It's probably a binary file: " + subtitle_path) - continue - else: - logging.debug('BAZARR detected encoding %r', guess) - try: - text = text.decode(guess) - except Exception: - logging.debug( - "BAZARR skipping this subtitles because we can't decode the file using the " - "guessed encoding. It's probably a binary file: " + subtitle_path) - continue - detected_language = guess_language(text) - except Exception: - logging.debug('BAZARR was unable to detect encoding for this subtitles file: %r', subtitle_path) - finally: - if detected_language: - logging.debug("BAZARR external subtitles detected and guessed this language: " + str( - detected_language)) - try: - subtitles[subtitle] = Language.rebuild(Language.fromietf(detected_language), forced=forced, - hi=False) - except Exception: - pass + pass # If language is still None (undetected), skip it if hasattr(subtitles[subtitle], 'basename') and not subtitles[subtitle].basename: @@ -139,24 +127,15 @@ def guess_external_subtitles(dest_folder, subtitles, media_type, previously_inde with open(subtitle_path, 'rb') as f: text = f.read() - try: + encoding = detect(text)['encoding'] + if not encoding: + logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. " + "It's probably a binary file: " + subtitle_path) + continue + if 'UTF' in encoding: text = text.decode('utf-8') - except UnicodeDecodeError: - detector = Detector() - try: - guess = detector.detect(text) - except Exception: - logging.debug("BAZARR skipping this subtitles because we can't guess the encoding. " - "It's probably a binary file: " + subtitle_path) - continue - else: - logging.debug('BAZARR detected encoding %r', guess) - try: - text = text.decode(guess) - except Exception: - logging.debug("BAZARR skipping this subtitles because we can't decode the file using the " - "guessed encoding. It's probably a binary file: " + subtitle_path) - continue + else: + text = text.decode(encoding) if bool(re.search(hi_regex, text)): subtitles[subtitle] = Language.rebuild(subtitles[subtitle], forced=False, hi=True) diff --git a/bazarr/utilities/helper.py b/bazarr/utilities/helper.py index c9e27a58f..bd12630c8 100644 --- a/bazarr/utilities/helper.py +++ b/bazarr/utilities/helper.py @@ -4,7 +4,7 @@ import os import logging import hashlib -from charamel import Detector +from chardet import detect from bs4 import UnicodeDammit from app.config import settings @@ -64,8 +64,7 @@ def force_unicode(s): try: s = s.decode("utf-8") except UnicodeDecodeError: - detector = Detector() - t = detector.detect(s) + t = detect(s)['encoding'] try: s = s.decode(t) except UnicodeDecodeError: diff --git a/libs/charamel/__init__.py b/libs/charamel/__init__.py deleted file mode 100644 index 67d0f36c5..000000000 --- a/libs/charamel/__init__.py +++ /dev/null @@ -1,20 +0,0 @@ -""" -🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Usage: - >>> import charamel - >>> detector = charamel.Detector() - >>> content = b'El espa\xf1ol o castellano del lat\xedn hablado' - >>> encoding = detector.detect(content) - >>> encoding - - >>> content.decode(encoding) - 'El español o castellano del latín hablado' - -Licensed under Apache 2.0 -""" -from .detector import Detector # noqa: F401 -from .encoding import Encoding # noqa: F401 - -__version__ = '1.0.0' diff --git a/libs/charamel/detector.py b/libs/charamel/detector.py deleted file mode 100644 index c63d307f4..000000000 --- a/libs/charamel/detector.py +++ /dev/null @@ -1,133 +0,0 @@ -""" -🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Licensed under Apache 2.0 -""" -import itertools -import math -from typing import Dict, List, Optional, Sequence, Set, Tuple - -from charamel.encoding import Encoding -from charamel.resources import load_biases, load_features, load_weights - - -def _get_features(content: bytes) -> Set[int]: - """ - Extract unique byte uni-grams and bi-grams - - Args: - content: Encoded text - - Returns: - Set of integers that represent byte n-grams - """ - pairs = zip(content, itertools.islice(content, 1, None)) - return set(content).union(x * 256 + y for x, y in pairs) - - -def _apply_sigmoid(value: float) -> float: - """ - Apply sigmoid function to given value - """ - return 1 / (1 + math.exp(-value)) - - -class Detector: - """ - Universal encoding detector - """ - - def __init__( - self, - encodings: Sequence[Encoding] = tuple(Encoding), - min_confidence: float = 0.0, - ): - """ - Create universal encoding detector for given encodings - - Args: - encodings: Encodings that will be supported by this Detector instance, - less encodings lead to faster runtime - min_confidence: Minimum confidence threshold for encodings - - Example: - >>> detector = Detector( - ... encodings=[Encoding.UTF_8, Encoding.BIG_5], - ... min_confidence=0.7, - ... ) - """ - if not encodings: - raise ValueError('No encodings specified') - - if not 0.0 <= min_confidence <= 1.0: - raise ValueError('min_confidence must be in range [0, 1]') - - self._features = load_features() - self._weights = load_weights(encodings) - self._biases = load_biases(encodings) - self._min_confidence = min_confidence - - def _score(self, content: bytes) -> Dict[Encoding, float]: - """ - Compute how likely each encoding is able to decode the content - - Args: - content: Encoded text - - Returns: - Real-valued score for each encoding - """ - scores = self._biases.copy() - features = _get_features(content).intersection(self._features) - indices = [self._features[feature] for feature in features] - for encoding, weights in self._weights.items(): - scores[encoding] += sum(weights[index] for index in indices) - return scores - - def detect(self, content: bytes) -> Optional[Encoding]: - """ - Detect the most probable encoding for given byte content - - Args: - content: Encoded text - - Returns: - Encoding or `None` if not confident enough - - Example: - >>> detector = Detector() - >>> detector.detect(b'\xc4\xe3\xba\xc3') - - """ - scores = self._score(content) - if scores: - encoding, score = max(scores.items(), key=lambda x: x[1]) - if _apply_sigmoid(score) >= self._min_confidence: - return encoding - return None - - def probe(self, content: bytes, top: int = 3) -> List[Tuple[Encoding, float]]: - """ - Detect `top` probable encodings with confidences - - Args: - content: Encoded text - top: How many of the most likely encodings to return - - Example: - >>> detector = Detector() - >>> detector.probe(b'\xc4\xe3\xba\xc3') - [(, 0.6940633812304486), - (, 0.6886364021582343), - (, 0.6707061223726806)] - """ - scores = sorted(self._score(content).items(), key=lambda x: x[1], reverse=True) - confidences = [ - (encoding, _apply_sigmoid(score)) for encoding, score in scores[:top] - ] - return [ - (encoding, confidence) - for encoding, confidence in confidences - if confidence >= self._min_confidence - ] diff --git a/libs/charamel/encoding.py b/libs/charamel/encoding.py deleted file mode 100644 index 21c5b095f..000000000 --- a/libs/charamel/encoding.py +++ /dev/null @@ -1,122 +0,0 @@ -""" -🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Licensed under Apache 2.0 -""" -import encodings.aliases -import enum - - -@enum.unique -class Encoding(str, enum.Enum): - """ - Python character encodings - """ - - ASCII = 'ascii' - BIG_5 = 'big5' - BIG_5_HKSCS = 'big5hkscs' - CP_037 = 'cp037' - CP_273 = 'cp273' - CP_424 = 'cp424' - CP_437 = 'cp437' - CP_500 = 'cp500' - CP_720 = 'cp720' - CP_737 = 'cp737' - CP_775 = 'cp775' - CP_850 = 'cp850' - CP_852 = 'cp852' - CP_855 = 'cp855' - CP_856 = 'cp856' - CP_857 = 'cp857' - CP_858 = 'cp858' - CP_860 = 'cp860' - CP_861 = 'cp861' - CP_862 = 'cp862' - CP_863 = 'cp863' - CP_864 = 'cp864' - CP_865 = 'cp865' - CP_866 = 'cp866' - CP_869 = 'cp869' - CP_874 = 'cp874' - CP_875 = 'cp875' - CP_932 = 'cp932' - CP_949 = 'cp949' - CP_950 = 'cp950' - CP_1006 = 'cp1006' - CP_1026 = 'cp1026' - CP_1125 = 'cp1125' - CP_1140 = 'cp1140' - CP_1250 = 'cp1250' - CP_1251 = 'cp1251' - CP_1252 = 'cp1252' - CP_1253 = 'cp1253' - CP_1254 = 'cp1254' - CP_1255 = 'cp1255' - CP_1256 = 'cp1256' - CP_1257 = 'cp1257' - CP_1258 = 'cp1258' - EUC_JP = 'euc_jp' - EUC_JIS_2004 = 'euc_jis_2004' - EUC_JIS_X_0213 = 'euc_jisx0213' - EUC_KR = 'euc_kr' - GB_2312 = 'gb2312' - GB_K = 'gbk' - GB_18030 = 'gb18030' - HZ = 'hz' - ISO_2022_JP = 'iso2022_jp' - ISO_2022_JP_1 = 'iso2022_jp_1' - ISO_2022_JP_2 = 'iso2022_jp_2' - ISO_2022_JP_2004 = 'iso2022_jp_2004' - ISO_2022_JP_3 = 'iso2022_jp_3' - ISO_2022_JP_EXT = 'iso2022_jp_ext' - ISO_2022_KR = 'iso2022_kr' - LATIN_1 = 'latin_1' - ISO_8859_2 = 'iso8859_2' - ISO_8859_3 = 'iso8859_3' - ISO_8859_4 = 'iso8859_4' - ISO_8859_5 = 'iso8859_5' - ISO_8859_6 = 'iso8859_6' - ISO_8859_7 = 'iso8859_7' - ISO_8859_8 = 'iso8859_8' - ISO_8859_9 = 'iso8859_9' - ISO_8859_10 = 'iso8859_10' - ISO_8859_11 = 'iso8859_11' - ISO_8859_13 = 'iso8859_13' - ISO_8859_14 = 'iso8859_14' - ISO_8859_15 = 'iso8859_15' - ISO_8859_16 = 'iso8859_16' - JOHAB = 'johab' - KOI_8_R = 'koi8_r' - KOI_8_T = 'koi8_t' - KOI_8_U = 'koi8_u' - KZ_1048 = 'kz1048' - MAC_CYRILLIC = 'mac_cyrillic' - MAC_GREEK = 'mac_greek' - MAC_ICELAND = 'mac_iceland' - MAC_LATIN_2 = 'mac_latin2' - MAC_ROMAN = 'mac_roman' - MAC_TURKISH = 'mac_turkish' - PTCP_154 = 'ptcp154' - SHIFT_JIS = 'shift_jis' - SHIFT_JIS_2004 = 'shift_jis_2004' - SHIFT_JIS_X_0213 = 'shift_jisx0213' - TIS_620 = 'tis_620' - UTF_32 = 'utf_32' - UTF_32_BE = 'utf_32_be' - UTF_32_LE = 'utf_32_le' - UTF_16 = 'utf_16' - UTF_16_BE = 'utf_16_be' - UTF_16_LE = 'utf_16_le' - UTF_7 = 'utf_7' - UTF_8 = 'utf_8' - UTF_8_SIG = 'utf_8_sig' - - @classmethod - def _missing_(cls, value): - normalized = encodings.normalize_encoding(value).lower() - normalized = encodings.aliases.aliases.get(normalized, normalized) - if value != normalized: - return cls(normalized) - return super()._missing_(value) diff --git a/libs/charamel/resources/__init__.py b/libs/charamel/resources/__init__.py deleted file mode 100644 index 336c41c21..000000000 --- a/libs/charamel/resources/__init__.py +++ /dev/null @@ -1,72 +0,0 @@ -""" -🌏 Charamel: Truly Universal Encoding Detection in Python 🌎 -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Licensed under Apache 2.0 -""" -import gzip -import pathlib -import struct -from typing import Any, Dict, List, Sequence - -from charamel.encoding import Encoding - -RESOURCE_DIRECTORY = pathlib.Path(__file__).parent.absolute() -WEIGHT_DIRECTORY = RESOURCE_DIRECTORY / 'weights' - - -def _unpack(file: pathlib.Path, pattern: str) -> List[Any]: - """ - Unpack struct values from file - - Args: - file: File that stores struct-packed values - pattern: Struct pattern - - Returns: - List of unpacked values - """ - with gzip.open(file, 'rb') as data: - return [values[0] for values in struct.iter_unpack(pattern, data.read())] - - -def load_features() -> Dict[int, int]: - """ - Load byte-level feature names and indices - - Returns: - Mapping from features to their indices in weight matrix - """ - features = _unpack(RESOURCE_DIRECTORY / 'features.gzip', pattern='>H') - return {feature: index for index, feature in enumerate(features)} - - -def load_biases(encodings: Sequence[Encoding]) -> Dict[Encoding, float]: - """ - Load linear model bias values for given encodings - - Args: - encodings: List of encodings - - Returns: - Mapping from encodings to their biases - """ - biases = {} - with gzip.open(RESOURCE_DIRECTORY / 'biases.gzip', 'rb') as data: - for line in data: - encoding, bias = line.decode().split() - biases[encoding] = float(bias) - - return {encoding: biases[encoding] for encoding in encodings} - - -def load_weights(encodings: Sequence[Encoding]) -> Dict[Encoding, List[float]]: - """ - - :param encodings: - :return: - """ - weights = {} - for encoding in encodings: - weights[encoding] = _unpack(WEIGHT_DIRECTORY / f'{encoding}.gzip', pattern='>e') - return weights diff --git a/libs/charamel/resources/biases.gzip b/libs/charamel/resources/biases.gzip deleted file mode 100644 index ab2692313..000000000 Binary files a/libs/charamel/resources/biases.gzip and /dev/null differ diff --git a/libs/charamel/resources/features.gzip b/libs/charamel/resources/features.gzip deleted file mode 100644 index 281c773f1..000000000 Binary files a/libs/charamel/resources/features.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/ascii.gzip b/libs/charamel/resources/weights/ascii.gzip deleted file mode 100644 index 695e1b11c..000000000 Binary files a/libs/charamel/resources/weights/ascii.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/big5.gzip b/libs/charamel/resources/weights/big5.gzip deleted file mode 100644 index 156368c44..000000000 Binary files a/libs/charamel/resources/weights/big5.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/big5hkscs.gzip b/libs/charamel/resources/weights/big5hkscs.gzip deleted file mode 100644 index 5fe8970a1..000000000 Binary files a/libs/charamel/resources/weights/big5hkscs.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp037.gzip b/libs/charamel/resources/weights/cp037.gzip deleted file mode 100644 index 1fa58d895..000000000 Binary files a/libs/charamel/resources/weights/cp037.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1006.gzip b/libs/charamel/resources/weights/cp1006.gzip deleted file mode 100644 index 09cce0caf..000000000 Binary files a/libs/charamel/resources/weights/cp1006.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1026.gzip b/libs/charamel/resources/weights/cp1026.gzip deleted file mode 100644 index 8fe9bb84c..000000000 Binary files a/libs/charamel/resources/weights/cp1026.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1125.gzip b/libs/charamel/resources/weights/cp1125.gzip deleted file mode 100644 index 9100b6e4b..000000000 Binary files a/libs/charamel/resources/weights/cp1125.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1140.gzip b/libs/charamel/resources/weights/cp1140.gzip deleted file mode 100644 index d8506c1c4..000000000 Binary files a/libs/charamel/resources/weights/cp1140.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1250.gzip b/libs/charamel/resources/weights/cp1250.gzip deleted file mode 100644 index 1b9b1ba5b..000000000 Binary files a/libs/charamel/resources/weights/cp1250.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1251.gzip b/libs/charamel/resources/weights/cp1251.gzip deleted file mode 100644 index a41146444..000000000 Binary files a/libs/charamel/resources/weights/cp1251.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1252.gzip b/libs/charamel/resources/weights/cp1252.gzip deleted file mode 100644 index 3f87769a7..000000000 Binary files a/libs/charamel/resources/weights/cp1252.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1253.gzip b/libs/charamel/resources/weights/cp1253.gzip deleted file mode 100644 index e57a16719..000000000 Binary files a/libs/charamel/resources/weights/cp1253.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1254.gzip b/libs/charamel/resources/weights/cp1254.gzip deleted file mode 100644 index 089f06899..000000000 Binary files a/libs/charamel/resources/weights/cp1254.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1255.gzip b/libs/charamel/resources/weights/cp1255.gzip deleted file mode 100644 index 5c08a1c18..000000000 Binary files a/libs/charamel/resources/weights/cp1255.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1256.gzip b/libs/charamel/resources/weights/cp1256.gzip deleted file mode 100644 index a17d45b13..000000000 Binary files a/libs/charamel/resources/weights/cp1256.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1257.gzip b/libs/charamel/resources/weights/cp1257.gzip deleted file mode 100644 index efd13ef3d..000000000 Binary files a/libs/charamel/resources/weights/cp1257.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp1258.gzip b/libs/charamel/resources/weights/cp1258.gzip deleted file mode 100644 index 8f546a44b..000000000 Binary files a/libs/charamel/resources/weights/cp1258.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp273.gzip b/libs/charamel/resources/weights/cp273.gzip deleted file mode 100644 index cce71cceb..000000000 Binary files a/libs/charamel/resources/weights/cp273.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp424.gzip b/libs/charamel/resources/weights/cp424.gzip deleted file mode 100644 index 5a13c138e..000000000 Binary files a/libs/charamel/resources/weights/cp424.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp437.gzip b/libs/charamel/resources/weights/cp437.gzip deleted file mode 100644 index c60c68667..000000000 Binary files a/libs/charamel/resources/weights/cp437.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp500.gzip b/libs/charamel/resources/weights/cp500.gzip deleted file mode 100644 index 9461df494..000000000 Binary files a/libs/charamel/resources/weights/cp500.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp720.gzip b/libs/charamel/resources/weights/cp720.gzip deleted file mode 100644 index b9d99e803..000000000 Binary files a/libs/charamel/resources/weights/cp720.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp737.gzip b/libs/charamel/resources/weights/cp737.gzip deleted file mode 100644 index 50a6feadd..000000000 Binary files a/libs/charamel/resources/weights/cp737.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp775.gzip b/libs/charamel/resources/weights/cp775.gzip deleted file mode 100644 index 955569ee0..000000000 Binary files a/libs/charamel/resources/weights/cp775.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp850.gzip b/libs/charamel/resources/weights/cp850.gzip deleted file mode 100644 index cf942dbd4..000000000 Binary files a/libs/charamel/resources/weights/cp850.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp852.gzip b/libs/charamel/resources/weights/cp852.gzip deleted file mode 100644 index c8d5cec53..000000000 Binary files a/libs/charamel/resources/weights/cp852.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp855.gzip b/libs/charamel/resources/weights/cp855.gzip deleted file mode 100644 index 228100c4e..000000000 Binary files a/libs/charamel/resources/weights/cp855.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp856.gzip b/libs/charamel/resources/weights/cp856.gzip deleted file mode 100644 index 28e1020f5..000000000 Binary files a/libs/charamel/resources/weights/cp856.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp857.gzip b/libs/charamel/resources/weights/cp857.gzip deleted file mode 100644 index 55bba8210..000000000 Binary files a/libs/charamel/resources/weights/cp857.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp858.gzip b/libs/charamel/resources/weights/cp858.gzip deleted file mode 100644 index 8f279d169..000000000 Binary files a/libs/charamel/resources/weights/cp858.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp860.gzip b/libs/charamel/resources/weights/cp860.gzip deleted file mode 100644 index 0b0914d17..000000000 Binary files a/libs/charamel/resources/weights/cp860.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp861.gzip b/libs/charamel/resources/weights/cp861.gzip deleted file mode 100644 index 2875d51d1..000000000 Binary files a/libs/charamel/resources/weights/cp861.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp862.gzip b/libs/charamel/resources/weights/cp862.gzip deleted file mode 100644 index 963f016b5..000000000 Binary files a/libs/charamel/resources/weights/cp862.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp863.gzip b/libs/charamel/resources/weights/cp863.gzip deleted file mode 100644 index 2ada2067d..000000000 Binary files a/libs/charamel/resources/weights/cp863.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp864.gzip b/libs/charamel/resources/weights/cp864.gzip deleted file mode 100644 index b6c0f573f..000000000 Binary files a/libs/charamel/resources/weights/cp864.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp865.gzip b/libs/charamel/resources/weights/cp865.gzip deleted file mode 100644 index f8c3f1e57..000000000 Binary files a/libs/charamel/resources/weights/cp865.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp866.gzip b/libs/charamel/resources/weights/cp866.gzip deleted file mode 100644 index 82fe8399e..000000000 Binary files a/libs/charamel/resources/weights/cp866.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp869.gzip b/libs/charamel/resources/weights/cp869.gzip deleted file mode 100644 index 52d2ec86c..000000000 Binary files a/libs/charamel/resources/weights/cp869.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp874.gzip b/libs/charamel/resources/weights/cp874.gzip deleted file mode 100644 index e609ca582..000000000 Binary files a/libs/charamel/resources/weights/cp874.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp875.gzip b/libs/charamel/resources/weights/cp875.gzip deleted file mode 100644 index 75846231b..000000000 Binary files a/libs/charamel/resources/weights/cp875.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp932.gzip b/libs/charamel/resources/weights/cp932.gzip deleted file mode 100644 index ac18fe2af..000000000 Binary files a/libs/charamel/resources/weights/cp932.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp949.gzip b/libs/charamel/resources/weights/cp949.gzip deleted file mode 100644 index 1882c6a49..000000000 Binary files a/libs/charamel/resources/weights/cp949.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/cp950.gzip b/libs/charamel/resources/weights/cp950.gzip deleted file mode 100644 index 04188859e..000000000 Binary files a/libs/charamel/resources/weights/cp950.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/euc_jis_2004.gzip b/libs/charamel/resources/weights/euc_jis_2004.gzip deleted file mode 100644 index 21bda22a3..000000000 Binary files a/libs/charamel/resources/weights/euc_jis_2004.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/euc_jisx0213.gzip b/libs/charamel/resources/weights/euc_jisx0213.gzip deleted file mode 100644 index 26eb21868..000000000 Binary files a/libs/charamel/resources/weights/euc_jisx0213.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/euc_jp.gzip b/libs/charamel/resources/weights/euc_jp.gzip deleted file mode 100644 index 7a8ab341f..000000000 Binary files a/libs/charamel/resources/weights/euc_jp.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/euc_kr.gzip b/libs/charamel/resources/weights/euc_kr.gzip deleted file mode 100644 index 2de76ddf2..000000000 Binary files a/libs/charamel/resources/weights/euc_kr.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/gb18030.gzip b/libs/charamel/resources/weights/gb18030.gzip deleted file mode 100644 index 5b714c3d7..000000000 Binary files a/libs/charamel/resources/weights/gb18030.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/gb2312.gzip b/libs/charamel/resources/weights/gb2312.gzip deleted file mode 100644 index a671844be..000000000 Binary files a/libs/charamel/resources/weights/gb2312.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/gbk.gzip b/libs/charamel/resources/weights/gbk.gzip deleted file mode 100644 index 86d2d0b77..000000000 Binary files a/libs/charamel/resources/weights/gbk.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/hz.gzip b/libs/charamel/resources/weights/hz.gzip deleted file mode 100644 index 8d924f6c0..000000000 Binary files a/libs/charamel/resources/weights/hz.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso2022_jp.gzip b/libs/charamel/resources/weights/iso2022_jp.gzip deleted file mode 100644 index edeef384b..000000000 Binary files a/libs/charamel/resources/weights/iso2022_jp.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso2022_jp_1.gzip b/libs/charamel/resources/weights/iso2022_jp_1.gzip deleted file mode 100644 index f37a21397..000000000 Binary files a/libs/charamel/resources/weights/iso2022_jp_1.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso2022_jp_2.gzip b/libs/charamel/resources/weights/iso2022_jp_2.gzip deleted file mode 100644 index b19b26a86..000000000 Binary files a/libs/charamel/resources/weights/iso2022_jp_2.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso2022_jp_2004.gzip b/libs/charamel/resources/weights/iso2022_jp_2004.gzip deleted file mode 100644 index 62c9c3e48..000000000 Binary files a/libs/charamel/resources/weights/iso2022_jp_2004.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso2022_jp_3.gzip b/libs/charamel/resources/weights/iso2022_jp_3.gzip deleted file mode 100644 index ac3de65e8..000000000 Binary files a/libs/charamel/resources/weights/iso2022_jp_3.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso2022_jp_ext.gzip b/libs/charamel/resources/weights/iso2022_jp_ext.gzip deleted file mode 100644 index a10b13048..000000000 Binary files a/libs/charamel/resources/weights/iso2022_jp_ext.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso2022_kr.gzip b/libs/charamel/resources/weights/iso2022_kr.gzip deleted file mode 100644 index 8a2d5e5c3..000000000 Binary files a/libs/charamel/resources/weights/iso2022_kr.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_10.gzip b/libs/charamel/resources/weights/iso8859_10.gzip deleted file mode 100644 index 1caf6ccb2..000000000 Binary files a/libs/charamel/resources/weights/iso8859_10.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_11.gzip b/libs/charamel/resources/weights/iso8859_11.gzip deleted file mode 100644 index 9d068f3e1..000000000 Binary files a/libs/charamel/resources/weights/iso8859_11.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_13.gzip b/libs/charamel/resources/weights/iso8859_13.gzip deleted file mode 100644 index 69fb36144..000000000 Binary files a/libs/charamel/resources/weights/iso8859_13.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_14.gzip b/libs/charamel/resources/weights/iso8859_14.gzip deleted file mode 100644 index decd39764..000000000 Binary files a/libs/charamel/resources/weights/iso8859_14.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_15.gzip b/libs/charamel/resources/weights/iso8859_15.gzip deleted file mode 100644 index 3dd65041b..000000000 Binary files a/libs/charamel/resources/weights/iso8859_15.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_16.gzip b/libs/charamel/resources/weights/iso8859_16.gzip deleted file mode 100644 index 36f6d4874..000000000 Binary files a/libs/charamel/resources/weights/iso8859_16.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_2.gzip b/libs/charamel/resources/weights/iso8859_2.gzip deleted file mode 100644 index c122280f1..000000000 Binary files a/libs/charamel/resources/weights/iso8859_2.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_3.gzip b/libs/charamel/resources/weights/iso8859_3.gzip deleted file mode 100644 index 1aac5dd11..000000000 Binary files a/libs/charamel/resources/weights/iso8859_3.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_4.gzip b/libs/charamel/resources/weights/iso8859_4.gzip deleted file mode 100644 index 3a26bac13..000000000 Binary files a/libs/charamel/resources/weights/iso8859_4.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_5.gzip b/libs/charamel/resources/weights/iso8859_5.gzip deleted file mode 100644 index 251b50988..000000000 Binary files a/libs/charamel/resources/weights/iso8859_5.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_6.gzip b/libs/charamel/resources/weights/iso8859_6.gzip deleted file mode 100644 index 0013b6425..000000000 Binary files a/libs/charamel/resources/weights/iso8859_6.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_7.gzip b/libs/charamel/resources/weights/iso8859_7.gzip deleted file mode 100644 index 7bf14906f..000000000 Binary files a/libs/charamel/resources/weights/iso8859_7.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_8.gzip b/libs/charamel/resources/weights/iso8859_8.gzip deleted file mode 100644 index 9bb4c3120..000000000 Binary files a/libs/charamel/resources/weights/iso8859_8.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/iso8859_9.gzip b/libs/charamel/resources/weights/iso8859_9.gzip deleted file mode 100644 index d176af958..000000000 Binary files a/libs/charamel/resources/weights/iso8859_9.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/johab.gzip b/libs/charamel/resources/weights/johab.gzip deleted file mode 100644 index c669f0f78..000000000 Binary files a/libs/charamel/resources/weights/johab.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/koi8_r.gzip b/libs/charamel/resources/weights/koi8_r.gzip deleted file mode 100644 index 31a59cbac..000000000 Binary files a/libs/charamel/resources/weights/koi8_r.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/koi8_t.gzip b/libs/charamel/resources/weights/koi8_t.gzip deleted file mode 100644 index 2977f2602..000000000 Binary files a/libs/charamel/resources/weights/koi8_t.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/koi8_u.gzip b/libs/charamel/resources/weights/koi8_u.gzip deleted file mode 100644 index c12c7f634..000000000 Binary files a/libs/charamel/resources/weights/koi8_u.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/kz1048.gzip b/libs/charamel/resources/weights/kz1048.gzip deleted file mode 100644 index 4ad027fc6..000000000 Binary files a/libs/charamel/resources/weights/kz1048.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/latin_1.gzip b/libs/charamel/resources/weights/latin_1.gzip deleted file mode 100644 index 3cf042fbe..000000000 Binary files a/libs/charamel/resources/weights/latin_1.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/mac_cyrillic.gzip b/libs/charamel/resources/weights/mac_cyrillic.gzip deleted file mode 100644 index a71344044..000000000 Binary files a/libs/charamel/resources/weights/mac_cyrillic.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/mac_greek.gzip b/libs/charamel/resources/weights/mac_greek.gzip deleted file mode 100644 index 34a1a8275..000000000 Binary files a/libs/charamel/resources/weights/mac_greek.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/mac_iceland.gzip b/libs/charamel/resources/weights/mac_iceland.gzip deleted file mode 100644 index 9bdd1accd..000000000 Binary files a/libs/charamel/resources/weights/mac_iceland.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/mac_latin2.gzip b/libs/charamel/resources/weights/mac_latin2.gzip deleted file mode 100644 index 9771a6956..000000000 Binary files a/libs/charamel/resources/weights/mac_latin2.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/mac_roman.gzip b/libs/charamel/resources/weights/mac_roman.gzip deleted file mode 100644 index cbe6140d0..000000000 Binary files a/libs/charamel/resources/weights/mac_roman.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/mac_turkish.gzip b/libs/charamel/resources/weights/mac_turkish.gzip deleted file mode 100644 index d0ed3d730..000000000 Binary files a/libs/charamel/resources/weights/mac_turkish.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/ptcp154.gzip b/libs/charamel/resources/weights/ptcp154.gzip deleted file mode 100644 index 23605f00b..000000000 Binary files a/libs/charamel/resources/weights/ptcp154.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/shift_jis.gzip b/libs/charamel/resources/weights/shift_jis.gzip deleted file mode 100644 index 713075b49..000000000 Binary files a/libs/charamel/resources/weights/shift_jis.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/shift_jis_2004.gzip b/libs/charamel/resources/weights/shift_jis_2004.gzip deleted file mode 100644 index afc68af72..000000000 Binary files a/libs/charamel/resources/weights/shift_jis_2004.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/shift_jisx0213.gzip b/libs/charamel/resources/weights/shift_jisx0213.gzip deleted file mode 100644 index aa3aa32f7..000000000 Binary files a/libs/charamel/resources/weights/shift_jisx0213.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/tis_620.gzip b/libs/charamel/resources/weights/tis_620.gzip deleted file mode 100644 index bebae49da..000000000 Binary files a/libs/charamel/resources/weights/tis_620.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_16.gzip b/libs/charamel/resources/weights/utf_16.gzip deleted file mode 100644 index 3ac44b14f..000000000 Binary files a/libs/charamel/resources/weights/utf_16.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_16_be.gzip b/libs/charamel/resources/weights/utf_16_be.gzip deleted file mode 100644 index 68e8024c1..000000000 Binary files a/libs/charamel/resources/weights/utf_16_be.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_16_le.gzip b/libs/charamel/resources/weights/utf_16_le.gzip deleted file mode 100644 index 4790b3a6e..000000000 Binary files a/libs/charamel/resources/weights/utf_16_le.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_32.gzip b/libs/charamel/resources/weights/utf_32.gzip deleted file mode 100644 index 4599cac24..000000000 Binary files a/libs/charamel/resources/weights/utf_32.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_32_be.gzip b/libs/charamel/resources/weights/utf_32_be.gzip deleted file mode 100644 index b70600e61..000000000 Binary files a/libs/charamel/resources/weights/utf_32_be.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_32_le.gzip b/libs/charamel/resources/weights/utf_32_le.gzip deleted file mode 100644 index 4ab2e68a5..000000000 Binary files a/libs/charamel/resources/weights/utf_32_le.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_7.gzip b/libs/charamel/resources/weights/utf_7.gzip deleted file mode 100644 index 13a4337ee..000000000 Binary files a/libs/charamel/resources/weights/utf_7.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_8.gzip b/libs/charamel/resources/weights/utf_8.gzip deleted file mode 100644 index 66966287e..000000000 Binary files a/libs/charamel/resources/weights/utf_8.gzip and /dev/null differ diff --git a/libs/charamel/resources/weights/utf_8_sig.gzip b/libs/charamel/resources/weights/utf_8_sig.gzip deleted file mode 100644 index 78567848d..000000000 Binary files a/libs/charamel/resources/weights/utf_8_sig.gzip and /dev/null differ diff --git a/libs/chardet/__init__.py b/libs/chardet/__init__.py index e91ad6182..fe581623d 100644 --- a/libs/chardet/__init__.py +++ b/libs/chardet/__init__.py @@ -15,19 +15,29 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + +from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState +from .resultdict import ResultDict from .universaldetector import UniversalDetector from .version import VERSION, __version__ __all__ = ["UniversalDetector", "detect", "detect_all", "__version__", "VERSION"] -def detect(byte_str): +def detect( + byte_str: Union[bytes, bytearray], should_rename_legacy: bool = False +) -> ResultDict: """ Detect the encoding of the given byte string. :param byte_str: The byte sequence to examine. :type byte_str: ``bytes`` or ``bytearray`` + :param should_rename_legacy: Should we rename legacy encodings + to their more modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): @@ -35,12 +45,16 @@ def detect(byte_str): f"Expected object of type bytes or bytearray, got: {type(byte_str)}" ) byte_str = bytearray(byte_str) - detector = UniversalDetector() + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) return detector.close() -def detect_all(byte_str, ignore_threshold=False): +def detect_all( + byte_str: Union[bytes, bytearray], + ignore_threshold: bool = False, + should_rename_legacy: bool = False, +) -> List[ResultDict]: """ Detect all the possible encodings of the given byte string. @@ -50,6 +64,9 @@ def detect_all(byte_str, ignore_threshold=False): ``UniversalDetector.MINIMUM_THRESHOLD`` in results. :type ignore_threshold: ``bool`` + :param should_rename_legacy: Should we rename legacy encodings + to their more modern equivalents? + :type should_rename_legacy: ``bool`` """ if not isinstance(byte_str, bytearray): if not isinstance(byte_str, bytes): @@ -58,15 +75,15 @@ def detect_all(byte_str, ignore_threshold=False): ) byte_str = bytearray(byte_str) - detector = UniversalDetector() + detector = UniversalDetector(should_rename_legacy=should_rename_legacy) detector.feed(byte_str) detector.close() if detector.input_state == InputState.HIGH_BYTE: - results = [] - probers = [] + results: List[ResultDict] = [] + probers: List[CharSetProber] = [] for prober in detector.charset_probers: - if hasattr(prober, "probers"): + if isinstance(prober, CharSetGroupProber): probers.extend(p for p in prober.probers) else: probers.append(prober) @@ -80,6 +97,11 @@ def detect_all(byte_str, ignore_threshold=False): charset_name = detector.ISO_WIN_MAP.get( lower_charset_name, charset_name ) + # Rename legacy encodings with superset encodings if asked + if should_rename_legacy: + charset_name = detector.LEGACY_MAP.get( + charset_name.lower(), charset_name + ) results.append( { "encoding": charset_name, diff --git a/libs/chardet/big5prober.py b/libs/chardet/big5prober.py index e4dfa7aa0..ef09c60e3 100644 --- a/libs/chardet/big5prober.py +++ b/libs/chardet/big5prober.py @@ -32,16 +32,16 @@ from .mbcssm import BIG5_SM_MODEL class Big5Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) self.distribution_analyzer = Big5DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "Big5" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/libs/chardet/chardistribution.py b/libs/chardet/chardistribution.py index 27b4a2939..176cb9964 100644 --- a/libs/chardet/chardistribution.py +++ b/libs/chardet/chardistribution.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Tuple, Union + from .big5freq import ( BIG5_CHAR_TO_FREQ_ORDER, BIG5_TABLE_SIZE, @@ -59,22 +61,22 @@ class CharDistributionAnalysis: SURE_NO = 0.01 MINIMUM_DATA_THRESHOLD = 3 - def __init__(self): + def __init__(self) -> None: # Mapping table to get frequency order from char order (get from # GetOrder()) - self._char_to_freq_order = tuple() - self._table_size = None # Size of above table + self._char_to_freq_order: Tuple[int, ...] = tuple() + self._table_size = 0 # Size of above table # This is a constant value which varies from language to language, # used in calculating confidence. See # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html # for further detail. - self.typical_distribution_ratio = None - self._done = None - self._total_chars = None - self._freq_chars = None + self.typical_distribution_ratio = 0.0 + self._done = False + self._total_chars = 0 + self._freq_chars = 0 self.reset() - def reset(self): + def reset(self) -> None: """reset analyser, clear any state""" # If this flag is set to True, detection is done and conclusion has # been made @@ -83,7 +85,7 @@ class CharDistributionAnalysis: # The number of characters whose frequency order is less than 512 self._freq_chars = 0 - def feed(self, char, char_len): + def feed(self, char: Union[bytes, bytearray], char_len: int) -> None: """feed a character with known length""" if char_len == 2: # we only care about 2-bytes character in our distribution analysis @@ -97,7 +99,7 @@ class CharDistributionAnalysis: if 512 > self._char_to_freq_order[order]: self._freq_chars += 1 - def get_confidence(self): + def get_confidence(self) -> float: """return confidence based on existing data""" # if we didn't receive any character in our consideration range, # return negative answer @@ -114,12 +116,12 @@ class CharDistributionAnalysis: # normalize confidence (we don't want to be 100% sure) return self.SURE_YES - def got_enough_data(self): + def got_enough_data(self) -> bool: # It is not necessary to receive all data to draw conclusion. # For charset detection, certain amount of data is enough return self._total_chars > self.ENOUGH_DATA_THRESHOLD - def get_order(self, _): + def get_order(self, _: Union[bytes, bytearray]) -> int: # We do not handle characters based on the original encoding string, # but convert this encoding string to a number, here called order. # This allows multiple encodings of a language to share one frequency @@ -128,13 +130,13 @@ class CharDistributionAnalysis: class EUCTWDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER self._table_size = EUCTW_TABLE_SIZE self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-TW encoding, we are interested # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -146,13 +148,13 @@ class EUCTWDistributionAnalysis(CharDistributionAnalysis): class EUCKRDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-KR encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -164,13 +166,13 @@ class EUCKRDistributionAnalysis(CharDistributionAnalysis): class JOHABDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: first_char = byte_str[0] if 0x88 <= first_char < 0xD4: code = first_char * 256 + byte_str[1] @@ -179,13 +181,13 @@ class JOHABDistributionAnalysis(CharDistributionAnalysis): class GB2312DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER self._table_size = GB2312_TABLE_SIZE self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for GB2312 encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe @@ -197,13 +199,13 @@ class GB2312DistributionAnalysis(CharDistributionAnalysis): class Big5DistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER self._table_size = BIG5_TABLE_SIZE self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe @@ -217,13 +219,13 @@ class Big5DistributionAnalysis(CharDistributionAnalysis): class SJISDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe @@ -242,13 +244,13 @@ class SJISDistributionAnalysis(CharDistributionAnalysis): class EUCJPDistributionAnalysis(CharDistributionAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> int: # for euc-JP encoding, we are interested # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe diff --git a/libs/chardet/charsetgroupprober.py b/libs/chardet/charsetgroupprober.py index 778ff332b..6def56b4a 100644 --- a/libs/chardet/charsetgroupprober.py +++ b/libs/chardet/charsetgroupprober.py @@ -25,29 +25,30 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Optional, Union + from .charsetprober import CharSetProber -from .enums import ProbingState +from .enums import LanguageFilter, ProbingState class CharSetGroupProber(CharSetProber): - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self._active_num = 0 - self.probers = [] - self._best_guess_prober = None + self.probers: List[CharSetProber] = [] + self._best_guess_prober: Optional[CharSetProber] = None - def reset(self): + def reset(self) -> None: super().reset() self._active_num = 0 for prober in self.probers: - if prober: - prober.reset() - prober.active = True - self._active_num += 1 + prober.reset() + prober.active = True + self._active_num += 1 self._best_guess_prober = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: @@ -55,17 +56,15 @@ class CharSetGroupProber(CharSetProber): return self._best_guess_prober.charset_name @property - def language(self): + def language(self) -> Optional[str]: if not self._best_guess_prober: self.get_confidence() if not self._best_guess_prober: return None return self._best_guess_prober.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for prober in self.probers: - if not prober: - continue if not prober.active: continue state = prober.feed(byte_str) @@ -83,7 +82,7 @@ class CharSetGroupProber(CharSetProber): return self.state return self.state - def get_confidence(self): + def get_confidence(self) -> float: state = self.state if state == ProbingState.FOUND_IT: return 0.99 @@ -92,8 +91,6 @@ class CharSetGroupProber(CharSetProber): best_conf = 0.0 self._best_guess_prober = None for prober in self.probers: - if not prober: - continue if not prober.active: self.logger.debug("%s not active", prober.charset_name) continue diff --git a/libs/chardet/charsetprober.py b/libs/chardet/charsetprober.py index 9f1afd999..a103ca113 100644 --- a/libs/chardet/charsetprober.py +++ b/libs/chardet/charsetprober.py @@ -28,8 +28,9 @@ import logging import re +from typing import Optional, Union -from .enums import ProbingState +from .enums import LanguageFilter, ProbingState INTERNATIONAL_WORDS_PATTERN = re.compile( b"[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?" @@ -40,35 +41,40 @@ class CharSetProber: SHORTCUT_THRESHOLD = 0.95 - def __init__(self, lang_filter=None): - self._state = None + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: + self._state = ProbingState.DETECTING + self.active = True self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - def reset(self): + def reset(self) -> None: self._state = ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return None - def feed(self, byte_str): + @property + def language(self) -> Optional[str]: + raise NotImplementedError + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: raise NotImplementedError @property - def state(self): + def state(self) -> ProbingState: return self._state - def get_confidence(self): + def get_confidence(self) -> float: return 0.0 @staticmethod - def filter_high_byte_only(buf): + def filter_high_byte_only(buf: Union[bytes, bytearray]) -> bytes: buf = re.sub(b"([\x00-\x7F])+", b" ", buf) return buf @staticmethod - def filter_international_words(buf): + def filter_international_words(buf: Union[bytes, bytearray]) -> bytearray: """ We define three types of bytes: alphabet: english alphabets [a-zA-Z] @@ -102,7 +108,7 @@ class CharSetProber: return filtered @staticmethod - def remove_xml_tags(buf): + def remove_xml_tags(buf: Union[bytes, bytearray]) -> bytes: """ Returns a copy of ``buf`` that retains only the sequences of English alphabet and high byte characters that are not between <> characters. @@ -117,10 +123,13 @@ class CharSetProber: for curr, buf_char in enumerate(buf): # Check if we're coming out of or entering an XML tag - if buf_char == b">": + + # https://github.com/python/typeshed/issues/8182 + if buf_char == b">": # type: ignore[comparison-overlap] prev = curr + 1 in_tag = False - elif buf_char == b"<": + # https://github.com/python/typeshed/issues/8182 + elif buf_char == b"<": # type: ignore[comparison-overlap] if curr > prev and not in_tag: # Keep everything after last non-extended-ASCII, # non-alphabetic character diff --git a/libs/chardet/cli/chardetect.py b/libs/chardet/cli/chardetect.py index 7926fa37e..43f6e144f 100644 --- a/libs/chardet/cli/chardetect.py +++ b/libs/chardet/cli/chardetect.py @@ -15,12 +15,18 @@ If no paths are provided, it takes its input from stdin. import argparse import sys +from typing import Iterable, List, Optional from .. import __version__ from ..universaldetector import UniversalDetector -def description_of(lines, name="stdin"): +def description_of( + lines: Iterable[bytes], + name: str = "stdin", + minimal: bool = False, + should_rename_legacy: bool = False, +) -> Optional[str]: """ Return a string describing the probable encoding of a file or list of strings. @@ -29,8 +35,11 @@ def description_of(lines, name="stdin"): :type lines: Iterable of bytes :param name: Name of file or collection of lines :type name: str + :param should_rename_legacy: Should we rename legacy encodings to + their more modern equivalents? + :type should_rename_legacy: ``bool`` """ - u = UniversalDetector() + u = UniversalDetector(should_rename_legacy=should_rename_legacy) for line in lines: line = bytearray(line) u.feed(line) @@ -39,12 +48,14 @@ def description_of(lines, name="stdin"): break u.close() result = u.result + if minimal: + return result["encoding"] if result["encoding"]: return f'{name}: {result["encoding"]} with confidence {result["confidence"]}' return f"{name}: no result" -def main(argv=None): +def main(argv: Optional[List[str]] = None) -> None: """ Handles command line arguments and gets things started. @@ -54,17 +65,28 @@ def main(argv=None): """ # Get command line arguments parser = argparse.ArgumentParser( - description="Takes one or more file paths and reports their detected \ - encodings" + description=( + "Takes one or more file paths and reports their detected encodings" + ) ) parser.add_argument( "input", - help="File whose encoding we would like to determine. \ - (default: stdin)", + help="File whose encoding we would like to determine. (default: stdin)", type=argparse.FileType("rb"), nargs="*", default=[sys.stdin.buffer], ) + parser.add_argument( + "--minimal", + help="Print only the encoding to standard output", + action="store_true", + ) + parser.add_argument( + "-l", + "--legacy", + help="Rename legacy encodings to more modern ones.", + action="store_true", + ) parser.add_argument( "--version", action="version", version=f"%(prog)s {__version__}" ) @@ -79,7 +101,11 @@ def main(argv=None): "--help\n", file=sys.stderr, ) - print(description_of(f, f.name)) + print( + description_of( + f, f.name, minimal=args.minimal, should_rename_legacy=args.legacy + ) + ) if __name__ == "__main__": diff --git a/libs/chardet/codingstatemachine.py b/libs/chardet/codingstatemachine.py index d3e3e825d..8ed4a8773 100644 --- a/libs/chardet/codingstatemachine.py +++ b/libs/chardet/codingstatemachine.py @@ -27,6 +27,7 @@ import logging +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState @@ -53,18 +54,19 @@ class CodingStateMachine: encoding from consideration from here on. """ - def __init__(self, sm): + def __init__(self, sm: CodingStateMachineDict) -> None: self._model = sm self._curr_byte_pos = 0 self._curr_char_len = 0 - self._curr_state = None + self._curr_state = MachineState.START + self.active = True self.logger = logging.getLogger(__name__) self.reset() - def reset(self): + def reset(self) -> None: self._curr_state = MachineState.START - def next_state(self, c): + def next_state(self, c: int) -> int: # for each byte we get its class # if it is first byte, we also get byte length byte_class = self._model["class_table"][c] @@ -77,12 +79,12 @@ class CodingStateMachine: self._curr_byte_pos += 1 return self._curr_state - def get_current_charlen(self): + def get_current_charlen(self) -> int: return self._curr_char_len - def get_coding_state_machine(self): + def get_coding_state_machine(self) -> str: return self._model["name"] @property - def language(self): + def language(self) -> str: return self._model["language"] diff --git a/libs/chardet/codingstatemachinedict.py b/libs/chardet/codingstatemachinedict.py new file mode 100644 index 000000000..7a3c4c7e3 --- /dev/null +++ b/libs/chardet/codingstatemachinedict.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING, Tuple + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class CodingStateMachineDict(TypedDict, total=False): + class_table: Tuple[int, ...] + class_factor: int + state_table: Tuple[int, ...] + char_len_table: Tuple[int, ...] + name: str + language: str # Optional key + +else: + CodingStateMachineDict = dict diff --git a/libs/chardet/cp949prober.py b/libs/chardet/cp949prober.py index 28a1f3dbb..fa7307ed8 100644 --- a/libs/chardet/cp949prober.py +++ b/libs/chardet/cp949prober.py @@ -32,7 +32,7 @@ from .mbcssm import CP949_SM_MODEL class CP949Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(CP949_SM_MODEL) # NOTE: CP949 is a superset of EUC-KR, so the distribution should be @@ -41,9 +41,9 @@ class CP949Prober(MultiByteCharSetProber): self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "CP949" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/libs/chardet/enums.py b/libs/chardet/enums.py index 32a77e76c..5e3e19823 100644 --- a/libs/chardet/enums.py +++ b/libs/chardet/enums.py @@ -4,6 +4,8 @@ All of the Enums that are used throughout the chardet package. :author: Dan Blanchard (dan.blanchard@gmail.com) """ +from enum import Enum, Flag + class InputState: """ @@ -15,12 +17,13 @@ class InputState: HIGH_BYTE = 2 -class LanguageFilter: +class LanguageFilter(Flag): """ This enum represents the different language filters we can apply to a ``UniversalDetector``. """ + NONE = 0x00 CHINESE_SIMPLIFIED = 0x01 CHINESE_TRADITIONAL = 0x02 JAPANESE = 0x04 @@ -31,7 +34,7 @@ class LanguageFilter: CJK = CHINESE | JAPANESE | KOREAN -class ProbingState: +class ProbingState(Enum): """ This enum represents the different states a prober can be in. """ @@ -62,7 +65,7 @@ class SequenceLikelihood: POSITIVE = 3 @classmethod - def get_num_categories(cls): + def get_num_categories(cls) -> int: """:returns: The number of likelihood categories in the enum.""" return 4 diff --git a/libs/chardet/escprober.py b/libs/chardet/escprober.py index d9926115d..fd713830d 100644 --- a/libs/chardet/escprober.py +++ b/libs/chardet/escprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine from .enums import LanguageFilter, MachineState, ProbingState @@ -43,7 +45,7 @@ class EscCharSetProber(CharSetProber): identify these encodings. """ - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self.coding_sm = [] if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: @@ -53,17 +55,15 @@ class EscCharSetProber(CharSetProber): self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL)) if self.lang_filter & LanguageFilter.KOREAN: self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL)) - self.active_sm_count = None - self._detected_charset = None - self._detected_language = None - self._state = None + self.active_sm_count = 0 + self._detected_charset: Optional[str] = None + self._detected_language: Optional[str] = None + self._state = ProbingState.DETECTING self.reset() - def reset(self): + def reset(self) -> None: super().reset() for coding_sm in self.coding_sm: - if not coding_sm: - continue coding_sm.active = True coding_sm.reset() self.active_sm_count = len(self.coding_sm) @@ -71,20 +71,20 @@ class EscCharSetProber(CharSetProber): self._detected_language = None @property - def charset_name(self): + def charset_name(self) -> Optional[str]: return self._detected_charset @property - def language(self): + def language(self) -> Optional[str]: return self._detected_language - def get_confidence(self): + def get_confidence(self) -> float: return 0.99 if self._detected_charset else 0.00 - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: for coding_sm in self.coding_sm: - if not coding_sm or not coding_sm.active: + if not coding_sm.active: continue coding_state = coding_sm.next_state(c) if coding_state == MachineState.ERROR: diff --git a/libs/chardet/escsm.py b/libs/chardet/escsm.py index 3aa0f4d96..11d4adf77 100644 --- a/libs/chardet/escsm.py +++ b/libs/chardet/escsm.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState # fmt: off @@ -75,7 +76,7 @@ MachineState.ITS_ME, MachineState.ITS_ME, MachineState.ERROR, MachineState.ERROR HZ_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -HZ_SM_MODEL = { +HZ_SM_MODEL: CodingStateMachineDict = { "class_table": HZ_CLS, "class_factor": 6, "state_table": HZ_ST, @@ -134,7 +135,7 @@ ISO2022CN_ST = ( ISO2022CN_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022CN_SM_MODEL = { +ISO2022CN_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022CN_CLS, "class_factor": 9, "state_table": ISO2022CN_ST, @@ -194,7 +195,7 @@ ISO2022JP_ST = ( ISO2022JP_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0, 0, 0, 0, 0) -ISO2022JP_SM_MODEL = { +ISO2022JP_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022JP_CLS, "class_factor": 10, "state_table": ISO2022JP_ST, @@ -250,7 +251,7 @@ ISO2022KR_ST = ( ISO2022KR_CHAR_LEN_TABLE = (0, 0, 0, 0, 0, 0) -ISO2022KR_SM_MODEL = { +ISO2022KR_SM_MODEL: CodingStateMachineDict = { "class_table": ISO2022KR_CLS, "class_factor": 6, "state_table": ISO2022KR_ST, diff --git a/libs/chardet/eucjpprober.py b/libs/chardet/eucjpprober.py index abf2e66e2..39487f409 100644 --- a/libs/chardet/eucjpprober.py +++ b/libs/chardet/eucjpprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .chardistribution import EUCJPDistributionAnalysis from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,29 @@ from .mbcssm import EUCJP_SM_MODEL class EUCJPProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) self.distribution_analyzer = EUCJPDistributionAnalysis() self.context_analyzer = EUCJPContextAnalysis() self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-JP" @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + for i, byte in enumerate(byte_str): # PY3K: byte_str is a byte array, so byte is an int, not a byte coding_state = self.coding_sm.next_state(byte) @@ -89,7 +94,9 @@ class EUCJPProber(MultiByteCharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/libs/chardet/euckrprober.py b/libs/chardet/euckrprober.py index 154a6d216..1fc5de046 100644 --- a/libs/chardet/euckrprober.py +++ b/libs/chardet/euckrprober.py @@ -32,16 +32,16 @@ from .mbcssm import EUCKR_SM_MODEL class EUCKRProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) self.distribution_analyzer = EUCKRDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-KR" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/libs/chardet/euctwprober.py b/libs/chardet/euctwprober.py index ca10a23ca..a37ab1899 100644 --- a/libs/chardet/euctwprober.py +++ b/libs/chardet/euctwprober.py @@ -32,16 +32,16 @@ from .mbcssm import EUCTW_SM_MODEL class EUCTWProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) self.distribution_analyzer = EUCTWDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "EUC-TW" @property - def language(self): + def language(self) -> str: return "Taiwan" diff --git a/libs/chardet/gb2312prober.py b/libs/chardet/gb2312prober.py index 251c04295..d423e7311 100644 --- a/libs/chardet/gb2312prober.py +++ b/libs/chardet/gb2312prober.py @@ -32,16 +32,16 @@ from .mbcssm import GB2312_SM_MODEL class GB2312Prober(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) self.distribution_analyzer = GB2312DistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "GB2312" @property - def language(self): + def language(self) -> str: return "Chinese" diff --git a/libs/chardet/hebrewprober.py b/libs/chardet/hebrewprober.py index 3ca634bf3..785d0057b 100644 --- a/libs/chardet/hebrewprober.py +++ b/libs/chardet/hebrewprober.py @@ -25,8 +25,11 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + from .charsetprober import CharSetProber from .enums import ProbingState +from .sbcharsetprober import SingleByteCharSetProber # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -127,6 +130,7 @@ from .enums import ProbingState class HebrewProber(CharSetProber): + SPACE = 0x20 # windows-1255 / ISO-8859-8 code points of interest FINAL_KAF = 0xEA NORMAL_KAF = 0xEB @@ -152,31 +156,35 @@ class HebrewProber(CharSetProber): VISUAL_HEBREW_NAME = "ISO-8859-8" LOGICAL_HEBREW_NAME = "windows-1255" - def __init__(self): + def __init__(self) -> None: super().__init__() - self._final_char_logical_score = None - self._final_char_visual_score = None - self._prev = None - self._before_prev = None - self._logical_prober = None - self._visual_prober = None + self._final_char_logical_score = 0 + self._final_char_visual_score = 0 + self._prev = self.SPACE + self._before_prev = self.SPACE + self._logical_prober: Optional[SingleByteCharSetProber] = None + self._visual_prober: Optional[SingleByteCharSetProber] = None self.reset() - def reset(self): + def reset(self) -> None: self._final_char_logical_score = 0 self._final_char_visual_score = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate # a word delimiter at the beginning of the data - self._prev = " " - self._before_prev = " " + self._prev = self.SPACE + self._before_prev = self.SPACE # These probers are owned by the group prober. - def set_model_probers(self, logical_prober, visual_prober): + def set_model_probers( + self, + logical_prober: SingleByteCharSetProber, + visual_prober: SingleByteCharSetProber, + ) -> None: self._logical_prober = logical_prober self._visual_prober = visual_prober - def is_final(self, c): + def is_final(self, c: int) -> bool: return c in [ self.FINAL_KAF, self.FINAL_MEM, @@ -185,7 +193,7 @@ class HebrewProber(CharSetProber): self.FINAL_TSADI, ] - def is_non_final(self, c): + def is_non_final(self, c: int) -> bool: # The normal Tsadi is not a good Non-Final letter due to words like # 'lechotet' (to chat) containing an apostrophe after the tsadi. This # apostrophe is converted to a space in FilterWithoutEnglishLetters @@ -198,7 +206,7 @@ class HebrewProber(CharSetProber): # since these words are quite rare. return c in [self.NORMAL_KAF, self.NORMAL_MEM, self.NORMAL_NUN, self.NORMAL_PE] - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew # or visual Hebrew. @@ -232,9 +240,9 @@ class HebrewProber(CharSetProber): byte_str = self.filter_high_byte_only(byte_str) for cur in byte_str: - if cur == " ": + if cur == self.SPACE: # We stand on a space - a word just ended - if self._before_prev != " ": + if self._before_prev != self.SPACE: # next-to-last char was not a space so self._prev is not a # 1 letter word if self.is_final(self._prev): @@ -247,9 +255,9 @@ class HebrewProber(CharSetProber): else: # Not standing on a space if ( - (self._before_prev == " ") + (self._before_prev == self.SPACE) and (self.is_final(self._prev)) - and (cur != " ") + and (cur != self.SPACE) ): # case (3) [-2:space][-1:final letter][cur:not space] self._final_char_visual_score += 1 @@ -261,7 +269,10 @@ class HebrewProber(CharSetProber): return ProbingState.DETECTING @property - def charset_name(self): + def charset_name(self) -> str: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. finalsub = self._final_char_logical_score - self._final_char_visual_score @@ -289,11 +300,14 @@ class HebrewProber(CharSetProber): return self.LOGICAL_HEBREW_NAME @property - def language(self): + def language(self) -> str: return "Hebrew" @property - def state(self): + def state(self) -> ProbingState: + assert self._logical_prober is not None + assert self._visual_prober is not None + # Remain active as long as any of the model probers are active. if (self._logical_prober.state == ProbingState.NOT_ME) and ( self._visual_prober.state == ProbingState.NOT_ME diff --git a/libs/chardet/johabprober.py b/libs/chardet/johabprober.py index 6f359d193..d7364ba61 100644 --- a/libs/chardet/johabprober.py +++ b/libs/chardet/johabprober.py @@ -32,16 +32,16 @@ from .mbcssm import JOHAB_SM_MODEL class JOHABProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(JOHAB_SM_MODEL) self.distribution_analyzer = JOHABDistributionAnalysis() self.reset() @property - def charset_name(self): + def charset_name(self) -> str: return "Johab" @property - def language(self): + def language(self) -> str: return "Korean" diff --git a/libs/chardet/jpcntx.py b/libs/chardet/jpcntx.py index 7a8e5be06..2f53bdda0 100644 --- a/libs/chardet/jpcntx.py +++ b/libs/chardet/jpcntx.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Tuple, Union # This is hiragana 2-char sequence table, the number in each cell represents its frequency category # fmt: off @@ -123,15 +124,15 @@ class JapaneseContextAnalysis: MAX_REL_THRESHOLD = 1000 MINIMUM_DATA_THRESHOLD = 4 - def __init__(self): - self._total_rel = None - self._rel_sample = None - self._need_to_skip_char_num = None - self._last_char_order = None - self._done = None + def __init__(self) -> None: + self._total_rel = 0 + self._rel_sample: List[int] = [] + self._need_to_skip_char_num = 0 + self._last_char_order = -1 + self._done = False self.reset() - def reset(self): + def reset(self) -> None: self._total_rel = 0 # total sequence received # category counters, each integer counts sequence in its category self._rel_sample = [0] * self.NUM_OF_CATEGORY @@ -143,7 +144,7 @@ class JapaneseContextAnalysis: # been made self._done = False - def feed(self, byte_str, num_bytes): + def feed(self, byte_str: Union[bytes, bytearray], num_bytes: int) -> None: if self._done: return @@ -172,29 +173,29 @@ class JapaneseContextAnalysis: ] += 1 self._last_char_order = order - def got_enough_data(self): + def got_enough_data(self) -> bool: return self._total_rel > self.ENOUGH_REL_THRESHOLD - def get_confidence(self): + def get_confidence(self) -> float: # This is just one way to calculate confidence. It works well for me. if self._total_rel > self.MINIMUM_DATA_THRESHOLD: return (self._total_rel - self._rel_sample[0]) / self._total_rel return self.DONT_KNOW - def get_order(self, _): + def get_order(self, _: Union[bytes, bytearray]) -> Tuple[int, int]: return -1, 1 class SJISContextAnalysis(JapaneseContextAnalysis): - def __init__(self): + def __init__(self) -> None: super().__init__() self._charset_name = "SHIFT_JIS" @property - def charset_name(self): + def charset_name(self) -> str: return self._charset_name - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length @@ -216,7 +217,7 @@ class SJISContextAnalysis(JapaneseContextAnalysis): class EUCJPContextAnalysis(JapaneseContextAnalysis): - def get_order(self, byte_str): + def get_order(self, byte_str: Union[bytes, bytearray]) -> Tuple[int, int]: if not byte_str: return -1, 1 # find out current char's byte length diff --git a/libs/chardet/latin1prober.py b/libs/chardet/latin1prober.py index 241f14ab9..59a01d91b 100644 --- a/libs/chardet/latin1prober.py +++ b/libs/chardet/latin1prober.py @@ -26,6 +26,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -96,26 +98,26 @@ Latin1ClassModel = ( class Latin1Prober(CharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() - self._last_char_class = None - self._freq_counter = None + self._last_char_class = OTH + self._freq_counter: List[int] = [] self.reset() - def reset(self): + def reset(self) -> None: self._last_char_class = OTH self._freq_counter = [0] * FREQ_CAT_NUM super().reset() @property - def charset_name(self): + def charset_name(self) -> str: return "ISO-8859-1" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: byte_str = self.remove_xml_tags(byte_str) for c in byte_str: char_class = Latin1_CharToClass[c] @@ -128,7 +130,7 @@ class Latin1Prober(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: if self.state == ProbingState.NOT_ME: return 0.01 diff --git a/libs/chardet/macromanprober.py b/libs/chardet/macromanprober.py new file mode 100644 index 000000000..1425d10ec --- /dev/null +++ b/libs/chardet/macromanprober.py @@ -0,0 +1,162 @@ +######################## BEGIN LICENSE BLOCK ######################## +# This code was modified from latin1prober.py by Rob Speer . +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Rob Speer - adapt to MacRoman encoding +# Mark Pilgrim - port to Python +# Shy Shalom - original C code +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA +# 02110-1301 USA +######################### END LICENSE BLOCK ######################### + +from typing import List, Union + +from .charsetprober import CharSetProber +from .enums import ProbingState + +FREQ_CAT_NUM = 4 + +UDF = 0 # undefined +OTH = 1 # other +ASC = 2 # ascii capital letter +ASS = 3 # ascii small letter +ACV = 4 # accent capital vowel +ACO = 5 # accent capital other +ASV = 6 # accent small vowel +ASO = 7 # accent small other +ODD = 8 # character that is unlikely to appear +CLASS_NUM = 9 # total classes + +# The change from Latin1 is that we explicitly look for extended characters +# that are infrequently-occurring symbols, and consider them to always be +# improbable. This should let MacRoman get out of the way of more likely +# encodings in most situations. + +# fmt: off +MacRoman_CharToClass = ( + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F + OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 + ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F + OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 + ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F + ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87 + ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F + ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97 + ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7 + OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 + OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF + OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7 + OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7 + ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF + OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7 + ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF + ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7 + ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF +) + +# 0 : illegal +# 1 : very unlikely +# 2 : normal +# 3 : very likely +MacRomanClassModel = ( +# UDF OTH ASC ASS ACV ACO ASV ASO ODD + 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF + 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC + 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS + 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO + 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV + 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO + 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD +) +# fmt: on + + +class MacRomanProber(CharSetProber): + def __init__(self) -> None: + super().__init__() + self._last_char_class = OTH + self._freq_counter: List[int] = [] + self.reset() + + def reset(self) -> None: + self._last_char_class = OTH + self._freq_counter = [0] * FREQ_CAT_NUM + + # express the prior that MacRoman is a somewhat rare encoding; + # this can be done by starting out in a slightly improbable state + # that must be overcome + self._freq_counter[2] = 10 + + super().reset() + + @property + def charset_name(self) -> str: + return "MacRoman" + + @property + def language(self) -> str: + return "" + + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + byte_str = self.remove_xml_tags(byte_str) + for c in byte_str: + char_class = MacRoman_CharToClass[c] + freq = MacRomanClassModel[(self._last_char_class * CLASS_NUM) + char_class] + if freq == 0: + self._state = ProbingState.NOT_ME + break + self._freq_counter[freq] += 1 + self._last_char_class = char_class + + return self.state + + def get_confidence(self) -> float: + if self.state == ProbingState.NOT_ME: + return 0.01 + + total = sum(self._freq_counter) + confidence = ( + 0.0 + if total < 0.01 + else (self._freq_counter[3] - self._freq_counter[1] * 20.0) / total + ) + confidence = max(confidence, 0.0) + # lower the confidence of MacRoman so that other more accurate + # detector can take priority. + confidence *= 0.73 + return confidence diff --git a/libs/chardet/mbcharsetprober.py b/libs/chardet/mbcharsetprober.py index bf96ad5d4..666307e8f 100644 --- a/libs/chardet/mbcharsetprober.py +++ b/libs/chardet/mbcharsetprober.py @@ -27,8 +27,12 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Optional, Union + +from .chardistribution import CharDistributionAnalysis from .charsetprober import CharSetProber -from .enums import MachineState, ProbingState +from .codingstatemachine import CodingStateMachine +from .enums import LanguageFilter, MachineState, ProbingState class MultiByteCharSetProber(CharSetProber): @@ -36,29 +40,24 @@ class MultiByteCharSetProber(CharSetProber): MultiByteCharSetProber """ - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) - self.distribution_analyzer = None - self.coding_sm = None - self._last_char = [0, 0] + self.distribution_analyzer: Optional[CharDistributionAnalysis] = None + self.coding_sm: Optional[CodingStateMachine] = None + self._last_char = bytearray(b"\0\0") - def reset(self): + def reset(self) -> None: super().reset() if self.coding_sm: self.coding_sm.reset() if self.distribution_analyzer: self.distribution_analyzer.reset() - self._last_char = [0, 0] + self._last_char = bytearray(b"\0\0") - @property - def charset_name(self): - raise NotImplementedError + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None - @property - def language(self): - raise NotImplementedError - - def feed(self, byte_str): for i, byte in enumerate(byte_str): coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: @@ -91,5 +90,6 @@ class MultiByteCharSetProber(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None return self.distribution_analyzer.get_confidence() diff --git a/libs/chardet/mbcsgroupprober.py b/libs/chardet/mbcsgroupprober.py index 94488360c..6cb9cc7b3 100644 --- a/libs/chardet/mbcsgroupprober.py +++ b/libs/chardet/mbcsgroupprober.py @@ -30,6 +30,7 @@ from .big5prober import Big5Prober from .charsetgroupprober import CharSetGroupProber from .cp949prober import CP949Prober +from .enums import LanguageFilter from .eucjpprober import EUCJPProber from .euckrprober import EUCKRProber from .euctwprober import EUCTWProber @@ -40,7 +41,7 @@ from .utf8prober import UTF8Prober class MBCSGroupProber(CharSetGroupProber): - def __init__(self, lang_filter=None): + def __init__(self, lang_filter: LanguageFilter = LanguageFilter.NONE) -> None: super().__init__(lang_filter=lang_filter) self.probers = [ UTF8Prober(), diff --git a/libs/chardet/mbcssm.py b/libs/chardet/mbcssm.py index d3b9c4b75..7bbe97e66 100644 --- a/libs/chardet/mbcssm.py +++ b/libs/chardet/mbcssm.py @@ -25,6 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from .codingstatemachinedict import CodingStateMachineDict from .enums import MachineState # BIG5 @@ -74,7 +75,7 @@ BIG5_ST = ( BIG5_CHAR_LEN_TABLE = (0, 1, 1, 2, 0) -BIG5_SM_MODEL = { +BIG5_SM_MODEL: CodingStateMachineDict = { "class_table": BIG5_CLS, "class_factor": 5, "state_table": BIG5_ST, @@ -117,7 +118,7 @@ CP949_ST = ( CP949_CHAR_LEN_TABLE = (0, 1, 2, 0, 1, 1, 2, 2, 0, 2) -CP949_SM_MODEL = { +CP949_SM_MODEL: CodingStateMachineDict = { "class_table": CP949_CLS, "class_factor": 10, "state_table": CP949_ST, @@ -173,7 +174,7 @@ EUCJP_ST = ( EUCJP_CHAR_LEN_TABLE = (2, 2, 2, 3, 1, 0) -EUCJP_SM_MODEL = { +EUCJP_SM_MODEL: CodingStateMachineDict = { "class_table": EUCJP_CLS, "class_factor": 6, "state_table": EUCJP_ST, @@ -226,7 +227,7 @@ EUCKR_ST = ( EUCKR_CHAR_LEN_TABLE = (0, 1, 2, 0) -EUCKR_SM_MODEL = { +EUCKR_SM_MODEL: CodingStateMachineDict = { "class_table": EUCKR_CLS, "class_factor": 4, "state_table": EUCKR_ST, @@ -283,7 +284,7 @@ JOHAB_ST = ( JOHAB_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 0, 0, 2, 2, 2) -JOHAB_SM_MODEL = { +JOHAB_SM_MODEL: CodingStateMachineDict = { "class_table": JOHAB_CLS, "class_factor": 10, "state_table": JOHAB_ST, @@ -340,7 +341,7 @@ EUCTW_ST = ( EUCTW_CHAR_LEN_TABLE = (0, 0, 1, 2, 2, 2, 3) -EUCTW_SM_MODEL = { +EUCTW_SM_MODEL: CodingStateMachineDict = { "class_table": EUCTW_CLS, "class_factor": 7, "state_table": EUCTW_ST, @@ -402,7 +403,7 @@ GB2312_ST = ( # 2 here. GB2312_CHAR_LEN_TABLE = (0, 1, 1, 1, 1, 1, 2) -GB2312_SM_MODEL = { +GB2312_SM_MODEL: CodingStateMachineDict = { "class_table": GB2312_CLS, "class_factor": 7, "state_table": GB2312_ST, @@ -458,7 +459,7 @@ SJIS_ST = ( SJIS_CHAR_LEN_TABLE = (0, 1, 1, 2, 0, 0) -SJIS_SM_MODEL = { +SJIS_SM_MODEL: CodingStateMachineDict = { "class_table": SJIS_CLS, "class_factor": 6, "state_table": SJIS_ST, @@ -516,7 +517,7 @@ UCS2BE_ST = ( UCS2BE_CHAR_LEN_TABLE = (2, 2, 2, 0, 2, 2) -UCS2BE_SM_MODEL = { +UCS2BE_SM_MODEL: CodingStateMachineDict = { "class_table": UCS2BE_CLS, "class_factor": 6, "state_table": UCS2BE_ST, @@ -574,7 +575,7 @@ UCS2LE_ST = ( UCS2LE_CHAR_LEN_TABLE = (2, 2, 2, 2, 2, 2) -UCS2LE_SM_MODEL = { +UCS2LE_SM_MODEL: CodingStateMachineDict = { "class_table": UCS2LE_CLS, "class_factor": 6, "state_table": UCS2LE_ST, @@ -651,7 +652,7 @@ UTF8_ST = ( UTF8_CHAR_LEN_TABLE = (0, 1, 0, 0, 0, 0, 2, 3, 3, 3, 4, 4, 5, 5, 6, 6) -UTF8_SM_MODEL = { +UTF8_SM_MODEL: CodingStateMachineDict = { "class_table": UTF8_CLS, "class_factor": 16, "state_table": UTF8_ST, diff --git a/libs/chardet/metadata/languages.py b/libs/chardet/metadata/languages.py index 1d37884c3..eb40c5f0c 100644 --- a/libs/chardet/metadata/languages.py +++ b/libs/chardet/metadata/languages.py @@ -6,6 +6,7 @@ This code is based on the language metadata from the uchardet project. """ from string import ascii_letters +from typing import List, Optional # TODO: Add Ukrainian (KOI8-U) @@ -33,13 +34,13 @@ class Language: def __init__( self, - name=None, - iso_code=None, - use_ascii=True, - charsets=None, - alphabet=None, - wiki_start_pages=None, - ): + name: Optional[str] = None, + iso_code: Optional[str] = None, + use_ascii: bool = True, + charsets: Optional[List[str]] = None, + alphabet: Optional[str] = None, + wiki_start_pages: Optional[List[str]] = None, + ) -> None: super().__init__() self.name = name self.iso_code = iso_code @@ -55,7 +56,7 @@ class Language: self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None self.wiki_start_pages = wiki_start_pages - def __repr__(self): + def __repr__(self) -> str: param_str = ", ".join( f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") ) @@ -103,7 +104,7 @@ LANGUAGES = { name="Danish", iso_code="da", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="æøåÆØÅ", wiki_start_pages=["Forside"], ), @@ -111,8 +112,8 @@ LANGUAGES = { name="German", iso_code="de", use_ascii=True, - charsets=["ISO-8859-1", "WINDOWS-1252"], - alphabet="äöüßÄÖÜ", + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], + alphabet="äöüßẞÄÖÜ", wiki_start_pages=["Wikipedia:Hauptseite"], ), "Greek": Language( @@ -127,7 +128,7 @@ LANGUAGES = { name="English", iso_code="en", use_ascii=True, - charsets=["ISO-8859-1", "WINDOWS-1252"], + charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"], wiki_start_pages=["Main_Page"], ), "Esperanto": Language( @@ -143,7 +144,7 @@ LANGUAGES = { name="Spanish", iso_code="es", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ñáéíóúüÑÁÉÍÓÚÜ", wiki_start_pages=["Wikipedia:Portada"], ), @@ -161,7 +162,7 @@ LANGUAGES = { name="Finnish", iso_code="fi", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ÅÄÖŠŽåäöšž", wiki_start_pages=["Wikipedia:Etusivu"], ), @@ -169,7 +170,7 @@ LANGUAGES = { name="French", iso_code="fr", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], ), @@ -203,7 +204,7 @@ LANGUAGES = { name="Italian", iso_code="it", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ÀÈÉÌÒÓÙàèéìòóù", wiki_start_pages=["Pagina_principale"], ), @@ -237,7 +238,7 @@ LANGUAGES = { name="Dutch", iso_code="nl", use_ascii=True, - charsets=["ISO-8859-1", "WINDOWS-1252"], + charsets=["ISO-8859-1", "WINDOWS-1252", "MacRoman"], wiki_start_pages=["Hoofdpagina"], ), "Polish": Language( @@ -253,7 +254,7 @@ LANGUAGES = { name="Portuguese", iso_code="pt", use_ascii=True, - charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], + charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252", "MacRoman"], alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", wiki_start_pages=["Wikipédia:Página_principal"], ), diff --git a/libs/chardet/py.typed b/libs/chardet/py.typed new file mode 100644 index 000000000..e69de29bb diff --git a/libs/chardet/resultdict.py b/libs/chardet/resultdict.py new file mode 100644 index 000000000..7d36e64c4 --- /dev/null +++ b/libs/chardet/resultdict.py @@ -0,0 +1,16 @@ +from typing import TYPE_CHECKING, Optional + +if TYPE_CHECKING: + # TypedDict was introduced in Python 3.8. + # + # TODO: Remove the else block and TYPE_CHECKING check when dropping support + # for Python 3.7. + from typing import TypedDict + + class ResultDict(TypedDict): + encoding: Optional[str] + confidence: float + language: Optional[str] + +else: + ResultDict = dict diff --git a/libs/chardet/sbcharsetprober.py b/libs/chardet/sbcharsetprober.py index 31d70e154..0ffbcdd2c 100644 --- a/libs/chardet/sbcharsetprober.py +++ b/libs/chardet/sbcharsetprober.py @@ -26,23 +26,20 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from collections import namedtuple +from typing import Dict, List, NamedTuple, Optional, Union from .charsetprober import CharSetProber from .enums import CharacterCategory, ProbingState, SequenceLikelihood -SingleByteCharSetModel = namedtuple( - "SingleByteCharSetModel", - [ - "charset_name", - "language", - "char_to_order_map", - "language_model", - "typical_positive_ratio", - "keep_ascii_letters", - "alphabet", - ], -) + +class SingleByteCharSetModel(NamedTuple): + charset_name: str + language: str + char_to_order_map: Dict[int, int] + language_model: Dict[int, Dict[int, int]] + typical_positive_ratio: float + keep_ascii_letters: bool + alphabet: str class SingleByteCharSetProber(CharSetProber): @@ -51,22 +48,27 @@ class SingleByteCharSetProber(CharSetProber): POSITIVE_SHORTCUT_THRESHOLD = 0.95 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 - def __init__(self, model, is_reversed=False, name_prober=None): + def __init__( + self, + model: SingleByteCharSetModel, + is_reversed: bool = False, + name_prober: Optional[CharSetProber] = None, + ) -> None: super().__init__() self._model = model # TRUE if we need to reverse every pair in the model lookup self._reversed = is_reversed # Optional auxiliary prober for name decision self._name_prober = name_prober - self._last_order = None - self._seq_counters = None - self._total_seqs = None - self._total_char = None - self._control_char = None - self._freq_char = None + self._last_order = 255 + self._seq_counters: List[int] = [] + self._total_seqs = 0 + self._total_char = 0 + self._control_char = 0 + self._freq_char = 0 self.reset() - def reset(self): + def reset(self) -> None: super().reset() # char order of last character self._last_order = 255 @@ -78,18 +80,18 @@ class SingleByteCharSetProber(CharSetProber): self._freq_char = 0 @property - def charset_name(self): + def charset_name(self) -> Optional[str]: if self._name_prober: return self._name_prober.charset_name return self._model.charset_name @property - def language(self): + def language(self) -> Optional[str]: if self._name_prober: return self._name_prober.language return self._model.language - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: # TODO: Make filter_international_words keep things in self.alphabet if not self._model.keep_ascii_letters: byte_str = self.filter_international_words(byte_str) @@ -139,7 +141,7 @@ class SingleByteCharSetProber(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: r = 0.01 if self._total_seqs > 0: r = ( diff --git a/libs/chardet/sbcsgroupprober.py b/libs/chardet/sbcsgroupprober.py index cad001cb1..890ae8465 100644 --- a/libs/chardet/sbcsgroupprober.py +++ b/libs/chardet/sbcsgroupprober.py @@ -48,7 +48,7 @@ from .sbcharsetprober import SingleByteCharSetProber class SBCSGroupProber(CharSetGroupProber): - def __init__(self): + def __init__(self) -> None: super().__init__() hebrew_prober = HebrewProber() logical_hebrew_prober = SingleByteCharSetProber( diff --git a/libs/chardet/sjisprober.py b/libs/chardet/sjisprober.py index 3bcbdb71d..91df07796 100644 --- a/libs/chardet/sjisprober.py +++ b/libs/chardet/sjisprober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .chardistribution import SJISDistributionAnalysis from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,29 @@ from .mbcssm import SJIS_SM_MODEL class SJISProber(MultiByteCharSetProber): - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) self.distribution_analyzer = SJISDistributionAnalysis() self.context_analyzer = SJISContextAnalysis() self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.context_analyzer.reset() @property - def charset_name(self): + def charset_name(self) -> str: return self.context_analyzer.charset_name @property - def language(self): + def language(self) -> str: return "Japanese" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: + assert self.coding_sm is not None + assert self.distribution_analyzer is not None + for i, byte in enumerate(byte_str): coding_state = self.coding_sm.next_state(byte) if coding_state == MachineState.ERROR: @@ -92,7 +97,9 @@ class SJISProber(MultiByteCharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: + assert self.distribution_analyzer is not None + context_conf = self.context_analyzer.get_confidence() distrib_conf = self.distribution_analyzer.get_confidence() return max(context_conf, distrib_conf) diff --git a/libs/chardet/universaldetector.py b/libs/chardet/universaldetector.py index 22fcf8290..30c441dc2 100644 --- a/libs/chardet/universaldetector.py +++ b/libs/chardet/universaldetector.py @@ -39,12 +39,16 @@ class a user of ``chardet`` should use. import codecs import logging import re +from typing import List, Optional, Union from .charsetgroupprober import CharSetGroupProber +from .charsetprober import CharSetProber from .enums import InputState, LanguageFilter, ProbingState from .escprober import EscCharSetProber from .latin1prober import Latin1Prober +from .macromanprober import MacRomanProber from .mbcsgroupprober import MBCSGroupProber +from .resultdict import ResultDict from .sbcsgroupprober import SBCSGroupProber from .utf1632prober import UTF1632Prober @@ -80,34 +84,55 @@ class UniversalDetector: "iso-8859-9": "Windows-1254", "iso-8859-13": "Windows-1257", } + # Based on https://encoding.spec.whatwg.org/#names-and-labels + # but altered to match Python names for encodings and remove mappings + # that break tests. + LEGACY_MAP = { + "ascii": "Windows-1252", + "iso-8859-1": "Windows-1252", + "tis-620": "ISO-8859-11", + "iso-8859-9": "Windows-1254", + "gb2312": "GB18030", + "euc-kr": "CP949", + "utf-16le": "UTF-16", + } - def __init__(self, lang_filter=LanguageFilter.ALL): - self._esc_charset_prober = None - self._utf1632_prober = None - self._charset_probers = [] - self.result = None - self.done = None - self._got_data = None - self._input_state = None - self._last_char = None + def __init__( + self, + lang_filter: LanguageFilter = LanguageFilter.ALL, + should_rename_legacy: bool = False, + ) -> None: + self._esc_charset_prober: Optional[EscCharSetProber] = None + self._utf1632_prober: Optional[UTF1632Prober] = None + self._charset_probers: List[CharSetProber] = [] + self.result: ResultDict = { + "encoding": None, + "confidence": 0.0, + "language": None, + } + self.done = False + self._got_data = False + self._input_state = InputState.PURE_ASCII + self._last_char = b"" self.lang_filter = lang_filter self.logger = logging.getLogger(__name__) - self._has_win_bytes = None + self._has_win_bytes = False + self.should_rename_legacy = should_rename_legacy self.reset() @property - def input_state(self): + def input_state(self) -> int: return self._input_state @property - def has_win_bytes(self): + def has_win_bytes(self) -> bool: return self._has_win_bytes @property - def charset_probers(self): + def charset_probers(self) -> List[CharSetProber]: return self._charset_probers - def reset(self): + def reset(self) -> None: """ Reset the UniversalDetector and all of its probers back to their initial states. This is called by ``__init__``, so you only need to @@ -126,7 +151,7 @@ class UniversalDetector: for prober in self._charset_probers: prober.reset() - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> None: """ Takes a chunk of a document and feeds it through all of the relevant charset probers. @@ -166,6 +191,7 @@ class UniversalDetector: elif byte_str.startswith(b"\xFE\xFF\x00\x00"): # FE FF 00 00 UCS-4, unusual octet order BOM (3412) self.result = { + # TODO: This encoding is not supported by Python. Should remove? "encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0, "language": "", @@ -173,6 +199,7 @@ class UniversalDetector: elif byte_str.startswith(b"\x00\x00\xFF\xFE"): # 00 00 FF FE UCS-4, unusual octet order BOM (2143) self.result = { + # TODO: This encoding is not supported by Python. Should remove? "encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0, "language": "", @@ -242,6 +269,7 @@ class UniversalDetector: if self.lang_filter & LanguageFilter.NON_CJK: self._charset_probers.append(SBCSGroupProber()) self._charset_probers.append(Latin1Prober()) + self._charset_probers.append(MacRomanProber()) for prober in self._charset_probers: if prober.feed(byte_str) == ProbingState.FOUND_IT: self.result = { @@ -254,7 +282,7 @@ class UniversalDetector: if self.WIN_BYTE_DETECTOR.search(byte_str): self._has_win_bytes = True - def close(self): + def close(self) -> ResultDict: """ Stop analyzing the current document and come up with a final prediction. @@ -288,7 +316,8 @@ class UniversalDetector: max_prober = prober if max_prober and (max_prober_confidence > self.MINIMUM_THRESHOLD): charset_name = max_prober.charset_name - lower_charset_name = max_prober.charset_name.lower() + assert charset_name is not None + lower_charset_name = charset_name.lower() confidence = max_prober.get_confidence() # Use Windows encoding name instead of ISO-8859 if we saw any # extra Windows-specific bytes @@ -297,6 +326,11 @@ class UniversalDetector: charset_name = self.ISO_WIN_MAP.get( lower_charset_name, charset_name ) + # Rename legacy encodings with superset encodings if asked + if self.should_rename_legacy: + charset_name = self.LEGACY_MAP.get( + (charset_name or "").lower(), charset_name + ) self.result = { "encoding": charset_name, "confidence": confidence, diff --git a/libs/chardet/utf1632prober.py b/libs/chardet/utf1632prober.py index 9fd1580b8..6bdec63d6 100644 --- a/libs/chardet/utf1632prober.py +++ b/libs/chardet/utf1632prober.py @@ -18,6 +18,8 @@ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import List, Union + from .charsetprober import CharSetProber from .enums import ProbingState @@ -36,7 +38,7 @@ class UTF1632Prober(CharSetProber): # a fixed constant ratio of expected zeros or non-zeros in modulo-position. EXPECTED_RATIO = 0.94 - def __init__(self): + def __init__(self) -> None: super().__init__() self.position = 0 self.zeros_at_mod = [0] * 4 @@ -51,7 +53,7 @@ class UTF1632Prober(CharSetProber): self.first_half_surrogate_pair_detected_16le = False self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.position = 0 self.zeros_at_mod = [0] * 4 @@ -66,7 +68,7 @@ class UTF1632Prober(CharSetProber): self.quad = [0, 0, 0, 0] @property - def charset_name(self): + def charset_name(self) -> str: if self.is_likely_utf32be(): return "utf-32be" if self.is_likely_utf32le(): @@ -79,16 +81,16 @@ class UTF1632Prober(CharSetProber): return "utf-16" @property - def language(self): + def language(self) -> str: return "" - def approx_32bit_chars(self): + def approx_32bit_chars(self) -> float: return max(1.0, self.position / 4.0) - def approx_16bit_chars(self): + def approx_16bit_chars(self) -> float: return max(1.0, self.position / 2.0) - def is_likely_utf32be(self): + def is_likely_utf32be(self) -> bool: approx_chars = self.approx_32bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO @@ -98,7 +100,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf32be ) - def is_likely_utf32le(self): + def is_likely_utf32le(self) -> bool: approx_chars = self.approx_32bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO @@ -108,7 +110,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf32le ) - def is_likely_utf16be(self): + def is_likely_utf16be(self) -> bool: approx_chars = self.approx_16bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars @@ -118,7 +120,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf16be ) - def is_likely_utf16le(self): + def is_likely_utf16le(self) -> bool: approx_chars = self.approx_16bit_chars() return approx_chars >= self.MIN_CHARS_FOR_DETECTION and ( (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars @@ -128,7 +130,7 @@ class UTF1632Prober(CharSetProber): and not self.invalid_utf16le ) - def validate_utf32_characters(self, quad): + def validate_utf32_characters(self, quad: List[int]) -> None: """ Validate if the quad of bytes is valid UTF-32. @@ -150,7 +152,7 @@ class UTF1632Prober(CharSetProber): ): self.invalid_utf32le = True - def validate_utf16_characters(self, pair): + def validate_utf16_characters(self, pair: List[int]) -> None: """ Validate if the pair of bytes is valid UTF-16. @@ -182,7 +184,7 @@ class UTF1632Prober(CharSetProber): else: self.invalid_utf16le = True - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: mod4 = self.position % 4 self.quad[mod4] = c @@ -198,7 +200,7 @@ class UTF1632Prober(CharSetProber): return self.state @property - def state(self): + def state(self) -> ProbingState: if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}: # terminal, decided states return self._state @@ -210,7 +212,7 @@ class UTF1632Prober(CharSetProber): self._state = ProbingState.NOT_ME return self._state - def get_confidence(self): + def get_confidence(self) -> float: return ( 0.85 if ( diff --git a/libs/chardet/utf8prober.py b/libs/chardet/utf8prober.py index 3aae09e86..d96354d97 100644 --- a/libs/chardet/utf8prober.py +++ b/libs/chardet/utf8prober.py @@ -25,6 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### +from typing import Union + from .charsetprober import CharSetProber from .codingstatemachine import CodingStateMachine from .enums import MachineState, ProbingState @@ -34,26 +36,26 @@ from .mbcssm import UTF8_SM_MODEL class UTF8Prober(CharSetProber): ONE_CHAR_PROB = 0.5 - def __init__(self): + def __init__(self) -> None: super().__init__() self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) - self._num_mb_chars = None + self._num_mb_chars = 0 self.reset() - def reset(self): + def reset(self) -> None: super().reset() self.coding_sm.reset() self._num_mb_chars = 0 @property - def charset_name(self): + def charset_name(self) -> str: return "utf-8" @property - def language(self): + def language(self) -> str: return "" - def feed(self, byte_str): + def feed(self, byte_str: Union[bytes, bytearray]) -> ProbingState: for c in byte_str: coding_state = self.coding_sm.next_state(c) if coding_state == MachineState.ERROR: @@ -72,7 +74,7 @@ class UTF8Prober(CharSetProber): return self.state - def get_confidence(self): + def get_confidence(self) -> float: unlike = 0.99 if self._num_mb_chars < 6: unlike *= self.ONE_CHAR_PROB**self._num_mb_chars diff --git a/libs/chardet/version.py b/libs/chardet/version.py index a08a06b9a..c5e9d85cd 100644 --- a/libs/chardet/version.py +++ b/libs/chardet/version.py @@ -1,9 +1,9 @@ """ This module exists only to simplify retrieving the version number of chardet -from within setup.py and from chardet subpackages. +from within setuptools and from chardet subpackages. :author: Dan Blanchard (dan.blanchard@gmail.com) """ -__version__ = "5.0.0" +__version__ = "5.1.0" VERSION = __version__.split(".") diff --git a/libs/version.txt b/libs/version.txt index 77ca95e1e..6d746375a 100644 --- a/libs/version.txt +++ b/libs/version.txt @@ -4,7 +4,7 @@ argparse==1.4.0 apprise==1.4.0 apscheduler==3.9.1 attrs==22.1.0 -charamel==1.0.0 +chardet==5.1.0 deep-translator==1.9.1 dogpile.cache==1.1.8 fese==0.1.2 @@ -102,7 +102,6 @@ msgpack==1.0.4 appdirs==1.4.4 babelfish==0.6.0 beautifulsoup4==4.11.1 -chardet==5.0.0 pysrt==1.1.2 #stevedore==3.5.2 # Do not upgrade. Version newer than that have issues with importlib on Python 3.7