mirror of
https://github.com/morpheus65535/bazarr.git
synced 2025-01-11 01:07:36 +08:00
improve character encoding detection
This commit is contained in:
parent
92497a8822
commit
be0411d50a
1 changed files with 5 additions and 3 deletions
|
@ -379,10 +379,12 @@ def guess_external_subtitles(dest_folder, subtitles):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# to improve performance, use only the first 32K to detect encoding
|
# to improve performance, use only the first 32K to detect encoding
|
||||||
if len(text) > 32768: guess = chardet.detect(text[:32768])
|
guess = chardet.detect(text[:32768])
|
||||||
else: guess = chardet.detect(text)
|
logging.debug('BAZARR detected encoding %r', guess)
|
||||||
if guess["confidence"] < 0.8:
|
if guess["confidence"] < 0.6:
|
||||||
raise UnicodeError
|
raise UnicodeError
|
||||||
|
if guess["confidence"] < 0.8 or guess["encoding"] == "ascii":
|
||||||
|
guess["encoding"] = "utf-8"
|
||||||
text = text.decode(guess["encoding"])
|
text = text.decode(guess["encoding"])
|
||||||
detected_language = guess_language(text)
|
detected_language = guess_language(text)
|
||||||
except UnicodeError:
|
except UnicodeError:
|
||||||
|
|
Loading…
Reference in a new issue