bazarr/libs/ftfy/chardata.py
2018-10-31 17:08:29 +01:00

214 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
This gives other modules access to the gritty details about characters and the
encodings that use them.
"""
import re
import zlib
import unicodedata
import itertools
from pkg_resources import resource_string
from ftfy.compatibility import unichr
# These are the encodings we will try to fix in ftfy, in the
# order that they should be tried.
CHARMAP_ENCODINGS = [
u'latin-1',
u'sloppy-windows-1252',
u'sloppy-windows-1250',
u'iso-8859-2',
u'sloppy-windows-1251',
u'macroman',
u'cp437',
]
def _build_regexes():
"""
ENCODING_REGEXES contain reasonably fast ways to detect if we
could represent a given string in a given encoding. The simplest one is
the u'ascii' detector, which of course just determines if all characters
are between U+0000 and U+007F.
"""
# Define a regex that matches ASCII text.
encoding_regexes = {u'ascii': re.compile('^[\x00-\x7f]*$')}
for encoding in CHARMAP_ENCODINGS:
# Make a sequence of characters that bytes \x80 to \xFF decode to
# in each encoding, as well as byte \x1A, which is used to represent
# the replacement character <20> in the sloppy-* encodings.
latin1table = u''.join(unichr(i) for i in range(128, 256)) + '\x1a'
charlist = latin1table.encode(u'latin-1').decode(encoding)
# The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B
# to \x7F -- will decode as those ASCII characters in any encoding we
# support, so we can just include them as ranges. This also lets us
# not worry about escaping regex special characters, because all of
# them are in the \x1B to \x7F range.
regex = u'^[\x00-\x19\x1b-\x7f{0}]*$'.format(charlist)
encoding_regexes[encoding] = re.compile(regex)
return encoding_regexes
ENCODING_REGEXES = _build_regexes()
def _build_utf8_punct_regex():
"""
Recognize UTF-8 mojibake that's so blatant that we can fix it even when the
rest of the string doesn't decode as UTF-8 -- namely, UTF-8 sequences for
the u'General Punctuation' characters U+2000 to U+2040, re-encoded in
Windows-1252.
These are recognizable by the distinctiveu'â€u' ('\xe2\x80') sequence they
all begin with when decoded as Windows-1252.
"""
# We're making a regex that has all the literal bytes from 0x80 to 0xbf in
# a range. "Couldn't this have just said [\x80-\xbf]?", you might ask.
# However, when we decode the regex as Windows-1252, the resulting
# characters won't even be remotely contiguous.
#
# Unrelatedly, the expression that generates these bytes will be so much
# prettier when we deprecate Python 2.
continuation_char_list = ''.join(
unichr(i) for i in range(0x80, 0xc0)
).encode(u'latin-1')
obvious_utf8 = (u'â€['
+ continuation_char_list.decode(u'sloppy-windows-1252')
+ u']')
return re.compile(obvious_utf8)
PARTIAL_UTF8_PUNCT_RE = _build_utf8_punct_regex()
# Recognize UTF-8 sequences that would be valid if it weren't for a b'\xa0'
# that some Windows-1252 program converted to a plain space.
#
# The smaller values are included on a case-by-case basis, because we don't want
# to decode likely input sequences to unlikely characters. These are the ones
# that *do* form likely characters before 0xa0:
#
# 0xc2 -> U+A0 NO-BREAK SPACE
# 0xc3 -> U+E0 LATIN SMALL LETTER A WITH GRAVE
# 0xc5 -> U+160 LATIN CAPITAL LETTER S WITH CARON
# 0xce -> U+3A0 GREEK CAPITAL LETTER PI
# 0xd0 -> U+420 CYRILLIC CAPITAL LETTER ER
#
# These still need to come with a cost, so that they only get converted when
# there's evidence that it fixes other things. Any of these could represent
# characters that legitimately appear surrounded by spaces, particularly U+C5
# (Å), which is a word in multiple languages!
#
# We should consider checking for b'\x85' being converted to ... in the future.
# I've seen it once, but the text still wasn't recoverable.
ALTERED_UTF8_RE = re.compile(b'[\xc2\xc3\xc5\xce\xd0][ ]'
b'|[\xe0-\xef][ ][\x80-\xbf]'
b'|[\xe0-\xef][\x80-\xbf][ ]'
b'|[\xf0-\xf4][ ][\x80-\xbf][\x80-\xbf]'
b'|[\xf0-\xf4][\x80-\xbf][ ][\x80-\xbf]'
b'|[\xf0-\xf4][\x80-\xbf][\x80-\xbf][ ]')
# This expression matches UTF-8 and CESU-8 sequences where some of the
# continuation bytes have been lost. The byte 0x1a (sometimes written as ^Z) is
# used within ftfy to represent a byte that produced the replacement character
# \ufffd. We don't know which byte it was, but we can at least decode the UTF-8
# sequence as \ufffd instead of failing to re-decode it at all.
LOSSY_UTF8_RE = re.compile(
b'[\xc2-\xdf][\x1a]'
b'|\xed[\xa0-\xaf][\x1a]\xed[\xb0-\xbf][\x1a\x80-\xbf]'
b'|\xed[\xa0-\xaf][\x1a\x80-\xbf]\xed[\xb0-\xbf][\x1a]'
b'|[\xe0-\xef][\x1a][\x1a\x80-\xbf]'
b'|[\xe0-\xef][\x1a\x80-\xbf][\x1a]'
b'|[\xf0-\xf4][\x1a][\x1a\x80-\xbf][\x1a\x80-\xbf]'
b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a][\x1a\x80-\xbf]'
b'|[\xf0-\xf4][\x1a\x80-\xbf][\x1a\x80-\xbf][\x1a]'
b'|\x1a'
)
# These regexes match various Unicode variations on single and double quotes.
SINGLE_QUOTE_RE = re.compile(u'[\u2018-\u201b]')
DOUBLE_QUOTE_RE = re.compile(u'[\u201c-\u201f]')
def possible_encoding(text, encoding):
"""
Given text and a single-byte encoding, check whether that text could have
been decoded from that single-byte encoding.
In other words, check whether it can be encoded in that encoding, possibly
sloppily.
"""
return bool(ENCODING_REGEXES[encoding].match(text))
CHAR_CLASS_STRING = zlib.decompress(
resource_string(__name__, 'char_classes.dat')
).decode(u'ascii')
def chars_to_classes(string):
"""
Convert each Unicode character to a letter indicating which of many
classes it's in.
See build_data.py for where this data comes from and what it means.
"""
return string.translate(CHAR_CLASS_STRING)
def _build_control_char_mapping():
"""
Build a translate mapping that strips likely-unintended control characters.
See :func:`ftfy.fixes.remove_control_chars` for a description of these
codepoint ranges and why they should be removed.
"""
control_chars = {}
for i in itertools.chain(
range(0x00, 0x09), [0x0b],
range(0x0e, 0x20), [0x7f],
range(0x206a, 0x2070),
[0xfeff],
range(0xfff9, 0xfffd),
range(0x1d173, 0x1d17b),
range(0xe0000, 0xe0080)
):
control_chars[i] = None
return control_chars
CONTROL_CHARS = _build_control_char_mapping()
# A translate mapping that breaks ligatures made of Latin letters. While
# ligatures may be important to the representation of other languages, in
# Latin letters they tend to represent a copy/paste error.
#
# Ligatures may also be separated by NFKC normalization, but that is sometimes
# more normalization than you want.
LIGATURES = {
ord(u'IJ'): u'IJ',
ord(u'ij'): u'ij',
ord(u''): u'ff',
ord(u''): u'fi',
ord(u''): u'fl',
ord(u''): u'ffi',
ord(u''): u'ffl',
ord(u''): u'ſt',
ord(u''): u'st'
}
def _build_width_map():
"""
Build a translate mapping that replaces halfwidth and fullwidth forms
with their standard-width forms.
"""
# Though it's not listed as a fullwidth character, we'll want to convert
# U+3000 IDEOGRAPHIC SPACE to U+20 SPACE on the same principle, so start
# with that in the dictionary.
width_map = {0x3000: u' '}
for i in range(0xff01, 0xfff0):
char = unichr(i)
alternate = unicodedata.normalize(u'NFKC', char)
if alternate != char:
width_map[i] = alternate
return width_map
WIDTH_MAP = _build_width_map()