mirror of
https://github.com/morpheus65535/bazarr.git
synced 2024-11-13 19:22:47 +08:00
664 lines
25 KiB
Python
664 lines
25 KiB
Python
# -*- coding: utf-8 -*-
|
||
"""
|
||
This module contains the individual fixes that the main fix_text function
|
||
can perform.
|
||
"""
|
||
|
||
from __future__ import unicode_literals
|
||
import re
|
||
import sys
|
||
import codecs
|
||
import warnings
|
||
from ftfy.chardata import (possible_encoding, CHARMAP_ENCODINGS,
|
||
CONTROL_CHARS, LIGATURES, WIDTH_MAP,
|
||
PARTIAL_UTF8_PUNCT_RE, ALTERED_UTF8_RE,
|
||
LOSSY_UTF8_RE, SINGLE_QUOTE_RE, DOUBLE_QUOTE_RE)
|
||
from ftfy.badness import text_cost
|
||
from ftfy.compatibility import unichr
|
||
from html5lib.constants import entities
|
||
|
||
|
||
BYTES_ERROR_TEXT = """Hey wait, this isn't Unicode.
|
||
|
||
ftfy is designed to fix problems that were introduced by handling Unicode
|
||
incorrectly. It might be able to fix the bytes you just handed it, but the
|
||
fact that you just gave a pile of bytes to a function that fixes text means
|
||
that your code is *also* handling Unicode incorrectly.
|
||
|
||
ftfy takes Unicode text as input. You should take these bytes and decode
|
||
them from the encoding you think they are in. If you're not sure what encoding
|
||
they're in:
|
||
|
||
- First, try to find out. 'utf-8' is a good assumption.
|
||
- If the encoding is simply unknowable, try running your bytes through
|
||
ftfy.guess_bytes. As the name implies, this may not always be accurate.
|
||
|
||
If you're confused by this, please read the Python Unicode HOWTO:
|
||
|
||
http://docs.python.org/%d/howto/unicode.html
|
||
""" % sys.version_info[0]
|
||
|
||
|
||
def fix_encoding(text):
|
||
r"""
|
||
Fix text with incorrectly-decoded garbage ("mojibake") whenever possible.
|
||
|
||
This function looks for the evidence of mojibake, formulates a plan to fix
|
||
it, and applies the plan. It determines whether it should replace nonsense
|
||
sequences of single-byte characters that were really meant to be UTF-8
|
||
characters, and if so, turns them into the correctly-encoded Unicode
|
||
character that they were meant to represent.
|
||
|
||
The input to the function must be Unicode. If you don't have Unicode text,
|
||
you're not using the right tool to solve your problem.
|
||
|
||
`fix_encoding` decodes text that looks like it was decoded incorrectly. It
|
||
leaves alone text that doesn't.
|
||
|
||
>>> print(fix_encoding('único'))
|
||
único
|
||
|
||
>>> print(fix_encoding('This text is fine already :þ'))
|
||
This text is fine already :þ
|
||
|
||
Because these characters often come from Microsoft products, we allow
|
||
for the possibility that we get not just Unicode characters 128-255, but
|
||
also Windows's conflicting idea of what characters 128-160 are.
|
||
|
||
>>> print(fix_encoding('This — should be an em dash'))
|
||
This — should be an em dash
|
||
|
||
We might have to deal with both Windows characters and raw control
|
||
characters at the same time, especially when dealing with characters like
|
||
0x81 that have no mapping in Windows. This is a string that Python's
|
||
standard `.encode` and `.decode` methods cannot correct.
|
||
|
||
>>> print(fix_encoding('This text is sad .â\x81”.'))
|
||
This text is sad .⁔.
|
||
|
||
However, it has safeguards against fixing sequences of letters and
|
||
punctuation that can occur in valid text. In the following example,
|
||
the last three characters are not replaced with a Korean character,
|
||
even though they could be.
|
||
|
||
>>> print(fix_encoding('not such a fan of Charlotte Brontë…”'))
|
||
not such a fan of Charlotte Brontë…”
|
||
|
||
This function can now recover some complex manglings of text, such as when
|
||
UTF-8 mojibake has been normalized in a way that replaces U+A0 with a
|
||
space:
|
||
|
||
>>> print(fix_encoding('The more you know 🌠'))
|
||
The more you know 🌠
|
||
|
||
Cases of genuine ambiguity can sometimes be addressed by finding other
|
||
characters that are not double-encoded, and expecting the encoding to
|
||
be consistent:
|
||
|
||
>>> print(fix_encoding('AHÅ™, the new sofa from IKEA®'))
|
||
AHÅ™, the new sofa from IKEA®
|
||
|
||
Finally, we handle the case where the text is in a single-byte encoding
|
||
that was intended as Windows-1252 all along but read as Latin-1:
|
||
|
||
>>> print(fix_encoding('This text was never UTF-8 at all\x85'))
|
||
This text was never UTF-8 at all…
|
||
|
||
The best version of the text is found using
|
||
:func:`ftfy.badness.text_cost`.
|
||
"""
|
||
text, _ = fix_encoding_and_explain(text)
|
||
return text
|
||
|
||
|
||
def fix_text_encoding(text):
|
||
"""
|
||
A deprecated name for :func:`ftfy.fixes.fix_encoding`.
|
||
"""
|
||
warnings.warn('fix_text_encoding is now known as fix_encoding',
|
||
DeprecationWarning)
|
||
return fix_encoding(text)
|
||
|
||
|
||
# When we support discovering mojibake in more encodings, we run the risk
|
||
# of more false positives. We can mitigate false positives by assigning an
|
||
# additional cost to using encodings that are rarer than Windows-1252, so
|
||
# that these encodings will only be used if they fix multiple problems.
|
||
ENCODING_COSTS = {
|
||
'macroman': 2,
|
||
'iso-8859-2': 2,
|
||
'sloppy-windows-1250': 2,
|
||
'sloppy-windows-1251': 3,
|
||
'cp437': 3,
|
||
}
|
||
|
||
|
||
def fix_encoding_and_explain(text):
|
||
"""
|
||
Re-decodes text that has been decoded incorrectly, and also return a
|
||
"plan" indicating all the steps required to fix it.
|
||
|
||
The resulting plan could be used with :func:`ftfy.fixes.apply_plan`
|
||
to fix additional strings that are broken in the same way.
|
||
"""
|
||
best_version = text
|
||
best_cost = text_cost(text)
|
||
best_plan = []
|
||
plan_so_far = []
|
||
while True:
|
||
prevtext = text
|
||
text, plan = fix_one_step_and_explain(text)
|
||
plan_so_far.extend(plan)
|
||
cost = text_cost(text)
|
||
for _, _, step_cost in plan_so_far:
|
||
cost += step_cost
|
||
|
||
if cost < best_cost:
|
||
best_cost = cost
|
||
best_version = text
|
||
best_plan = list(plan_so_far)
|
||
if text == prevtext:
|
||
return best_version, best_plan
|
||
|
||
|
||
def fix_one_step_and_explain(text):
|
||
"""
|
||
Performs a single step of re-decoding text that's been decoded incorrectly.
|
||
|
||
Returns the decoded text, plus a "plan" for how to reproduce what it did.
|
||
"""
|
||
if isinstance(text, bytes):
|
||
raise UnicodeError(BYTES_ERROR_TEXT)
|
||
if len(text) == 0:
|
||
return text, []
|
||
|
||
# The first plan is to return ASCII text unchanged.
|
||
if possible_encoding(text, 'ascii'):
|
||
return text, []
|
||
|
||
# As we go through the next step, remember the possible encodings
|
||
# that we encounter but don't successfully fix yet. We may need them
|
||
# later.
|
||
possible_1byte_encodings = []
|
||
|
||
# Suppose the text was supposed to be UTF-8, but it was decoded using
|
||
# a single-byte encoding instead. When these cases can be fixed, they
|
||
# are usually the correct thing to do, so try them next.
|
||
for encoding in CHARMAP_ENCODINGS:
|
||
if possible_encoding(text, encoding):
|
||
encoded_bytes = text.encode(encoding)
|
||
encode_step = ('encode', encoding, ENCODING_COSTS.get(encoding, 0))
|
||
transcode_steps = []
|
||
|
||
# Now, find out if it's UTF-8 (or close enough). Otherwise,
|
||
# remember the encoding for later.
|
||
try:
|
||
decoding = 'utf-8'
|
||
# Check encoded_bytes for sequences that would be UTF-8,
|
||
# except they have b' ' where b'\xa0' would belong.
|
||
if ALTERED_UTF8_RE.search(encoded_bytes):
|
||
encoded_bytes = restore_byte_a0(encoded_bytes)
|
||
cost = encoded_bytes.count(b'\xa0') * 2
|
||
transcode_steps.append(('transcode', 'restore_byte_a0', cost))
|
||
|
||
# Check for the byte 0x1a, which indicates where one of our
|
||
# 'sloppy' codecs found a replacement character.
|
||
if encoding.startswith('sloppy') and b'\x1a' in encoded_bytes:
|
||
encoded_bytes = replace_lossy_sequences(encoded_bytes)
|
||
transcode_steps.append(('transcode', 'replace_lossy_sequences', 0))
|
||
|
||
if b'\xed' in encoded_bytes or b'\xc0' in encoded_bytes:
|
||
decoding = 'utf-8-variants'
|
||
|
||
decode_step = ('decode', decoding, 0)
|
||
steps = [encode_step] + transcode_steps + [decode_step]
|
||
fixed = encoded_bytes.decode(decoding)
|
||
return fixed, steps
|
||
|
||
except UnicodeDecodeError:
|
||
possible_1byte_encodings.append(encoding)
|
||
|
||
# Look for a-hat-euro sequences that remain, and fix them in isolation.
|
||
if PARTIAL_UTF8_PUNCT_RE.search(text):
|
||
steps = [('transcode', 'fix_partial_utf8_punct_in_1252', 1)]
|
||
fixed = fix_partial_utf8_punct_in_1252(text)
|
||
return fixed, steps
|
||
|
||
# The next most likely case is that this is Latin-1 that was intended to
|
||
# be read as Windows-1252, because those two encodings in particular are
|
||
# easily confused.
|
||
if 'latin-1' in possible_1byte_encodings:
|
||
if 'windows-1252' in possible_1byte_encodings:
|
||
# This text is in the intersection of Latin-1 and
|
||
# Windows-1252, so it's probably legit.
|
||
return text, []
|
||
else:
|
||
# Otherwise, it means we have characters that are in Latin-1 but
|
||
# not in Windows-1252. Those are C1 control characters. Nobody
|
||
# wants those. Assume they were meant to be Windows-1252. Don't
|
||
# use the sloppy codec, because bad Windows-1252 characters are
|
||
# a bad sign.
|
||
encoded = text.encode('latin-1')
|
||
try:
|
||
fixed = encoded.decode('windows-1252')
|
||
steps = []
|
||
if fixed != text:
|
||
steps = [('encode', 'latin-1', 0),
|
||
('decode', 'windows-1252', 1)]
|
||
return fixed, steps
|
||
except UnicodeDecodeError:
|
||
# This text contained characters that don't even make sense
|
||
# if you assume they were supposed to be Windows-1252. In
|
||
# that case, let's not assume anything.
|
||
pass
|
||
|
||
# The cases that remain are mixups between two different single-byte
|
||
# encodings, and not the common case of Latin-1 vs. Windows-1252.
|
||
#
|
||
# These cases may be unsolvable without adding false positives, though
|
||
# I have vague ideas about how to optionally address them in the future.
|
||
|
||
# Return the text unchanged; the plan is empty.
|
||
return text, []
|
||
|
||
|
||
def apply_plan(text, plan):
|
||
"""
|
||
Apply a plan for fixing the encoding of text.
|
||
|
||
The plan is a list of tuples of the form (operation, encoding, cost):
|
||
|
||
- `operation` is 'encode' if it turns a string into bytes, 'decode' if it
|
||
turns bytes into a string, and 'transcode' if it keeps the type the same.
|
||
- `encoding` is the name of the encoding to use, such as 'utf-8' or
|
||
'latin-1', or the function name in the case of 'transcode'.
|
||
- The `cost` does not affect how the plan itself works. It's used by other
|
||
users of plans, namely `fix_encoding_and_explain`, which has to decide
|
||
*which* plan to use.
|
||
"""
|
||
obj = text
|
||
for operation, encoding, _ in plan:
|
||
if operation == 'encode':
|
||
obj = obj.encode(encoding)
|
||
elif operation == 'decode':
|
||
obj = obj.decode(encoding)
|
||
elif operation == 'transcode':
|
||
if encoding in TRANSCODERS:
|
||
obj = TRANSCODERS[encoding](obj)
|
||
else:
|
||
raise ValueError("Unknown transcode operation: %s" % encoding)
|
||
else:
|
||
raise ValueError("Unknown plan step: %s" % operation)
|
||
|
||
return obj
|
||
|
||
|
||
HTML_ENTITY_RE = re.compile(r"&#?\w{0,8};")
|
||
|
||
|
||
def unescape_html(text):
|
||
"""
|
||
Decode all three types of HTML entities/character references.
|
||
|
||
Code by Fredrik Lundh of effbot.org. Rob Speer made a slight change
|
||
to it for efficiency: it won't match entities longer than 8 characters,
|
||
because there are no valid entities like that.
|
||
|
||
>>> print(unescape_html('<tag>'))
|
||
<tag>
|
||
"""
|
||
def fixup(match):
|
||
"""
|
||
Replace one matched HTML entity with the character it represents,
|
||
if possible.
|
||
"""
|
||
text = match.group(0)
|
||
if text[:2] == "&#":
|
||
# character reference
|
||
try:
|
||
if text[:3] == "&#x":
|
||
codept = int(text[3:-1], 16)
|
||
else:
|
||
codept = int(text[2:-1])
|
||
if 0x80 <= codept < 0xa0:
|
||
# Decode this range of characters as Windows-1252, as Web
|
||
# browsers do in practice.
|
||
return unichr(codept).encode('latin-1').decode('sloppy-windows-1252')
|
||
else:
|
||
return unichr(codept)
|
||
except ValueError:
|
||
pass
|
||
else:
|
||
# named entity
|
||
try:
|
||
text = entities[text[1:]]
|
||
except KeyError:
|
||
pass
|
||
return text # leave as is
|
||
return HTML_ENTITY_RE.sub(fixup, text)
|
||
|
||
|
||
ANSI_RE = re.compile('\033\\[((?:\\d|;)*)([a-zA-Z])')
|
||
|
||
def remove_terminal_escapes(text):
|
||
r"""
|
||
Strip out "ANSI" terminal escape sequences, such as those that produce
|
||
colored text on Unix.
|
||
|
||
>>> print(remove_terminal_escapes(
|
||
... "\033[36;44mI'm blue, da ba dee da ba doo...\033[0m"
|
||
... ))
|
||
I'm blue, da ba dee da ba doo...
|
||
"""
|
||
return ANSI_RE.sub('', text)
|
||
|
||
|
||
def uncurl_quotes(text):
|
||
r"""
|
||
Replace curly quotation marks with straight equivalents.
|
||
|
||
>>> print(uncurl_quotes('\u201chere\u2019s a test\u201d'))
|
||
"here's a test"
|
||
"""
|
||
return SINGLE_QUOTE_RE.sub("'", DOUBLE_QUOTE_RE.sub('"', text))
|
||
|
||
|
||
def fix_latin_ligatures(text):
|
||
"""
|
||
Replace single-character ligatures of Latin letters, such as 'fi', with the
|
||
characters that they contain, as in 'fi'. Latin ligatures are usually not
|
||
intended in text strings (though they're lovely in *rendered* text). If
|
||
you have such a ligature in your string, it is probably a result of a
|
||
copy-and-paste glitch.
|
||
|
||
We leave ligatures in other scripts alone to be safe. They may be intended,
|
||
and removing them may lose information. If you want to take apart nearly
|
||
all ligatures, use NFKC normalization.
|
||
|
||
>>> print(fix_latin_ligatures("fluffiest"))
|
||
fluffiest
|
||
"""
|
||
return text.translate(LIGATURES)
|
||
|
||
|
||
def fix_character_width(text):
|
||
"""
|
||
The ASCII characters, katakana, and Hangul characters have alternate
|
||
"halfwidth" or "fullwidth" forms that help text line up in a grid.
|
||
|
||
If you don't need these width properties, you probably want to replace
|
||
these characters with their standard form, which is what this function
|
||
does.
|
||
|
||
Note that this replaces the ideographic space, U+3000, with the ASCII
|
||
space, U+20.
|
||
|
||
>>> print(fix_character_width("LOUD NOISES"))
|
||
LOUD NOISES
|
||
>>> print(fix_character_width("Uターン")) # this means "U-turn"
|
||
Uターン
|
||
"""
|
||
return text.translate(WIDTH_MAP)
|
||
|
||
|
||
def fix_line_breaks(text):
|
||
r"""
|
||
Convert all line breaks to Unix style.
|
||
|
||
This will convert the following sequences into the standard \\n
|
||
line break:
|
||
|
||
- CRLF (\\r\\n), used on Windows and in some communication
|
||
protocols
|
||
- CR (\\r), once used on Mac OS Classic, and now kept alive
|
||
by misguided software such as Microsoft Office for Mac
|
||
- LINE SEPARATOR (\\u2028) and PARAGRAPH SEPARATOR (\\u2029),
|
||
defined by Unicode and used to sow confusion and discord
|
||
- NEXT LINE (\\x85), a C1 control character that is certainly
|
||
not what you meant
|
||
|
||
The NEXT LINE character is a bit of an odd case, because it
|
||
usually won't show up if `fix_encoding` is also being run.
|
||
\\x85 is very common mojibake for \\u2026, HORIZONTAL ELLIPSIS.
|
||
|
||
>>> print(fix_line_breaks(
|
||
... "This string is made of two things:\u2029"
|
||
... "1. Unicode\u2028"
|
||
... "2. Spite"
|
||
... ))
|
||
This string is made of two things:
|
||
1. Unicode
|
||
2. Spite
|
||
|
||
For further testing and examples, let's define a function to make sure
|
||
we can see the control characters in their escaped form:
|
||
|
||
>>> def eprint(text):
|
||
... print(text.encode('unicode-escape').decode('ascii'))
|
||
|
||
>>> eprint(fix_line_breaks("Content-type: text/plain\r\n\r\nHi."))
|
||
Content-type: text/plain\n\nHi.
|
||
|
||
>>> eprint(fix_line_breaks("This is how Microsoft \r trolls Mac users"))
|
||
This is how Microsoft \n trolls Mac users
|
||
|
||
>>> eprint(fix_line_breaks("What is this \x85 I don't even"))
|
||
What is this \n I don't even
|
||
"""
|
||
return text.replace('\r\n', '\n').replace('\r', '\n')\
|
||
.replace('\u2028', '\n').replace('\u2029', '\n')\
|
||
.replace('\u0085', '\n')
|
||
|
||
|
||
SURROGATE_RE = re.compile('[\ud800-\udfff]')
|
||
SURROGATE_PAIR_RE = re.compile('[\ud800-\udbff][\udc00-\udfff]')
|
||
|
||
|
||
def convert_surrogate_pair(match):
|
||
"""
|
||
Convert a surrogate pair to the single codepoint it represents.
|
||
|
||
This implements the formula described at:
|
||
http://en.wikipedia.org/wiki/Universal_Character_Set_characters#Surrogates
|
||
"""
|
||
pair = match.group(0)
|
||
codept = 0x10000 + (ord(pair[0]) - 0xd800) * 0x400 + (ord(pair[1]) - 0xdc00)
|
||
return unichr(codept)
|
||
|
||
|
||
def fix_surrogates(text):
|
||
"""
|
||
Replace 16-bit surrogate codepoints with the characters they represent
|
||
(when properly paired), or with \ufffd otherwise.
|
||
|
||
>>> high_surrogate = unichr(0xd83d)
|
||
>>> low_surrogate = unichr(0xdca9)
|
||
>>> print(fix_surrogates(high_surrogate + low_surrogate))
|
||
💩
|
||
>>> print(fix_surrogates(low_surrogate + high_surrogate))
|
||
<20><>
|
||
|
||
The above doctest had to be very carefully written, because even putting
|
||
the Unicode escapes of the surrogates in the docstring was causing
|
||
various tools to fail, which I think just goes to show why this fixer is
|
||
necessary.
|
||
"""
|
||
if SURROGATE_RE.search(text):
|
||
text = SURROGATE_PAIR_RE.sub(convert_surrogate_pair, text)
|
||
text = SURROGATE_RE.sub('\ufffd', text)
|
||
return text
|
||
|
||
|
||
def remove_control_chars(text):
|
||
"""
|
||
Remove various control characters that you probably didn't intend to be in
|
||
your text. Many of these characters appear in the table of "Characters not
|
||
suitable for use with markup" at
|
||
http://www.unicode.org/reports/tr20/tr20-9.html.
|
||
|
||
This includes:
|
||
|
||
- ASCII control characters, except for the important whitespace characters
|
||
(U+00 to U+08, U+0B, U+0E to U+1F, U+7F)
|
||
- Deprecated Arabic control characters (U+206A to U+206F)
|
||
- Interlinear annotation characters (U+FFF9 to U+FFFB)
|
||
- The Object Replacement Character (U+FFFC)
|
||
- The byte order mark (U+FEFF)
|
||
- Musical notation control characters (U+1D173 to U+1D17A)
|
||
- Tag characters (U+E0000 to U+E007F)
|
||
|
||
However, these similar characters are left alone:
|
||
|
||
- Control characters that produce whitespace (U+09, U+0A, U+0C, U+0D,
|
||
U+2028, and U+2029)
|
||
- C1 control characters (U+80 to U+9F) -- even though they are basically
|
||
never used intentionally, they are important clues about what mojibake
|
||
has happened
|
||
- Control characters that affect glyph rendering, such as joiners and
|
||
right-to-left marks (U+200C to U+200F, U+202A to U+202E)
|
||
"""
|
||
return text.translate(CONTROL_CHARS)
|
||
|
||
|
||
def remove_bom(text):
|
||
r"""
|
||
Remove a byte-order mark that was accidentally decoded as if it were part
|
||
of the text.
|
||
|
||
>>> print(remove_bom("\ufeffWhere do you want to go today?"))
|
||
Where do you want to go today?
|
||
"""
|
||
return text.lstrip(unichr(0xfeff))
|
||
|
||
|
||
# Define a regex to match valid escape sequences in Python string literals.
|
||
ESCAPE_SEQUENCE_RE = re.compile(r'''
|
||
( \\U........ # 8-digit hex escapes
|
||
| \\u.... # 4-digit hex escapes
|
||
| \\x.. # 2-digit hex escapes
|
||
| \\[0-7]{1,3} # Octal escapes
|
||
| \\N\{[^}]+\} # Unicode characters by name
|
||
| \\[\\'"abfnrtv] # Single-character escapes
|
||
)''', re.UNICODE | re.VERBOSE)
|
||
|
||
|
||
def decode_escapes(text):
|
||
r"""
|
||
Decode backslashed escape sequences, including \\x, \\u, and \\U character
|
||
references, even in the presence of other Unicode.
|
||
|
||
This is what Python's "string-escape" and "unicode-escape" codecs were
|
||
meant to do, but in contrast, this actually works. It will decode the
|
||
string exactly the same way that the Python interpreter decodes its string
|
||
literals.
|
||
|
||
>>> factoid = '\\u20a1 is the currency symbol for the colón.'
|
||
>>> print(factoid[1:])
|
||
u20a1 is the currency symbol for the colón.
|
||
>>> print(decode_escapes(factoid))
|
||
₡ is the currency symbol for the colón.
|
||
|
||
Even though Python itself can read string literals with a combination of
|
||
escapes and literal Unicode -- you're looking at one right now -- the
|
||
"unicode-escape" codec doesn't work on literal Unicode. (See
|
||
http://stackoverflow.com/a/24519338/773754 for more details.)
|
||
|
||
Instead, this function searches for just the parts of a string that
|
||
represent escape sequences, and decodes them, leaving the rest alone. All
|
||
valid escape sequences are made of ASCII characters, and this allows
|
||
"unicode-escape" to work correctly.
|
||
|
||
This fix cannot be automatically applied by the `ftfy.fix_text` function,
|
||
because escaped text is not necessarily a mistake, and there is no way
|
||
to distinguish text that's supposed to be escaped from text that isn't.
|
||
"""
|
||
def decode_match(match):
|
||
"Given a regex match, decode the escape sequence it contains."
|
||
return codecs.decode(match.group(0), 'unicode-escape')
|
||
|
||
return ESCAPE_SEQUENCE_RE.sub(decode_match, text)
|
||
|
||
|
||
def restore_byte_a0(byts):
|
||
"""
|
||
Some mojibake has been additionally altered by a process that said "hmm,
|
||
byte A0, that's basically a space!" and replaced it with an ASCII space.
|
||
When the A0 is part of a sequence that we intend to decode as UTF-8,
|
||
changing byte A0 to 20 would make it fail to decode.
|
||
|
||
This process finds sequences that would convincingly decode as UTF-8 if
|
||
byte 20 were changed to A0, and puts back the A0. For the purpose of
|
||
deciding whether this is a good idea, this step gets a cost of twice
|
||
the number of bytes that are changed.
|
||
|
||
This is used as a step within `fix_encoding`.
|
||
"""
|
||
def replacement(match):
|
||
"The function to apply when this regex matches."
|
||
return match.group(0).replace(b'\x20', b'\xa0')
|
||
|
||
return ALTERED_UTF8_RE.sub(replacement, byts)
|
||
|
||
|
||
def replace_lossy_sequences(byts):
|
||
"""
|
||
This function identifies sequences where information has been lost in
|
||
a "sloppy" codec, indicated by byte 1A, and if they would otherwise look
|
||
like a UTF-8 sequence, it replaces them with the UTF-8 sequence for U+FFFD.
|
||
|
||
A further explanation:
|
||
|
||
ftfy can now fix text in a few cases that it would previously fix
|
||
incompletely, because of the fact that it can't successfully apply the fix
|
||
to the entire string. A very common case of this is when characters have
|
||
been erroneously decoded as windows-1252, but instead of the "sloppy"
|
||
windows-1252 that passes through unassigned bytes, the unassigned bytes get
|
||
turned into U+FFFD (<28>), so we can't tell what they were.
|
||
|
||
This most commonly happens with curly quotation marks that appear
|
||
``“ like this â€<C3A2>``.
|
||
|
||
We can do better by building on ftfy's "sloppy codecs" to let them handle
|
||
less-sloppy but more-lossy text. When they encounter the character ``<60>``,
|
||
instead of refusing to encode it, they encode it as byte 1A -- an
|
||
ASCII control code called SUBSTITUTE that once was meant for about the same
|
||
purpose. We can then apply a fixer that looks for UTF-8 sequences where
|
||
some continuation bytes have been replaced by byte 1A, and decode the whole
|
||
sequence as <20>; if that doesn't work, it'll just turn the byte back into <20>
|
||
itself.
|
||
|
||
As a result, the above text ``“ like this â€<C3A2>`` will decode as
|
||
``“ like this <20>``.
|
||
|
||
If U+1A was actually in the original string, then the sloppy codecs will
|
||
not be used, and this function will not be run, so your weird control
|
||
character will be left alone but wacky fixes like this won't be possible.
|
||
|
||
This is used as a step within `fix_encoding`.
|
||
"""
|
||
return LOSSY_UTF8_RE.sub('\ufffd'.encode('utf-8'), byts)
|
||
|
||
|
||
def fix_partial_utf8_punct_in_1252(text):
|
||
"""
|
||
Fix particular characters that seem to be found in the wild encoded in
|
||
UTF-8 and decoded in Latin-1 or Windows-1252, even when this fix can't be
|
||
consistently applied.
|
||
|
||
For this function, we assume the text has been decoded in Windows-1252.
|
||
If it was decoded in Latin-1, we'll call this right after it goes through
|
||
the Latin-1-to-Windows-1252 fixer.
|
||
|
||
This is used as a step within `fix_encoding`.
|
||
"""
|
||
def replacement(match):
|
||
"The function to apply when this regex matches."
|
||
return match.group(0).encode('sloppy-windows-1252').decode('utf-8')
|
||
return PARTIAL_UTF8_PUNCT_RE.sub(replacement, text)
|
||
|
||
|
||
TRANSCODERS = {
|
||
'restore_byte_a0': restore_byte_a0,
|
||
'replace_lossy_sequences': replace_lossy_sequences,
|
||
'fix_partial_utf8_punct_in_1252': fix_partial_utf8_punct_in_1252
|
||
}
|