bazarr/libs/ftfy/__init__.py
2018-10-31 17:08:29 +01:00

411 lines
16 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding: utf-8 -*-
"""
ftfy: fixes text for you
This is a module for making text less broken. See the `fix_text` function
for more information.
"""
from __future__ import unicode_literals
import unicodedata
import ftfy.bad_codecs
from ftfy import fixes
from ftfy.formatting import display_ljust
from ftfy.compatibility import is_printable
__version__ = '4.4.3'
# See the docstring for ftfy.bad_codecs to see what we're doing here.
ftfy.bad_codecs.ok()
def fix_text(text,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
fix_latin_ligatures=True,
fix_character_width=True,
uncurl_quotes=True,
fix_line_breaks=True,
fix_surrogates=True,
remove_control_chars=True,
remove_bom=True,
normalization='NFC',
max_decode_length=10**6):
r"""
Given Unicode text as input, fix inconsistencies and glitches in it,
such as mojibake.
Let's start with some examples:
>>> print(fix_text('ˆnicode'))
ünicode
>>> print(fix_text('Broken text… it’s flubberific!',
... normalization='NFKC'))
Broken text... it's flubberific!
>>> print(fix_text('HTML entities <3'))
HTML entities <3
>>> print(fix_text('<em>HTML entities &lt;3</em>'))
<em>HTML entities &lt;3</em>
>>> print(fix_text("&macr;\\_(ã\x83\x84)_/&macr;"))
¯\_(ツ)_/¯
>>> # This example string starts with a byte-order mark, even if
>>> # you can't see it on the Web.
>>> print(fix_text('\ufeffParty like\nit&rsquo;s 1999!'))
Party like
it's 1999!
>>> print(fix_text(' '))
LOUD NOISES
>>> len(fix_text('' * 100000))
200000
>>> len(fix_text(''))
0
Based on the options you provide, ftfy applies these steps in order:
- If `remove_terminal_escapes` is True, remove sequences of bytes that are
instructions for Unix terminals, such as the codes that make text appear
in different colors.
- If `fix_encoding` is True, look for common mistakes that come from
encoding or decoding Unicode text incorrectly, and fix them if they are
reasonably fixable. See `fixes.fix_encoding` for details.
- If `fix_entities` is True, replace HTML entities with their equivalent
characters. If it's "auto" (the default), then consider replacing HTML
entities, but don't do so in text where you have seen a pair of actual
angle brackets (that's probably actually HTML and you shouldn't mess
with the entities).
- If `uncurl_quotes` is True, replace various curly quotation marks with
plain-ASCII straight quotes.
- If `fix_latin_ligatures` is True, then ligatures made of Latin letters,
such as `fi`, will be separated into individual letters. These ligatures
are usually not meaningful outside of font rendering, and often represent
copy-and-paste errors.
- If `fix_character_width` is True, half-width and full-width characters
will be replaced by their standard-width form.
- If `fix_line_breaks` is true, convert all line breaks to Unix style
(CRLF and CR line breaks become LF line breaks).
- If `fix_surrogates` is true, ensure that there are no UTF-16 surrogates
in the resulting string, by converting them to the correct characters
when they're appropriately paired, or replacing them with \ufffd
otherwise.
- If `remove_control_chars` is true, remove control characters that
are not suitable for use in text. This includes most of the ASCII control
characters, plus some Unicode controls such as the byte order mark
(U+FEFF). Useful control characters, such as Tab, Line Feed, and
bidirectional marks, are left as they are.
- If `remove_bom` is True, remove the Byte-Order Mark at the start of the
string if it exists. (This is largely redundant, because it's a special
case of `remove_control_characters`. This option will become deprecated
in a later version.)
- If `normalization` is not None, apply the specified form of Unicode
normalization, which can be one of 'NFC', 'NFKC', 'NFD', and 'NFKD'.
- The default normalization, NFC, combines characters and diacritics that
are written using separate code points, such as converting "e" plus an
acute accent modifier into "é", or converting "ka" (か) plus a dakuten
into the single character "ga" (が). Unicode can be converted to NFC
form without any change in its meaning.
- If you ask for NFKC normalization, it will apply additional
normalizations that can change the meanings of characters. For example,
ellipsis characters will be replaced with three periods, all ligatures
will be replaced with the individual characters that make them up,
and characters that differ in font style will be converted to the same
character.
- If anything was changed, repeat all the steps, so that the function is
idempotent. "&amp;amp;" will become "&", for example, not "&amp;".
`fix_text` will work one line at a time, with the possibility that some
lines are in different encodings, allowing it to fix text that has been
concatenated together from different sources.
When it encounters lines longer than `max_decode_length` (1 million
codepoints by default), it will not run the `fix_encoding` step, to avoid
unbounded slowdowns.
If you're certain that any decoding errors in the text would have affected
the entire text in the same way, and you don't mind operations that scale
with the length of the text, you can use `fix_text_segment` directly to
fix the whole string in one batch.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
out = []
pos = 0
while pos < len(text):
textbreak = text.find('\n', pos) + 1
fix_encoding_this_time = fix_encoding
if textbreak == 0:
textbreak = len(text)
if (textbreak - pos) > max_decode_length:
fix_encoding_this_time = False
substring = text[pos:textbreak]
if fix_entities == 'auto' and '<' in substring and '>' in substring:
# we see angle brackets together; this could be HTML
fix_entities = False
out.append(
fix_text_segment(
substring,
fix_entities=fix_entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding_this_time,
uncurl_quotes=uncurl_quotes,
fix_latin_ligatures=fix_latin_ligatures,
fix_character_width=fix_character_width,
fix_line_breaks=fix_line_breaks,
fix_surrogates=fix_surrogates,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom,
normalization=normalization
)
)
pos = textbreak
return ''.join(out)
# Some alternate names for the main functions
ftfy = fix_text
fix_encoding = fixes.fix_encoding
fix_text_encoding = fixes.fix_text_encoding # deprecated
def fix_file(input_file,
encoding=None,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
fix_latin_ligatures=True,
fix_character_width=True,
uncurl_quotes=True,
fix_line_breaks=True,
fix_surrogates=True,
remove_control_chars=True,
remove_bom=True,
normalization='NFC'):
"""
Fix text that is found in a file.
If the file is being read as Unicode text, use that. If it's being read as
bytes, then we hope an encoding was supplied. If not, unfortunately, we
have to guess what encoding it is. We'll try a few common encodings, but we
make no promises. See the `guess_bytes` function for how this is done.
The output is a stream of fixed lines of text.
"""
entities = fix_entities
for line in input_file:
if isinstance(line, bytes):
if encoding is None:
line, encoding = guess_bytes(line)
else:
line = line.decode(encoding)
if fix_entities == 'auto' and '<' in line and '>' in line:
entities = False
yield fix_text_segment(
line,
fix_entities=entities,
remove_terminal_escapes=remove_terminal_escapes,
fix_encoding=fix_encoding,
fix_latin_ligatures=fix_latin_ligatures,
fix_character_width=fix_character_width,
uncurl_quotes=uncurl_quotes,
fix_line_breaks=fix_line_breaks,
fix_surrogates=fix_surrogates,
remove_control_chars=remove_control_chars,
remove_bom=remove_bom,
normalization=normalization
)
def fix_text_segment(text,
fix_entities='auto',
remove_terminal_escapes=True,
fix_encoding=True,
fix_latin_ligatures=True,
fix_character_width=True,
uncurl_quotes=True,
fix_line_breaks=True,
fix_surrogates=True,
remove_control_chars=True,
remove_bom=True,
normalization='NFC'):
"""
Apply fixes to text in a single chunk. This could be a line of text
within a larger run of `fix_text`, or it could be a larger amount
of text that you are certain is in a consistent encoding.
See `fix_text` for a description of the parameters.
"""
if isinstance(text, bytes):
raise UnicodeError(fixes.BYTES_ERROR_TEXT)
if fix_entities == 'auto' and '<' in text and '>' in text:
fix_entities = False
while True:
origtext = text
if remove_terminal_escapes:
text = fixes.remove_terminal_escapes(text)
if fix_encoding:
text = fixes.fix_encoding(text)
if fix_entities:
text = fixes.unescape_html(text)
if fix_latin_ligatures:
text = fixes.fix_latin_ligatures(text)
if fix_character_width:
text = fixes.fix_character_width(text)
if uncurl_quotes:
text = fixes.uncurl_quotes(text)
if fix_line_breaks:
text = fixes.fix_line_breaks(text)
if fix_surrogates:
text = fixes.fix_surrogates(text)
if remove_control_chars:
text = fixes.remove_control_chars(text)
if remove_bom and not remove_control_chars:
# Skip this step if we've already done `remove_control_chars`,
# because it would be redundant.
text = fixes.remove_bom(text)
if normalization is not None:
text = unicodedata.normalize(normalization, text)
if text == origtext:
return text
def guess_bytes(bstring):
"""
NOTE: Using `guess_bytes` is not the recommended way of using ftfy. ftfy
is not designed to be an encoding detector.
In the unfortunate situation that you have some bytes in an unknown
encoding, ftfy can guess a reasonable strategy for decoding them, by trying
a few common encodings that can be distinguished from each other.
Unlike the rest of ftfy, this may not be accurate, and it may *create*
Unicode problems instead of solving them!
It doesn't try East Asian encodings at all, and if you have East Asian text
that you don't know how to decode, you are somewhat out of luck. East
Asian encodings require some serious statistics to distinguish from each
other, so we can't support them without decreasing the accuracy of ftfy.
If you don't know which encoding you have at all, I recommend
trying the 'chardet' module, and being appropriately skeptical about its
results.
The encodings we try here are:
- UTF-16 with a byte order mark, because a UTF-16 byte order mark looks
like nothing else
- UTF-8, because it's the global standard, which has been used by a
majority of the Web since 2008
- "utf-8-variants", because it's what people actually implement when they
think they're doing UTF-8
- MacRoman, because Microsoft Office thinks it's still a thing, and it
can be distinguished by its line breaks. (If there are no line breaks in
the string, though, you're out of luck.)
- "sloppy-windows-1252", the Latin-1-like encoding that is the most common
single-byte encoding
"""
if type(bstring) == type(''):
raise UnicodeError(
"This string was already decoded as Unicode. You should pass "
"bytes to guess_bytes, not Unicode."
)
if bstring.startswith(b'\xfe\xff') or bstring.startswith(b'\xff\xfe'):
return bstring.decode('utf-16'), 'utf-16'
byteset = set(bytes(bstring))
byte_ed, byte_c0, byte_CR, byte_LF = b'\xed\xc0\r\n'
try:
if byte_ed in byteset or byte_c0 in byteset:
# Byte 0xed can be used to encode a range of codepoints that
# are UTF-16 surrogates. UTF-8 does not use UTF-16 surrogates,
# so when we see 0xed, it's very likely we're being asked to
# decode CESU-8, the variant that encodes UTF-16 surrogates
# instead of the original characters themselves.
#
# This will occasionally trigger on standard UTF-8, as there
# are some Korean characters that also use byte 0xed, but that's
# not harmful.
#
# Byte 0xc0 is impossible because, numerically, it would only
# encode characters lower than U+0040. Those already have
# single-byte representations, and UTF-8 requires using the
# shortest possible representation. However, Java hides the null
# codepoint, U+0000, in a non-standard longer representation -- it
# encodes it as 0xc0 0x80 instead of 0x00, guaranteeing that 0x00
# will never appear in the encoded bytes.
#
# The 'utf-8-variants' decoder can handle both of these cases, as
# well as standard UTF-8, at the cost of a bit of speed.
return bstring.decode('utf-8-variants'), 'utf-8-variants'
else:
return bstring.decode('utf-8'), 'utf-8'
except UnicodeDecodeError:
pass
if byte_CR in bstring and byte_LF not in bstring:
return bstring.decode('macroman'), 'macroman'
else:
return bstring.decode('sloppy-windows-1252'), 'sloppy-windows-1252'
def explain_unicode(text):
"""
A utility method that's useful for debugging mysterious Unicode.
It breaks down a string, showing you for each codepoint its number in
hexadecimal, its glyph, its category in the Unicode standard, and its name
in the Unicode standard.
>>> explain_unicode('(╯°□°)╯︵ ┻━┻')
U+0028 ( [Ps] LEFT PARENTHESIS
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+00B0 ° [So] DEGREE SIGN
U+25A1 □ [So] WHITE SQUARE
U+00B0 ° [So] DEGREE SIGN
U+0029 ) [Pe] RIGHT PARENTHESIS
U+256F ╯ [So] BOX DRAWINGS LIGHT ARC UP AND LEFT
U+FE35 ︵ [Ps] PRESENTATION FORM FOR VERTICAL LEFT PARENTHESIS
U+0020 [Zs] SPACE
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
U+2501 ━ [So] BOX DRAWINGS HEAVY HORIZONTAL
U+253B ┻ [So] BOX DRAWINGS HEAVY UP AND HORIZONTAL
"""
for char in text:
if is_printable(char):
display = char
else:
display = char.encode('unicode-escape').decode('ascii')
print('U+{code:04X} {display} [{category}] {name}'.format(
display=display_ljust(display, 7),
code=ord(char),
category=unicodedata.category(char),
name=unicodedata.name(char, '<unknown>')
))