364 lines
11 KiB
Python
364 lines
11 KiB
Python
#
|
|
# Cython -- encoding related tools
|
|
#
|
|
|
|
from __future__ import absolute_import
|
|
|
|
import re
|
|
import sys
|
|
|
|
if sys.version_info[0] >= 3:
|
|
_unicode, _str, _bytes, _unichr = str, str, bytes, chr
|
|
IS_PYTHON3 = True
|
|
else:
|
|
_unicode, _str, _bytes, _unichr = unicode, str, str, unichr
|
|
IS_PYTHON3 = False
|
|
|
|
empty_bytes = _bytes()
|
|
empty_unicode = _unicode()
|
|
|
|
join_bytes = empty_bytes.join
|
|
|
|
|
|
class UnicodeLiteralBuilder(object):
|
|
"""Assemble a unicode string.
|
|
"""
|
|
def __init__(self):
|
|
self.chars = []
|
|
|
|
def append(self, characters):
|
|
if isinstance(characters, _bytes):
|
|
# this came from a Py2 string literal in the parser code
|
|
characters = characters.decode("ASCII")
|
|
assert isinstance(characters, _unicode), str(type(characters))
|
|
self.chars.append(characters)
|
|
|
|
if sys.maxunicode == 65535:
|
|
def append_charval(self, char_number):
|
|
if char_number > 65535:
|
|
# wide Unicode character on narrow platform => replace
|
|
# by surrogate pair
|
|
char_number -= 0x10000
|
|
self.chars.append( _unichr((char_number // 1024) + 0xD800) )
|
|
self.chars.append( _unichr((char_number % 1024) + 0xDC00) )
|
|
else:
|
|
self.chars.append( _unichr(char_number) )
|
|
else:
|
|
def append_charval(self, char_number):
|
|
self.chars.append( _unichr(char_number) )
|
|
|
|
def append_uescape(self, char_number, escape_string):
|
|
self.append_charval(char_number)
|
|
|
|
def getstring(self):
|
|
return EncodedString(u''.join(self.chars))
|
|
|
|
def getstrings(self):
|
|
return (None, self.getstring())
|
|
|
|
|
|
class BytesLiteralBuilder(object):
|
|
"""Assemble a byte string or char value.
|
|
"""
|
|
def __init__(self, target_encoding):
|
|
self.chars = []
|
|
self.target_encoding = target_encoding
|
|
|
|
def append(self, characters):
|
|
if isinstance(characters, _unicode):
|
|
characters = characters.encode(self.target_encoding)
|
|
assert isinstance(characters, _bytes), str(type(characters))
|
|
self.chars.append(characters)
|
|
|
|
def append_charval(self, char_number):
|
|
self.chars.append( _unichr(char_number).encode('ISO-8859-1') )
|
|
|
|
def append_uescape(self, char_number, escape_string):
|
|
self.append(escape_string)
|
|
|
|
def getstring(self):
|
|
# this *must* return a byte string!
|
|
return bytes_literal(join_bytes(self.chars), self.target_encoding)
|
|
|
|
def getchar(self):
|
|
# this *must* return a byte string!
|
|
return self.getstring()
|
|
|
|
def getstrings(self):
|
|
return (self.getstring(), None)
|
|
|
|
|
|
class StrLiteralBuilder(object):
|
|
"""Assemble both a bytes and a unicode representation of a string.
|
|
"""
|
|
def __init__(self, target_encoding):
|
|
self._bytes = BytesLiteralBuilder(target_encoding)
|
|
self._unicode = UnicodeLiteralBuilder()
|
|
|
|
def append(self, characters):
|
|
self._bytes.append(characters)
|
|
self._unicode.append(characters)
|
|
|
|
def append_charval(self, char_number):
|
|
self._bytes.append_charval(char_number)
|
|
self._unicode.append_charval(char_number)
|
|
|
|
def append_uescape(self, char_number, escape_string):
|
|
self._bytes.append(escape_string)
|
|
self._unicode.append_charval(char_number)
|
|
|
|
def getstrings(self):
|
|
return (self._bytes.getstring(), self._unicode.getstring())
|
|
|
|
|
|
class EncodedString(_unicode):
|
|
# unicode string subclass to keep track of the original encoding.
|
|
# 'encoding' is None for unicode strings and the source encoding
|
|
# otherwise
|
|
encoding = None
|
|
|
|
def __deepcopy__(self, memo):
|
|
return self
|
|
|
|
def byteencode(self):
|
|
assert self.encoding is not None
|
|
return self.encode(self.encoding)
|
|
|
|
def utf8encode(self):
|
|
assert self.encoding is None
|
|
return self.encode("UTF-8")
|
|
|
|
@property
|
|
def is_unicode(self):
|
|
return self.encoding is None
|
|
|
|
def contains_surrogates(self):
|
|
return string_contains_surrogates(self)
|
|
|
|
def as_utf8_string(self):
|
|
return bytes_literal(self.utf8encode(), 'utf8')
|
|
|
|
|
|
def string_contains_surrogates(ustring):
|
|
"""
|
|
Check if the unicode string contains surrogate code points
|
|
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
|
|
Unicode, i.e. characters that would be spelled as two
|
|
separate code units on a narrow platform.
|
|
"""
|
|
for c in map(ord, ustring):
|
|
if c > 65535: # can only happen on wide platforms
|
|
return True
|
|
if 0xD800 <= c <= 0xDFFF:
|
|
return True
|
|
return False
|
|
|
|
|
|
def string_contains_lone_surrogates(ustring):
|
|
"""
|
|
Check if the unicode string contains lone surrogate code points
|
|
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
|
|
Unicode, i.e. characters that would be spelled as two
|
|
separate code units on a narrow platform, but that do not form a pair.
|
|
"""
|
|
last_was_start = False
|
|
unicode_uses_surrogate_encoding = sys.maxunicode == 65535
|
|
for c in map(ord, ustring):
|
|
# surrogates tend to be rare
|
|
if c < 0xD800 or c > 0xDFFF:
|
|
if last_was_start:
|
|
return True
|
|
elif not unicode_uses_surrogate_encoding:
|
|
# on 32bit Unicode platforms, there is never a pair
|
|
return True
|
|
elif c <= 0xDBFF:
|
|
if last_was_start:
|
|
return True # lone start
|
|
last_was_start = True
|
|
else:
|
|
if not last_was_start:
|
|
return True # lone end
|
|
last_was_start = False
|
|
return last_was_start
|
|
|
|
|
|
class BytesLiteral(_bytes):
|
|
# bytes subclass that is compatible with EncodedString
|
|
encoding = None
|
|
|
|
def __deepcopy__(self, memo):
|
|
return self
|
|
|
|
def byteencode(self):
|
|
if IS_PYTHON3:
|
|
return _bytes(self)
|
|
else:
|
|
# fake-recode the string to make it a plain bytes object
|
|
return self.decode('ISO-8859-1').encode('ISO-8859-1')
|
|
|
|
def utf8encode(self):
|
|
assert False, "this is not a unicode string: %r" % self
|
|
|
|
def __str__(self):
|
|
"""Fake-decode the byte string to unicode to support %
|
|
formatting of unicode strings.
|
|
"""
|
|
return self.decode('ISO-8859-1')
|
|
|
|
is_unicode = False
|
|
|
|
def as_c_string_literal(self):
|
|
value = split_string_literal(escape_byte_string(self))
|
|
return '"%s"' % value
|
|
|
|
|
|
def bytes_literal(s, encoding):
|
|
assert isinstance(s, bytes)
|
|
s = BytesLiteral(s)
|
|
s.encoding = encoding
|
|
return s
|
|
|
|
|
|
def encoded_string(s, encoding):
|
|
assert isinstance(s, (_unicode, bytes))
|
|
s = EncodedString(s)
|
|
if encoding is not None:
|
|
s.encoding = encoding
|
|
return s
|
|
|
|
|
|
char_from_escape_sequence = {
|
|
r'\a' : u'\a',
|
|
r'\b' : u'\b',
|
|
r'\f' : u'\f',
|
|
r'\n' : u'\n',
|
|
r'\r' : u'\r',
|
|
r'\t' : u'\t',
|
|
r'\v' : u'\v',
|
|
}.get
|
|
|
|
_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
|
|
|
|
|
|
def _to_escape_sequence(s):
|
|
if s in '\n\r\t':
|
|
return repr(s)[1:-1]
|
|
elif s == '"':
|
|
return r'\"'
|
|
elif s == '\\':
|
|
return r'\\'
|
|
else:
|
|
# within a character sequence, oct passes much better than hex
|
|
return ''.join(['\\%03o' % ord(c) for c in s])
|
|
|
|
|
|
def _build_specials_replacer():
|
|
subexps = []
|
|
replacements = {}
|
|
for special in _c_special:
|
|
regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
|
|
subexps.append(regexp)
|
|
replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
|
|
sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
|
|
def replace_specials(m):
|
|
return replacements[m.group(1)]
|
|
def replace(s):
|
|
return sub(replace_specials, s)
|
|
return replace
|
|
|
|
_replace_specials = _build_specials_replacer()
|
|
|
|
|
|
def escape_char(c):
|
|
if IS_PYTHON3:
|
|
c = c.decode('ISO-8859-1')
|
|
if c in '\n\r\t\\':
|
|
return repr(c)[1:-1]
|
|
elif c == "'":
|
|
return "\\'"
|
|
n = ord(c)
|
|
if n < 32 or n > 127:
|
|
# hex works well for characters
|
|
return "\\x%02X" % n
|
|
else:
|
|
return c
|
|
|
|
def escape_byte_string(s):
|
|
"""Escape a byte string so that it can be written into C code.
|
|
Note that this returns a Unicode string instead which, when
|
|
encoded as ISO-8859-1, will result in the correct byte sequence
|
|
being written.
|
|
"""
|
|
s = _replace_specials(s)
|
|
try:
|
|
return s.decode("ASCII") # trial decoding: plain ASCII => done
|
|
except UnicodeDecodeError:
|
|
pass
|
|
if IS_PYTHON3:
|
|
s_new = bytearray()
|
|
append, extend = s_new.append, s_new.extend
|
|
for b in s:
|
|
if b >= 128:
|
|
extend(('\\%3o' % b).encode('ASCII'))
|
|
else:
|
|
append(b)
|
|
return s_new.decode('ISO-8859-1')
|
|
else:
|
|
l = []
|
|
append = l.append
|
|
for c in s:
|
|
o = ord(c)
|
|
if o >= 128:
|
|
append('\\%3o' % o)
|
|
else:
|
|
append(c)
|
|
return join_bytes(l).decode('ISO-8859-1')
|
|
|
|
def split_string_literal(s, limit=2000):
|
|
# MSVC can't handle long string literals.
|
|
if len(s) < limit:
|
|
return s
|
|
else:
|
|
start = 0
|
|
chunks = []
|
|
while start < len(s):
|
|
end = start + limit
|
|
if len(s) > end-4 and '\\' in s[end-4:end]:
|
|
end -= 4 - s[end-4:end].find('\\') # just before the backslash
|
|
while s[end-1] == '\\':
|
|
end -= 1
|
|
if end == start:
|
|
# must have been a long line of backslashes
|
|
end = start + limit - (limit % 2) - 4
|
|
break
|
|
chunks.append(s[start:end])
|
|
start = end
|
|
return '""'.join(chunks)
|
|
|
|
def encode_pyunicode_string(s):
|
|
"""Create Py_UNICODE[] representation of a given unicode string.
|
|
"""
|
|
s = list(map(ord, s)) + [0]
|
|
|
|
if sys.maxunicode >= 0x10000: # Wide build or Py3.3
|
|
utf16, utf32 = [], s
|
|
for code_point in s:
|
|
if code_point >= 0x10000: # outside of BMP
|
|
high, low = divmod(code_point - 0x10000, 1024)
|
|
utf16.append(high + 0xD800)
|
|
utf16.append(low + 0xDC00)
|
|
else:
|
|
utf16.append(code_point)
|
|
else:
|
|
utf16, utf32 = s, []
|
|
for code_unit in s:
|
|
if 0xDC00 <= code_unit <= 0xDFFF and utf32 and 0xD800 <= utf32[-1] <= 0xDBFF:
|
|
high, low = utf32[-1], code_unit
|
|
utf32[-1] = ((high & 0x3FF) << 10) + (low & 0x3FF) + 0x10000
|
|
else:
|
|
utf32.append(code_unit)
|
|
|
|
if utf16 == utf32:
|
|
utf16 = []
|
|
return ",".join(map(_unicode, utf16)), ",".join(map(_unicode, utf32))
|