Skip to content

Commit d38aa7f

Browse files
[3.14] Improve tests for the PyUnicodeWriter C API (GH-146157)
Add tests for corner cases: NULL pointers and out of range values. (cherry picked from commit ab47892) Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent fa3143a commit d38aa7f

2 files changed

Lines changed: 151 additions & 83 deletions

File tree

Lib/test/test_capi/test_unicode.py

Lines changed: 106 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1751,35 +1751,61 @@ def test_basic(self):
17511751
writer.write_utf8(b'var', -1)
17521752

17531753
# test PyUnicodeWriter_WriteChar()
1754-
writer.write_char('=')
1754+
writer.write_char(ord('='))
17551755

17561756
# test PyUnicodeWriter_WriteSubstring()
17571757
writer.write_substring("[long]", 1, 5)
1758+
# CRASHES writer.write_substring(NULL, 0, 0)
17581759

17591760
# test PyUnicodeWriter_WriteStr()
17601761
writer.write_str(" value ")
1762+
# CRASHES writer.write_str(NULL)
17611763

17621764
# test PyUnicodeWriter_WriteRepr()
17631765
writer.write_repr("repr")
17641766

17651767
self.assertEqual(writer.finish(),
17661768
"var=long value 'repr'")
17671769

1770+
def test_repr_null(self):
1771+
writer = self.create_writer(0)
1772+
writer.write_utf8(b'var=', -1)
1773+
writer.write_repr(NULL)
1774+
self.assertEqual(writer.finish(),
1775+
"var=<NULL>")
1776+
1777+
def test_write_char(self):
1778+
writer = self.create_writer(0)
1779+
writer.write_char(0)
1780+
writer.write_char(ord('$'))
1781+
writer.write_char(0x20ac)
1782+
writer.write_char(0x10_ffff)
1783+
self.assertRaises(ValueError, writer.write_char, 0x11_0000)
1784+
self.assertRaises(ValueError, writer.write_char, 0xFFFF_FFFF)
1785+
self.assertEqual(writer.finish(),
1786+
"\0$\u20AC\U0010FFFF")
1787+
17681788
def test_utf8(self):
17691789
writer = self.create_writer(0)
17701790
writer.write_utf8(b"ascii", -1)
1771-
writer.write_char('-')
1791+
writer.write_char(ord('-'))
17721792
writer.write_utf8(b"latin1=\xC3\xA9", -1)
1773-
writer.write_char('-')
1793+
writer.write_char(ord('-'))
17741794
writer.write_utf8(b"euro=\xE2\x82\xAC", -1)
1775-
writer.write_char('.')
1795+
writer.write_char(ord('.'))
1796+
writer.write_utf8(NULL, 0)
1797+
# CRASHES writer.write_utf8(NULL, 1)
1798+
# CRASHES writer.write_utf8(NULL, -1)
17761799
self.assertEqual(writer.finish(),
17771800
"ascii-latin1=\xE9-euro=\u20AC.")
17781801

17791802
def test_ascii(self):
17801803
writer = self.create_writer(0)
17811804
writer.write_ascii(b"Hello ", -1)
17821805
writer.write_ascii(b"", 0)
1806+
writer.write_ascii(NULL, 0)
1807+
# CRASHES writer.write_ascii(NULL, 1)
1808+
# CRASHES writer.write_ascii(NULL, -1)
17831809
writer.write_ascii(b"Python! <truncated>", 6)
17841810
self.assertEqual(writer.finish(), "Hello Python")
17851811

@@ -1796,6 +1822,9 @@ def test_recover_utf8_error(self):
17961822
# write fails with an invalid string
17971823
with self.assertRaises(UnicodeDecodeError):
17981824
writer.write_utf8(b"invalid\xFF", -1)
1825+
with self.assertRaises(UnicodeDecodeError):
1826+
s = "truncated\u20AC".encode()
1827+
writer.write_utf8(s, len(s) - 1)
17991828

18001829
# retry write with a valid string
18011830
writer.write_utf8(b"valid", -1)
@@ -1807,13 +1836,19 @@ def test_decode_utf8(self):
18071836
# test PyUnicodeWriter_DecodeUTF8Stateful()
18081837
writer = self.create_writer(0)
18091838
writer.decodeutf8stateful(b"ign\xFFore", -1, b"ignore")
1810-
writer.write_char('-')
1839+
writer.write_char(ord('-'))
18111840
writer.decodeutf8stateful(b"replace\xFF", -1, b"replace")
1812-
writer.write_char('-')
1841+
writer.write_char(ord('-'))
18131842

18141843
# incomplete trailing UTF-8 sequence
18151844
writer.decodeutf8stateful(b"incomplete\xC3", -1, b"replace")
18161845

1846+
writer.decodeutf8stateful(NULL, 0, b"replace")
1847+
# CRASHES writer.decodeutf8stateful(NULL, 1, b"replace")
1848+
# CRASHES writer.decodeutf8stateful(NULL, -1, b"replace")
1849+
with self.assertRaises(UnicodeDecodeError):
1850+
writer.decodeutf8stateful(b"default\xFF", -1, NULL)
1851+
18171852
self.assertEqual(writer.finish(),
18181853
"ignore-replace\uFFFD-incomplete\uFFFD")
18191854

@@ -1824,12 +1859,12 @@ def test_decode_utf8_consumed(self):
18241859
# valid string
18251860
consumed = writer.decodeutf8stateful(b"text", -1, b"strict", True)
18261861
self.assertEqual(consumed, 4)
1827-
writer.write_char('-')
1862+
writer.write_char(ord('-'))
18281863

18291864
# non-ASCII
18301865
consumed = writer.decodeutf8stateful(b"\xC3\xA9-\xE2\x82\xAC", 6, b"strict", True)
18311866
self.assertEqual(consumed, 6)
1832-
writer.write_char('-')
1867+
writer.write_char(ord('-'))
18331868

18341869
# invalid UTF-8 (consumed is 0 on error)
18351870
with self.assertRaises(UnicodeDecodeError):
@@ -1838,54 +1873,92 @@ def test_decode_utf8_consumed(self):
18381873
# ignore error handler
18391874
consumed = writer.decodeutf8stateful(b"more\xFF", -1, b"ignore", True)
18401875
self.assertEqual(consumed, 5)
1841-
writer.write_char('-')
1876+
writer.write_char(ord('-'))
18421877

18431878
# incomplete trailing UTF-8 sequence
18441879
consumed = writer.decodeutf8stateful(b"incomplete\xC3", -1, b"ignore", True)
18451880
self.assertEqual(consumed, 10)
1881+
writer.write_char(ord('-'))
1882+
1883+
consumed = writer.decodeutf8stateful(NULL, 0, b"replace", True)
1884+
self.assertEqual(consumed, 0)
1885+
# CRASHES writer.decodeutf8stateful(NULL, 1, b"replace", True)
1886+
# CRASHES writer.decodeutf8stateful(NULL, -1, b"replace", True)
1887+
consumed = writer.decodeutf8stateful(b"default\xC3", -1, NULL, True)
1888+
self.assertEqual(consumed, 7)
18461889

1847-
self.assertEqual(writer.finish(), "text-\xE9-\u20AC-more-incomplete")
1890+
self.assertEqual(writer.finish(), "text-\xE9-\u20AC-more-incomplete-default")
18481891

18491892
def test_widechar(self):
1893+
from _testcapi import SIZEOF_WCHAR_T
1894+
1895+
if SIZEOF_WCHAR_T == 2:
1896+
encoding = 'utf-16le' if sys.byteorder == 'little' else 'utf-16be'
1897+
elif SIZEOF_WCHAR_T == 4:
1898+
encoding = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1899+
18501900
writer = self.create_writer(0)
1851-
writer.write_widechar("latin1=\xE9")
1852-
writer.write_widechar("-")
1853-
writer.write_widechar("euro=\u20AC")
1854-
writer.write_char("-")
1855-
writer.write_widechar("max=\U0010ffff")
1856-
writer.write_char('.')
1901+
writer.write_widechar("latin1=\xE9".encode(encoding))
1902+
writer.write_char(ord("-"))
1903+
writer.write_widechar("euro=\u20AC".encode(encoding))
1904+
writer.write_char(ord("-"))
1905+
writer.write_widechar("max=\U0010ffff".encode(encoding))
1906+
writer.write_char(ord("-"))
1907+
writer.write_widechar("zeroes=".encode(encoding).ljust(SIZEOF_WCHAR_T * 10, b'\0'),
1908+
10)
1909+
writer.write_char(ord('.'))
1910+
1911+
if SIZEOF_WCHAR_T == 4:
1912+
invalid = (b'\x00\x00\x11\x00' if sys.byteorder == 'little' else
1913+
b'\x00\x11\x00\x00')
1914+
with self.assertRaises(ValueError):
1915+
writer.write_widechar("invalid=".encode(encoding) + invalid)
1916+
writer.write_widechar(b'', -5)
1917+
writer.write_widechar(NULL, 0)
1918+
# CRASHES writer.write_widechar(NULL, 1)
1919+
# CRASHES writer.write_widechar(NULL, -1)
1920+
18571921
self.assertEqual(writer.finish(),
1858-
"latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
1922+
"latin1=\xE9-euro=\u20AC-max=\U0010ffff-zeroes=\0\0\0.")
18591923

18601924
def test_ucs4(self):
1925+
encoding = 'utf-32le' if sys.byteorder == 'little' else 'utf-32be'
1926+
18611927
writer = self.create_writer(0)
1862-
writer.write_ucs4("ascii IGNORED", 5)
1863-
writer.write_char("-")
1864-
writer.write_ucs4("latin1=\xe9", 8)
1865-
writer.write_char("-")
1866-
writer.write_ucs4("euro=\u20ac", 6)
1867-
writer.write_char("-")
1868-
writer.write_ucs4("max=\U0010ffff", 5)
1869-
writer.write_char(".")
1928+
writer.write_ucs4("ascii IGNORED".encode(encoding), 5)
1929+
writer.write_char(ord("-"))
1930+
writer.write_ucs4("latin1=\xe9".encode(encoding))
1931+
writer.write_char(ord("-"))
1932+
writer.write_ucs4("euro=\u20ac".encode(encoding))
1933+
writer.write_char(ord("-"))
1934+
writer.write_ucs4("max=\U0010ffff".encode(encoding))
1935+
writer.write_char(ord("."))
18701936
self.assertEqual(writer.finish(),
18711937
"ascii-latin1=\xE9-euro=\u20AC-max=\U0010ffff.")
18721938

18731939
# Test some special characters
18741940
writer = self.create_writer(0)
18751941
# Lone surrogate character
1876-
writer.write_ucs4("lone\uDC80", 5)
1877-
writer.write_char("-")
1942+
writer.write_ucs4("lone\uDC80".encode(encoding, 'surrogatepass'))
1943+
writer.write_char(ord("-"))
18781944
# Surrogate pair
1879-
writer.write_ucs4("pair\uDBFF\uDFFF", 5)
1880-
writer.write_char("-")
1881-
writer.write_ucs4("null[\0]", 7)
1945+
writer.write_ucs4("pair\uD83D\uDC0D".encode(encoding, 'surrogatepass'))
1946+
writer.write_char(ord("-"))
1947+
writer.write_ucs4("null[\0]".encode(encoding), 7)
1948+
invalid = (b'\x00\x00\x11\x00' if sys.byteorder == 'little' else
1949+
b'\x00\x11\x00\x00')
1950+
# CRASHES writer.write_ucs4("invalid".encode(encoding) + invalid)
1951+
writer.write_ucs4(NULL, 0)
1952+
# CRASHES writer.write_ucs4(NULL, 1)
18821953
self.assertEqual(writer.finish(),
1883-
"lone\udc80-pair\udbff-null[\0]")
1954+
"lone\udc80-pair\ud83d\udc0d-null[\x00]")
18841955

18851956
# invalid size
18861957
writer = self.create_writer(0)
18871958
with self.assertRaises(ValueError):
1888-
writer.write_ucs4("text", -1)
1959+
writer.write_ucs4("text".encode(encoding), -1)
1960+
self.assertRaises(ValueError, writer.write_ucs4, b'', -1)
1961+
self.assertRaises(ValueError, writer.write_ucs4, NULL, -1)
18891962

18901963
def test_substring_empty(self):
18911964
writer = self.create_writer(0)
@@ -1911,7 +1984,7 @@ def test_format(self):
19111984
from ctypes import c_int
19121985
writer = self.create_writer(0)
19131986
self.writer_format(writer, b'%s %i', b'abc', c_int(123))
1914-
writer.write_char('.')
1987+
writer.write_char(ord('.'))
19151988
self.assertEqual(writer.finish(), 'abc 123.')
19161989

19171990
def test_recover_error(self):

0 commit comments

Comments
 (0)