@@ -32,6 +32,7 @@ typedef struct {
3232 NRT_MemInfo *buffer;
3333 void *data;
3434 enum PyUnicode_Kind kind;
35+ int is_ascii;
3536 Py_UCS4 maxchar;
3637 Py_ssize_t size;
3738 Py_ssize_t pos;
@@ -62,6 +63,7 @@ _C_UnicodeWriter_Init(_C_UnicodeWriter *writer)
6263 /* use a value smaller than PyUnicode_1BYTE_KIND() so
6364 _C_UnicodeWriter_PrepareKind() will copy the buffer. */
6465 writer->kind = PyUnicode_WCHAR_KIND;
66+ writer->is_ascii = 0 ;
6567 assert (writer->kind <= PyUnicode_1BYTE_KIND);
6668}
6769
@@ -98,6 +100,10 @@ _C_UnicodeWriter_Init(_C_UnicodeWriter *writer)
98100
99101#include " stringlib/bytesobject.cpp"
100102
103+ #include " stringlib/asciilib.h"
104+ #include " stringlib/codecs.h"
105+ #include " stringlib/undef.h"
106+
101107#include " stringlib/ucs1lib.h"
102108#include " stringlib/codecs.h"
103109#include " stringlib/undef.h"
@@ -115,14 +121,17 @@ static int _copy_characters(NRT_MemInfo *to, Py_ssize_t to_start,
115121 NRT_MemInfo *from, Py_ssize_t from_start,
116122 Py_ssize_t how_many, unsigned int from_kind, unsigned int to_kind);
117123
124+
125+ // similar to PyUnicode_New()
118126NRT_MemInfo *alloc_writer (_C_UnicodeWriter *writer, Py_ssize_t newlen, Py_UCS4 maxchar)
119127{
120128 enum PyUnicode_Kind kind;
129+ int is_ascii = 0 ;
121130 Py_ssize_t char_size;
122131
123132 if (maxchar < 128 ) {
124- // TODO: anything needed for ASCII?
125133 kind = PyUnicode_1BYTE_KIND;
134+ is_ascii = 1 ;
126135 char_size = 1 ;
127136 }
128137 else if (maxchar < 256 ) {
@@ -156,13 +165,15 @@ NRT_MemInfo *alloc_writer(_C_UnicodeWriter *writer, Py_ssize_t newlen, Py_UCS4 m
156165
157166 if (!writer->readonly ) {
158167 writer->kind = kind;
168+ writer->is_ascii = is_ascii;
159169 writer->size = newlen;
160170 }
161171 else {
162172 /* use a value smaller than PyUnicode_1BYTE_KIND() so
163173 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
164174 writer->kind = PyUnicode_WCHAR_KIND;
165175 assert (writer->kind <= PyUnicode_1BYTE_KIND);
176+ writer->is_ascii = 0 ;
166177
167178 /* Copy-on-write mode: set buffer size to 0 so
168179 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
@@ -311,7 +322,7 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
311322}
312323
313324// ported from CPython PyUnicode_DecodeUTF8Stateful: https://github.com/python/cpython/blob/31e8d69bfe7cf5d4ffe0967cb225d2a8a229cc97/Objects/unicodeobject.c#L4813
314- void decode_utf8 (const char *s, Py_ssize_t size, int * kind, int * length, NRT_MemInfo** meminfo)
325+ void decode_utf8 (const char *s, Py_ssize_t size, int * kind, int *is_ascii, int * length, NRT_MemInfo** meminfo)
315326{
316327 _C_UnicodeWriter writer;
317328 const char *starts = s;
@@ -320,11 +331,13 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
320331 Py_ssize_t startinpos;
321332 Py_ssize_t endinpos;
322333 const char *errmsg = " " ;
334+ *is_ascii = 0 ;
323335
324336 if (size == 0 ) {
325337 (*meminfo) = NRT_MemInfo_alloc_safe (1 );
326338 ((char *)((*meminfo)->data ))[0 ] = 0 ;
327339 *kind = PyUnicode_1BYTE_KIND;
340+ *is_ascii = 1 ;
328341 *length = 0 ;
329342 return ;
330343 }
@@ -336,6 +349,7 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
336349 ((char *)((*meminfo)->data ))[0 ] = s[0 ];
337350 ((char *)((*meminfo)->data ))[1 ] = 0 ;
338351 *kind = PyUnicode_1BYTE_KIND;
352+ *is_ascii = 1 ;
339353 *length = 1 ;
340354 return ;
341355 }
@@ -352,8 +366,10 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
352366 int kind = writer.kind ;
353367
354368 if (kind == PyUnicode_1BYTE_KIND) {
355- // TODO: anything needed for ASCII?
356- ch = ucs1lib_utf8_decode (&s, end, (Py_UCS1*)writer.data , &writer.pos );
369+ if (writer.is_ascii == 1 )
370+ ch = asciilib_utf8_decode (&s, end, (Py_UCS1*)writer.data , &writer.pos );
371+ else
372+ ch = ucs1lib_utf8_decode (&s, end, (Py_UCS1*)writer.data , &writer.pos );
357373 } else if (kind == PyUnicode_2BYTE_KIND) {
358374 ch = ucs2lib_utf8_decode (&s, end, (Py_UCS2*)writer.data , &writer.pos );
359375 } else {
@@ -398,6 +414,7 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
398414End:
399415 (*meminfo) = writer.buffer ;
400416 *kind = writer.kind ;
417+ *is_ascii = writer.is_ascii ;
401418 *length = writer.pos ;
402419 // set null
403420 if (writer.kind == PyUnicode_1BYTE_KIND) {
0 commit comments