Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit d79a316

Browse files
author
Ehsan Totoni
committed
use is_ascii flag in utf8 decode
1 parent 91dfb8f commit d79a316

3 files changed

Lines changed: 52 additions & 4 deletions

File tree

hpat/_str_decode.cpp

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ typedef struct {
3232
NRT_MemInfo *buffer;
3333
void *data;
3434
enum PyUnicode_Kind kind;
35+
int is_ascii;
3536
Py_UCS4 maxchar;
3637
Py_ssize_t size;
3738
Py_ssize_t pos;
@@ -62,6 +63,7 @@ _C_UnicodeWriter_Init(_C_UnicodeWriter *writer)
6263
/* use a value smaller than PyUnicode_1BYTE_KIND() so
6364
_C_UnicodeWriter_PrepareKind() will copy the buffer. */
6465
writer->kind = PyUnicode_WCHAR_KIND;
66+
writer->is_ascii = 0;
6567
assert(writer->kind <= PyUnicode_1BYTE_KIND);
6668
}
6769

@@ -98,6 +100,10 @@ _C_UnicodeWriter_Init(_C_UnicodeWriter *writer)
98100

99101
#include "stringlib/bytesobject.cpp"
100102

103+
#include "stringlib/asciilib.h"
104+
#include "stringlib/codecs.h"
105+
#include "stringlib/undef.h"
106+
101107
#include "stringlib/ucs1lib.h"
102108
#include "stringlib/codecs.h"
103109
#include "stringlib/undef.h"
@@ -115,14 +121,17 @@ static int _copy_characters(NRT_MemInfo *to, Py_ssize_t to_start,
115121
NRT_MemInfo *from, Py_ssize_t from_start,
116122
Py_ssize_t how_many, unsigned int from_kind, unsigned int to_kind);
117123

124+
125+
// similar to PyUnicode_New()
118126
NRT_MemInfo *alloc_writer(_C_UnicodeWriter *writer, Py_ssize_t newlen, Py_UCS4 maxchar)
119127
{
120128
enum PyUnicode_Kind kind;
129+
int is_ascii = 0;
121130
Py_ssize_t char_size;
122131

123132
if (maxchar < 128) {
124-
// TODO: anything needed for ASCII?
125133
kind = PyUnicode_1BYTE_KIND;
134+
is_ascii = 1;
126135
char_size = 1;
127136
}
128137
else if (maxchar < 256) {
@@ -156,13 +165,15 @@ NRT_MemInfo *alloc_writer(_C_UnicodeWriter *writer, Py_ssize_t newlen, Py_UCS4 m
156165

157166
if (!writer->readonly) {
158167
writer->kind = kind;
168+
writer->is_ascii = is_ascii;
159169
writer->size = newlen;
160170
}
161171
else {
162172
/* use a value smaller than PyUnicode_1BYTE_KIND() so
163173
_PyUnicodeWriter_PrepareKind() will copy the buffer. */
164174
writer->kind = PyUnicode_WCHAR_KIND;
165175
assert(writer->kind <= PyUnicode_1BYTE_KIND);
176+
writer->is_ascii = 0;
166177

167178
/* Copy-on-write mode: set buffer size to 0 so
168179
* _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
@@ -311,7 +322,7 @@ ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
311322
}
312323

313324
// ported from CPython PyUnicode_DecodeUTF8Stateful: https://github.com/python/cpython/blob/31e8d69bfe7cf5d4ffe0967cb225d2a8a229cc97/Objects/unicodeobject.c#L4813
314-
void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_MemInfo** meminfo)
325+
void decode_utf8(const char *s, Py_ssize_t size, int* kind, int *is_ascii, int* length, NRT_MemInfo** meminfo)
315326
{
316327
_C_UnicodeWriter writer;
317328
const char *starts = s;
@@ -320,11 +331,13 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
320331
Py_ssize_t startinpos;
321332
Py_ssize_t endinpos;
322333
const char *errmsg = "";
334+
*is_ascii = 0;
323335

324336
if (size == 0) {
325337
(*meminfo) = NRT_MemInfo_alloc_safe(1);
326338
((char*)((*meminfo)->data))[0] = 0;
327339
*kind = PyUnicode_1BYTE_KIND;
340+
*is_ascii = 1;
328341
*length = 0;
329342
return;
330343
}
@@ -336,6 +349,7 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
336349
((char*)((*meminfo)->data))[0] = s[0];
337350
((char*)((*meminfo)->data))[1] = 0;
338351
*kind = PyUnicode_1BYTE_KIND;
352+
*is_ascii = 1;
339353
*length = 1;
340354
return;
341355
}
@@ -352,8 +366,10 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
352366
int kind = writer.kind;
353367

354368
if (kind == PyUnicode_1BYTE_KIND) {
355-
// TODO: anything needed for ASCII?
356-
ch = ucs1lib_utf8_decode(&s, end, (Py_UCS1*)writer.data, &writer.pos);
369+
if (writer.is_ascii == 1)
370+
ch = asciilib_utf8_decode(&s, end, (Py_UCS1*)writer.data, &writer.pos);
371+
else
372+
ch = ucs1lib_utf8_decode(&s, end, (Py_UCS1*)writer.data, &writer.pos);
357373
} else if (kind == PyUnicode_2BYTE_KIND) {
358374
ch = ucs2lib_utf8_decode(&s, end, (Py_UCS2*)writer.data, &writer.pos);
359375
} else {
@@ -398,6 +414,7 @@ void decode_utf8(const char *s, Py_ssize_t size, int* kind, int* length, NRT_Mem
398414
End:
399415
(*meminfo) = writer.buffer;
400416
*kind = writer.kind;
417+
*is_ascii = writer.is_ascii;
401418
*length = writer.pos;
402419
// set null
403420
if (writer.kind == PyUnicode_1BYTE_KIND) {

hpat/str_arr_ext.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,12 +1044,14 @@ def codegen(context, builder, sig, args):
10441044
fnty = lir.FunctionType(lir.VoidType(), [lir.IntType(8).as_pointer(),
10451045
lir.IntType(64),
10461046
lir.IntType(32).as_pointer(),
1047+
lir.IntType(32).as_pointer(),
10471048
lir.IntType(64).as_pointer(),
10481049
uni_str.meminfo.type.as_pointer()])
10491050
fn_decode = builder.module.get_or_insert_function(
10501051
fnty, name="decode_utf8")
10511052
builder.call(fn_decode, [ptr, length,
10521053
uni_str._get_ptr_by_name('kind'),
1054+
uni_str._get_ptr_by_name('is_ascii'),
10531055
uni_str._get_ptr_by_name('length'),
10541056
uni_str._get_ptr_by_name('meminfo')])
10551057
uni_str.hash = context.get_constant(_Py_hash_t, -1)

hpat/stringlib/asciilib.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
/* this is sort of a hack. there's at least one place (formatting
2+
floats) where some stringlib code takes a different path if it's
3+
compiled as unicode. */
4+
#define STRINGLIB_IS_UNICODE 1
5+
6+
#define FASTSEARCH asciilib_fastsearch
7+
#define STRINGLIB(F) asciilib_##F
8+
#define STRINGLIB_OBJECT PyUnicodeObject
9+
#define STRINGLIB_SIZEOF_CHAR 1
10+
#define STRINGLIB_MAX_CHAR 0x7Fu
11+
#define STRINGLIB_CHAR Py_UCS1
12+
#define STRINGLIB_TYPE_NAME "unicode"
13+
#define STRINGLIB_PARSE_CODE "U"
14+
#define STRINGLIB_EMPTY unicode_empty
15+
#define STRINGLIB_ISSPACE Py_UNICODE_ISSPACE
16+
#define STRINGLIB_ISLINEBREAK BLOOM_LINEBREAK
17+
#define STRINGLIB_ISDECIMAL Py_UNICODE_ISDECIMAL
18+
#define STRINGLIB_TODECIMAL Py_UNICODE_TODECIMAL
19+
#define STRINGLIB_STR PyUnicode_1BYTE_DATA
20+
#define STRINGLIB_LEN PyUnicode_GET_LENGTH
21+
#define STRINGLIB_NEW(STR,LEN) _PyUnicode_FromASCII((char*)(STR),(LEN))
22+
#define STRINGLIB_CHECK PyUnicode_Check
23+
#define STRINGLIB_CHECK_EXACT PyUnicode_CheckExact
24+
25+
#define STRINGLIB_TOSTR PyObject_Str
26+
#define STRINGLIB_TOASCII PyObject_ASCII
27+
28+
#define _Py_InsertThousandsGrouping _PyUnicode_ascii_InsertThousandsGrouping
29+

0 commit comments

Comments
 (0)