Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 91dfb8f

Browse files
author
Ehsan Totoni
authored
Merge pull request #83 from IntelLabs/encode_utf8
Encode utf8
2 parents 9b89199 + 61740ad commit 91dfb8f

15 files changed

Lines changed: 516 additions & 40 deletions

hpat/_str_decode.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ _C_UnicodeWriter_Init(_C_UnicodeWriter *writer)
9696
(0xffffU) : \
9797
(0x10ffffU)))
9898

99+
#include "stringlib/bytesobject.cpp"
99100

100101
#include "stringlib/ucs1lib.h"
101102
#include "stringlib/codecs.h"
@@ -547,3 +548,19 @@ static inline int _C_UnicodeWriter_WriteCharInline(_C_UnicodeWriter *writer, Py_
547548
writer->pos++;
548549
return 0;
549550
}
551+
552+
553+
int64_t unicode_to_utf8(char* out_data, char* data, int64_t size, int kind)
554+
{
555+
//
556+
switch (kind) {
557+
default:
558+
Py_UNREACHABLE();
559+
case PyUnicode_1BYTE_KIND:
560+
return ucs1lib_utf8_encoder(out_data, (Py_UCS1 *)data, size);
561+
case PyUnicode_2BYTE_KIND:
562+
return ucs2lib_utf8_encoder(out_data, (Py_UCS2 *)data, size);
563+
case PyUnicode_4BYTE_KIND:
564+
return ucs4lib_utf8_encoder(out_data, (Py_UCS4 *)data, size);
565+
}
566+
}

hpat/_str_ext.cpp

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,8 @@ void* np_array_from_string_array(int64_t no_strings, const uint32_t * offset_tab
6969
void allocate_string_array(uint32_t **offsets, char **data, uint8_t **null_bitmap,
7070
int64_t num_strings, int64_t total_size);
7171

72-
void setitem_string_array(uint32_t *offsets, char *data, char* str, int64_t len, int64_t index);
72+
void setitem_string_array(uint32_t *offsets, char *data, int64_t n_bytes, char* str, int64_t len, int kind, int is_ascii, int64_t index);
73+
int64_t get_utf8_size(char* str, int64_t len, int kind);
7374

7475
void set_string_array_range(uint32_t *out_offsets, char *out_data,
7576
uint32_t *in_offsets, char *in_data,
@@ -216,6 +217,8 @@ PyMODINIT_FUNC PyInit_hstr_ext(void) {
216217
PyLong_FromVoidPtr((void*)(&init_memsys)));
217218
PyObject_SetAttrString(m, "decode_utf8",
218219
PyLong_FromVoidPtr((void*)(&decode_utf8)));
220+
PyObject_SetAttrString(m, "get_utf8_size",
221+
PyLong_FromVoidPtr((void*)(&get_utf8_size)));
219222
return m;
220223
}
221224

@@ -440,17 +443,34 @@ void allocate_string_array(uint32_t **offsets, char **data, uint8_t **null_bitma
440443
return;
441444
}
442445

443-
void setitem_string_array(uint32_t *offsets, char *data, char* str, int64_t len, int64_t index)
446+
void setitem_string_array(uint32_t *offsets, char *data, int64_t n_bytes, char* str, int64_t len, int kind, int is_ascii, int64_t index)
444447
{
448+
#define CHECK(expr, msg) if(!(expr)){std::cerr << msg << std::endl; return;}
445449
// std::cout << "setitem str: " << *str << " " << index << std::endl;
446450
if (index==0)
447451
offsets[index] = 0;
448452
uint32_t start = offsets[index];
453+
int64_t utf8_len = -1;
449454
// std::cout << "start " << start << " len " << len << std::endl;
450-
memcpy(&data[start], str, len);
451-
assert(len < std::numeric_limits<uint32_t>::max());
452-
offsets[index+1] = start+ (uint32_t)len;
455+
456+
if (is_ascii==1) {
457+
memcpy(&data[start], str, len);
458+
utf8_len = len;
459+
}
460+
else {
461+
utf8_len = unicode_to_utf8(&data[start], str, len, kind);
462+
}
463+
464+
CHECK(utf8_len < std::numeric_limits<uint32_t>::max(), "string array too large");
465+
CHECK(start+utf8_len <= n_bytes, "out of bounds string array setitem");
466+
offsets[index+1] = start+ (uint32_t)utf8_len;
453467
return;
468+
#undef CHECK
469+
}
470+
471+
int64_t get_utf8_size(char* str, int64_t len, int kind)
472+
{
473+
return unicode_to_utf8(NULL, str, len, kind);
454474
}
455475

456476
void set_string_array_range(uint32_t *out_offsets, char *out_data,

hpat/hiframes/aggregate.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,7 @@ def alloc_agg_output_overload(n_uniq_keys, out_dummy_tup, key_set,
885885
# string special case
886886
# TODO: handle strings in multi-key case
887887
if key_types == [string_type]:
888+
# TODO: handle unicode length
888889
func_text += " num_total_chars = num_total_chars_set_string(key_set)\n"
889890
func_text += " out_key_0 = pre_alloc_string_array(n_uniq_keys, num_total_chars)\n"
890891
else:

hpat/hiframes/hiframes_typed.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from hpat.str_ext import (string_type, unicode_to_std_str, std_str_to_unicode,
2525
list_string_array_type)
2626
from hpat.str_arr_ext import (string_array_type, StringArrayType,
27-
is_str_arr_typ, pre_alloc_string_array)
27+
is_str_arr_typ, pre_alloc_string_array, get_utf8_size)
2828
from hpat.hiframes.pd_series_ext import (SeriesType, is_str_series_typ,
2929
series_to_array_type, is_dt64_series_typ,
3030
if_series_to_array_type, is_series_type,
@@ -1661,7 +1661,7 @@ def _run_pd_DatetimeIndex(self, assign, lhs, rhs):
16611661

16621662
def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
16631663

1664-
supported_methods = (hpat.hiframes.pd_series_ext.str2str_methods
1664+
supported_methods = (hpat.hiframes.pd_series_ext.str2str_methods
16651665
+ ('len', 'replace', 'split', 'get', 'contains'))
16661666
if func_name not in supported_methods:
16671667
raise NotImplementedError(
@@ -1681,7 +1681,7 @@ def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
16811681
else:
16821682
func_text += ' num_chars = 0\n'
16831683
func_text += ' for i in numba.parfor.internal_prange(n):\n'
1684-
func_text += ' num_chars += len(str_arr[i].{}())\n'.format(func_name)
1684+
func_text += ' num_chars += get_utf8_size(str_arr[i].{}())\n'.format(func_name)
16851685
func_text += ' S = hpat.str_arr_ext.pre_alloc_string_array(n, num_chars)\n'
16861686
func_text += ' for i in numba.parfor.internal_prange(n):\n'
16871687
func_text += ' S[i] = str_arr[i].{}()\n'.format(func_name)
@@ -1691,7 +1691,10 @@ def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
16911691
exec(func_text, {}, loc_vars)
16921692
f = loc_vars['f']
16931693
return self._replace_func(f, [arr], pre_nodes=nodes,
1694-
extra_globals={'num_total_chars': hpat.str_arr_ext.num_total_chars})
1694+
extra_globals={
1695+
'num_total_chars': hpat.str_arr_ext.num_total_chars,
1696+
'get_utf8_size': hpat.str_arr_ext.get_utf8_size,
1697+
})
16951698

16961699
if func_name == 'contains':
16971700
return self._run_series_str_contains(rhs, arr, nodes)
@@ -1742,7 +1745,8 @@ def _run_series_str_replace(self, assign, lhs, arr, rhs, nodes):
17421745
[arr, rhs.args[0], rhs.args[1]], pre_nodes=nodes,
17431746
extra_globals={'unicode_to_std_str': unicode_to_std_str,
17441747
'std_str_to_unicode': std_str_to_unicode,
1745-
'pre_alloc_string_array': pre_alloc_string_array}
1748+
'pre_alloc_string_array': pre_alloc_string_array,
1749+
'get_utf8_size': get_utf8_size}
17461750
)
17471751

17481752

@@ -1794,7 +1798,7 @@ def _str_get_impl(str_arr, ind):
17941798
in_list_str = str_arr[i]
17951799
out_str = in_list_str[ind]
17961800
str_list[i] = out_str
1797-
n_total_chars += len(out_str)
1801+
n_total_chars += get_utf8_size(out_str)
17981802
numba.parfor.init_prange()
17991803
out_arr = pre_alloc_string_array(n, n_total_chars)
18001804
for i in numba.parfor.internal_prange(n):
@@ -1825,7 +1829,8 @@ def _str_get_impl(arr, ind):
18251829
'get_array_ctypes_ptr': get_array_ctypes_ptr,
18261830
'getitem_c_arr': getitem_c_arr,
18271831
'get_split_view_index': get_split_view_index,
1828-
'get_split_view_data_ptr': get_split_view_data_ptr})
1832+
'get_split_view_data_ptr': get_split_view_data_ptr,
1833+
'get_utf8_size': get_utf8_size})
18291834

18301835
def _is_dt_index_binop(self, rhs):
18311836
if rhs.op != 'binop':

hpat/hiframes/join.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
get_offset_ptr, get_data_ptr, convert_len_arr_to_offset,
1919
pre_alloc_string_array, num_total_chars,
2020
getitem_str_offset, copy_str_arr_slice,
21-
str_copy_ptr,
21+
str_copy_ptr, get_utf8_size,
2222
setitem_str_offset, str_arr_set_na)
2323
from hpat.str_ext import string_type
2424
from hpat.timsort import copyElement_tup, getitem_arr_tup, setitem_arr_tup
@@ -582,7 +582,7 @@ def write_data_buff_overload(meta, node_id, i, val, data):
582582
if not typ in (string_type, string_array_type):
583583
func_text += " meta.send_buff_tup[{}][w_ind] = val_{}\n".format(i, i)
584584
else:
585-
func_text += " n_chars_{} = len(val_{})\n".format(i, i)
585+
func_text += " n_chars_{} = get_utf8_size(val_{})\n".format(i, i)
586586
func_text += " meta.send_arr_lens_tup[{}][w_ind] = n_chars_{}\n".format(n_str, i)
587587
func_text += " indc_{} = meta.send_disp_char_tup[{}][node_id] + meta.tmp_offset_char_tup[{}][node_id]\n".format(i, n_str, n_str)
588588
func_text += " str_copy_ptr(meta.send_arr_chars_tup[{}], indc_{}, val_{}._data, n_chars_{})\n".format(n_str, i, i, i)
@@ -594,7 +594,8 @@ def write_data_buff_overload(meta, node_id, i, val, data):
594594
# print(func_text)
595595

596596
loc_vars = {}
597-
exec(func_text, {'str_copy_ptr': str_copy_ptr}, loc_vars)
597+
exec(func_text, {'str_copy_ptr': str_copy_ptr,
598+
'get_utf8_size': get_utf8_size}, loc_vars)
598599
write_impl = loc_vars['f']
599600
return write_impl
600601

@@ -644,15 +645,16 @@ def write_data_send_buff_overload(meta_tup, node_id, ind, data, key_meta):
644645
else:
645646
# TODO: fix
646647
assert typ == string_array_type
647-
func_text += " n_chars_{} = len(val_{})\n".format(i, i)
648+
func_text += " n_chars_{} = get_utf8_size(val_{})\n".format(i, i)
648649
func_text += " meta_tup[{}].send_arr_lens[ind_{}] = n_chars_{}\n".format(i, i, i)
649650
func_text += " indc_{} = meta_tup[{}].send_disp_char[node_id] + meta_tup[{}].tmp_offset_char[node_id]\n".format(i, i, i)
650651
func_text += " str_copy_ptr(meta_tup[{}].send_arr_chars, indc_{}, val_{}._data, n_chars_{})\n".format(i, i, i, i)
651652
func_text += " meta_tup[{}].tmp_offset_char[node_id] += n_chars_{}\n".format(i, i)
652653

653654
func_text += " return\n"
654655
loc_vars = {}
655-
exec(func_text, {'str_copy_ptr': str_copy_ptr}, loc_vars)
656+
exec(func_text, {'str_copy_ptr': str_copy_ptr,
657+
'get_utf8_size': get_utf8_size}, loc_vars)
656658
write_impl = loc_vars['f']
657659
return write_impl
658660

hpat/hiframes/series_kernels.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import hpat
1010
from hpat.str_ext import string_type, unicode_to_std_str, std_str_to_unicode
1111
from hpat.str_arr_ext import (string_array_type, StringArrayType,
12-
is_str_arr_typ, pre_alloc_string_array)
12+
is_str_arr_typ, pre_alloc_string_array, get_utf8_size)
1313

1414

1515
# float columns can have regular np.nan
@@ -363,7 +363,6 @@ def _series_astype_str_impl(arr):
363363
return hpat.hiframes.api.init_series(A)
364364

365365

366-
# TODO: refactor regex and noregex
367366
def _str_replace_regex_impl(str_arr, pat, val):
368367
numba.parfor.init_prange()
369368
e = hpat.str_ext.compile_regex(unicode_to_std_str(pat))
@@ -372,6 +371,7 @@ def _str_replace_regex_impl(str_arr, pat, val):
372371
n_total_chars = 0
373372
str_list = hpat.str_ext.alloc_str_list(n)
374373
for i in numba.parfor.internal_prange(n):
374+
# TODO: support unicode
375375
in_str = unicode_to_std_str(str_arr[i])
376376
out_str = std_str_to_unicode(
377377
hpat.str_ext.str_replace_regex(in_str, e, val))
@@ -385,19 +385,39 @@ def _str_replace_regex_impl(str_arr, pat, val):
385385
return hpat.hiframes.api.init_series(out_arr)
386386

387387

388+
# TODO: refactor regex and noregex
389+
# implementation using std::string
390+
# def _str_replace_noregex_impl(str_arr, pat, val):
391+
# numba.parfor.init_prange()
392+
# e = unicode_to_std_str(pat)
393+
# val = unicode_to_std_str(val)
394+
# n = len(str_arr)
395+
# n_total_chars = 0
396+
# str_list = hpat.str_ext.alloc_str_list(n)
397+
# for i in numba.parfor.internal_prange(n):
398+
# # TODO: support unicode
399+
# in_str = unicode_to_std_str(str_arr[i])
400+
# out_str = std_str_to_unicode(
401+
# hpat.str_ext.str_replace_noregex(in_str, e, val))
402+
# str_list[i] = out_str
403+
# n_total_chars += len(out_str)
404+
# numba.parfor.init_prange()
405+
# out_arr = pre_alloc_string_array(n, n_total_chars)
406+
# for i in numba.parfor.internal_prange(n):
407+
# _str = str_list[i]
408+
# out_arr[i] = _str
409+
# return hpat.hiframes.api.init_series(out_arr)
410+
411+
388412
def _str_replace_noregex_impl(str_arr, pat, val):
389413
numba.parfor.init_prange()
390-
e = unicode_to_std_str(pat)
391-
val = unicode_to_std_str(val)
392414
n = len(str_arr)
393415
n_total_chars = 0
394416
str_list = hpat.str_ext.alloc_str_list(n)
395417
for i in numba.parfor.internal_prange(n):
396-
in_str = unicode_to_std_str(str_arr[i])
397-
out_str = std_str_to_unicode(
398-
hpat.str_ext.str_replace_noregex(in_str, e, val))
418+
out_str = str_arr[i].replace(pat, val)
399419
str_list[i] = out_str
400-
n_total_chars += len(out_str)
420+
n_total_chars += get_utf8_size(out_str)
401421
numba.parfor.init_prange()
402422
out_arr = pre_alloc_string_array(n, n_total_chars)
403423
for i in numba.parfor.internal_prange(n):

hpat/hiframes/sort.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -401,7 +401,8 @@ def to_string_list_typ(typ):
401401
return typ
402402

403403

404-
@numba.njit(no_cpython_wrapper=True, cache=True)
404+
# TODO: fix cache issue
405+
@numba.njit(no_cpython_wrapper=True, cache=False)
405406
def local_sort(key_arrs, data, ascending=True):
406407
# convert StringArray to list(string) to enable swapping in sort
407408
l_key_arrs = to_string_list(key_arrs)

hpat/set_ext.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ def to_array_overload(A):
170170
if A == set_string_type:
171171
#
172172
def set_string_to_array(A):
173+
# TODO: support unicode
173174
num_total_chars = num_total_chars_set_string(A)
174175
num_strs = len(A)
175176
str_arr = pre_alloc_string_array(num_strs, num_total_chars)

0 commit comments

Comments
 (0)