IntelPython
diff --git a/‎hpat/_str_decode.cpp‎
Lines changed: 17 additions & 0 deletions b/‎hpat/_str_decode.cpp‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎hpat/_str_ext.cpp‎
Lines changed: 25 additions & 5 deletions b/‎hpat/_str_ext.cpp‎
Lines changed: 25 additions & 5 deletions
diff --git a/‎hpat/hiframes/aggregate.py‎
Lines changed: 1 addition & 0 deletions b/‎hpat/hiframes/aggregate.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎hpat/hiframes/hiframes_typed.py‎
Lines changed: 12 additions & 7 deletions b/‎hpat/hiframes/hiframes_typed.py‎
Lines changed: 12 additions & 7 deletions
diff --git a/‎hpat/hiframes/join.py‎
Lines changed: 7 additions & 5 deletions b/‎hpat/hiframes/join.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎hpat/hiframes/series_kernels.py‎
Lines changed: 28 additions & 8 deletions b/‎hpat/hiframes/series_kernels.py‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎hpat/hiframes/sort.py‎
Lines changed: 2 additions & 1 deletion b/‎hpat/hiframes/sort.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎hpat/set_ext.py‎
Lines changed: 1 addition & 0 deletions b/‎hpat/set_ext.py‎
Lines changed: 1 addition & 0 deletions
@@ -96,6 +96,7 @@ _C_UnicodeWriter_Init(_C_UnicodeWriter *writer)
         (0xffffU) :                                                     \
         (0x10ffffU)))
 
+#include "stringlib/bytesobject.cpp"
 
 #include "stringlib/ucs1lib.h"
 #include "stringlib/codecs.h"
@@ -547,3 +548,19 @@ static inline int _C_UnicodeWriter_WriteCharInline(_C_UnicodeWriter *writer, Py_
     writer->pos++;
     return 0;
 }
+
+
+int64_t unicode_to_utf8(char* out_data, char* data, int64_t size, int kind)
+{
+    //
+    switch (kind) {
+    default:
+        Py_UNREACHABLE();
+    case PyUnicode_1BYTE_KIND:
+        return ucs1lib_utf8_encoder(out_data, (Py_UCS1 *)data, size);
+    case PyUnicode_2BYTE_KIND:
+        return ucs2lib_utf8_encoder(out_data, (Py_UCS2 *)data, size);
+    case PyUnicode_4BYTE_KIND:
+        return ucs4lib_utf8_encoder(out_data, (Py_UCS4 *)data, size);
+    }
+}
@@ -69,7 +69,8 @@ void* np_array_from_string_array(int64_t no_strings, const uint32_t * offset_tab
 void allocate_string_array(uint32_t **offsets, char **data, uint8_t **null_bitmap,
     int64_t num_strings, int64_t total_size);
 
-void setitem_string_array(uint32_t *offsets, char *data, char* str, int64_t len, int64_t index);
+void setitem_string_array(uint32_t *offsets, char *data, int64_t n_bytes, char* str, int64_t len, int kind, int is_ascii, int64_t index);
+int64_t get_utf8_size(char* str, int64_t len, int kind);
 
 void set_string_array_range(uint32_t *out_offsets, char *out_data,
                             uint32_t *in_offsets, char *in_data,
@@ -216,6 +217,8 @@ PyMODINIT_FUNC PyInit_hstr_ext(void) {
                             PyLong_FromVoidPtr((void*)(&init_memsys)));
     PyObject_SetAttrString(m, "decode_utf8",
                             PyLong_FromVoidPtr((void*)(&decode_utf8)));
+    PyObject_SetAttrString(m, "get_utf8_size",
+                            PyLong_FromVoidPtr((void*)(&get_utf8_size)));
     return m;
 }
 
@@ -440,17 +443,34 @@ void allocate_string_array(uint32_t **offsets, char **data, uint8_t **null_bitma
     return;
 }
 
-void setitem_string_array(uint32_t *offsets, char *data, char* str, int64_t len, int64_t index)
+void setitem_string_array(uint32_t *offsets, char *data, int64_t n_bytes, char* str, int64_t len, int kind, int is_ascii, int64_t index)
 {
+#define CHECK(expr, msg) if(!(expr)){std::cerr << msg << std::endl; return;}
     // std::cout << "setitem str: " << *str << " " << index << std::endl;
     if (index==0)
         offsets[index] = 0;
     uint32_t start = offsets[index];
+    int64_t utf8_len = -1;
     // std::cout << "start " << start << " len " << len << std::endl;
-    memcpy(&data[start], str, len);
-    assert(len < std::numeric_limits<uint32_t>::max());
-    offsets[index+1] = start+ (uint32_t)len;
+
+    if (is_ascii==1) {
+        memcpy(&data[start], str, len);
+        utf8_len = len;
+    }
+    else {
+        utf8_len = unicode_to_utf8(&data[start], str, len, kind);
+    }
+
+    CHECK(utf8_len < std::numeric_limits<uint32_t>::max(), "string array too large");
+    CHECK(start+utf8_len <= n_bytes, "out of bounds string array setitem");
+    offsets[index+1] = start+ (uint32_t)utf8_len;
     return;
+#undef CHECK
+}
+
+int64_t get_utf8_size(char* str, int64_t len, int kind)
+{
+    return unicode_to_utf8(NULL, str, len, kind);
 }
 
 void set_string_array_range(uint32_t *out_offsets, char *out_data,
 
@@ -885,6 +885,7 @@ def alloc_agg_output_overload(n_uniq_keys, out_dummy_tup, key_set,
         # string special case
         # TODO: handle strings in multi-key case
         if key_types == [string_type]:
+            # TODO: handle unicode length
             func_text += "  num_total_chars = num_total_chars_set_string(key_set)\n"
             func_text += "  out_key_0 = pre_alloc_string_array(n_uniq_keys, num_total_chars)\n"
         else:
 
@@ -24,7 +24,7 @@
 from hpat.str_ext import (string_type, unicode_to_std_str, std_str_to_unicode,
     list_string_array_type)
 from hpat.str_arr_ext import (string_array_type, StringArrayType,
-    is_str_arr_typ, pre_alloc_string_array)
+    is_str_arr_typ, pre_alloc_string_array, get_utf8_size)
 from hpat.hiframes.pd_series_ext import (SeriesType, is_str_series_typ,
     series_to_array_type, is_dt64_series_typ,
     if_series_to_array_type, is_series_type,
@@ -1661,7 +1661,7 @@ def _run_pd_DatetimeIndex(self, assign, lhs, rhs):
 
     def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
 
-        supported_methods = (hpat.hiframes.pd_series_ext.str2str_methods 
+        supported_methods = (hpat.hiframes.pd_series_ext.str2str_methods
             + ('len', 'replace', 'split', 'get', 'contains'))
         if func_name not in supported_methods:
             raise NotImplementedError(
@@ -1681,7 +1681,7 @@ def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
             else:
                 func_text += '    num_chars = 0\n'
                 func_text += '    for i in numba.parfor.internal_prange(n):\n'
-                func_text += '        num_chars += len(str_arr[i].{}())\n'.format(func_name)
+                func_text += '        num_chars += get_utf8_size(str_arr[i].{}())\n'.format(func_name)
             func_text += '    S = hpat.str_arr_ext.pre_alloc_string_array(n, num_chars)\n'
             func_text += '    for i in numba.parfor.internal_prange(n):\n'
             func_text += '        S[i] = str_arr[i].{}()\n'.format(func_name)
@@ -1691,7 +1691,10 @@ def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
             exec(func_text, {}, loc_vars)
             f = loc_vars['f']
             return self._replace_func(f, [arr], pre_nodes=nodes,
-                extra_globals={'num_total_chars': hpat.str_arr_ext.num_total_chars})
+                extra_globals={
+                    'num_total_chars': hpat.str_arr_ext.num_total_chars,
+                    'get_utf8_size': hpat.str_arr_ext.get_utf8_size,
+                })
 
         if func_name == 'contains':
             return self._run_series_str_contains(rhs, arr, nodes)
@@ -1742,7 +1745,8 @@ def _run_series_str_replace(self, assign, lhs, arr, rhs, nodes):
             [arr, rhs.args[0], rhs.args[1]], pre_nodes=nodes,
             extra_globals={'unicode_to_std_str': unicode_to_std_str,
                             'std_str_to_unicode': std_str_to_unicode,
-                            'pre_alloc_string_array': pre_alloc_string_array}
+                            'pre_alloc_string_array': pre_alloc_string_array,
+                            'get_utf8_size': get_utf8_size}
         )
 
 
@@ -1794,7 +1798,7 @@ def _str_get_impl(str_arr, ind):
                 in_list_str = str_arr[i]
                 out_str = in_list_str[ind]
                 str_list[i] = out_str
-                n_total_chars += len(out_str)
+                n_total_chars += get_utf8_size(out_str)
             numba.parfor.init_prange()
             out_arr = pre_alloc_string_array(n, n_total_chars)
             for i in numba.parfor.internal_prange(n):
@@ -1825,7 +1829,8 @@ def _str_get_impl(arr, ind):
                 'get_array_ctypes_ptr': get_array_ctypes_ptr,
                 'getitem_c_arr': getitem_c_arr,
                 'get_split_view_index': get_split_view_index,
-                'get_split_view_data_ptr': get_split_view_data_ptr})
+                'get_split_view_data_ptr': get_split_view_data_ptr,
+                'get_utf8_size': get_utf8_size})
 
     def _is_dt_index_binop(self, rhs):
         if rhs.op != 'binop':
 
@@ -18,7 +18,7 @@
                               get_offset_ptr, get_data_ptr, convert_len_arr_to_offset,
                               pre_alloc_string_array, num_total_chars,
                               getitem_str_offset, copy_str_arr_slice,
-                              str_copy_ptr,
+                              str_copy_ptr, get_utf8_size,
                               setitem_str_offset, str_arr_set_na)
 from hpat.str_ext import string_type
 from hpat.timsort import copyElement_tup, getitem_arr_tup, setitem_arr_tup
@@ -582,7 +582,7 @@ def write_data_buff_overload(meta, node_id, i, val, data):
         if not typ in (string_type, string_array_type):
             func_text += "  meta.send_buff_tup[{}][w_ind] = val_{}\n".format(i, i)
         else:
-            func_text += "  n_chars_{} = len(val_{})\n".format(i, i)
+            func_text += "  n_chars_{} = get_utf8_size(val_{})\n".format(i, i)
             func_text += "  meta.send_arr_lens_tup[{}][w_ind] = n_chars_{}\n".format(n_str, i)
             func_text += "  indc_{} = meta.send_disp_char_tup[{}][node_id] + meta.tmp_offset_char_tup[{}][node_id]\n".format(i, n_str, n_str)
             func_text += "  str_copy_ptr(meta.send_arr_chars_tup[{}], indc_{}, val_{}._data, n_chars_{})\n".format(n_str, i, i, i)
@@ -594,7 +594,8 @@ def write_data_buff_overload(meta, node_id, i, val, data):
     # print(func_text)
 
     loc_vars = {}
-    exec(func_text, {'str_copy_ptr': str_copy_ptr}, loc_vars)
+    exec(func_text, {'str_copy_ptr': str_copy_ptr,
+        'get_utf8_size': get_utf8_size}, loc_vars)
     write_impl = loc_vars['f']
     return write_impl
 
@@ -644,15 +645,16 @@ def write_data_send_buff_overload(meta_tup, node_id, ind, data, key_meta):
         else:
             # TODO: fix
             assert typ == string_array_type
-            func_text += "  n_chars_{} = len(val_{})\n".format(i, i)
+            func_text += "  n_chars_{} = get_utf8_size(val_{})\n".format(i, i)
             func_text += "  meta_tup[{}].send_arr_lens[ind_{}] = n_chars_{}\n".format(i, i, i)
             func_text += "  indc_{} = meta_tup[{}].send_disp_char[node_id] + meta_tup[{}].tmp_offset_char[node_id]\n".format(i, i, i)
             func_text += "  str_copy_ptr(meta_tup[{}].send_arr_chars, indc_{}, val_{}._data, n_chars_{})\n".format(i, i, i, i)
             func_text += "  meta_tup[{}].tmp_offset_char[node_id] += n_chars_{}\n".format(i, i)
 
     func_text += "  return\n"
     loc_vars = {}
-    exec(func_text, {'str_copy_ptr': str_copy_ptr}, loc_vars)
+    exec(func_text, {'str_copy_ptr': str_copy_ptr,
+        'get_utf8_size': get_utf8_size}, loc_vars)
     write_impl = loc_vars['f']
     return write_impl
 
 
@@ -9,7 +9,7 @@
 import hpat
 from hpat.str_ext import string_type, unicode_to_std_str, std_str_to_unicode
 from hpat.str_arr_ext import (string_array_type, StringArrayType,
-    is_str_arr_typ, pre_alloc_string_array)
+    is_str_arr_typ, pre_alloc_string_array, get_utf8_size)
 
 
 # float columns can have regular np.nan
@@ -363,7 +363,6 @@ def _series_astype_str_impl(arr):
     return hpat.hiframes.api.init_series(A)
 
 
-# TODO: refactor regex and noregex
 def _str_replace_regex_impl(str_arr, pat, val):
     numba.parfor.init_prange()
     e = hpat.str_ext.compile_regex(unicode_to_std_str(pat))
@@ -372,6 +371,7 @@ def _str_replace_regex_impl(str_arr, pat, val):
     n_total_chars = 0
     str_list = hpat.str_ext.alloc_str_list(n)
     for i in numba.parfor.internal_prange(n):
+        # TODO: support unicode
         in_str = unicode_to_std_str(str_arr[i])
         out_str = std_str_to_unicode(
             hpat.str_ext.str_replace_regex(in_str, e, val))
@@ -385,19 +385,39 @@ def _str_replace_regex_impl(str_arr, pat, val):
     return hpat.hiframes.api.init_series(out_arr)
 
 
+# TODO: refactor regex and noregex
+# implementation using std::string
+# def _str_replace_noregex_impl(str_arr, pat, val):
+#     numba.parfor.init_prange()
+#     e = unicode_to_std_str(pat)
+#     val = unicode_to_std_str(val)
+#     n = len(str_arr)
+#     n_total_chars = 0
+#     str_list = hpat.str_ext.alloc_str_list(n)
+#     for i in numba.parfor.internal_prange(n):
+#         # TODO: support unicode
+#         in_str = unicode_to_std_str(str_arr[i])
+#         out_str = std_str_to_unicode(
+#             hpat.str_ext.str_replace_noregex(in_str, e, val))
+#         str_list[i] = out_str
+#         n_total_chars += len(out_str)
+#     numba.parfor.init_prange()
+#     out_arr = pre_alloc_string_array(n, n_total_chars)
+#     for i in numba.parfor.internal_prange(n):
+#         _str = str_list[i]
+#         out_arr[i] = _str
+#     return hpat.hiframes.api.init_series(out_arr)
+
+
 def _str_replace_noregex_impl(str_arr, pat, val):
     numba.parfor.init_prange()
-    e = unicode_to_std_str(pat)
-    val = unicode_to_std_str(val)
     n = len(str_arr)
     n_total_chars = 0
     str_list = hpat.str_ext.alloc_str_list(n)
     for i in numba.parfor.internal_prange(n):
-        in_str = unicode_to_std_str(str_arr[i])
-        out_str = std_str_to_unicode(
-            hpat.str_ext.str_replace_noregex(in_str, e, val))
+        out_str = str_arr[i].replace(pat, val)
         str_list[i] = out_str
-        n_total_chars += len(out_str)
+        n_total_chars += get_utf8_size(out_str)
     numba.parfor.init_prange()
     out_arr = pre_alloc_string_array(n, n_total_chars)
     for i in numba.parfor.internal_prange(n):
 
@@ -401,7 +401,8 @@ def to_string_list_typ(typ):
     return typ
 
 
-@numba.njit(no_cpython_wrapper=True, cache=True)
+# TODO: fix cache issue
+@numba.njit(no_cpython_wrapper=True, cache=False)
 def local_sort(key_arrs, data, ascending=True):
     # convert StringArray to list(string) to enable swapping in sort
     l_key_arrs = to_string_list(key_arrs)
 
@@ -170,6 +170,7 @@ def to_array_overload(A):
     if A == set_string_type:
         #
         def set_string_to_array(A):
+            # TODO: support unicode
             num_total_chars = num_total_chars_set_string(A)
             num_strs = len(A)
             str_arr = pre_alloc_string_array(num_strs, num_total_chars)
Original file line number	Diff line number	Diff line change
`@@ -170,6 +170,7 @@ def to_array_overload(A):`
`170`	`170`	`if A == set_string_type:`
`171`	`171`	`#`
`172`	`172`	`def set_string_to_array(A):`
	`173`	`+ # TODO: support unicode`
`173`	`174`	`num_total_chars = num_total_chars_set_string(A)`
`174`	`175`	`num_strs = len(A)`
`175`	`176`	`str_arr = pre_alloc_string_array(num_strs, num_total_chars)`