2424from hpat .str_ext import (string_type , unicode_to_std_str , std_str_to_unicode ,
2525 list_string_array_type )
2626from hpat .str_arr_ext import (string_array_type , StringArrayType ,
27- is_str_arr_typ , pre_alloc_string_array )
27+ is_str_arr_typ , pre_alloc_string_array , get_utf8_size )
2828from hpat .hiframes .pd_series_ext import (SeriesType , is_str_series_typ ,
2929 series_to_array_type , is_dt64_series_typ ,
3030 if_series_to_array_type , is_series_type ,
@@ -1661,7 +1661,7 @@ def _run_pd_DatetimeIndex(self, assign, lhs, rhs):
16611661
16621662 def _run_series_str_method (self , assign , lhs , series_var , func_name , rhs ):
16631663
1664- supported_methods = (hpat .hiframes .pd_series_ext .str2str_methods
1664+ supported_methods = (hpat .hiframes .pd_series_ext .str2str_methods
16651665 + ('len' , 'replace' , 'split' , 'get' , 'contains' ))
16661666 if func_name not in supported_methods :
16671667 raise NotImplementedError (
@@ -1681,7 +1681,7 @@ def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
16811681 else :
16821682 func_text += ' num_chars = 0\n '
16831683 func_text += ' for i in numba.parfor.internal_prange(n):\n '
1684- func_text += ' num_chars += len (str_arr[i].{}())\n ' .format (func_name )
1684+ func_text += ' num_chars += get_utf8_size (str_arr[i].{}())\n ' .format (func_name )
16851685 func_text += ' S = hpat.str_arr_ext.pre_alloc_string_array(n, num_chars)\n '
16861686 func_text += ' for i in numba.parfor.internal_prange(n):\n '
16871687 func_text += ' S[i] = str_arr[i].{}()\n ' .format (func_name )
@@ -1691,7 +1691,10 @@ def _run_series_str_method(self, assign, lhs, series_var, func_name, rhs):
16911691 exec (func_text , {}, loc_vars )
16921692 f = loc_vars ['f' ]
16931693 return self ._replace_func (f , [arr ], pre_nodes = nodes ,
1694- extra_globals = {'num_total_chars' : hpat .str_arr_ext .num_total_chars })
1694+ extra_globals = {
1695+ 'num_total_chars' : hpat .str_arr_ext .num_total_chars ,
1696+ 'get_utf8_size' : hpat .str_arr_ext .get_utf8_size ,
1697+ })
16951698
16961699 if func_name == 'contains' :
16971700 return self ._run_series_str_contains (rhs , arr , nodes )
@@ -1742,7 +1745,8 @@ def _run_series_str_replace(self, assign, lhs, arr, rhs, nodes):
17421745 [arr , rhs .args [0 ], rhs .args [1 ]], pre_nodes = nodes ,
17431746 extra_globals = {'unicode_to_std_str' : unicode_to_std_str ,
17441747 'std_str_to_unicode' : std_str_to_unicode ,
1745- 'pre_alloc_string_array' : pre_alloc_string_array }
1748+ 'pre_alloc_string_array' : pre_alloc_string_array ,
1749+ 'get_utf8_size' : get_utf8_size }
17461750 )
17471751
17481752
@@ -1794,7 +1798,7 @@ def _str_get_impl(str_arr, ind):
17941798 in_list_str = str_arr [i ]
17951799 out_str = in_list_str [ind ]
17961800 str_list [i ] = out_str
1797- n_total_chars += len (out_str )
1801+ n_total_chars += get_utf8_size (out_str )
17981802 numba .parfor .init_prange ()
17991803 out_arr = pre_alloc_string_array (n , n_total_chars )
18001804 for i in numba .parfor .internal_prange (n ):
@@ -1825,7 +1829,8 @@ def _str_get_impl(arr, ind):
18251829 'get_array_ctypes_ptr' : get_array_ctypes_ptr ,
18261830 'getitem_c_arr' : getitem_c_arr ,
18271831 'get_split_view_index' : get_split_view_index ,
1828- 'get_split_view_data_ptr' : get_split_view_data_ptr })
1832+ 'get_split_view_data_ptr' : get_split_view_data_ptr ,
1833+ 'get_utf8_size' : get_utf8_size })
18291834
18301835 def _is_dt_index_binop (self , rhs ):
18311836 if rhs .op != 'binop' :
0 commit comments