Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 709974a

Browse files
author
Ehsan Totoni
committed
optimized str.get() for split view
1 parent d59346e commit 709974a

2 files changed

Lines changed: 62 additions & 2 deletions

File tree

hpat/hiframes/hiframes_typed.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from hpat.hiframes import series_kernels, split_impl
3838
from hpat.hiframes.series_kernels import series_replace_funcs
3939
from hpat.hiframes.split_impl import (string_array_split_view_type,
40-
StringArraySplitViewType)
40+
StringArraySplitViewType, getitem_c_arr, get_array_ctypes_ptr)
4141

4242

4343
_dt_index_binops = ('==', '!=', '>=', '>', '<=', '<', '-',
@@ -1734,9 +1734,50 @@ def _str_get_impl(str_arr, ind):
17341734
out_arr[i] = _str
17351735
return hpat.hiframes.api.init_series(out_arr)
17361736

1737+
if arr_typ == string_array_split_view_type:
1738+
# TODO: refactor and enable distributed
1739+
def _str_get_impl(arr, ind):
1740+
numba.parfor.init_prange()
1741+
n = len(arr)
1742+
n_total_chars = 0
1743+
for i in numba.parfor.internal_prange(n):
1744+
start_index = getitem_c_arr(arr._index_offsets, i)
1745+
# TODO: check num strings and support NAN
1746+
# end_index = getitem_c_arr(arr._index_offsets, i+1)
1747+
data_start = getitem_c_arr(
1748+
arr._data_offsets, start_index + ind)
1749+
data_start += 1
1750+
# get around -1 storage in uint32 problem
1751+
if start_index + ind == 0:
1752+
data_start = 0
1753+
data_end = getitem_c_arr(
1754+
arr._data_offsets, start_index + ind + 1)
1755+
length = data_end - data_start
1756+
n_total_chars += length
1757+
numba.parfor.init_prange()
1758+
out_arr = pre_alloc_string_array(n, n_total_chars)
1759+
for i in numba.parfor.internal_prange(n):
1760+
start_index = getitem_c_arr(arr._index_offsets, i)
1761+
# TODO: check num strings and support NAN
1762+
# end_index = getitem_c_arr(arr._index_offsets, i+1)
1763+
data_start = getitem_c_arr(
1764+
arr._data_offsets, start_index + ind)
1765+
data_start += 1
1766+
# get around -1 storage in uint32 problem
1767+
if start_index + ind == 0:
1768+
data_start = 0
1769+
data_end = getitem_c_arr(
1770+
arr._data_offsets, start_index + ind + 1)
1771+
length = data_end - data_start
1772+
ptr = get_array_ctypes_ptr(arr._data, data_start)
1773+
hpat.str_arr_ext.setitem_str_arr_ptr(out_arr, i, ptr, length)
1774+
return hpat.hiframes.api.init_series(out_arr)
1775+
17371776
return self._replace_func(_str_get_impl, [arr, ind_var],
17381777
pre_nodes=nodes,
1739-
extra_globals={'pre_alloc_string_array': pre_alloc_string_array})
1778+
extra_globals={'pre_alloc_string_array': pre_alloc_string_array,
1779+
'get_array_ctypes_ptr': get_array_ctypes_ptr,
1780+
'getitem_c_arr': getitem_c_arr})
17401781

17411782
def _is_dt_index_binop(self, rhs):
17421783
if rhs.op != 'binop':

hpat/str_arr_ext.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -935,6 +935,25 @@ def setitem_str_arr(context, builder, sig, args):
935935
uni_str.data, uni_str.length, ind])
936936
return context.get_dummy_value()
937937

938+
@intrinsic
939+
def setitem_str_arr_ptr(typingctx, str_arr_t, ind_t, ptr_t, len_t=None):
940+
def codegen(context, builder, sig, args):
941+
arr, ind, ptr, length = args
942+
string_array = context.make_helper(builder, string_array_type, arr)
943+
fnty = lir.FunctionType(lir.VoidType(),
944+
[lir.IntType(32).as_pointer(),
945+
lir.IntType(8).as_pointer(),
946+
lir.IntType(8).as_pointer(),
947+
lir.IntType(64),
948+
lir.IntType(64)])
949+
fn_setitem = builder.module.get_or_insert_function(
950+
fnty, name="setitem_string_array")
951+
builder.call(fn_setitem, [string_array.offsets, string_array.data,
952+
builder.extract_value(ptr, 0), length, ind])
953+
return context.get_dummy_value()
954+
955+
return types.void(str_arr_t, ind_t, ptr_t, len_t), codegen
956+
938957
def lower_is_na(context, builder, bull_bitmap, ind):
939958
fnty = lir.FunctionType(lir.IntType(1),
940959
[lir.IntType(8).as_pointer(),

0 commit comments

Comments
 (0)