Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 3918e54

Browse files
author
Ehsan Totoni
committed
bool arr filter for split view
1 parent e7bb1ea commit 3918e54

3 files changed

Lines changed: 122 additions & 1 deletion

File tree

hpat/_str_ext.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ void* init_string_const(char* in_str);
4444
void dtor_string(std::string** in_str, int64_t size, void* in);
4545
void dtor_string_array(str_arr_payload* in_str, int64_t size, void* in);
4646
void dtor_str_arr_split_view(str_arr_split_view_payload* in_str_arr, int64_t size, void* in);
47+
void str_arr_split_view_alloc(str_arr_split_view_payload* out_view, int64_t num_items, int64_t num_offsets);
4748
void str_arr_split_view_impl(str_arr_split_view_payload* out_view, int64_t n_strs, uint32_t* offsets, char* data, char sep);
4849
const char* get_c_str(std::string* s);
4950
const char* get_char_ptr(char c);
@@ -116,6 +117,8 @@ PyMODINIT_FUNC PyInit_hstr_ext(void) {
116117
PyLong_FromVoidPtr((void*)(&dtor_string_array)));
117118
PyObject_SetAttrString(m, "dtor_str_arr_split_view",
118119
PyLong_FromVoidPtr((void*)(&dtor_str_arr_split_view)));
120+
PyObject_SetAttrString(m, "str_arr_split_view_alloc",
121+
PyLong_FromVoidPtr((void*)(&str_arr_split_view_alloc)));
119122
PyObject_SetAttrString(m, "str_arr_split_view_impl",
120123
PyLong_FromVoidPtr((void*)(&str_arr_split_view_impl)));
121124
PyObject_SetAttrString(m, "get_c_str",
@@ -253,6 +256,13 @@ void dtor_str_arr_split_view(str_arr_split_view_payload* in_str_arr, int64_t siz
253256
return;
254257
}
255258

259+
void str_arr_split_view_alloc(str_arr_split_view_payload* out_view, int64_t num_items, int64_t num_offsets)
260+
{
261+
out_view->index_offsets = new uint32_t[num_items+1];
262+
out_view->data_offsets = new uint32_t[num_offsets];
263+
return;
264+
}
265+
256266
// example: ['AB,CC', 'C,ABB,D', 'G', '', 'g,f']
257267
// offsets [0, 5, 12, 13, 13, 14, 17]
258268
// data_offsets [-1, 2, 5, 4, 6, 10, 12, 11, 13, 12, 13, 12, 14, 16]

hpat/hiframes/split_impl.py

Lines changed: 102 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from .. import hstr_ext
2828
ll.add_symbol('dtor_str_arr_split_view', hstr_ext.dtor_str_arr_split_view)
2929
ll.add_symbol('str_arr_split_view_impl', hstr_ext.str_arr_split_view_impl)
30+
ll.add_symbol('str_arr_split_view_alloc', hstr_ext.str_arr_split_view_alloc)
3031

3132
char_typ = types.uint8
3233
offset_typ = types.uint32
@@ -167,6 +168,7 @@ def codegen(context, builder, sig, args):
167168
out_view.num_items = in_str_arr.num_items
168169
out_view.index_offsets = view_payload.index_offsets
169170
out_view.data_offsets = view_payload.data_offsets
171+
# TODO: incref?
170172
out_view.data = context.compile_internal(
171173
builder, lambda S: get_data_ptr(S),
172174
data_ctypes_type(string_array_type), [str_arr])
@@ -245,6 +247,58 @@ def box_str_arr_split_view(typ, val, c):
245247
return out_arr
246248

247249

250+
@intrinsic
251+
def pre_alloc_str_arr_view(typingctx, num_items_t, num_offsets_t, data_t=None):
252+
assert num_items_t == types.intp and num_offsets_t == types.intp
253+
def codegen(context, builder, sig, args):
254+
num_items, num_offsets, data_ptr = args
255+
meminfo, meminfo_data_ptr = construct_str_arr_split_view(
256+
context, builder)
257+
258+
# (str_arr_split_view_payload* out_view, int64_t num_items,
259+
# int64_t num_offsets)
260+
fnty = lir.FunctionType(
261+
lir.VoidType(),
262+
[meminfo_data_ptr.type, lir.IntType(64), lir.IntType(64)])
263+
264+
fn_impl = builder.module.get_or_insert_function(
265+
fnty, name="str_arr_split_view_alloc")
266+
267+
builder.call(fn_impl,
268+
[meminfo_data_ptr, num_items, num_offsets])
269+
270+
271+
view_payload = cgutils.create_struct_proxy(
272+
str_arr_split_view_payload_type)(
273+
context, builder, value=builder.load(meminfo_data_ptr))
274+
275+
out_view = context.make_helper(builder, string_array_split_view_type)
276+
out_view.num_items = num_items
277+
out_view.index_offsets = view_payload.index_offsets
278+
out_view.data_offsets = view_payload.data_offsets
279+
# TODO: incref?
280+
out_view.data = data_ptr
281+
# out_view.null_bitmap = view_payload.null_bitmap
282+
out_view.meminfo = meminfo
283+
ret = out_view._getvalue()
284+
285+
return impl_ret_new_ref(
286+
context, builder, string_array_split_view_type, ret)
287+
288+
return string_array_split_view_type(
289+
types.intp, types.intp, data_t), codegen
290+
291+
292+
@intrinsic
293+
def get_c_arr_ptr(typingctx, c_arr, ind_t=None):
294+
def codegen(context, builder, sig, args):
295+
in_arr, ind = args
296+
return builder.bitcast(
297+
builder.gep(in_arr, [ind]), lir.IntType(8).as_pointer())
298+
299+
return types.voidptr(c_arr, ind_t), codegen
300+
301+
248302
@intrinsic
249303
def getitem_c_arr(typingctx, c_arr, ind_t=None):
250304
def codegen(context, builder, sig, args):
@@ -254,6 +308,16 @@ def codegen(context, builder, sig, args):
254308
return c_arr.dtype(c_arr, ind_t), codegen
255309

256310

311+
@intrinsic
312+
def setitem_c_arr(typingctx, c_arr, ind_t, item_t=None):
313+
def codegen(context, builder, sig, args):
314+
in_arr, ind, item = args
315+
ptr = builder.gep(in_arr, [ind])
316+
builder.store(item, ptr)
317+
318+
return types.void(c_arr, ind_t, c_arr.dtype), codegen
319+
320+
257321
@intrinsic
258322
def get_array_ctypes_ptr(typingctx, arr_ctypes_t, ind_t=None):
259323
def codegen(context, builder, sig, args):
@@ -286,7 +350,6 @@ def _impl(A, ind):
286350
end_index = getitem_c_arr(A._index_offsets, ind+1)
287351
n = end_index - start_index - 1
288352

289-
290353
str_list = hpat.str_ext.alloc_str_list(n)
291354
for i in range(n):
292355
data_start = getitem_c_arr(
@@ -306,3 +369,41 @@ def _impl(A, ind):
306369
return str_list
307370

308371
return _impl
372+
373+
if A == string_array_split_view_type and ind == types.Array(types.bool_, 1, 'C'):
374+
def _impl(A, ind):
375+
n = len(A)
376+
if n != len(ind):
377+
raise IndexError("boolean index did not match indexed array"
378+
" along dimension 0")
379+
380+
num_items = 0
381+
num_offsets = 0
382+
for i in range(n):
383+
if ind[i]:
384+
num_items += 1
385+
start_index = getitem_c_arr(A._index_offsets, i)
386+
end_index = getitem_c_arr(A._index_offsets, i+1)
387+
num_offsets += end_index - start_index
388+
389+
out_arr = pre_alloc_str_arr_view(num_items, num_offsets, A._data)
390+
item_ind = 0
391+
offset_ind = 0
392+
for i in range(n):
393+
if ind[i]:
394+
start_index = getitem_c_arr(A._index_offsets, i)
395+
end_index = getitem_c_arr(A._index_offsets, i+1)
396+
n_offsets = end_index - start_index
397+
398+
setitem_c_arr(out_arr._index_offsets, item_ind, offset_ind)
399+
ptr = get_c_arr_ptr(A._data_offsets, start_index)
400+
out_ptr = get_c_arr_ptr(out_arr._data_offsets, offset_ind)
401+
_memcpy(out_ptr, ptr, n_offsets, 4)
402+
item_ind += 1
403+
offset_ind += n_offsets
404+
405+
# last item
406+
setitem_c_arr(out_arr._index_offsets, item_ind, offset_ind)
407+
return out_arr
408+
409+
return _impl

hpat/tests/test_hiframes.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,16 @@ def test_impl(df):
373373
pd.testing.assert_series_equal(
374374
hpat_func(df), test_impl(df), check_names=False)
375375

376+
def test_str_split_filter(self):
377+
def test_impl(df):
378+
B = df.A.str.split(',')
379+
df2 = pd.DataFrame({'B': B})
380+
return df2[df2.B.str.len()>1]
381+
382+
df = pd.DataFrame({'A': ['AB,CC', 'C,ABB,D', 'G', '', 'g,f']})
383+
hpat_func = hpat.jit(test_impl)
384+
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
385+
376386
def test_str_split_box_df(self):
377387
def test_impl(df):
378388
return pd.DataFrame({'B': df.A.str.split(',')})

0 commit comments

Comments
 (0)