Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 4630e9a

Browse files
author
Ehsan Totoni
committed
optimized to_numeric() using string array view
1 parent 709974a commit 4630e9a

5 files changed

Lines changed: 117 additions & 4 deletions

File tree

hpat/_str_ext.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ using std::regex;
1616
using std::regex_search;
1717
#endif
1818

19+
#include <boost/lexical_cast.hpp>
20+
1921
#ifndef _WIN32
2022
#include <glob.h>
2123
#endif
@@ -76,6 +78,8 @@ void* getitem_string_array_std(uint32_t *offsets, char *data, int64_t index);
7678
void print_str(std::string* str);
7779
void print_char(char c);
7880
void print_int(int64_t val);
81+
int str_arr_to_int64(int64_t* out, uint32_t *offsets, char *data, int64_t index);
82+
int str_arr_to_float64(double* out, uint32_t *offsets, char *data, int64_t index);
7983
void* compile_regex(std::string* pat);
8084
bool str_contains_regex(std::string* str, regex* e);
8185
bool str_contains_noregex(std::string* str, std::string* pat);
@@ -167,6 +171,10 @@ PyMODINIT_FUNC PyInit_hstr_ext(void) {
167171
PyLong_FromVoidPtr((void*)(&print_char)));
168172
PyObject_SetAttrString(m, "print_int",
169173
PyLong_FromVoidPtr((void*)(&print_int)));
174+
PyObject_SetAttrString(m, "str_arr_to_int64",
175+
PyLong_FromVoidPtr((void*)(&str_arr_to_int64)));
176+
PyObject_SetAttrString(m, "str_arr_to_float64",
177+
PyLong_FromVoidPtr((void*)(&str_arr_to_float64)));
170178
PyObject_SetAttrString(m, "compile_regex",
171179
PyLong_FromVoidPtr((void*)(&compile_regex)));
172180
PyObject_SetAttrString(m, "str_contains_noregex",
@@ -493,6 +501,42 @@ void* getitem_string_array_std(uint32_t *offsets, char *data, int64_t index)
493501
return new std::string(&data[start], size);
494502
}
495503

504+
505+
int str_arr_to_int64(int64_t* out, uint32_t *offsets, char *data, int64_t index)
506+
{
507+
uint32_t size = offsets[index+1]-offsets[index];
508+
uint32_t start = offsets[index];
509+
try
510+
{
511+
*out = boost::lexical_cast<int64_t>(data+start, (std::size_t)size);
512+
return 0;
513+
}
514+
catch(const boost::bad_lexical_cast &)
515+
{
516+
*out = 0;
517+
return -1;
518+
}
519+
return -1;
520+
}
521+
522+
int str_arr_to_float64(double* out, uint32_t *offsets, char *data, int64_t index)
523+
{
524+
uint32_t size = offsets[index+1]-offsets[index];
525+
uint32_t start = offsets[index];
526+
try
527+
{
528+
*out = boost::lexical_cast<double>(data+start, (std::size_t)size);
529+
return 0;
530+
}
531+
catch(const boost::bad_lexical_cast &)
532+
{
533+
*out = std::nan(""); // TODO: numpy NaN
534+
return -1;
535+
}
536+
return -1;
537+
}
538+
539+
496540
void* compile_regex(std::string* pat)
497541
{
498542
// printf("compiling\n");

hpat/hiframes/hiframes_typed.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -762,28 +762,30 @@ def _flatten_impl(A):
762762

763763
if func_name == 'to_numeric':
764764
out_dtype = self.typemap[lhs.name].dtype
765-
conv_func = int
765+
conv_func = hpat.str_arr_ext.str_arr_item_to_int64
766766
if out_dtype == types.float64:
767-
conv_func = float
767+
conv_func = hpat.str_arr_ext.str_arr_item_to_float64
768768
else:
769769
assert out_dtype == types.int64
770770

771771
# TODO: handle non-Series input
772772

773773
def _to_numeric_impl(A):
774+
# TODO: fix distributed
774775
numba.parfor.init_prange()
775776
n = len(A)
776777
B = np.empty(n, out_dtype)
777778
for i in numba.parfor.internal_prange(n):
778-
B[i] = conv_func(A[i])
779+
conv_func(get_c_arr_ptr(B.ctypes, i), A, i)
779780

780781
return hpat.hiframes.api.init_series(B)
781782

782783
nodes = []
783784
data = self._get_series_data(rhs.args[0], nodes)
784785
return self._replace_func(_to_numeric_impl, [data],
785786
pre_nodes=nodes,
786-
extra_globals={'out_dtype': out_dtype, 'conv_func': conv_func})
787+
extra_globals={'out_dtype': out_dtype, 'conv_func': conv_func,
788+
'get_c_arr_ptr': hpat.hiframes.split_impl.get_c_arr_ptr})
787789

788790
if func_name == 'parse_datetimes_from_strings':
789791
nodes = []

hpat/hiframes/split_impl.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,8 +293,12 @@ def codegen(context, builder, sig, args):
293293

294294
@intrinsic
295295
def get_c_arr_ptr(typingctx, c_arr, ind_t=None):
296+
assert isinstance(c_arr, (types.CPointer, types.ArrayCTypes))
296297
def codegen(context, builder, sig, args):
297298
in_arr, ind = args
299+
if isinstance(sig.args[0], types.ArrayCTypes):
300+
in_arr = builder.extract_value(in_arr, 0)
301+
298302
return builder.bitcast(
299303
builder.gep(in_arr, [ind]), lir.IntType(8).as_pointer())
300304

hpat/str_arr_ext.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -593,6 +593,8 @@ def str_arr_len(str_arr):
593593
ll.add_symbol('print_int', hstr_ext.print_int)
594594
ll.add_symbol('convert_len_arr_to_offset', hstr_ext.convert_len_arr_to_offset)
595595
ll.add_symbol('set_string_array_range', hstr_ext.set_string_array_range)
596+
ll.add_symbol('str_arr_to_int64', hstr_ext.str_arr_to_int64)
597+
ll.add_symbol('str_arr_to_float64', hstr_ext.str_arr_to_float64)
596598

597599
convert_len_arr_to_offset = types.ExternalFunction("convert_len_arr_to_offset", types.void(types.voidptr, types.intp))
598600

@@ -1084,6 +1086,56 @@ def str_arr_arr_impl(str_arr, ind_arr):
10841086
res = context.compile_internal(builder, str_arr_arr_impl, sig, args)
10851087
return res
10861088

1089+
@intrinsic
1090+
def str_arr_item_to_int64(typingctx, out_ptr_t, str_arr_t, ind_t=None):
1091+
assert str_arr_t == string_array_type
1092+
assert ind_t == types.int64
1093+
1094+
def codegen(context, builder, sig, args):
1095+
out_ptr, arr, ind = args
1096+
out_ptr = builder.bitcast(out_ptr, lir.IntType(64).as_pointer())
1097+
string_array = context.make_helper(builder, string_array_type, arr)
1098+
fnty = lir.FunctionType(
1099+
lir.IntType(32),
1100+
[lir.IntType(64).as_pointer(),
1101+
lir.IntType(32).as_pointer(),
1102+
lir.IntType(8).as_pointer(),
1103+
lir.IntType(64)])
1104+
fn_setitem = builder.module.get_or_insert_function(
1105+
fnty, name="str_arr_to_int64")
1106+
return builder.call(
1107+
fn_setitem,
1108+
[out_ptr, string_array.offsets, string_array.data, ind])
1109+
1110+
return types.int32(
1111+
out_ptr_t, string_array_type, types.int64), codegen
1112+
1113+
1114+
@intrinsic
1115+
def str_arr_item_to_float64(typingctx, out_ptr_t, str_arr_t, ind_t=None):
1116+
assert str_arr_t == string_array_type
1117+
assert ind_t == types.int64
1118+
1119+
def codegen(context, builder, sig, args):
1120+
out_ptr, arr, ind = args
1121+
out_ptr = builder.bitcast(out_ptr, lir.DoubleType().as_pointer())
1122+
string_array = context.make_helper(builder, string_array_type, arr)
1123+
fnty = lir.FunctionType(
1124+
lir.IntType(32),
1125+
[lir.DoubleType().as_pointer(),
1126+
lir.IntType(32).as_pointer(),
1127+
lir.IntType(8).as_pointer(),
1128+
lir.IntType(64)])
1129+
fn_setitem = builder.module.get_or_insert_function(
1130+
fnty, name="str_arr_to_float64")
1131+
return builder.call(
1132+
fn_setitem,
1133+
[out_ptr, string_array.offsets, string_array.data, ind])
1134+
1135+
return types.int32(
1136+
out_ptr_t, string_array_type, types.int64), codegen
1137+
1138+
10871139

10881140
# TODO: support array of strings
10891141
# @typeof_impl.register(np.ndarray)

hpat/tests/test_hiframes.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,17 @@ def test_impl(df):
453453
self.assertEqual(count_array_REPs(), 3)
454454
self.assertEqual(count_parfor_REPs(), 0)
455455

456+
def test_str_get_to_numeric(self):
457+
def test_impl(df):
458+
B = df.A.str.split(',')
459+
C = pd.to_numeric(B.str.get(1), errors='coerce')
460+
return C
461+
462+
df = pd.DataFrame({'A': ['AB,12', 'C,321,D']})
463+
hpat_func = hpat.jit(locals={'C': hpat.int64[:]})(test_impl)
464+
pd.testing.assert_series_equal(
465+
hpat_func(df), test_impl(df), check_names=False)
466+
456467
def test_str_flatten(self):
457468
def test_impl(df):
458469
A = df.A.str.split(',')

0 commit comments

Comments
 (0)