Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit f69dbe2

Browse files
author
Ehsan Totoni
committed
refactor df value type inference
1 parent 4630e9a commit f69dbe2

1 file changed

Lines changed: 29 additions & 27 deletions

File tree

hpat/hiframes/boxing.py

Lines changed: 29 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@
2121
from hpat.str_arr_ext import (string_array_type, unbox_str_series, box_str_arr)
2222
from hpat.hiframes.pd_categorical_ext import (PDCategoricalDtype,
2323
box_categorical_array, unbox_categorical_array)
24-
from hpat.hiframes.pd_series_ext import SeriesType, arr_to_series_type
24+
from hpat.hiframes.pd_series_ext import (SeriesType, arr_to_series_type,
25+
_get_series_array_type)
2526
from hpat.hiframes.split_impl import (string_array_split_view_type,
2627
box_str_arr_split_view)
2728

@@ -103,46 +104,47 @@ def unbox_dataframe(typ, val, c):
103104
def get_hiframes_dtypes(df):
104105
"""get hiframe data types for a pandas dataframe
105106
"""
106-
pd_typ_list = df.dtypes.tolist()
107107
col_names = df.columns.tolist()
108-
hi_typs = []
109-
for cname, typ in zip(col_names, pd_typ_list):
110-
if typ == np.dtype('O'):
111-
# XXX assuming the whole column is strings if 1st val is string
112-
first_val = df[cname].iloc[0]
113-
if isinstance(first_val, list):
114-
typ = _infer_series_list_type(df[cname], cname)
115-
hi_typs.append(typ)
116-
continue
117-
if isinstance(first_val, str):
118-
hi_typs.append(string_array_type)
119-
continue
120-
else:
121-
raise ValueError("data type for column {} not supported".format(cname))
122-
try:
123-
t = numpy_support.from_dtype(typ)
124-
hi_typs.append(types.Array(t, 1, 'C'))
125-
except NotImplementedError:
126-
raise ValueError("data type for column {} not supported".format(cname))
127-
108+
hi_typs = [_get_series_array_type(_infer_series_dtype(df[cname]))
109+
for cname in col_names]
128110
return tuple(hi_typs)
129111

130112

131-
def _infer_series_list_type(S, cname):
113+
def _infer_series_dtype(S):
114+
if S.dtype == np.dtype('O'):
115+
# XXX assuming the whole column is strings if 1st val is string
116+
first_val = S.iloc[0]
117+
if isinstance(first_val, list):
118+
return _infer_series_list_dtype(S)
119+
elif isinstance(first_val, str):
120+
return string_type
121+
else:
122+
raise ValueError(
123+
"data type for column {} not supported".format(S.name))
124+
125+
# regular numpy types
126+
try:
127+
return numpy_support.from_dtype(S.dtype)
128+
except NotImplementedError:
129+
raise ValueError("data type for column {} not supported".format(S.name))
130+
131+
132+
133+
def _infer_series_list_dtype(S):
132134
for i in range(len(S)):
133135
first_val = S.iloc[i]
134136
if not isinstance(first_val, list):
135137
raise ValueError(
136-
"data type for column {} not supported".format(cname))
138+
"data type for column {} not supported".format(S.name))
137139
if len(first_val) > 0:
138140
# TODO: support more types
139141
if isinstance(first_val[0], str):
140-
return list_string_array_type
142+
return types.List(string_type)
141143
else:
142144
raise ValueError(
143-
"data type for column {} not supported".format(cname))
145+
"data type for column {} not supported".format(S.name))
144146
raise ValueError(
145-
"data type for column {} not supported".format(cname))
147+
"data type for column {} not supported".format(S.name))
146148

147149

148150
@box(DataFrameType)

0 commit comments

Comments
 (0)