|
21 | 21 | from hpat.str_arr_ext import (string_array_type, unbox_str_series, box_str_arr) |
22 | 22 | from hpat.hiframes.pd_categorical_ext import (PDCategoricalDtype, |
23 | 23 | box_categorical_array, unbox_categorical_array) |
24 | | -from hpat.hiframes.pd_series_ext import SeriesType, arr_to_series_type |
| 24 | +from hpat.hiframes.pd_series_ext import (SeriesType, arr_to_series_type, |
| 25 | + _get_series_array_type) |
25 | 26 | from hpat.hiframes.split_impl import (string_array_split_view_type, |
26 | 27 | box_str_arr_split_view) |
27 | 28 |
|
@@ -103,46 +104,47 @@ def unbox_dataframe(typ, val, c): |
103 | 104 | def get_hiframes_dtypes(df): |
104 | 105 | """get hiframe data types for a pandas dataframe |
105 | 106 | """ |
106 | | - pd_typ_list = df.dtypes.tolist() |
107 | 107 | col_names = df.columns.tolist() |
108 | | - hi_typs = [] |
109 | | - for cname, typ in zip(col_names, pd_typ_list): |
110 | | - if typ == np.dtype('O'): |
111 | | - # XXX assuming the whole column is strings if 1st val is string |
112 | | - first_val = df[cname].iloc[0] |
113 | | - if isinstance(first_val, list): |
114 | | - typ = _infer_series_list_type(df[cname], cname) |
115 | | - hi_typs.append(typ) |
116 | | - continue |
117 | | - if isinstance(first_val, str): |
118 | | - hi_typs.append(string_array_type) |
119 | | - continue |
120 | | - else: |
121 | | - raise ValueError("data type for column {} not supported".format(cname)) |
122 | | - try: |
123 | | - t = numpy_support.from_dtype(typ) |
124 | | - hi_typs.append(types.Array(t, 1, 'C')) |
125 | | - except NotImplementedError: |
126 | | - raise ValueError("data type for column {} not supported".format(cname)) |
127 | | - |
| 108 | + hi_typs = [_get_series_array_type(_infer_series_dtype(df[cname])) |
| 109 | + for cname in col_names] |
128 | 110 | return tuple(hi_typs) |
129 | 111 |
|
130 | 112 |
|
131 | | -def _infer_series_list_type(S, cname): |
| 113 | +def _infer_series_dtype(S): |
| 114 | + if S.dtype == np.dtype('O'): |
| 115 | + # XXX assuming the whole column is strings if 1st val is string |
| 116 | + first_val = S.iloc[0] |
| 117 | + if isinstance(first_val, list): |
| 118 | + return _infer_series_list_dtype(S) |
| 119 | + elif isinstance(first_val, str): |
| 120 | + return string_type |
| 121 | + else: |
| 122 | + raise ValueError( |
| 123 | + "data type for column {} not supported".format(S.name)) |
| 124 | + |
| 125 | + # regular numpy types |
| 126 | + try: |
| 127 | + return numpy_support.from_dtype(S.dtype) |
| 128 | + except NotImplementedError: |
| 129 | + raise ValueError("data type for column {} not supported".format(S.name)) |
| 130 | + |
| 131 | + |
| 132 | + |
| 133 | +def _infer_series_list_dtype(S): |
132 | 134 | for i in range(len(S)): |
133 | 135 | first_val = S.iloc[i] |
134 | 136 | if not isinstance(first_val, list): |
135 | 137 | raise ValueError( |
136 | | - "data type for column {} not supported".format(cname)) |
| 138 | + "data type for column {} not supported".format(S.name)) |
137 | 139 | if len(first_val) > 0: |
138 | 140 | # TODO: support more types |
139 | 141 | if isinstance(first_val[0], str): |
140 | | - return list_string_array_type |
| 142 | + return types.List(string_type) |
141 | 143 | else: |
142 | 144 | raise ValueError( |
143 | | - "data type for column {} not supported".format(cname)) |
| 145 | + "data type for column {} not supported".format(S.name)) |
144 | 146 | raise ValueError( |
145 | | - "data type for column {} not supported".format(cname)) |
| 147 | + "data type for column {} not supported".format(S.name)) |
146 | 148 |
|
147 | 149 |
|
148 | 150 | @box(DataFrameType) |
|
0 commit comments