Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 5d09505

Browse files
Merge pull request #935 from IntelPython/release_0.36.0
Merge release 0.36.0 into master
2 parents ee80555 + 30122b2 commit 5d09505

7 files changed

Lines changed: 125 additions & 143 deletions

File tree

conda-recipe/meta.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ build:
1616

1717
requirements:
1818
build:
19-
- {{ compiler('c') }}
20-
- {{ compiler('cxx') }}
19+
- {{ compiler('c') }} # [not osx]
20+
- {{ compiler('cxx') }} # [not osx]
2121
- wheel
2222
- python
2323

@@ -51,8 +51,8 @@ outputs:
5151
name: sdc
5252
requirements:
5353
build:
54-
- {{ compiler('c') }}
55-
- {{ compiler('cxx') }}
54+
- {{ compiler('c') }} # [not osx]
55+
- {{ compiler('cxx') }} # [not osx]
5656
- python
5757
- wheel
5858
- setuptools

sdc/datatypes/hpat_pandas_functions.py

Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,12 @@
3939

4040
from sdc.io.csv_ext import (
4141
_gen_csv_reader_py_pyarrow_py_func,
42-
_gen_pandas_read_csv_func_text,
42+
_gen_csv_reader_py_pyarrow_func_text_dataframe,
4343
)
4444
from sdc.str_arr_ext import string_array_type
4545

4646
from sdc.hiframes import join, aggregate, sort
4747
from sdc.types import CategoricalDtypeType, Categorical
48-
from sdc.datatypes.categorical.pdimpl import _reconstruct_CategoricalDtype
4948

5049

5150
def get_numba_array_types_for_csv(df):
@@ -256,69 +255,45 @@ def sdc_pandas_read_csv(
256255
usecols = [col.literal_value for col in usecols]
257256

258257
if infer_from_params:
259-
# dtype is a tuple of format ('A', A_dtype, 'B', B_dtype, ...)
260-
# where column names should be constants and is important only for inference from params
258+
# dtype should be constants and is important only for inference from params
261259
if isinstance(dtype, types.Tuple):
262-
assert all(isinstance(key, types.StringLiteral) for key in dtype[::2])
260+
assert all(isinstance(key, types.Literal) for key in dtype[::2])
263261
keys = (k.literal_value for k in dtype[::2])
264-
values = dtype[1::2]
265-
266-
def _get_df_col_type(dtype):
267-
if isinstance(dtype, types.Function):
268-
if dtype.typing_key == int:
269-
return types.Array(types.int_, 1, 'C')
270-
elif dtype.typing_key == float:
271-
return types.Array(types.float64, 1, 'C')
272-
elif dtype.typing_key == str:
273-
return string_array_type
274-
else:
275-
assert False, f"map_dtype_to_col_type: failing to infer column type for dtype={dtype}"
276-
277-
if isinstance(dtype, types.StringLiteral):
278-
if dtype.literal_value == 'str':
279-
return string_array_type
280-
else:
281-
return types.Array(numba.from_dtype(np.dtype(dtype.literal_value)), 1, 'C')
282262

283-
if isinstance(dtype, types.NumberClass):
284-
return types.Array(dtype.dtype, 1, 'C')
285-
286-
if isinstance(dtype, CategoricalDtypeType):
287-
return Categorical(dtype)
263+
values = dtype[1::2]
264+
values = [v.typing_key if isinstance(v, types.Function) else v for v in values]
265+
values = [types.Array(numba.from_dtype(np.dtype(v.literal_value)), 1, 'C')
266+
if isinstance(v, types.Literal) else v for v in values]
267+
values = [types.Array(types.int_, 1, 'C') if v == int else v for v in values]
268+
values = [types.Array(types.float64, 1, 'C') if v == float else v for v in values]
269+
values = [string_array_type if v == str else v for v in values]
270+
values = [Categorical(v) if isinstance(v, CategoricalDtypeType) else v for v in values]
288271

289-
col_types_map = dict(zip(keys, map(_get_df_col_type, values)))
272+
dtype = dict(zip(keys, values))
290273

291274
# in case of both are available
292275
# inferencing from params has priority over inferencing from file
293276
if infer_from_params:
277+
col_names = names
294278
# all names should be in dtype
295-
col_names = usecols if usecols else names
296-
col_types = [col_types_map[n] for n in col_names]
279+
return_columns = usecols if usecols else names
280+
col_typs = [dtype[n] for n in return_columns]
297281

298282
elif infer_from_file:
299-
col_names, col_types = infer_column_names_and_types_from_constant_filename(
283+
col_names, col_typs = infer_column_names_and_types_from_constant_filename(
300284
filepath_or_buffer, delimiter, names, usecols, skiprows)
301285

302286
else:
303287
return None
304288

305-
def _get_py_col_dtype(ctype):
306-
""" Re-creates column dtype as python type to be used in read_csv call """
307-
dtype = ctype.dtype
308-
if ctype == string_array_type:
309-
return str
310-
if isinstance(ctype, Categorical):
311-
return _reconstruct_CategoricalDtype(ctype.pd_dtype)
312-
return numpy_support.as_dtype(dtype)
313-
314-
py_col_dtypes = {cname: _get_py_col_dtype(ctype) for cname, ctype in zip(col_names, col_types)}
289+
dtype_present = not isinstance(dtype, (types.Omitted, type(None)))
315290

316291
# generate function text with signature and returning DataFrame
317-
func_text, func_name, global_vars = _gen_pandas_read_csv_func_text(
318-
col_names, col_types, py_col_dtypes, usecols, signature)
292+
func_text, func_name = _gen_csv_reader_py_pyarrow_func_text_dataframe(
293+
col_names, col_typs, dtype_present, usecols, signature)
319294

320295
# compile with Python
321-
csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name, global_vars)
296+
csv_reader_py = _gen_csv_reader_py_pyarrow_py_func(func_text, func_name)
322297

323298
return csv_reader_py
324299

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727

2828
import operator
29+
from typing import NamedTuple
2930

3031
import numba
3132
from numba import types
@@ -38,7 +39,7 @@
3839
from numba.core.imputils import impl_ret_new_ref, impl_ret_borrowed
3940

4041
from sdc.hiframes.pd_series_ext import SeriesType
41-
from sdc.hiframes.pd_dataframe_type import DataFrameType, ColumnLoc
42+
from sdc.hiframes.pd_dataframe_type import DataFrameType
4243
from sdc.str_ext import string_type
4344

4445

@@ -53,6 +54,10 @@ def generic_resolve(self, df, attr):
5354
return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
5455

5556

57+
class ColumnLoc(NamedTuple):
58+
type_id: int
59+
col_id: int
60+
5661

5762
def get_structure_maps(col_types, col_names):
5863
# Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}

sdc/hiframes/pd_dataframe_type.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
2525
# *****************************************************************************
2626

27-
import re
28-
from typing import NamedTuple
2927

3028
import numba
3129
from numba import types
@@ -50,7 +48,7 @@ def __init__(self, data=None, index=None, columns=None, has_parent=False, column
5048
self.has_parent = has_parent
5149
self.column_loc = column_loc
5250
super(DataFrameType, self).__init__(
53-
name="DataFrameType({}, {}, {}, {})".format(data, index, columns, has_parent))
51+
name="dataframe({}, {}, {}, {})".format(data, index, columns, has_parent))
5452

5553
def copy(self, index=None, has_parent=None):
5654
# XXX is copy necessary?
@@ -85,16 +83,6 @@ def unify(self, typingctx, other):
8583
def is_precise(self):
8684
return all(a.is_precise() for a in self.data) and self.index.is_precise()
8785

88-
def __repr__(self):
89-
90-
# To have correct repr of DataFrame we need some changes to what types.Type gives:
91-
# (1) e.g. array(int64, 1d, C) should be Array(int64, 1, 'C')
92-
# (2) ColumnLoc is not part of DataFrame name, so we need to add it
93-
default_repr = super(DataFrameType, self).__repr__()
94-
res = re.sub(r'array\((\w+), 1d, C\)', r'Array(\1, 1, \'C\')', default_repr)
95-
res = re.sub(r'\)$', f', column_loc={self.column_loc})', res)
96-
return res
97-
9886

9987
@register_model(DataFrameType)
10088
class DataFrameModel(models.StructModel):
@@ -116,15 +104,6 @@ def __init__(self, dmm, fe_type):
116104
super(DataFrameModel, self).__init__(dmm, fe_type, members)
117105

118106

119-
class ColumnLoc(NamedTuple):
120-
type_id: int
121-
col_id: int
122-
123-
124-
# FIXME_Numba#3372: add into numba.types to allow returning from objmode
125-
types.DataFrameType = DataFrameType
126-
types.ColumnLoc = ColumnLoc
127-
128107
make_attribute_wrapper(DataFrameType, 'data', '_data')
129108
make_attribute_wrapper(DataFrameType, 'index', '_index')
130109
make_attribute_wrapper(DataFrameType, 'columns', '_columns')

0 commit comments

Comments
 (0)