Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit d5f706f

Browse files
Refactor df.drop to improve compile times (#945)
1 parent 8520b62 commit d5f706f

3 files changed

Lines changed: 277 additions & 116 deletions

File tree

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 97 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@
5252
from sdc.datatypes.range_index_type import RangeIndexType
5353

5454
from sdc.hiframes.pd_dataframe_type import DataFrameType
55+
from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps
5556
from sdc.hiframes.pd_series_type import SeriesType
5657

5758
from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType,
@@ -1337,40 +1338,69 @@ def isna_overload(df):
13371338
return sdc_pandas_dataframe_isna_codegen(df, 'isna')
13381339

13391340

1340-
def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols):
1341+
def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_col_names):
13411342
"""
13421343
Example of generated implementation:
13431344
def sdc_pandas_dataframe_drop_impl(df, labels=None, axis=0, index=None, columns=None,
1344-
level=None, inplace=False, errors="raise"):
1345-
new_col_0_data_df = df._data[1][0]
1346-
new_col_1_data_df = df._data[0][1]
1347-
return pandas.DataFrame({"B": new_col_0_data_df, "C": new_col_1_data_df}, index=df.index)
1345+
level=None, inplace=False, errors="raise"):
1346+
list_0 = df._data[0].copy()
1347+
for col_id in old_scheme_drop_idxs_0[::-1]:
1348+
list_0.pop(col_id)
1349+
list_1 = df._data[1].copy()
1350+
new_data = (list_1, list_0, )
1351+
return init_dataframe_internal(new_data, df._index, df_type)
13481352
"""
13491353
indent = 4 * ' '
1350-
saved_df_columns = [column for column in df.columns if column not in drop_cols]
1351-
func_definition = [f'def sdc_pandas_dataframe_{func_name}_impl({", ".join(func_args)}):']
1354+
func_definition = [f'def {func_name}({", ".join(func_args)}):']
13521355
func_text = []
1353-
column_list = []
13541356

1355-
for label in drop_cols:
1357+
old_column_loc, old_data_typs_map, old_types_order = get_structure_maps(df.data, df.columns)
1358+
1359+
new_data_typs = tuple(t for i, t in enumerate(df.data) if df.columns[i] not in drop_col_names)
1360+
new_column_names = tuple(c for c in df.columns if c not in drop_col_names)
1361+
new_column_loc, new_data_typs_map, new_types_order = get_structure_maps(new_data_typs, new_column_names)
1362+
1363+
old_types_idxs_map = dict(zip(old_types_order, range(len(old_types_order))))
1364+
reorder_scheme = tuple(old_types_idxs_map[t] for t in new_types_order)
1365+
df_type = DataFrameType(new_data_typs, df.index, new_column_names, column_loc=new_column_loc)
1366+
1367+
old_scheme_drop_idxs = []
1368+
for i, k in enumerate(old_types_order):
1369+
a = [j for j, x in enumerate(old_data_typs_map[k][1]) if df.columns[x] in drop_col_names]
1370+
old_scheme_drop_idxs.append(tuple(a) or None)
1371+
1372+
for label in drop_col_names:
13561373
if label not in df.columns:
13571374
func_text.append(f'if errors == "raise":')
13581375
func_text.append(indent + f'raise ValueError("The label {label} is not found in the selected axis")')
13591376
break
13601377

1361-
for column_id, column_name in enumerate(saved_df_columns):
1362-
col_loc = df.column_loc[column_name]
1363-
type_id, col_id = col_loc.type_id, col_loc.col_id
1364-
func_text.append(f'new_col_{column_id}_data_df = df._data[{type_id}][{col_id}]')
1365-
column_list.append((f'new_col_{column_id}_data_df', column_name))
1366-
1367-
data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list)
1368-
index = 'df.index'
1369-
func_text.append(f"return pandas.DataFrame({{{data}}}, index={index})\n")
1378+
old_ntypes = len(old_types_order)
1379+
for type_id in range(old_ntypes):
1380+
func_text.append(f'list_{type_id} = df._data[{type_id}].copy()')
1381+
if old_scheme_drop_idxs[type_id]:
1382+
func_text.append(f'for col_id in old_scheme_drop_idxs_{type_id}[::-1]:')
1383+
func_text.append(indent + f'list_{type_id}.pop(col_id)')
1384+
1385+
# in new df the order of array lists (i.e. types_order) can be different, so
1386+
# making a new tuple of lists reorder as needed
1387+
new_ntypes = len(new_types_order)
1388+
data_lists_reordered = ', '.join(['list_' + str(reorder_scheme[i]) for i in range(new_ntypes)])
1389+
data_val = '(' + data_lists_reordered + ', )' if new_ntypes > 0 else '()'
1390+
1391+
data, index = 'new_data', 'df._index'
1392+
func_text.append(f'{data} = {data_val}')
1393+
func_text.append(f"return init_dataframe_internal({data}, {index}, df_type)\n")
13701394
func_definition.extend([indent + func_line for func_line in func_text])
13711395
func_def = '\n'.join(func_definition)
13721396

1373-
global_vars = {'pandas': pandas}
1397+
global_vars = {
1398+
'pandas': pandas,
1399+
'init_dataframe_internal': init_dataframe_internal,
1400+
'df_type': df_type
1401+
}
1402+
1403+
global_vars.update({f'old_scheme_drop_idxs_{i}': old_scheme_drop_idxs[i] for i in range(old_ntypes)})
13741404

13751405
return func_def, global_vars
13761406

@@ -1387,7 +1417,8 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None,
13871417
-----------
13881418
- Parameters ``labels``, ``axis``, ``index``, ``level`` and ``inplace`` are currently unsupported.
13891419
- Parameter ``columns`` is required and is expected to be a Literal value with one column name
1390-
or Tuple with columns names.
1420+
or List with columns names. Mutating a list of column names after it was defined and then using it as a
1421+
columns argument results in an SDCLimitation exception at runtime.
13911422
- Supported ``errors`` can be {``raise``, ``ignore``}, default ``raise``. If ``ignore``, suppress error and only
13921423
existing labels are dropped.
13931424
@@ -1420,36 +1451,66 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None,
14201451
14211452
"""
14221453

1423-
_func_name = 'drop'
1454+
method_name = f'Method drop().'
14241455

1425-
ty_checker = TypeChecker(f'Method {_func_name}().')
1456+
ty_checker = TypeChecker(method_name)
14261457
ty_checker.check(df, DataFrameType)
14271458

1428-
if not isinstance(labels, types.Omitted) and labels is not None:
1459+
if not isinstance(labels, (types.Omitted, types.NoneType)) and labels is not None:
14291460
ty_checker.raise_exc(labels, 'None', 'labels')
14301461

1431-
if not isinstance(axis, (int, types.Omitted)):
1462+
if not isinstance(axis, (types.Omitted, types.Integer)) and axis != 0:
14321463
ty_checker.raise_exc(axis, 'int', 'axis')
14331464

1434-
if not isinstance(index, types.Omitted) and index is not None:
1465+
if not isinstance(index, (types.Omitted, types.NoneType)) and index is not None:
14351466
ty_checker.raise_exc(index, 'None', 'index')
14361467

1437-
if not isinstance(columns, (types.Omitted, types.Tuple, types.Literal)):
1438-
ty_checker.raise_exc(columns, 'str, tuple of str', 'columns')
1468+
if not (isinstance(columns, (types.Omitted, types.StringLiteral))
1469+
or (isinstance(columns, types.Tuple)
1470+
and all(isinstance(c, types.StringLiteral) for c in columns))
1471+
or (isinstance(columns, types.UniTuple) and isinstance(columns.dtype, types.StringLiteral))
1472+
or isinstance(columns, types.List) and isinstance(columns.dtype, types.UnicodeType)
1473+
):
1474+
ty_checker.raise_exc(columns, 'str, list of const str', 'columns')
14391475

1440-
if not isinstance(level, (types.Omitted, types.Literal)) and level is not None:
1476+
if not isinstance(level, (types.Omitted, types.NoneType, types.Literal)) and level is not None:
14411477
ty_checker.raise_exc(level, 'None', 'level')
14421478

1443-
if not isinstance(inplace, (bool, types.Omitted)) and inplace:
1479+
if not isinstance(inplace, (types.Omitted, types.NoneType, types.Boolean)) and inplace:
14441480
ty_checker.raise_exc(inplace, 'bool', 'inplace')
14451481

1446-
if not isinstance(errors, (str, types.Omitted, types.Literal)):
1482+
if not isinstance(errors, (types.Omitted, types.UnicodeType, types.StringLiteral)) and errors != "raise":
14471483
ty_checker.raise_exc(errors, 'str', 'errors')
14481484

1485+
if isinstance(columns, types.List):
1486+
if columns.initial_value is None:
1487+
raise TypingError('{} Unsupported use of parameter columns:'
1488+
' expected list of constant strings. Given: {}'.format(method_name, columns))
1489+
else:
1490+
# this works because global tuple of strings is captured as Tuple of StringLiterals
1491+
columns_as_tuple = tuple(columns.initial_value)
1492+
def _sdc_pandas_dataframe_drop_wrapper_impl(df, labels=None, axis=0, index=None,
1493+
columns=None, level=None, inplace=False, errors="raise"):
1494+
1495+
# if at runtime columns list differs from it's initial value (known at compile time)
1496+
# we cannot tell which columns to drop and what is the resulting DataFrameType, so raise exception
1497+
if list(columns_as_tuple) != columns:
1498+
raise SDCLimitation("Unsupported use of parameter columns: non-const list was used.")
1499+
1500+
return df.drop(labels=labels,
1501+
axis=axis,
1502+
index=index,
1503+
columns=columns_as_tuple,
1504+
level=level,
1505+
inplace=inplace,
1506+
errors=errors)
1507+
1508+
return _sdc_pandas_dataframe_drop_wrapper_impl
1509+
14491510
args = {'labels': None, 'axis': 0, 'index': None, 'columns': None, 'level': None, 'inplace': False,
14501511
'errors': f'"raise"'}
14511512

1452-
def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
1513+
def sdc_pandas_dataframe_drop_impl(df, args, columns):
14531514
func_args = ['df']
14541515
for key, value in args.items():
14551516
if key not in func_args:
@@ -1459,18 +1520,19 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
14591520

14601521
if isinstance(columns, types.StringLiteral):
14611522
drop_cols = (columns.literal_value,)
1462-
elif isinstance(columns, types.Tuple):
1523+
elif isinstance(columns, (types.Tuple, types.UniTuple)):
14631524
drop_cols = tuple(column.literal_value for column in columns)
14641525
else:
14651526
raise ValueError('Only drop by one column or tuple of columns is currently supported in df.drop()')
14661527

1467-
func_def, global_vars = sdc_pandas_dataframe_drop_codegen(_func_name, func_args, df, drop_cols)
1528+
func_name = 'sdc_pandas_dataframe_drop_impl'
1529+
func_def, global_vars = sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols)
14681530
loc_vars = {}
14691531
exec(func_def, global_vars, loc_vars)
1470-
_drop_impl = loc_vars['sdc_pandas_dataframe_drop_impl']
1532+
_drop_impl = loc_vars[func_name]
14711533
return _drop_impl
14721534

1473-
return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns)
1535+
return sdc_pandas_dataframe_drop_impl(df, args, columns)
14741536

14751537

14761538
def df_length_expr(self):

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,6 @@ def generic_resolve(self, df, attr):
5353
return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
5454

5555

56-
5756
def get_structure_maps(col_types, col_names):
5857
# Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
5958
column_loc = {}
@@ -80,6 +79,31 @@ def get_structure_maps(col_types, col_names):
8079
return column_loc, data_typs_map, types_order
8180

8281

82+
@intrinsic
83+
def init_dataframe_internal(typingctx, data, index, df_type):
84+
85+
ret_type = df_type.instance_type
86+
87+
def codegen(context, builder, sig, args):
88+
data_val, index_val = args[:2]
89+
90+
dataframe = cgutils.create_struct_proxy(
91+
sig.return_type)(context, builder)
92+
dataframe.data = data_val
93+
dataframe.index = index_val
94+
dataframe.parent = context.get_constant_null(types.pyobject)
95+
96+
# increase refcount of stored values
97+
if context.enable_nrt:
98+
context.nrt.incref(builder, sig.args[0], data_val)
99+
context.nrt.incref(builder, sig.args[1], index_val)
100+
101+
return dataframe._getvalue()
102+
103+
sig = signature(ret_type, data, index, df_type)
104+
return sig, codegen
105+
106+
83107
# TODO: alias analysis
84108
# this function should be used for getting df._data for alias analysis to work
85109
# no_cpython_wrapper since Array(DatetimeDate) cannot be boxed

0 commit comments

Comments
 (0)