Refactor df.drop to improve compile times (#945)

kozlov-alexey · web-flow · commit d5f706fe42a4 · 2020-12-07T17:05:29.000+03:00
diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -52,6 +52,7 @@
 from sdc.datatypes.range_index_type import RangeIndexType
 
 from sdc.hiframes.pd_dataframe_type import DataFrameType
+from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps
 from sdc.hiframes.pd_series_type import SeriesType
 
 from sdc.datatypes.hpat_pandas_dataframe_getitem_types import (DataFrameGetitemAccessorType,
@@ -1337,40 +1338,69 @@ def isna_overload(df):
     return sdc_pandas_dataframe_isna_codegen(df, 'isna')
 
 
-def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols):
+def sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_col_names):
     """
     Example of generated implementation:
         def sdc_pandas_dataframe_drop_impl(df, labels=None, axis=0, index=None, columns=None,
-                                           level=None, inplace=False, errors="raise"):
-            new_col_0_data_df = df._data[1][0]
-            new_col_1_data_df = df._data[0][1]
-            return pandas.DataFrame({"B": new_col_0_data_df, "C": new_col_1_data_df}, index=df.index)
+                                            level=None, inplace=False, errors="raise"):
+            list_0 = df._data[0].copy()
+            for col_id in old_scheme_drop_idxs_0[::-1]:
+                list_0.pop(col_id)
+            list_1 = df._data[1].copy()
+            new_data = (list_1, list_0, )
+            return init_dataframe_internal(new_data, df._index, df_type)
     """
     indent = 4 * ' '
-    saved_df_columns = [column for column in df.columns if column not in drop_cols]
-    func_definition = [f'def sdc_pandas_dataframe_{func_name}_impl({", ".join(func_args)}):']
+    func_definition = [f'def {func_name}({", ".join(func_args)}):']
     func_text = []
-    column_list = []
 
-    for label in drop_cols:
+    old_column_loc, old_data_typs_map, old_types_order = get_structure_maps(df.data, df.columns)
+
+    new_data_typs = tuple(t for i, t in enumerate(df.data) if df.columns[i] not in drop_col_names)
+    new_column_names = tuple(c for c in df.columns if c not in drop_col_names)
+    new_column_loc, new_data_typs_map, new_types_order = get_structure_maps(new_data_typs, new_column_names)
+
+    old_types_idxs_map = dict(zip(old_types_order, range(len(old_types_order))))
+    reorder_scheme = tuple(old_types_idxs_map[t] for t in new_types_order)
+    df_type = DataFrameType(new_data_typs, df.index, new_column_names, column_loc=new_column_loc)
+
+    old_scheme_drop_idxs = []
+    for i, k in enumerate(old_types_order):
+        a = [j for j, x in enumerate(old_data_typs_map[k][1]) if df.columns[x] in drop_col_names]
+        old_scheme_drop_idxs.append(tuple(a) or None)
+
+    for label in drop_col_names:
         if label not in df.columns:
             func_text.append(f'if errors == "raise":')
             func_text.append(indent + f'raise ValueError("The label {label} is not found in the selected axis")')
             break
 
-    for column_id, column_name in enumerate(saved_df_columns):
-        col_loc = df.column_loc[column_name]
-        type_id, col_id = col_loc.type_id, col_loc.col_id
-        func_text.append(f'new_col_{column_id}_data_df = df._data[{type_id}][{col_id}]')
-        column_list.append((f'new_col_{column_id}_data_df', column_name))
-
-    data = ', '.join(f'"{column_name}": {column}' for column, column_name in column_list)
-    index = 'df.index'
-    func_text.append(f"return pandas.DataFrame({{{data}}}, index={index})\n")
+    old_ntypes = len(old_types_order)
+    for type_id in range(old_ntypes):
+        func_text.append(f'list_{type_id} = df._data[{type_id}].copy()')
+        if old_scheme_drop_idxs[type_id]:
+            func_text.append(f'for col_id in old_scheme_drop_idxs_{type_id}[::-1]:')
+            func_text.append(indent + f'list_{type_id}.pop(col_id)')
+
+    # in new df the order of array lists (i.e. types_order) can be different, so
+    # making a new tuple of lists reorder as needed
+    new_ntypes = len(new_types_order)
+    data_lists_reordered = ', '.join(['list_' + str(reorder_scheme[i]) for i in range(new_ntypes)])
+    data_val = '(' + data_lists_reordered + ', )' if new_ntypes > 0 else '()'
+
+    data, index = 'new_data', 'df._index'
+    func_text.append(f'{data} = {data_val}')
+    func_text.append(f"return init_dataframe_internal({data}, {index}, df_type)\n")
     func_definition.extend([indent + func_line for func_line in func_text])
     func_def = '\n'.join(func_definition)
 
-    global_vars = {'pandas': pandas}
+    global_vars = {
+        'pandas': pandas,
+        'init_dataframe_internal': init_dataframe_internal,
+        'df_type': df_type
+    }
+
+    global_vars.update({f'old_scheme_drop_idxs_{i}': old_scheme_drop_idxs[i] for i in range(old_ntypes)})
 
     return func_def, global_vars
 
@@ -1387,7 +1417,8 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None,
     -----------
     - Parameters ``labels``, ``axis``, ``index``, ``level`` and ``inplace`` are currently unsupported.
     - Parameter ``columns`` is required and is expected to be a Literal value with one column name
-    or Tuple with columns names.
+    or List with columns names. Mutating a list of column names after it was defined and then using it as a
+    columns argument results in an SDCLimitation exception at runtime.
     - Supported ``errors`` can be {``raise``, ``ignore``}, default ``raise``. If ``ignore``, suppress error and only
     existing labels are dropped.
 
@@ -1420,36 +1451,66 @@ def sdc_pandas_dataframe_drop(df, labels=None, axis=0, index=None, columns=None,
 
     """
 
-    _func_name = 'drop'
+    method_name = f'Method drop().'
 
-    ty_checker = TypeChecker(f'Method {_func_name}().')
+    ty_checker = TypeChecker(method_name)
     ty_checker.check(df, DataFrameType)
 
-    if not isinstance(labels, types.Omitted) and labels is not None:
+    if not isinstance(labels, (types.Omitted, types.NoneType)) and labels is not None:
         ty_checker.raise_exc(labels, 'None', 'labels')
 
-    if not isinstance(axis, (int, types.Omitted)):
+    if not isinstance(axis, (types.Omitted, types.Integer)) and axis != 0:
         ty_checker.raise_exc(axis, 'int', 'axis')
 
-    if not isinstance(index, types.Omitted) and index is not None:
+    if not isinstance(index, (types.Omitted, types.NoneType)) and index is not None:
         ty_checker.raise_exc(index, 'None', 'index')
 
-    if not isinstance(columns, (types.Omitted, types.Tuple, types.Literal)):
-        ty_checker.raise_exc(columns, 'str, tuple of str', 'columns')
+    if not (isinstance(columns, (types.Omitted, types.StringLiteral))
+            or (isinstance(columns, types.Tuple)
+                and all(isinstance(c, types.StringLiteral) for c in columns))
+            or (isinstance(columns, types.UniTuple) and isinstance(columns.dtype, types.StringLiteral))
+            or isinstance(columns, types.List) and isinstance(columns.dtype, types.UnicodeType)
+            ):
+        ty_checker.raise_exc(columns, 'str, list of const str', 'columns')
 
-    if not isinstance(level, (types.Omitted, types.Literal)) and level is not None:
+    if not isinstance(level, (types.Omitted, types.NoneType, types.Literal)) and level is not None:
         ty_checker.raise_exc(level, 'None', 'level')
 
-    if not isinstance(inplace, (bool, types.Omitted)) and inplace:
+    if not isinstance(inplace, (types.Omitted, types.NoneType, types.Boolean)) and inplace:
         ty_checker.raise_exc(inplace, 'bool', 'inplace')
 
-    if not isinstance(errors, (str, types.Omitted, types.Literal)):
+    if not isinstance(errors, (types.Omitted, types.UnicodeType, types.StringLiteral)) and errors != "raise":
         ty_checker.raise_exc(errors, 'str', 'errors')
 
+    if isinstance(columns, types.List):
+        if columns.initial_value is None:
+            raise TypingError('{} Unsupported use of parameter columns:'
+                              ' expected list of constant strings. Given: {}'.format(method_name, columns))
+        else:
+            # this works because global tuple of strings is captured as Tuple of StringLiterals
+            columns_as_tuple = tuple(columns.initial_value)
+            def _sdc_pandas_dataframe_drop_wrapper_impl(df, labels=None, axis=0, index=None,
+                                                        columns=None, level=None, inplace=False, errors="raise"):
+
+                # if at runtime columns list differs from it's initial value (known at compile time)
+                # we cannot tell which columns to drop and what is the resulting DataFrameType, so raise exception
+                if list(columns_as_tuple) != columns:
+                    raise SDCLimitation("Unsupported use of parameter columns: non-const list was used.")
+
+                return df.drop(labels=labels,
+                               axis=axis,
+                               index=index,
+                               columns=columns_as_tuple,
+                               level=level,
+                               inplace=inplace,
+                               errors=errors)
+
+            return _sdc_pandas_dataframe_drop_wrapper_impl
+
     args = {'labels': None, 'axis': 0, 'index': None, 'columns': None, 'level': None, 'inplace': False,
             'errors': f'"raise"'}
 
-    def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
+    def sdc_pandas_dataframe_drop_impl(df, args, columns):
         func_args = ['df']
         for key, value in args.items():
             if key not in func_args:
@@ -1459,18 +1520,19 @@ def sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns):
 
         if isinstance(columns, types.StringLiteral):
             drop_cols = (columns.literal_value,)
-        elif isinstance(columns, types.Tuple):
+        elif isinstance(columns, (types.Tuple, types.UniTuple)):
             drop_cols = tuple(column.literal_value for column in columns)
         else:
             raise ValueError('Only drop by one column or tuple of columns is currently supported in df.drop()')
 
-        func_def, global_vars = sdc_pandas_dataframe_drop_codegen(_func_name, func_args, df, drop_cols)
+        func_name = 'sdc_pandas_dataframe_drop_impl'
+        func_def, global_vars = sdc_pandas_dataframe_drop_codegen(func_name, func_args, df, drop_cols)
         loc_vars = {}
         exec(func_def, global_vars, loc_vars)
-        _drop_impl = loc_vars['sdc_pandas_dataframe_drop_impl']
+        _drop_impl = loc_vars[func_name]
         return _drop_impl
 
-    return sdc_pandas_dataframe_drop_impl(df, _func_name, args, columns)
+    return sdc_pandas_dataframe_drop_impl(df, args, columns)
 
 
 def df_length_expr(self):
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -53,7 +53,6 @@ def generic_resolve(self, df, attr):
             return SeriesType(arr_typ.dtype, arr_typ, df.index, True)
 
 
-
 def get_structure_maps(col_types, col_names):
     # Define map column name to column location ex. {'A': (0,0), 'B': (1,0), 'C': (0,1)}
     column_loc = {}
@@ -80,6 +79,31 @@ def get_structure_maps(col_types, col_names):
     return column_loc, data_typs_map, types_order
 
 
+@intrinsic
+def init_dataframe_internal(typingctx, data, index, df_type):
+
+    ret_type = df_type.instance_type
+
+    def codegen(context, builder, sig, args):
+        data_val, index_val = args[:2]
+
+        dataframe = cgutils.create_struct_proxy(
+            sig.return_type)(context, builder)
+        dataframe.data = data_val
+        dataframe.index = index_val
+        dataframe.parent = context.get_constant_null(types.pyobject)
+
+        # increase refcount of stored values
+        if context.enable_nrt:
+            context.nrt.incref(builder, sig.args[0], data_val)
+            context.nrt.incref(builder, sig.args[1], index_val)
+
+        return dataframe._getvalue()
+
+    sig = signature(ret_type, data, index, df_type)
+    return sig, codegen
+
+
 # TODO: alias analysis
 # this function should be used for getting df._data for alias analysis to work
 # no_cpython_wrapper since Array(DatetimeDate) cannot be boxed
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py