Remove column names from DataFrame model (#944)

kozlov-alexey · web-flow · commit ce4142a11bb2 · 2020-11-12T17:23:02.000+03:00
Motivation: before this change column names were passed to DF ctor as
arguments of LiteralString types (each name of it's own type), which
seems to add to linear dependency of LLVM IR size and hence impact
DF ctor compile time. Since this information is saved into DF type
itself and can be captured in any of DF methods on typing it's proposed
to remove columns from DF model struct as redundant.
diff --git a/examples/dataframe/dataframe_columns.py b/examples/dataframe/dataframe_columns.py
@@ -0,0 +1,39 @@
+# *****************************************************************************
+# Copyright (c) 2020, Intel Corporation All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+#     Redistributions of source code must retain the above copyright notice,
+#     this list of conditions and the following disclaimer.
+#
+#     Redistributions in binary form must reproduce the above copyright notice,
+#     this list of conditions and the following disclaimer in the documentation
+#     and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# *****************************************************************************
+
+import pandas as pd
+from numba import njit
+
+
+@njit
+def dataframe_columns():
+    df = pd.DataFrame({'A': [1, 2], 'AA': [3, 4], 'B': [5, 6]}, index=['a', 'b'])
+    result = df.columns
+
+    return result  # A tuple of column names ('A', 'AA', 'B')
+
+
+print(dataframe_columns())
diff --git a/sdc/datatypes/hpat_pandas_dataframe_functions.py b/sdc/datatypes/hpat_pandas_dataframe_functions.py
@@ -116,6 +116,44 @@ def hpat_pandas_df_index_impl(df):
         return hpat_pandas_df_index_impl
 
 
+@sdc_overload_attribute(DataFrameType, 'columns')
+def hpat_pandas_dataframe_columns(df):
+    """
+    Intel Scalable Dataframe Compiler User Guide
+    ********************************************
+    Pandas API: pandas.DataFrame.columns
+
+    Examples
+    --------
+    .. literalinclude:: ../../../examples/dataframe/dataframe_columns.py
+        :language: python
+        :lines: 27-
+        :caption: The column names of the DataFrame.
+        :name: ex_dataframe_columns
+
+    .. command-output:: python ./dataframe/dataframe_columns.py
+        :cwd: ../../../examples
+
+    Intel Scalable Dataframe Compiler Developer Guide
+    *************************************************
+    Pandas DataFrame attribute :attr:`pandas.DataFrame.columns` implementation.
+
+    .. only:: developer
+        Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_dataframe_columns*
+    """
+
+    ty_checker = TypeChecker('Attribute columns.')
+    ty_checker.check(df, DataFrameType)
+
+    # no columns in DF model to avoid impact on DF ctor IR size (captured when needed only)
+    df_columns = df.columns
+
+    def hpat_pandas_df_columns_impl(df):
+        return df_columns
+
+    return hpat_pandas_df_columns_impl
+
+
 def sdc_pandas_dataframe_values_codegen(self, numba_common_dtype):
     """
     Example of generated implementation:
diff --git a/sdc/datatypes/hpat_pandas_groupby_functions.py b/sdc/datatypes/hpat_pandas_groupby_functions.py
@@ -160,6 +160,8 @@ def sdc_pandas_dataframe_getitem(self, idx):
             target_col_loc = self.parent.column_loc[self.parent.columns[target_col_id_literal]]
             target_type_id, target_col_id = target_col_loc.type_id, target_col_loc.col_id
 
+        parent_df_col_names = self.parent.columns
+
         def sdc_pandas_dataframe_getitem_common_impl(self, idx):
 
             # calling getitem twice raises IndexError, just as in pandas
@@ -170,7 +172,7 @@ def sdc_pandas_dataframe_getitem_common_impl(self, idx):
                 # no need to pass index into this series, as we group by array
                 target_series = pandas.Series(
                     data=self._parent._data[target_type_id][target_col_id],
-                    name=self._parent._columns[target_col_id_literal]
+                    name=parent_df_col_names[target_col_id_literal]
                 )
                 by_arr_data = self._parent._data[by_type_id][by_col_id]
                 return init_series_groupby(target_series, by_arr_data, self._data, self._sort)
diff --git a/sdc/hiframes/boxing.py b/sdc/hiframes/boxing.py
@@ -88,30 +88,11 @@ def unbox_dataframe(typ, val, c):
     columns will be extracted later if necessary.
     """
     n_cols = len(typ.columns)
-    column_strs = [numba.cpython.unicode.make_string_from_constant(
-        c.context, c.builder, string_type, a) for a in typ.columns]
     # create dataframe struct and store values
     dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder)
 
     errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit)
 
-    col_list_type = types.List(string_type)
-    ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, col_list_type, n_cols)
-
-    with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok):
-        with if_ok:
-            inst.size = c.context.get_constant(types.intp, n_cols)
-            for i, column_str in enumerate(column_strs):
-                inst.setitem(c.context.get_constant(types.intp, i), column_str, incref=False)
-            dataframe.columns = inst.value
-
-        with if_not_ok:
-            c.builder.store(cgutils.true_bit, errorptr)
-
-    # If an error occurred, drop the whole native list
-    with c.builder.if_then(c.builder.load(errorptr)):
-        c.context.nrt.decref(c.builder, col_list_type, inst.value)
-
     _, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns)
 
     for col_typ in types_order:
@@ -150,12 +131,6 @@ def unbox_dataframe(typ, val, c):
 
     dataframe.parent = val
 
-    # increase refcount of stored values
-    if c.context.enable_nrt:
-        # TODO: other objects?
-        for var in column_strs:
-            c.context.nrt.incref(c.builder, string_type, var)
-
     return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))
 
 
diff --git a/sdc/hiframes/pd_dataframe_ext.py b/sdc/hiframes/pd_dataframe_ext.py
@@ -100,8 +100,7 @@ def codegen(context, builder, signature, args):
         in_tup = args[0]
         data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)]
         index = builder.extract_value(in_tup, n_cols)
-        column_strs = [numba.cpython.unicode.make_string_from_constant(
-            context, builder, string_type, c) for c in column_names]
+
         # create dataframe struct and store values
         dataframe = cgutils.create_struct_proxy(
             signature.return_type)(context, builder)
@@ -117,21 +116,15 @@ def codegen(context, builder, signature, args):
         data_tup = context.make_tuple(
             builder, types.Tuple(data_list_type), data_lists)
 
-        col_list_type = types.List(string_type)
-        column_list = context.build_list(builder, col_list_type, column_strs)
-
         dataframe.data = data_tup
         dataframe.index = index
-        dataframe.columns = column_list
         dataframe.parent = context.get_constant_null(types.pyobject)
 
         # increase refcount of stored values
         if context.enable_nrt:
             context.nrt.incref(builder, index_typ, index)
             for var, typ in zip(data_arrs, data_typs):
                 context.nrt.incref(builder, typ, var)
-            for var in column_strs:
-                context.nrt.incref(builder, string_type, var)
 
         return dataframe._getvalue()
 
diff --git a/sdc/hiframes/pd_dataframe_type.py b/sdc/hiframes/pd_dataframe_type.py
@@ -110,7 +110,6 @@ def __init__(self, dmm, fe_type):
         members = [
             ('data', types.Tuple([types.List(typ) for typ in df_types])),
             ('index', fe_type.index),
-            ('columns', types.List(string_type)),
             ('parent', types.pyobject),
         ]
         super(DataFrameModel, self).__init__(dmm, fe_type, members)
@@ -127,6 +126,5 @@ class ColumnLoc(NamedTuple):
 
 make_attribute_wrapper(DataFrameType, 'data', '_data')
 make_attribute_wrapper(DataFrameType, 'index', '_index')
-make_attribute_wrapper(DataFrameType, 'columns', '_columns')
 make_attribute_wrapper(DataFrameType, 'unboxed', '_unboxed')
 make_attribute_wrapper(DataFrameType, 'parent', '_parent')
diff --git a/sdc/tests/test_dataframe.py b/sdc/tests/test_dataframe.py
@@ -242,7 +242,6 @@ def test_impl(df):
         hpat_func = self.jit(test_impl)
         pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
 
-    @skip_numba_jit
     def test_box1(self):
         def test_impl(n):
             df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
@@ -2557,7 +2556,6 @@ def test_impl():
         hpat_func = self.jit(test_impl)
         pd.testing.assert_series_equal(hpat_func(), test_impl(), check_names=False)
 
-    @unittest.skip("Implement getting columns attribute")
     def test_dataframe_columns_attribute(self):
         def test_impl():
             df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})
@@ -2566,7 +2564,6 @@ def test_impl():
         hpat_func = self.jit(test_impl)
         np.testing.assert_array_equal(hpat_func(), test_impl())
 
-    @unittest.skip("Implement getting columns attribute")
     def test_dataframe_columns_iterator(self):
         def test_impl():
             df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})