Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit ce4142a

Browse files
Remove column names from DataFrame model (#944)
Motivation: before this change column names were passed to DF ctor as arguments of LiteralString types (each name of it's own type), which seems to add to linear dependency of LLVM IR size and hence impact DF ctor compile time. Since this information is saved into DF type itself and can be captured in any of DF methods on typing it's proposed to remove columns from DF model struct as redundant.
1 parent 9deb029 commit ce4142a

7 files changed

Lines changed: 81 additions & 39 deletions

File tree

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
import pandas as pd
28+
from numba import njit
29+
30+
31+
@njit
32+
def dataframe_columns():
33+
df = pd.DataFrame({'A': [1, 2], 'AA': [3, 4], 'B': [5, 6]}, index=['a', 'b'])
34+
result = df.columns
35+
36+
return result # A tuple of column names ('A', 'AA', 'B')
37+
38+
39+
print(dataframe_columns())

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,44 @@ def hpat_pandas_df_index_impl(df):
116116
return hpat_pandas_df_index_impl
117117

118118

119+
@sdc_overload_attribute(DataFrameType, 'columns')
120+
def hpat_pandas_dataframe_columns(df):
121+
"""
122+
Intel Scalable Dataframe Compiler User Guide
123+
********************************************
124+
Pandas API: pandas.DataFrame.columns
125+
126+
Examples
127+
--------
128+
.. literalinclude:: ../../../examples/dataframe/dataframe_columns.py
129+
:language: python
130+
:lines: 27-
131+
:caption: The column names of the DataFrame.
132+
:name: ex_dataframe_columns
133+
134+
.. command-output:: python ./dataframe/dataframe_columns.py
135+
:cwd: ../../../examples
136+
137+
Intel Scalable Dataframe Compiler Developer Guide
138+
*************************************************
139+
Pandas DataFrame attribute :attr:`pandas.DataFrame.columns` implementation.
140+
141+
.. only:: developer
142+
Test: python -m sdc.runtests -k sdc.tests.test_dataframe.TestDataFrame.test_dataframe_columns*
143+
"""
144+
145+
ty_checker = TypeChecker('Attribute columns.')
146+
ty_checker.check(df, DataFrameType)
147+
148+
# no columns in DF model to avoid impact on DF ctor IR size (captured when needed only)
149+
df_columns = df.columns
150+
151+
def hpat_pandas_df_columns_impl(df):
152+
return df_columns
153+
154+
return hpat_pandas_df_columns_impl
155+
156+
119157
def sdc_pandas_dataframe_values_codegen(self, numba_common_dtype):
120158
"""
121159
Example of generated implementation:

sdc/datatypes/hpat_pandas_groupby_functions.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,8 @@ def sdc_pandas_dataframe_getitem(self, idx):
160160
target_col_loc = self.parent.column_loc[self.parent.columns[target_col_id_literal]]
161161
target_type_id, target_col_id = target_col_loc.type_id, target_col_loc.col_id
162162

163+
parent_df_col_names = self.parent.columns
164+
163165
def sdc_pandas_dataframe_getitem_common_impl(self, idx):
164166

165167
# calling getitem twice raises IndexError, just as in pandas
@@ -170,7 +172,7 @@ def sdc_pandas_dataframe_getitem_common_impl(self, idx):
170172
# no need to pass index into this series, as we group by array
171173
target_series = pandas.Series(
172174
data=self._parent._data[target_type_id][target_col_id],
173-
name=self._parent._columns[target_col_id_literal]
175+
name=parent_df_col_names[target_col_id_literal]
174176
)
175177
by_arr_data = self._parent._data[by_type_id][by_col_id]
176178
return init_series_groupby(target_series, by_arr_data, self._data, self._sort)

sdc/hiframes/boxing.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -88,30 +88,11 @@ def unbox_dataframe(typ, val, c):
8888
columns will be extracted later if necessary.
8989
"""
9090
n_cols = len(typ.columns)
91-
column_strs = [numba.cpython.unicode.make_string_from_constant(
92-
c.context, c.builder, string_type, a) for a in typ.columns]
9391
# create dataframe struct and store values
9492
dataframe = cgutils.create_struct_proxy(typ)(c.context, c.builder)
9593

9694
errorptr = cgutils.alloca_once_value(c.builder, cgutils.false_bit)
9795

98-
col_list_type = types.List(string_type)
99-
ok, inst = listobj.ListInstance.allocate_ex(c.context, c.builder, col_list_type, n_cols)
100-
101-
with c.builder.if_else(ok, likely=True) as (if_ok, if_not_ok):
102-
with if_ok:
103-
inst.size = c.context.get_constant(types.intp, n_cols)
104-
for i, column_str in enumerate(column_strs):
105-
inst.setitem(c.context.get_constant(types.intp, i), column_str, incref=False)
106-
dataframe.columns = inst.value
107-
108-
with if_not_ok:
109-
c.builder.store(cgutils.true_bit, errorptr)
110-
111-
# If an error occurred, drop the whole native list
112-
with c.builder.if_then(c.builder.load(errorptr)):
113-
c.context.nrt.decref(c.builder, col_list_type, inst.value)
114-
11596
_, data_typs_map, types_order = get_structure_maps(typ.data, typ.columns)
11697

11798
for col_typ in types_order:
@@ -150,12 +131,6 @@ def unbox_dataframe(typ, val, c):
150131

151132
dataframe.parent = val
152133

153-
# increase refcount of stored values
154-
if c.context.enable_nrt:
155-
# TODO: other objects?
156-
for var in column_strs:
157-
c.context.nrt.incref(c.builder, string_type, var)
158-
159134
return NativeValue(dataframe._getvalue(), is_error=c.builder.load(errorptr))
160135

161136

sdc/hiframes/pd_dataframe_ext.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,7 @@ def codegen(context, builder, signature, args):
100100
in_tup = args[0]
101101
data_arrs = [builder.extract_value(in_tup, i) for i in range(n_cols)]
102102
index = builder.extract_value(in_tup, n_cols)
103-
column_strs = [numba.cpython.unicode.make_string_from_constant(
104-
context, builder, string_type, c) for c in column_names]
103+
105104
# create dataframe struct and store values
106105
dataframe = cgutils.create_struct_proxy(
107106
signature.return_type)(context, builder)
@@ -117,21 +116,15 @@ def codegen(context, builder, signature, args):
117116
data_tup = context.make_tuple(
118117
builder, types.Tuple(data_list_type), data_lists)
119118

120-
col_list_type = types.List(string_type)
121-
column_list = context.build_list(builder, col_list_type, column_strs)
122-
123119
dataframe.data = data_tup
124120
dataframe.index = index
125-
dataframe.columns = column_list
126121
dataframe.parent = context.get_constant_null(types.pyobject)
127122

128123
# increase refcount of stored values
129124
if context.enable_nrt:
130125
context.nrt.incref(builder, index_typ, index)
131126
for var, typ in zip(data_arrs, data_typs):
132127
context.nrt.incref(builder, typ, var)
133-
for var in column_strs:
134-
context.nrt.incref(builder, string_type, var)
135128

136129
return dataframe._getvalue()
137130

sdc/hiframes/pd_dataframe_type.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ def __init__(self, dmm, fe_type):
110110
members = [
111111
('data', types.Tuple([types.List(typ) for typ in df_types])),
112112
('index', fe_type.index),
113-
('columns', types.List(string_type)),
114113
('parent', types.pyobject),
115114
]
116115
super(DataFrameModel, self).__init__(dmm, fe_type, members)
@@ -127,6 +126,5 @@ class ColumnLoc(NamedTuple):
127126

128127
make_attribute_wrapper(DataFrameType, 'data', '_data')
129128
make_attribute_wrapper(DataFrameType, 'index', '_index')
130-
make_attribute_wrapper(DataFrameType, 'columns', '_columns')
131129
make_attribute_wrapper(DataFrameType, 'unboxed', '_unboxed')
132130
make_attribute_wrapper(DataFrameType, 'parent', '_parent')

sdc/tests/test_dataframe.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,6 @@ def test_impl(df):
242242
hpat_func = self.jit(test_impl)
243243
pd.testing.assert_frame_equal(hpat_func(df), test_impl(df))
244244

245-
@skip_numba_jit
246245
def test_box1(self):
247246
def test_impl(n):
248247
df = pd.DataFrame({'A': np.ones(n), 'B': np.arange(n)})
@@ -2557,7 +2556,6 @@ def test_impl():
25572556
hpat_func = self.jit(test_impl)
25582557
pd.testing.assert_series_equal(hpat_func(), test_impl(), check_names=False)
25592558

2560-
@unittest.skip("Implement getting columns attribute")
25612559
def test_dataframe_columns_attribute(self):
25622560
def test_impl():
25632561
df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})
@@ -2566,7 +2564,6 @@ def test_impl():
25662564
hpat_func = self.jit(test_impl)
25672565
np.testing.assert_array_equal(hpat_func(), test_impl())
25682566

2569-
@unittest.skip("Implement getting columns attribute")
25702567
def test_dataframe_columns_iterator(self):
25712568
def test_impl():
25722569
df = pd.DataFrame({'A': [1, 2, 3], 'B': [2, 3, 4]})

0 commit comments

Comments
 (0)