Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit a91d01c

Browse files
Refactor df.geitem by slice idx to improve compile time (#949)
Motivation: df[idx] when idx is a slice produces DataFrame with the same internal structure as original one (only index type may change). Hence it can be copied and filled with column[idx] slices, which improves compilation time.
1 parent d5f706f commit a91d01c

1 file changed

Lines changed: 42 additions & 19 deletions

File tree

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1554,24 +1554,33 @@ def df_index_expr(self, length_expr=None):
15541554

15551555
def df_getitem_slice_idx_main_codelines(self, idx):
15561556
"""Generate main code lines for df.getitem with idx of slice"""
1557+
1558+
types_order = get_structure_maps(self.data, self.columns)[2]
1559+
n_lists = len(types_order)
1560+
15571561
results = []
1558-
func_lines = [f' res_index = self.index[idx]']
1559-
for i, col in enumerate(self.columns):
1560-
col_loc = self.column_loc[col]
1561-
type_id, col_id = col_loc.type_id, col_loc.col_id
1562-
res_data = f'res_data_{i}'
1562+
func_lines = []
1563+
for i in range(n_lists):
15631564
func_lines += [
1564-
f' data_{i} = self._data[{type_id}][{col_id}][idx]',
1565-
f' {res_data} = data_{i}'
1565+
f' list_{i} = self._data[{i}].copy()',
1566+
f' for i, item in enumerate(list_{i}):',
1567+
f' list_{i}[i] = item[idx]'
15661568
]
1567-
results.append((col, res_data))
15681569

1569-
data = ', '.join(f'"{col}": {data}' for col, data in results)
1570-
func_lines += [f' return pandas.DataFrame({{{data}}}, index=res_index)']
1570+
all_lists_joined = ', '.join([f'list_{i}' for i in range(n_lists)]) + ', '
1571+
res_data = f'({all_lists_joined})' if n_lists > 0 else '()'
1572+
func_lines += [
1573+
f' if self_index_is_none == True:',
1574+
f' old_index = pandas.RangeIndex(len(self))',
1575+
f' else:',
1576+
f' old_index = self._index',
1577+
f' res_data = {res_data}',
1578+
f' res_index = old_index[idx]',
1579+
f' return init_dataframe_internal(res_data, res_index, df_type)'
1580+
]
15711581

15721582
return func_lines
15731583

1574-
15751584
def df_getitem_tuple_idx_main_codelines(self, literal_idx):
15761585
"""Generate main code lines for df.getitem with idx of tuple"""
15771586
results = []
@@ -1686,13 +1695,17 @@ def df_getitem_key_error_codelines():
16861695
def df_getitem_slice_idx_codegen(self, idx):
16871696
"""
16881697
Example of generated implementation with provided index:
1689-
def _df_getitem_slice_idx_impl(self, idx)
1690-
res_index = self._index
1691-
data_0 = self._data[0]
1692-
res_data_0 = pandas.Series(data_0[idx], index=res_index[idx], name="A")
1693-
data_1 = self._data [1]
1694-
res_data_1 = pandas.Series(data_1[idx], index=res_index, name="B")
1695-
return pandas.DataFrame({"A": res_data_0, "B": res_data_1}, index=res_index[idx])
1698+
def _df_getitem_slice_idx_impl(self, idx):
1699+
list_0 = self._data[0].copy()
1700+
for i, item in enumerate(list_0):
1701+
list_0[i] = item[idx]
1702+
if self_index_is_none == True:
1703+
old_index = pandas.RangeIndex(len(self))
1704+
else:
1705+
old_index = self._index
1706+
res_data = (list_0, )
1707+
res_index = old_index[idx]
1708+
return init_dataframe_internal(res_data, res_index, df_type)
16961709
"""
16971710
func_lines = ['def _df_getitem_slice_idx_impl(self, idx):']
16981711
if self.columns:
@@ -1701,7 +1714,17 @@ def _df_getitem_slice_idx_impl(self, idx)
17011714
# raise KeyError if input DF is empty
17021715
func_lines += df_getitem_key_error_codelines()
17031716
func_text = '\n'.join(func_lines)
1704-
global_vars = {'pandas': pandas, 'numpy': numpy}
1717+
1718+
# TO-DO: need DefaultIndex to handle self.index[idx] construct inside func
1719+
self_index_is_none = isinstance(self.index, types.NoneType)
1720+
new_index_type = RangeIndexType(False) if self_index_is_none else self.index
1721+
df_type = DataFrameType(self.data, new_index_type, self.columns, column_loc=self.column_loc)
1722+
1723+
global_vars = {'pandas': pandas,
1724+
'numpy': numpy,
1725+
'df_type': df_type,
1726+
'init_dataframe_internal': init_dataframe_internal,
1727+
'self_index_is_none': self_index_is_none}
17051728

17061729
return func_text, global_vars
17071730

0 commit comments

Comments
 (0)