Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit db4f431

Browse files
Fix for supporting DF categoricals unboxing (#923)
1 parent 8c0914a commit db4f431

4 files changed

Lines changed: 132 additions & 5 deletions

File tree

sdc/hiframes/api.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
SeriesType,
4444
if_series_to_array_type)
4545
from numba.core.errors import TypingError
46+
from sdc.datatypes.categorical.types import Categorical
4647

4748

4849
def isna(arr, i):
@@ -162,7 +163,7 @@ def fix_df_array_list_str_impl(column): # pragma: no cover
162163
if isinstance(column, RangeIndexType):
163164
return lambda column: np.array(column)
164165

165-
if isinstance(column, (types.Array, StringArrayType)):
166+
if isinstance(column, (types.Array, StringArrayType, Categorical)):
166167
return lambda column: column
167168

168169

sdc/hiframes/boxing.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,10 +127,11 @@ def unbox_dataframe(typ, val, c):
127127
series_obj = c.pyapi.object_getattr_string(val, typ.columns[col_idx])
128128
arr_obj = c.pyapi.object_getattr_string(series_obj, "values")
129129
ty_series = typ.data[col_idx]
130-
if isinstance(ty_series, types.Array):
131-
native_val = unbox_array(typ.data[col_idx], arr_obj, c)
132-
elif ty_series == string_array_type:
133-
native_val = unbox_str_series(string_array_type, series_obj, c)
130+
131+
# FIXME: CategoricalType has wrong dtype attribute value (i.e. dtype of codes)
132+
# current implementation offers pd_dtype for this purpose, so use it
133+
column_dtype = ty_series.pd_dtype if isinstance(ty_series, Categorical) else ty_series.dtype
134+
native_val = _unbox_series_data(column_dtype, ty_series, arr_obj, c)
134135

135136
inst.setitem(c.context.get_constant(types.intp, i), native_val.value, incref=False)
136137

sdc/tests/categorical/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,4 @@
2727
from . import test_categorical
2828
from . import test_categoricaldtype
2929
from . import test_series_category
30+
from . import test_df_category
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# *****************************************************************************
2+
# Copyright (c) 2020, Intel Corporation All rights reserved.
3+
#
4+
# Redistribution and use in source and binary forms, with or without
5+
# modification, are permitted provided that the following conditions are met:
6+
#
7+
# Redistributions of source code must retain the above copyright notice,
8+
# this list of conditions and the following disclaimer.
9+
#
10+
# Redistributions in binary form must reproduce the above copyright notice,
11+
# this list of conditions and the following disclaimer in the documentation
12+
# and/or other materials provided with the distribution.
13+
#
14+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
16+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
18+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
21+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
22+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
23+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
24+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25+
# *****************************************************************************
26+
27+
from sdc.tests.test_base import TestCase
28+
29+
import numpy as np
30+
import pandas as pd
31+
import numba as nb
32+
from numba import types
33+
34+
from sdc.types import (
35+
CategoricalDtypeType,
36+
Categorical,
37+
)
38+
39+
from sdc.hiframes.pd_dataframe_type import DataFrameType
40+
from sdc.tests.test_utils import skip_numba_jit
41+
42+
43+
class DFCategoryTest(TestCase):
44+
"""
45+
Test for pandas DataFrames with CategoricalDtype.
46+
"""
47+
48+
def _pd_value(self):
49+
return pd.DataFrame({'A': pd.Categorical([1, 2, 3, 2, 1])})
50+
51+
def test_typeof(self):
52+
pd_value = self._pd_value()
53+
nb_type = nb.typeof(pd_value)
54+
55+
assert(isinstance(nb_type, DataFrameType))
56+
assert(nb_type.columns == ('A',))
57+
assert(nb_type.index == types.none)
58+
assert(nb_type.data[0].pd_dtype == CategoricalDtypeType(categories=[1, 2, 3], ordered=False))
59+
assert(nb_type.data[0] == Categorical(CategoricalDtypeType(categories=[1, 2, 3], ordered=False)))
60+
61+
def test_unboxing(self):
62+
@nb.njit
63+
def func(c):
64+
pass
65+
66+
pd_value = self._pd_value()
67+
func(pd_value)
68+
69+
def test_boxing(self):
70+
@nb.njit
71+
def func(c):
72+
return c
73+
74+
pd_value = self._pd_value()
75+
boxed = func(pd_value)
76+
assert(boxed.equals(pd_value))
77+
78+
@skip_numba_jit("capturing DFs (not only categoricals) as freevar not working")
79+
def test_lowering(self):
80+
pd_value = self._pd_value()
81+
82+
@nb.njit
83+
def func():
84+
return pd_value
85+
86+
boxed = func()
87+
assert(boxed.equals(pd_value))
88+
89+
def test_constructor(self):
90+
@nb.njit
91+
def func():
92+
return pd.DataFrame({'A': pd.Categorical([1, 2, 3, 2, 1])})
93+
94+
boxed = func()
95+
assert(boxed.equals(self._pd_value()))
96+
97+
@skip_numba_jit("compiles, but category dtype not supported by df ctor")
98+
def test_constructor_list(self):
99+
@nb.njit
100+
def func():
101+
return pd.DataFrame({'A': list("12321")}, dtype='category')
102+
103+
boxed = func()
104+
assert(boxed.equals(self._pd_value()))
105+
106+
@skip_numba_jit
107+
def test_constructor_CategoricalDtype(self):
108+
@nb.njit
109+
def func():
110+
return pd.DataFrame(data={'A': np.array([1, 2, 3, 2, 1])},
111+
dtype=pd.CategoricalDtype(categories=[1, 2, 3]))
112+
113+
boxed = func()
114+
assert(boxed.equals(self._pd_value()))
115+
116+
@skip_numba_jit
117+
def test_constructor_CategoricalDtype_list(self):
118+
@nb.njit
119+
def func():
120+
return pd.DataFrame(data={'A': [1, 2, 3, 2, 1]},
121+
dtype=pd.CategoricalDtype(categories=[1, 2, 3]))
122+
123+
boxed = func()
124+
assert(boxed.equals(self._pd_value()))

0 commit comments

Comments
 (0)