Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 97cff23

Browse files
Adds Int64Index type and updates Series and DF methods to use it (#950)
* Adds Int64Index type and updates Series and DF methods to use it Motivation: as part of the work on supporting common pandas indexes a new type (Int64IndexType) representing pandas.Int64Index is added. Boxing/unboxing of Series and DataFrames as well as common numpy-like functions are changed accordingly to handle it. * Fixing DateTime tests and PEP remarks * Fixing review comments #1
1 parent 79fb01b commit 97cff23

22 files changed

Lines changed: 1727 additions & 316 deletions

sdc/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
import sdc.datatypes.series.init
4949

5050
import sdc.extensions.indexes.range_index_ext
51+
import sdc.extensions.indexes.int64_index_ext
5152

5253
from ._version import get_versions
5354

sdc/datatypes/common_functions.py

Lines changed: 39 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,17 @@
4848
from sdc.functions import numpy_like
4949
from sdc.str_arr_type import string_array_type, StringArrayType
5050
from sdc.datatypes.range_index_type import RangeIndexType
51+
from sdc.datatypes.int64_index_type import Int64IndexType
5152
from sdc.str_arr_ext import (num_total_chars, append_string_array_to,
5253
str_arr_is_na, pre_alloc_string_array, str_arr_set_na, string_array_type,
5354
cp_str_list_to_array, create_str_arr_from_list, get_utf8_size,
5455
str_arr_set_na_by_mask)
5556
from sdc.utilities.prange_utils import parallel_chunks
5657
from sdc.utilities.utils import sdc_overload, sdc_register_jitable
57-
from sdc.utilities.sdc_typing_utils import (find_common_dtype_from_numpy_dtypes,
58-
TypeChecker)
58+
from sdc.utilities.sdc_typing_utils import (
59+
find_common_dtype_from_numpy_dtypes,
60+
TypeChecker)
61+
from sdc.utilities.sdc_typing_utils import sdc_pandas_index_types
5962

6063

6164
class SDCLimitation(Exception):
@@ -71,18 +74,20 @@ def hpat_arrays_append(A, B):
7174
def hpat_arrays_append_overload(A, B):
7275
"""Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A"""
7376

74-
A_is_range_index = isinstance(A, RangeIndexType)
75-
B_is_range_index = isinstance(B, RangeIndexType)
76-
if isinstance(A, (types.Array, RangeIndexType)):
77-
if isinstance(B, (types.Array, RangeIndexType)):
77+
use_A_array = isinstance(A, (RangeIndexType, Int64IndexType))
78+
use_B_array = isinstance(B, (RangeIndexType, Int64IndexType))
79+
if isinstance(A, (types.Array, RangeIndexType, Int64IndexType)):
80+
if isinstance(B, (types.Array, RangeIndexType, Int64IndexType)):
7881
def _append_single_numeric_impl(A, B):
79-
_A = A.values if A_is_range_index == True else A # noqa
80-
_B = B.values if B_is_range_index == True else B # noqa
82+
_A = A.values if use_A_array == True else A # noqa
83+
_B = B.values if use_B_array == True else B # noqa
8184
return numpy.concatenate((_A, _B,))
8285

8386
return _append_single_numeric_impl
84-
elif isinstance(B, (types.UniTuple, types.List)) and isinstance(B.dtype, (types.Array, RangeIndexType)):
85-
B_dtype_is_range_index = isinstance(B.dtype, RangeIndexType)
87+
88+
elif (isinstance(B, (types.UniTuple, types.List))
89+
and isinstance(B.dtype, (types.Array, RangeIndexType, Int64IndexType))):
90+
B_dtype_is_index = isinstance(B.dtype, (RangeIndexType, Int64IndexType))
8691
numba_common_dtype = find_common_dtype_from_numpy_dtypes([A.dtype, B.dtype.dtype], [])
8792

8893
# TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime
@@ -92,10 +97,10 @@ def _append_list_numeric_impl(A, B):
9297
new_data = numpy.empty(total_length, numba_common_dtype)
9398

9499
stop = len(A)
95-
_A = numpy.array(A) if A_is_range_index == True else A # noqa
100+
_A = numpy.array(A) if use_A_array == True else A # noqa
96101
new_data[:stop] = _A
97102
for arr in B:
98-
_arr = numpy.array(arr) if B_dtype_is_range_index == True else arr # noqa
103+
_arr = arr.values if B_dtype_is_index == True else arr # noqa
99104
start = stop
100105
stop = start + len(_arr)
101106
new_data[start:stop] = _arr
@@ -218,12 +223,13 @@ def sdc_join_series_indexes_overload(left, right):
218223
"""Function for joining arrays left and right in a way similar to pandas.join 'outer' algorithm"""
219224

220225
# check that both operands are of types used for representing Pandas indexes
221-
if not (isinstance(left, (types.Array, StringArrayType, RangeIndexType))
222-
and isinstance(right, (types.Array, StringArrayType, RangeIndexType))):
226+
if not (isinstance(left, sdc_pandas_index_types) and isinstance(right, sdc_pandas_index_types)
227+
and not isinstance(left, types.NoneType)
228+
and not isinstance(right, types.NoneType)):
223229
return None
224230

225-
convert_left = isinstance(left, RangeIndexType)
226-
convert_right = isinstance(right, RangeIndexType)
231+
convert_left = isinstance(left, (RangeIndexType, Int64IndexType))
232+
convert_right = isinstance(right, (RangeIndexType, Int64IndexType))
227233

228234
def _convert_to_arrays_impl(left, right):
229235
_left = left.values if convert_left == True else left # noqa
@@ -243,10 +249,9 @@ def sdc_join_range_indexes_impl(left, right):
243249

244250
return sdc_join_range_indexes_impl
245251

246-
elif isinstance(left, RangeIndexType) and isinstance(right, types.Array):
247-
return _convert_to_arrays_impl
248-
249-
elif isinstance(left, types.Array) and isinstance(right, RangeIndexType):
252+
elif (isinstance(left, (RangeIndexType, Int64IndexType, types.Array))
253+
and isinstance(right, (RangeIndexType, Int64IndexType, types.Array))
254+
and not (isinstance(left, types.Array) and isinstance(right, types.Array))):
250255
return _convert_to_arrays_impl
251256

252257
# TODO: remove code duplication below and merge numeric and StringArray impls into one
@@ -618,13 +623,16 @@ def _sdc_take(data, indexes):
618623
@sdc_overload(_sdc_take)
619624
def _sdc_take_overload(data, indexes):
620625

621-
if not isinstance(data, (types.Array, StringArrayType, RangeIndexType)):
626+
valid_data_types = (types.Array,) + sdc_pandas_index_types
627+
if not (isinstance(data, valid_data_types) and not isinstance(data, types.NoneType)):
622628
return None
623-
if not (isinstance(indexes, (types.Array, types.List))
629+
630+
if not (isinstance(indexes, (types.Array, types.List, Int64IndexType))
624631
and isinstance(indexes.dtype, (types.Integer, types.ListType))):
625632
return None
626633

627-
if isinstance(indexes.dtype, types.ListType) and isinstance(data, (types.Array, types.List, RangeIndexType)):
634+
if (isinstance(indexes.dtype, types.ListType)
635+
and isinstance(data, (types.Array, types.List, RangeIndexType, Int64IndexType))):
628636
arr_dtype = data.dtype
629637

630638
def _sdc_take_list_impl(data, indexes):
@@ -677,7 +685,7 @@ def _sdc_take_list_str_impl(data, indexes):
677685

678686
return _sdc_take_list_str_impl
679687

680-
elif isinstance(data, (types.Array, RangeIndexType)):
688+
elif isinstance(data, (types.Array, RangeIndexType, Int64IndexType)):
681689
arr_dtype = data.dtype
682690

683691
def _sdc_take_array_impl(data, indexes):
@@ -740,6 +748,7 @@ def sdc_reindex_series_overload(arr, index, name, by_index):
740748
""" Reindexes series data by new index following the logic of pandas.core.indexing.check_bool_indexer """
741749

742750
range_indexes = isinstance(index, RangeIndexType) and isinstance(by_index, RangeIndexType)
751+
int64_indexes = isinstance(index, Int64IndexType) and isinstance(by_index, Int64IndexType)
743752
data_dtype, index_dtype = arr.dtype, index.dtype
744753
data_is_str_arr = isinstance(arr.dtype, types.UnicodeType)
745754

@@ -748,6 +757,8 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
748757
# no reindexing is needed if indexes are equal
749758
if range_indexes == True: # noqa
750759
equal_indexes = numpy_like.array_equal(index, by_index)
760+
elif int64_indexes == True: # noqa
761+
equal_indexes = numpy_like.array_equal(index, by_index)
751762
else:
752763
equal_indexes = False
753764
if (index is by_index or equal_indexes):
@@ -772,10 +783,10 @@ def sdc_reindex_series_impl(arr, index, name, by_index):
772783
map_index_to_position[value] = i
773784

774785
index_mismatch = 0
775-
# FIXME: TypingError in parfor step (wrong promotion to float64?) if prange is used
776-
for i in numpy.arange(len(by_index)):
777-
if by_index[i] in map_index_to_position:
778-
pos_in_self = map_index_to_position[by_index[i]]
786+
for i in numba.prange(len(by_index)):
787+
val = by_index[i]
788+
if val in map_index_to_position:
789+
pos_in_self = map_index_to_position[val]
779790
_res_data[i] = arr[pos_in_self]
780791
if data_is_str_arr == True: # noqa
781792
res_data_nan_mask[i] = isna(arr, i)

sdc/datatypes/hpat_pandas_dataframe_functions.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
gen_impl_generator, find_common_dtype_from_numpy_dtypes)
5151
from sdc.str_arr_ext import StringArrayType
5252
from sdc.datatypes.range_index_type import RangeIndexType
53+
from sdc.datatypes.int64_index_type import Int64IndexType
5354

5455
from sdc.hiframes.pd_dataframe_type import DataFrameType
5556
from sdc.hiframes.pd_dataframe_ext import init_dataframe_internal, get_structure_maps
@@ -2257,7 +2258,7 @@ def sdc_pandas_dataframe_accessor_getitem(self, idx):
22572258

22582259
if accessor == 'at':
22592260
num_idx = (isinstance(idx[0], types.Number)
2260-
and isinstance(self.dataframe.index, (types.Array, types.NoneType, RangeIndexType)))
2261+
and isinstance(self.dataframe.index, (types.NoneType, RangeIndexType, Int64IndexType)))
22612262
str_idx = (isinstance(idx[0], (types.UnicodeType, types.StringLiteral))
22622263
and isinstance(self.dataframe.index, StringArrayType))
22632264
if isinstance(idx, types.Tuple) and isinstance(idx[1], types.StringLiteral):

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
find_common_dtype_from_numpy_dtypes, has_literal_value,
5454
has_python_value)
5555
from sdc.datatypes.range_index_type import RangeIndexType
56+
from sdc.datatypes.int64_index_type import Int64IndexType
5657
from sdc.datatypes.common_functions import (sdc_join_series_indexes, sdc_arrays_argsort, sdc_reindex_series)
5758
from sdc.datatypes.hpat_pandas_rolling_types import (
5859
gen_sdc_pandas_rolling_overload_body, sdc_pandas_rolling_docstring_tmpl)
@@ -618,7 +619,7 @@ def sdc_pandas_series_setitem(self, idx, value):
618619
def sdc_pandas_series_setitem_no_reindexing_impl(self, idx, value):
619620

620621
if assign_via_idx_mask == True: # noqa
621-
# FIXME_Numba#5157: using asarray since eq impl for RangeIndexType returns list
622+
# FIXME_Numba#5157: using asarray since eq impl for index types returns list
622623
_idx = numpy.asarray(self._index == idx)
623624
elif assign_via_idx_data == True: # noqa
624625
_idx = idx._data
@@ -4034,7 +4035,8 @@ def hpat_pandas_series_dropna(self, axis=0, inplace=False):
40344035
ty_checker.raise_exc(inplace, 'bool', 'inplace')
40354036

40364037
if (isinstance(self.data.dtype, types.Number)
4037-
and isinstance(self.index, (types.Number, types.NoneType, RangeIndexType))):
4038+
and (isinstance(self.index, types.NoneType)
4039+
or isinstance(self.index.dtype, types.Number))):
40384040
def hpat_pandas_series_dropna_impl(self, axis=0, inplace=False):
40394041
index = self.index
40404042
return numpy_like.dropna(self._data, index, self._name)

sdc/datatypes/int64_index_type.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
# -*- coding: utf-8 -*-
2+
# *****************************************************************************
3+
# Copyright (c) 2020, Intel Corporation All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
#
8+
# Redistributions of source code must retain the above copyright notice,
9+
# this list of conditions and the following disclaimer.
10+
#
11+
# Redistributions in binary form must reproduce the above copyright notice,
12+
# this list of conditions and the following disclaimer in the documentation
13+
# and/or other materials provided with the distribution.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
# *****************************************************************************
27+
28+
from numba import types
29+
from numba.extending import (
30+
models,
31+
register_model,
32+
make_attribute_wrapper
33+
)
34+
35+
36+
class Int64IndexType(types.IterableType):
37+
dtype = types.int64
38+
39+
def __init__(self, data, is_named=False):
40+
self.data = data
41+
self.is_named = is_named
42+
super(Int64IndexType, self).__init__(
43+
name='Int64IndexType({}, {})'.format(data, is_named))
44+
45+
@property
46+
def iterator_type(self):
47+
res = self.data.iterator_type
48+
return res
49+
50+
51+
@register_model(Int64IndexType)
52+
class Int64IndexModel(models.StructModel):
53+
def __init__(self, dmm, fe_type):
54+
55+
data_type = fe_type.data
56+
name_type = types.unicode_type if fe_type.is_named else types.none
57+
members = [
58+
('data', data_type),
59+
('name', name_type),
60+
]
61+
models.StructModel.__init__(self, dmm, fe_type, members)
62+
63+
64+
make_attribute_wrapper(Int64IndexType, 'data', '_data')
65+
make_attribute_wrapper(Int64IndexType, 'name', '_name')
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
# -*- coding: utf-8 -*-
2+
# *****************************************************************************
3+
# Copyright (c) 2019-2020, Intel Corporation All rights reserved.
4+
#
5+
# Redistribution and use in source and binary forms, with or without
6+
# modification, are permitted provided that the following conditions are met:
7+
#
8+
# Redistributions of source code must retain the above copyright notice,
9+
# this list of conditions and the following disclaimer.
10+
#
11+
# Redistributions in binary form must reproduce the above copyright notice,
12+
# this list of conditions and the following disclaimer in the documentation
13+
# and/or other materials provided with the distribution.
14+
#
15+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
17+
# THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18+
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
19+
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20+
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21+
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
22+
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23+
# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
24+
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25+
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26+
# *****************************************************************************
27+
28+
import numba
29+
import numpy as np
30+
import pandas as pd
31+
32+
from numba import types
33+
34+
35+
def _check_dtype_param_type(dtype):
36+
""" Returns True is dtype is a valid type for dtype parameter and False otherwise.
37+
Used in RangeIndex ctor and other methods that take dtype parameter. """
38+
39+
valid_dtype_types = (types.NoneType, types.Omitted, types.UnicodeType, types.NumberClass)
40+
return isinstance(dtype, valid_dtype_types) or dtype is None

0 commit comments

Comments
 (0)