Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit aff4c75

Browse files
Implement parallel argsort (#910)
1 parent ae3233b commit aff4c75

11 files changed

Lines changed: 907 additions & 618 deletions

File tree

sdc/datatypes/common_functions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,8 @@ def sdc_join_series_indexes_impl(left, right):
277277
right_nan.append(i)
278278

279279
# sort arrays saving the old positions
280-
sorted_left = numpy.argsort(left, kind='mergesort')
281-
sorted_right = numpy.argsort(right, kind='mergesort')
280+
sorted_left = numpy_like.argsort(left, kind='mergesort')
281+
sorted_right = numpy_like.argsort(right, kind='mergesort')
282282
# put the position of the nans in an increasing sequence
283283
sorted_left[lsize-len(left_nan):] = left_nan
284284
sorted_right[rsize-len(right_nan):] = right_nan
@@ -523,7 +523,7 @@ def sdc_arrays_argsort_overload(A, kind='quicksort'):
523523
if isinstance(A, types.Array):
524524
def _sdc_arrays_argsort_array_impl(A, kind='quicksort'):
525525
_kind = 'quicksort' if kind_is_default == True else kind # noqa
526-
return numpy.argsort(A, kind=_kind)
526+
return numpy_like.argsort(A, kind=_kind)
527527

528528
return _sdc_arrays_argsort_array_impl
529529

sdc/datatypes/hpat_pandas_series_functions.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1137,7 +1137,7 @@ def hpat_pandas_series_nsmallest_impl(self, n=5, keep='first'):
11371137
raise ValueError("Method nsmallest(). Unsupported parameter. Given 'keep' != 'first'")
11381138

11391139
# mergesort is used for stable sorting of repeated values
1140-
indices = self._data.argsort(kind='mergesort')[:max(n, 0)]
1140+
indices = numpy_like.argsort(self._data, kind='mergesort')[:max(n, 0)]
11411141

11421142
return self.take(indices)
11431143

@@ -1207,7 +1207,7 @@ def hpat_pandas_series_nlargest_impl(self, n=5, keep='first'):
12071207
# data: [0, 1, -1, 1, 0] -> [1, 1, 0, 0, -1]
12081208
# index: [0, 1, 2, 3, 4] -> [1, 3, 0, 4, 2] (not [3, 1, 4, 0, 2])
12091209
# subtract 1 to ensure reverse ordering at boundaries
1210-
indices = (-self._data - 1).argsort(kind='mergesort')[:max(n, 0)]
1210+
indices = numpy_like.argsort(-self._data - 1, kind='mergesort')[:max(n, 0)]
12111211

12121212
return self.take(indices)
12131213

@@ -1457,7 +1457,7 @@ def hpat_pandas_series_value_counts_str_impl(
14571457
counts = numpy.asarray(counts_as_list, dtype=numpy.intp)
14581458
indexes_order = numpy.arange(values_len)
14591459
if sort:
1460-
indexes_order = counts.argsort()
1460+
indexes_order = numpy_like.argsort(counts)
14611461
if not ascending:
14621462
indexes_order = indexes_order[::-1]
14631463

@@ -1521,7 +1521,7 @@ def hpat_pandas_series_value_counts_number_impl(
15211521

15221522
indexes_order = numpy.arange(len(value_counts))
15231523
if sort:
1524-
indexes_order = value_counts.argsort()
1524+
indexes_order = numpy_like.argsort(value_counts)
15251525
if not ascending:
15261526
indexes_order = indexes_order[::-1]
15271527

@@ -3808,16 +3808,16 @@ def hpat_pandas_series_argsort_idx_impl(self, axis=0, kind='quicksort', order=No
38083808
raise ValueError("Method argsort(). Unsupported parameter. Given 'kind' != 'quicksort' or 'mergesort'")
38093809
if kind == 'mergesort':
38103810
#It is impossible to use numpy.argsort(self._data, kind=kind) since numba gives typing error
3811-
sort = numpy.argsort(self._data, kind='mergesort')
3811+
sort = numpy_like.argsort(self._data, kind='mergesort')
38123812
else:
3813-
sort = numpy.argsort(self._data)
3813+
sort = numpy_like.argsort(self._data)
38143814
na = self.isna().sum()
38153815
result = numpy.empty(len(self._data), dtype=numpy.int64)
38163816
na_data_arr = sdc.hiframes.api.get_nan_mask(self._data)
38173817
if kind == 'mergesort':
3818-
sort_nona = numpy.argsort(self._data[~na_data_arr], kind='mergesort')
3818+
sort_nona = numpy_like.argsort(self._data[~na_data_arr], kind='mergesort')
38193819
else:
3820-
sort_nona = numpy.argsort(self._data[~na_data_arr])
3820+
sort_nona = numpy_like.argsort(self._data[~na_data_arr])
38213821
q = 0
38223822
for id, i in enumerate(sort):
38233823
if id in set(sort[len(self._data) - na:]):
@@ -3835,16 +3835,16 @@ def hpat_pandas_series_argsort_noidx_impl(self, axis=0, kind='quicksort', order=
38353835
if kind != 'quicksort' and kind != 'mergesort':
38363836
raise ValueError("Method argsort(). Unsupported parameter. Given 'kind' != 'quicksort' or 'mergesort'")
38373837
if kind == 'mergesort':
3838-
sort = numpy.argsort(self._data, kind='mergesort')
3838+
sort = numpy_like.argsort(self._data, kind='mergesort')
38393839
else:
3840-
sort = numpy.argsort(self._data)
3840+
sort = numpy_like.argsort(self._data)
38413841
na = self.isna().sum()
38423842
result = numpy.empty(len(self._data), dtype=numpy.int64)
38433843
na_data_arr = sdc.hiframes.api.get_nan_mask(self._data)
38443844
if kind == 'mergesort':
3845-
sort_nona = numpy.argsort(self._data[~na_data_arr], kind='mergesort')
3845+
sort_nona = numpy_like.argsort(self._data[~na_data_arr], kind='mergesort')
38463846
else:
3847-
sort_nona = numpy.argsort(self._data[~na_data_arr])
3847+
sort_nona = numpy_like.argsort(self._data[~na_data_arr])
38483848
q = 0
38493849
for id, i in enumerate(sort):
38503850
if id in set(sort[len(self._data) - na:]):

sdc/functions/numpy_like.py

Lines changed: 109 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@
4747
from sdc.functions.statistics import skew_formula
4848
from sdc.hiframes.api import isna
4949
from sdc.datatypes.range_index_type import RangeIndexType
50-
from sdc.utilities.sdc_typing_utils import TypeChecker
50+
from sdc.utilities.sdc_typing_utils import TypeChecker, is_default
5151
from sdc.utilities.utils import (sdc_overload, sdc_register_jitable,
5252
min_dtype_int_val, max_dtype_int_val, min_dtype_float_val,
5353
max_dtype_float_val)
@@ -56,6 +56,7 @@
5656
num_total_chars, str_arr_is_na)
5757
from sdc.utilities.prange_utils import parallel_chunks
5858
from sdc.utilities.sdc_typing_utils import check_types_comparable
59+
from sdc.functions.sort import parallel_sort, parallel_stable_sort, parallel_argsort, parallel_stable_argsort
5960

6061
def astype(self, dtype):
6162
pass
@@ -1154,3 +1155,110 @@ def f(A):
11541155
i += 1
11551156
return arr
11561157
return f
1158+
1159+
1160+
def sort(a, axis=-1, kind=None, order=None):
1161+
"""
1162+
Sort input array inplace.
1163+
1164+
Parameters
1165+
-----------
1166+
a: :obj:`Array`
1167+
Input array
1168+
axis: Unsupported
1169+
kind: {'quicksort', 'mergesort'}, optional
1170+
Sorting algorithm. Default is 'quicksort'.
1171+
In fact sorting algorithm is never niether 'quicksort' nor 'mergesort'.
1172+
It is just either nonstable or stable sort.
1173+
order: Unsupported
1174+
1175+
Returns
1176+
-------
1177+
None
1178+
"""
1179+
1180+
pass
1181+
1182+
1183+
@sdc_overload(sort)
1184+
def sort_overload(a, axis=-1, kind=None, order=None):
1185+
_func_name = 'sort'
1186+
ty_checker = TypeChecker(_func_name)
1187+
1188+
ty_checker.check(a, types.Array)
1189+
1190+
if not is_default(axis, -1):
1191+
raise TypingError(f'{_func_name} Unsupported parameter axis')
1192+
1193+
if not is_default(order, None):
1194+
raise TypingError(f'{_func_name} Unsupported parameter order')
1195+
1196+
def sort_impl(a, axis=-1, kind=None, order=None):
1197+
_kind = 'quicksort'
1198+
if kind is not None:
1199+
_kind = kind
1200+
1201+
if _kind == 'quicksort':
1202+
return parallel_sort(a)
1203+
elif _kind == 'mergesort':
1204+
return parallel_stable_sort(a)
1205+
else:
1206+
raise ValueError("Unsupported value of 'kind' parameter")
1207+
1208+
return sort_impl
1209+
1210+
1211+
def argsort(a, axis=-1, kind=None, order=None):
1212+
"""
1213+
Returns the indices that would sort an array.
1214+
1215+
Perform an indirect sort along the given axis using the algorithm specified
1216+
by the `kind` keyword. It returns an array of indices of the same shape as
1217+
`a` that index data along the given axis in sorted order.
1218+
1219+
Parameters
1220+
----------
1221+
a : :obj:`Array`
1222+
Array to sort.
1223+
axis: Unsupported
1224+
kind: {'quicksort', 'mergesort'}, optional
1225+
Sorting algorithm. Default is 'quicksort'.
1226+
In fact sorting algorithm is never niether 'quicksort' nor 'mergesort'.
1227+
It is just either nonstable or stable sort.
1228+
order: Unsupported
1229+
1230+
Returns
1231+
-------
1232+
index_array : ndarray, int
1233+
Array of indices that sort `a`.
1234+
"""
1235+
1236+
pass
1237+
1238+
1239+
@sdc_overload(argsort)
1240+
def argsort_overload(a, axis=-1, kind=None, order=None):
1241+
_func_name = 'argsort'
1242+
ty_checker = TypeChecker(_func_name)
1243+
1244+
ty_checker.check(a, types.Array)
1245+
1246+
if not is_default(axis, -1):
1247+
raise TypingError(f'{_func_name} Unsupported parameter axis')
1248+
1249+
if not is_default(order, None):
1250+
raise TypingError(f'{_func_name} Unsupported parameter order')
1251+
1252+
def argsort_impl(a, axis=-1, kind=None, order=None):
1253+
_kind = 'quicksort'
1254+
if kind is not None:
1255+
_kind = kind
1256+
1257+
if _kind == 'quicksort':
1258+
return parallel_argsort(a)
1259+
elif _kind == 'mergesort':
1260+
return parallel_stable_argsort(a)
1261+
else:
1262+
raise ValueError("Unsupported value of 'kind' parameter")
1263+
1264+
return argsort_impl

0 commit comments

Comments
 (0)