diff --git a/CHANGELOG.md b/CHANGELOG.md index ef8b88bc9e3..76dd093fa08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ This release is compatible with NumPy 2.4.5. * Updated tests to align with NumPy 2.4.5 compatibility [gh-2920](https://github.com/IntelPython/dpnp/pull/2920) * Replaced `.pxi` includes in `dpnp.tensor` with modular `.pxd`/`.pyx` Cython imports [#2913](https://github.com/IntelPython/dpnp/pull/2913) * Reimplemented `dpnp.eye` and `dpnp.tensor.eye` with a branchless kernel [gh-2937](https://github.com/IntelPython/dpnp/pull/2937) +* Improved performance of `dpnp.fft` functions for complex strided input by avoiding oversized allocations and extra copies [#2939](https://github.com/IntelPython/dpnp/pull/2939) ### Deprecated diff --git a/dpnp/fft/dpnp_utils_fft.py b/dpnp/fft/dpnp_utils_fft.py index 733436ab988..dadf8b2fc7f 100644 --- a/dpnp/fft/dpnp_utils_fft.py +++ b/dpnp/fft/dpnp_utils_fft.py @@ -408,12 +408,27 @@ def _fft(a, norm, out, forward, in_place, c2c, axes, batch_fft=True): a = dpnp.reshape(a, local_shape) index = 1 + if not a.flags.c_contiguous: # cuFFT requires input arrays to be C-contiguous (row-major) # for correct execution - if ( - dpnp.is_cuda_backend(a) and not a.flags.c_contiguous - ): # pragma: no cover + if dpnp.is_cuda_backend(a): # pragma: no cover a = dpnp.ascontiguousarray(a) + else: + # Check if the memory footprint of the strides exceeds + # the number of elements. + # If so, copy to contiguous to avoid oversized allocation + # for the output array and unnecessary copy to contiguous + # after oneMKL FFT + _strides = dpnp.get_usm_ndarray(a).strides + _shape = a.shape + # Max element displacement reachable by the strides. + # Negative strides are handled by _copy_array, so only + # positive strides are possible here + max_disp = sum( + st * (sh - 1) for st, sh in zip(_strides, _shape) if st > 0 + ) + if (max_disp + 1) > a.size: + a = dpnp.ascontiguousarray(a) # w/a for cuFFT to avoid "Invalid strides" error when # the last dimension is 1 and there are multiple axes