Skip to content
This repository was archived by the owner on Feb 2, 2024. It is now read-only.

Commit 517cbb1

Browse files
author
Ehsan Totoni
committed
use re for str.replace with regex
1 parent 16fb932 commit 517cbb1

4 files changed

Lines changed: 32 additions & 10 deletions

File tree

hpat/hiframes/hiframes_typed.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import operator
22
from collections import defaultdict, namedtuple
3+
import re
34
import numpy as np
45
import pandas as pd
56
import warnings
@@ -1746,7 +1747,8 @@ def _run_series_str_replace(self, assign, lhs, arr, rhs, nodes):
17461747
extra_globals={'unicode_to_std_str': unicode_to_std_str,
17471748
'std_str_to_unicode': std_str_to_unicode,
17481749
'pre_alloc_string_array': pre_alloc_string_array,
1749-
'get_utf8_size': get_utf8_size}
1750+
'get_utf8_size': get_utf8_size,
1751+
're': re}
17501752
)
17511753

17521754

hpat/hiframes/series_kernels.py

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from collections import defaultdict
22
import numpy as np
3+
import re
34

45
import numba
56
from numba import types
@@ -363,20 +364,38 @@ def _series_astype_str_impl(arr):
363364
return hpat.hiframes.api.init_series(A)
364365

365366

367+
# def _str_replace_regex_impl(str_arr, pat, val):
368+
# numba.parfor.init_prange()
369+
# e = hpat.str_ext.compile_regex(unicode_to_std_str(pat))
370+
# val = unicode_to_std_str(val)
371+
# n = len(str_arr)
372+
# n_total_chars = 0
373+
# str_list = hpat.str_ext.alloc_str_list(n)
374+
# for i in numba.parfor.internal_prange(n):
375+
# # TODO: support unicode
376+
# in_str = unicode_to_std_str(str_arr[i])
377+
# out_str = std_str_to_unicode(
378+
# hpat.str_ext.str_replace_regex(in_str, e, val))
379+
# str_list[i] = out_str
380+
# n_total_chars += len(out_str)
381+
# numba.parfor.init_prange()
382+
# out_arr = pre_alloc_string_array(n, n_total_chars)
383+
# for i in numba.parfor.internal_prange(n):
384+
# _str = str_list[i]
385+
# out_arr[i] = _str
386+
# return hpat.hiframes.api.init_series(out_arr)
387+
388+
366389
def _str_replace_regex_impl(str_arr, pat, val):
367390
numba.parfor.init_prange()
368-
e = hpat.str_ext.compile_regex(unicode_to_std_str(pat))
369-
val = unicode_to_std_str(val)
391+
e = re.compile(pat)
370392
n = len(str_arr)
371393
n_total_chars = 0
372394
str_list = hpat.str_ext.alloc_str_list(n)
373395
for i in numba.parfor.internal_prange(n):
374-
# TODO: support unicode
375-
in_str = unicode_to_std_str(str_arr[i])
376-
out_str = std_str_to_unicode(
377-
hpat.str_ext.str_replace_regex(in_str, e, val))
396+
out_str = e.sub(val, str_arr[i])
378397
str_list[i] = out_str
379-
n_total_chars += len(out_str)
398+
n_total_chars += get_utf8_size(out_str)
380399
numba.parfor.init_prange()
381400
out_arr = pre_alloc_string_array(n, n_total_chars)
382401
for i in numba.parfor.internal_prange(n):

hpat/str_ext.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ def unbox_re_pattern(typ, obj, c):
105105
c.pyapi.incref(obj)
106106
return NativeValue(obj)
107107

108-
@overload(re.compile)
108+
109+
# jitoptions until numba #4020 is resolved
110+
@overload(re.compile, jit_options={'no_cpython_wrapper': False})
109111
def re_compile_overload(pattern, flags=0):
110112
def _re_compile_impl(pattern, flags=0):
111113
with numba.objmode(pat='re_pattern_type'):

hpat/tests/test_strings.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,6 @@ def test_impl(a):
106106
# XXX: use startswith since hpat output can have extra characters
107107
self.assertTrue(h_res.startswith(py_res))
108108

109-
@unittest.skip("pending numba #4020")
110109
def test_re_sub(self):
111110
def test_impl(_str):
112111
p = re.compile('ab*')

0 commit comments

Comments
 (0)