Skip to content

Commit 8fa9f94

Browse files
ianyu93Ian Yupre-commit-ci[bot]
authored
Updated apply_regex_anonymization (#404)
* Updated ac_dc * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update .gitmodules Updating submodule config * updated test case Updated test case for faker address to be street_address rather than full address * Updated apply_regex_anonymization * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: Ian Yu <ianyu@MBPC02FC7Z5MD6R.phub.net.cable.rogers.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent cb05d64 commit 8fa9f94

2 files changed

Lines changed: 9 additions & 4 deletions

File tree

ac_dc/anonymization.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from muliwai.pii_regexes import detect_ner_with_regex_and_context
2+
from muliwai.pii_regexes import regex_rulebase
23

34
trannum = str.maketrans("0123456789", "1111111111")
45

@@ -184,7 +185,12 @@ def apply_regex_anonymization(
184185
sentence: str, lang_id: str, context_window: int = 20, anonymize_condition=None
185186
) -> str:
186187
lang_id = lang_id.split("_")[0]
187-
ner = detect_ner_with_regex_and_context(sentence, lang_id)
188+
ner = detect_ner_with_regex_and_context(
189+
sentence=sentence,
190+
src_lang=lang_id,
191+
context_window=context_window,
192+
tag_type=regex_rulebase.keys(),
193+
)
188194
if anonymize_condition:
189195
for (ent, start, end, tag) in ner:
190196
# we need to actually walk through and replace by start, end span.

ac_dc/test_anonymization.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import random
2-
32
from anonymization import apply_regex_anonymization
43
from faker import Faker
54
from num2words import num2words
@@ -20,7 +19,7 @@ def test_en():
2019
fake = Faker("en_US")
2120
sentences = [
2221
f"I am {num2words(random.randint(0,120))} years old, and she is {random.randint(0,120)} year-old", # Age
23-
f"Sherry lives at {fake.address()}", # Address
22+
f"Sherry lives at {fake.street_address()}", # Address
2423
f"My dad is a cancer fighter. Her grandma is suffering from alzheimer's", # Disease
2524
f"Let me tell you a secret, Mr. Nguyen's SSN is {fake.ssn() if random.choice([True, False]) else fake.ssn().replace('-', '')}.", # Government ID
2625
f"Dear Ian, the payment through {fake.credit_card_number()} has been successfully executed.", # Credit card
@@ -37,7 +36,7 @@ def test_zh():
3736
fake = Faker("zh_CN")
3837
sentences = [
3938
f'我今年{num2words(random.randint(0,120), lang="ja")}歲, 而她去年{random.randint(0,120)}岁', # Age
40-
f"我住在{fake.address()}", # Address
39+
f"我住在{fake.street_address()}", # Address
4140
f"我爸是抗癌戰士。她奶奶有老人癡呆", # Disease
4241
f"李雪妮小姐331125198402010129", # Government ID
4342
f"先生,信用卡号{fake.credit_card_number()}已缴费成功", # Credit card

0 commit comments

Comments
 (0)