Skip to content

Commit db1017e

Browse files
authored
Create person_and_id_anonymization.py
1 parent b185a7a commit db1017e

1 file changed

Lines changed: 46 additions & 0 deletions

File tree

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from muliwai.regex_manager import detect_ner_with_regex_and_context
2+
from muliwai.pii_regexes_rulebase import regex_rulebase
3+
from muliwai.ner_manager import detect_ner_with_hf_model
4+
from muliwai.faker_manager import augment_anonymize
5+
6+
def apply_anonymization(
7+
sentence: str,
8+
lang_id: str,
9+
context_window: int = 20,
10+
anonymize_condition=None,
11+
tag_type={'IP_ADDRESS', 'KEY', 'ID', 'PHONE', 'USER', 'EMAIL', 'LICENSE_PLATE', 'PERSON'} ,
12+
device: str = "cpu",
13+
) -> str:
14+
"""
15+
Params:
16+
==================
17+
sentence: str, the sentence to be anonymized
18+
lang_id: str, the language id of the sentence
19+
context_window: int, the context window size
20+
anonymize_condition: function, the anonymization condition
21+
tag_type: iterable, the tag types of the anonymization. By default: {'IP_ADDRESS', 'KEY', 'ID', 'PHONE', 'USER', 'EMAIL', 'LICENSE_PLATE', 'PERSON'}
22+
device: cpu or cuda:{device_id}
23+
24+
"""
25+
if tag_type == None:
26+
tag_type = regex_rulebase.keys()
27+
lang_id = lang_id.split("_")[0]
28+
ner_ids = detect_ner_with_regex_and_context(
29+
sentence=sentence,
30+
src_lang=lang_id,
31+
context_window=context_window,
32+
tag_type=tag_type,
33+
)
34+
ner_persons = detect_ner_with_hf_model(
35+
sentence=sentence,
36+
src_lang=lang_id,
37+
device=device,
38+
)
39+
ner = ner_ids + ner_persons
40+
if anonymize_condition:
41+
new_sentence, new_ner = augment_anonymize(sentence, lang_id, ner, )
42+
doc = {'text': new_sentence, 'ner': new_ner, 'orig_text': sentence, 'orig_ner': ner}
43+
else:
44+
new_sentence = sentence
45+
doc = {'text': new_sentence, 'ner': ner}
46+
return new_sentence, doc

0 commit comments

Comments
 (0)