|
| 1 | +from muliwai.regex_manager import detect_ner_with_regex_and_context |
| 2 | +from muliwai.pii_regexes_rulebase import regex_rulebase |
| 3 | +from muliwai.ner_manager import detect_ner_with_hf_model |
| 4 | +from muliwai.faker_manager import augment_anonymize |
| 5 | + |
| 6 | +def apply_anonymization( |
| 7 | + sentence: str, |
| 8 | + lang_id: str, |
| 9 | + context_window: int = 20, |
| 10 | + anonymize_condition=None, |
| 11 | + tag_type={'IP_ADDRESS', 'KEY', 'ID', 'PHONE', 'USER', 'EMAIL', 'LICENSE_PLATE', 'PERSON'} , |
| 12 | + device: str = "cpu", |
| 13 | +) -> str: |
| 14 | + """ |
| 15 | + Params: |
| 16 | + ================== |
| 17 | + sentence: str, the sentence to be anonymized |
| 18 | + lang_id: str, the language id of the sentence |
| 19 | + context_window: int, the context window size |
| 20 | + anonymize_condition: function, the anonymization condition |
| 21 | + tag_type: iterable, the tag types of the anonymization. By default: {'IP_ADDRESS', 'KEY', 'ID', 'PHONE', 'USER', 'EMAIL', 'LICENSE_PLATE', 'PERSON'} |
| 22 | + device: cpu or cuda:{device_id} |
| 23 | +
|
| 24 | + """ |
| 25 | + if tag_type == None: |
| 26 | + tag_type = regex_rulebase.keys() |
| 27 | + lang_id = lang_id.split("_")[0] |
| 28 | + ner_ids = detect_ner_with_regex_and_context( |
| 29 | + sentence=sentence, |
| 30 | + src_lang=lang_id, |
| 31 | + context_window=context_window, |
| 32 | + tag_type=tag_type, |
| 33 | + ) |
| 34 | + ner_persons = detect_ner_with_hf_model( |
| 35 | + sentence=sentence, |
| 36 | + src_lang=lang_id, |
| 37 | + device=device, |
| 38 | + ) |
| 39 | + ner = ner_ids + ner_persons |
| 40 | + if anonymize_condition: |
| 41 | + new_sentence, new_ner = augment_anonymize(sentence, lang_id, ner, ) |
| 42 | + doc = {'text': new_sentence, 'ner': new_ner, 'orig_text': sentence, 'orig_ner': ner} |
| 43 | + else: |
| 44 | + new_sentence = sentence |
| 45 | + doc = {'text': new_sentence, 'ner': ner} |
| 46 | + return new_sentence, doc |
0 commit comments