Skip to content

Commit 2905aff

Browse files
authored
Update anonymization.py
Deleting all the IBAN patterns as we now account for them in the regexes.
1 parent db1017e commit 2905aff

1 file changed

Lines changed: 0 additions & 178 deletions

File tree

ac_dc/anonymization.py

Lines changed: 0 additions & 178 deletions
Original file line numberDiff line numberDiff line change
@@ -3,184 +3,6 @@
33

44
trannum = str.maketrans("0123456789", "1111111111")
55

6-
7-
# Will we cover IBAN??
8-
# IBAN - see https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/iban_patterns.py which is under MIT
9-
10-
# IBAN parts format
11-
CC = "[A-Z]{2}" # country code
12-
CK = "[0-9]{2}[ ]?" # checksum
13-
BOS = "^"
14-
EOS = "$" # end of string
15-
16-
A = "[A-Z][ ]?"
17-
A2 = "([A-Z][ ]?){2}"
18-
A3 = "([A-Z][ ]?){3}"
19-
A4 = "([A-Z][ ]?){4}"
20-
21-
C = "[a-zA-Z0-9][ ]?"
22-
C2 = "([a-zA-Z0-9][ ]?){2}"
23-
C3 = "([a-zA-Z0-9][ ]?){3}"
24-
C4 = "([a-zA-Z0-9][ ]?){4}"
25-
26-
N = "[0-9][ ]?"
27-
N2 = "([0-9][ ]?){2}"
28-
N3 = "([0-9][ ]?){3}"
29-
N4 = "([0-9][ ]?){4}"
30-
31-
# WIP - fix the country codes and group by languages
32-
# move this to financial record
33-
iban_regex = {
34-
# Albania (8n, 16c) ALkk bbbs sssx cccc cccc cccc cccc
35-
"al_AL": "(AL)" + CK + N4 + N4 + C4 + C4 + C4 + C4,
36-
# Andorra (8n, 12c) ADkk bbbb ssss cccc cccc cccc
37-
"ad_AD": "(AD)" + CK + N4 + N4 + C4 + C4 + C4,
38-
# Austria (16n) ATkk bbbb bccc cccc cccc
39-
"en_AT": "(AT)" + CK + N4 + N4 + N4 + N4,
40-
# Azerbaijan (4c,20n) AZkk bbbb cccc cccc cccc cccc cccc
41-
"az_AZ": "(AZ)" + CK + C4 + N4 + N4 + N4 + N4 + N4,
42-
# Bahrain (4a,14c) BHkk bbbb cccc cccc cccc cc
43-
"ar_BH": "(BH)" + CK + A4 + C4 + C4 + C4 + C2,
44-
# Belarus (4c, 4n, 16c) BYkk bbbb aaaa cccc cccc cccc cccc
45-
"bel_BY": "(BY)" + CK + C4 + N4 + C4 + C4 + C4 + C4,
46-
# Belgium (12n) BEkk bbbc cccc ccxx
47-
"fr_BE": "(BE)" + CK + N4 + N4 + N4,
48-
# Bosnia and Herzegovina (16n) BAkk bbbs sscc cccc ccxx
49-
"bos_BA": "(BA)" + CK + N4 + N4 + N4 + N4,
50-
# Brazil (23n,1a,1c) BRkk bbbb bbbb ssss sccc cccc ccct n
51-
"pt_BR": "(BR)" + CK + N4 + N4 + N4 + N4 + N4 + N3 + A + C,
52-
# Bulgaria (4a,6n,8c) BGkk bbbb ssss ttcc cccc cc
53-
"bg_BG": "(BG)" + CK + A4 + N4 + N + N + C2 + C4 + C2,
54-
# Costa Rica (18n) CRkk 0bbb cccc cccc cccc cc (0 = always zero)
55-
"es_CR": "(CR)" + CK + "[0]" + N3 + N4 + N4 + N4 + N2,
56-
# Croatia (17n) HRkk bbbb bbbc cccc cccc c
57-
"hr_HR": "(HR)" + CK + N4 + N4 + N4 + N4 + N,
58-
# Cyprus (8n,16c) CYkk bbbs ssss cccc cccc cccc cccc
59-
"el_CY": "(CY)" + CK + N4 + N4 + C4 + C4 + C4 + C4,
60-
# Czech Republic (20n) CZkk bbbb ssss sscc cccc cccc
61-
"cz_CZ": "(CZ)" + CK + N4 + N4 + N4 + N4 + N4,
62-
# Denmark (14n) DKkk bbbb cccc cccc cc
63-
"dan_DK": "(DK)" + CK + N4 + N4 + N4 + N2,
64-
# Dominican Republic (4a,20n) DOkk bbbb cccc cccc cccc cccc cccc
65-
"es_DO": "(DO)" + CK + A4 + N4 + N4 + N4 + N4 + N4,
66-
# EAt Timor (19n) TLkk bbbc cccc cccc cccc cxx
67-
"tl_TL": "(TL)" + CK + N4 + N4 + N4 + N4 + N3,
68-
# Estonia (16n) EEkk bbss cccc cccc cccx
69-
"ee_EE": "(EE)" + CK + N4 + N4 + N4 + N4,
70-
# Faroe Islands (14n) FOkk bbbb cccc cccc cx
71-
"FO": "(FO)" + CK + N4 + N4 + N4 + N2,
72-
# Finland (14n) FIkk bbbb bbcc cccc cx
73-
"fi_FI": "(FI)" + CK + N4 + N4 + N4 + N2,
74-
# France (10n,11c,2n) FRkk bbbb bsss sscc cccc cccc cxx
75-
"fr_FR": "(FR)" + CK + N4 + N4 + N2 + C2 + C4 + C4 + C + N2,
76-
# Georgia (2c,16n) GEkk bbcc cccc cccc cccc cc
77-
"ge_GE": "(GE)" + CK + C2 + N2 + N4 + N4 + N4 + N2,
78-
# Germany (18n) DEkk bbbb bbbb cccc cccc cc
79-
"de_DE": "(DE)" + CK + N4 + N4 + N4 + N4 + N2,
80-
# Gibraltar (4a,15c) GIkk bbbb cccc cccc cccc ccc
81-
"GI": "(GI)" + CK + A4 + C4 + C4 + C4 + C3,
82-
# Greece (7n,16c) GRkk bbbs sssc cccc cccc cccc ccc
83-
"el_GR": "(GR)" + CK + N4 + N3 + C + C4 + C4 + C4 + C3,
84-
# Greenland (14n) GLkk bbbb cccc cccc cc
85-
"kl_GL": "(GL)" + CK + N4 + N4 + N4 + N2,
86-
# Guatemala (4c,20c) GTkk bbbb mmtt cccc cccc cccc cccc
87-
"es_GT": "(GT)" + CK + C4 + C4 + C4 + C4 + C4 + C4,
88-
# Hungary (24n) HUkk bbbs sssx cccc cccc cccc cccx
89-
"hu_HU": "(HU)" + CK + N4 + N4 + N4 + N4 + N4 + N4,
90-
# Iceland (22n) ISkk bbbb sscc cccc iiii iiii ii
91-
"is_IS": "(IS)" + CK + N4 + N4 + N4 + N4 + N4 + N2,
92-
# Ireland (4c,14n) IEkk aaaa bbbb bbcc cccc cc
93-
"en_IE": "(IE)" + CK + C4 + N4 + N4 + N4 + N2,
94-
# Israel (19n) ILkk bbbn nncc cccc cccc ccc
95-
"hb_IL": "(IL)" + CK + N4 + N4 + N4 + N4 + N3,
96-
# Italy (1a,10n,12c) ITkk xbbb bbss sssc cccc cccc ccc
97-
"it_IT": "(IT)" + CK + A + N3 + N4 + N3 + C + C3 + C + C4 + C3,
98-
# Jordan (4a,22n) JOkk bbbb ssss cccc cccc cccc cccc cc
99-
"ar_JO": "(JO)" + CK + A4 + N4 + N4 + N4 + N4 + N4 + N2,
100-
# Kazakhstan (3n,13c) KZkk bbbc cccc cccc cccc
101-
"kz_KZ": "(KZ)" + CK + N3 + C + C4 + C4 + C4,
102-
# Kosovo (4n,10n,2n) XKkk bbbb cccc cccc cccc
103-
"xk_XK": "(XK)" + CK + N4 + N4 + N4 + N4,
104-
# Kuwait (4a,22c) KWkk bbbb cccc cccc cccc cccc cccc cc
105-
"ar_KW": "(KW)" + CK + A4 + C4 + C4 + C4 + C4 + C4 + C2,
106-
# Latvia (4a,13c) LVkk bbbb cccc cccc cccc c
107-
"lv_LV": "(LV)" + CK + A4 + C4 + C4 + C4 + C,
108-
# Lebanon (4n,20c) LBkk bbbb cccc cccc cccc cccc cccc
109-
"lb_LB": "(LB)" + CK + N4 + C4 + C4 + C4 + C4 + C4,
110-
# de_LiechteNtein (5n,12c) LIkk bbbb bccc cccc cccc c
111-
"li_LI": "(LI)" + CK + N4 + N + C3 + C4 + C4 + C,
112-
# Lithuania (16n) LTkk bbbb bccc cccc cccc
113-
"lt_LT": "(LT)" + CK + N4 + N4 + N4 + N4,
114-
# Luxembourg (3n,13c) LUkk bbbc cccc cccc cccc
115-
"lu_LU": "(LU)" + CK + N3 + C + C4 + C4 + C4,
116-
# Malta (4a,5n,18c) MTkk bbbb ssss sccc cccc cccc cccc ccc
117-
"mt_MT": "(MT)" + CK + A4 + N4 + N + C3 + C4 + C4 + C4 + C3,
118-
# Mauritania (23n) MRkk bbbb bsss sscc cccc cccc cxx
119-
"mr_MR": "(MR)" + CK + N4 + N4 + N4 + N4 + N4 + N3,
120-
# Mauritius (4a,19n,3a) MUkk bbbb bbss cccc cccc cccc 000m mm
121-
"mu_MU": "(MU)" + CK + A4 + N4 + N4 + N4 + N4 + N3 + A,
122-
# Moldova (2c,18c) MDkk bbcc cccc cccc cccc cccc
123-
"md_MD": "(MD)" + CK + C4 + C4 + C4 + C4 + C4,
124-
# Monaco (10n,11c,2n) MCkk bbbb bsss sscc cccc cccc cxx
125-
"mc_MC": "(MC)" + CK + N4 + N4 + N2 + C2 + C4 + C4 + C + N2,
126-
# Montenegro (18n) MEkk bbbc cccc cccc cccc xx
127-
"me_ME": "(ME)" + CK + N4 + N4 + N4 + N4 + N2,
128-
# Netherlands (4a,10n) NLkk bbbb cccc cccc cc
129-
"nl_NL": "(NL)" + CK + A4 + N4 + N4 + N2,
130-
# North Macedonia (3n,10c,2n) MKkk bbbc cccc cccc cxx
131-
"mk_MK": "(MK)" + CK + N3 + C + C4 + C4 + C + N2,
132-
# Norway (11n) NOkk bbbb cccc ccx
133-
"no_NO": "(NO)" + CK + N4 + N4 + N3,
134-
# Pakistan (4c,16n) PKkk bbbb cccc cccc cccc cccc
135-
"pk_PK": "(PK)" + CK + C4 + N4 + N4 + N4 + N4,
136-
# Palestinian territories (4c,21n) PSkk bbbb xxxx xxxx xccc cccc cccc c
137-
"ps_PS": "(PS)" + CK + C4 + N4 + N4 + N4 + N4 + N,
138-
# Poland (24n) PLkk bbbs sssx cccc cccc cccc cccc
139-
"pl_PL": "(PL)" + CK + N4 + N4 + N4 + N4 + N4 + N4,
140-
# Portugal (21n) PTkk bbbb ssss cccc cccc cccx x
141-
"pt_PT": "(PT)" + CK + N4 + N4 + N4 + N4 + N,
142-
# Qatar (4a,21c) QAkk bbbb cccc cccc cccc cccc cccc c
143-
"ar_QA": "(QA)" + CK + A4 + C4 + C4 + C4 + C4 + C,
144-
# Romania (4a,16c) ROkk bbbb cccc cccc cccc cccc
145-
"ro_RO": "(RO)" + CK + A4 + C4 + C4 + C4 + C4,
146-
# San Marino (1a,10n,12c) SMkk xbbb bbss sssc cccc cccc ccc
147-
"sm": "(SM)" + CK + A + N3 + N4 + N3 + C + C4 + C4 + C3,
148-
# Saudi Arabia (2n,18c) SAkk bbcc cccc cccc cccc cccc
149-
"ar_SA": "(SA)" + CK + N2 + C2 + C4 + C4 + C4 + C4,
150-
# Serbia (18n) RSkk bbbc cccc cccc cccc xx
151-
"rs_RS": "(RS)" + CK + N4 + N4 + N4 + N4 + N2,
152-
# Slovakia (20n) SKkk bbbb ssss sscc cccc cccc
153-
"sk_SK": "(SK)" + CK + N4 + N4 + N4 + N4 + N4,
154-
# Slovenia (15n) SIkk bbss sccc cccc cxx
155-
"si_SI": "(SI)" + CK + N4 + N4 + N4 + N3,
156-
# Spain (20n) ESkk bbbb ssss xxcc cccc cccc
157-
"es_ES": "(ES)" + CK + N4 + N4 + N4 + N4 + N4,
158-
# Sweden (20n) SEkk bbbc cccc cccc cccc cccc
159-
"se_SE": "(SE)" + CK + N4 + N4 + N4 + N4 + N4,
160-
# Switzerland (5n,12c) CHkk bbbb bccc cccc cccc c
161-
"gsw_CH": "(CH)" + CK + N4 + N + C3 + C4 + C4 + C,
162-
# Tunisia (20n) TNkk bbss sccc cccc cccc cccc
163-
"ar_TN": "(TN)" + CK + N4 + N4 + N4 + N4 + N4,
164-
# Turkey (5n,17c) TRkk bbbb bxcc cccc cccc cccc cc
165-
"tr_TR": "(TR)" + CK + N4 + N + C3 + C4 + C4 + C4 + C2,
166-
# United Arab Emirates (3n,16n) AEkk bbbc cccc cccc cccc ccc
167-
"ar_AE": "(AE)" + CK + N4 + N4 + N4 + N4 + N3,
168-
# United Kingdom (4a,14n) GBkk bbbb ssss sscc cccc cc
169-
"en_GB": "(GB)" + CK + A4 + N4 + N4 + N4 + N2,
170-
# Vatican City (3n,15n) VAkk bbbc cccc cccc cccc cc
171-
"it_VA": "(VA)" + CK + N4 + N4 + N4 + N4 + N2,
172-
# Virgin Islands, British (4c,16n) VGkk bbbb cccc cccc cccc cccc
173-
"en_VG": "(VG)" + CK + C4 + N4 + N4 + N4 + N4,
174-
}
175-
176-
177-
# should we move license plate to govt_id?
178-
# ("LICENSE_PLATE", regex.compile('^(?:[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领 A-Z]{1}[A-HJ-NP-Z]{1}(?:(?:[0-9]{5}[DF])|(?:[DF](?:[A-HJ-NP-Z0-9])[0-9]{4})))|(?:[京津沪渝冀豫云辽黑湘皖鲁新苏浙赣鄂桂甘晋蒙陕吉闽贵粤青藏川宁琼使领 A-Z]{1}[A-Z]{1}[A-HJ-NP-Z0-9]{4}[A-HJ-NP-Z0-9 挂学警港澳]{1})$'), None, None, None),
179-
# ("LICENSE_PLATE", regex.compile('\b[A-Z]{3}-\d{4}\b'), None, None, None),
180-
181-
# Code below needs to be updated/completed.
182-
183-
1846
def apply_regex_anonymization(
1857
sentence: str,
1868
lang_id: str,

0 commit comments

Comments
 (0)