Skip to content

Commit 0567dad

Browse files
committed
Update parameters for Arabic
1 parent 425d0ae commit 0567dad

1 file changed

Lines changed: 8 additions & 9 deletions

File tree

ac_dc/parameters_filtering.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,28 +94,28 @@
9494
"cond_check_number_words": True,
9595
"tokenization": False,
9696
"strip_characters": special_characters_default,
97-
"number_words_min_cutoff": 1,
97+
"number_words_min_cutoff": 20,
9898
"number_words_max_cutoff": 100000,
9999
"cond_check_character_repetition_removal": True,
100100
"character_repetition_length": 10,
101-
"character_repetition_max_cutoff": 0.106,
101+
"character_repetition_max_cutoff": 0.20,
102102
"cond_check_word_repetition_removal": True,
103103
"word_repetition_length": 5,
104-
"word_repetition_max_cutoff": 0.19,
104+
"word_repetition_max_cutoff": 0.34,
105105
"cond_check_special_characters": True,
106106
"special_characters": special_characters_default,
107-
"special_characters_max_cutoff": 0.45,
107+
"special_characters_max_cutoff": 0.30,
108108
"cond_words_augmentation": False,
109109
"words_augmentation_group_sizes": [],
110110
"words_augmentation_join_char": "",
111111
"cond_check_stopwords": True,
112-
"stopwords_min_cutoff": 0,
112+
"stopwords_min_cutoff": 0.07,
113113
"cond_check_flagged_words": True,
114-
"flagged_words_max_cutoff": 0.2,
114+
"flagged_words_max_cutoff": 0.03,
115115
"cond_check_lang_id": True,
116-
"lang_id_min_cutoff": 0.75,
116+
"lang_id_min_cutoff": 0.81,
117117
"cond_check_perplexity": True,
118-
"perplexity_max_cutoff": 1000000,
118+
"perplexity_max_cutoff": 2300,
119119
}
120120

121121
parameters_filtering_arz = {
@@ -868,7 +868,6 @@
868868

869869
parameters_filtering = {
870870
"default": parameters_filtering_default,
871-
"af": parameters_filtering_af,
872871
"ar": parameters_filtering_ar,
873872
"arz": parameters_filtering_arz,
874873
"as": parameters_filtering_as,

0 commit comments

Comments
 (0)