Skip to content

Commit 4976e99

Browse files
committed
Update parameters for Bengali
1 parent 8e91629 commit 4976e99

1 file changed

Lines changed: 9 additions & 9 deletions

File tree

ac_dc/parameters_filtering.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -192,32 +192,32 @@
192192
"cond_remove_words_with_incorrect_substrings": True,
193193
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
194194
"cond_remove_long_words": True,
195-
"length_word_max_cutoff": 30,
195+
"length_word_max_cutoff": 20,
196196
"cond_check_number_words": True,
197197
"tokenization": False,
198198
"strip_characters": special_characters_default,
199-
"number_words_min_cutoff": 1,
199+
"number_words_min_cutoff": 33,
200200
"number_words_max_cutoff": 100000,
201201
"cond_check_character_repetition_removal": True,
202202
"character_repetition_length": 10,
203-
"character_repetition_max_cutoff": 0.106,
203+
"character_repetition_max_cutoff": 0.13,
204204
"cond_check_word_repetition_removal": True,
205205
"word_repetition_length": 5,
206-
"word_repetition_max_cutoff": 0.19,
206+
"word_repetition_max_cutoff": 0.21,
207207
"cond_check_special_characters": True,
208208
"special_characters": special_characters_default,
209-
"special_characters_max_cutoff": 0.275,
209+
"special_characters_max_cutoff": 0.45,
210210
"cond_words_augmentation": False,
211211
"words_augmentation_group_sizes": [],
212212
"words_augmentation_join_char": "",
213213
"cond_check_stopwords": True,
214-
"stopwords_min_cutoff": 0.05,
214+
"stopwords_min_cutoff": 0.002,
215215
"cond_check_flagged_words": True,
216-
"flagged_words_max_cutoff": 0.2,
216+
"flagged_words_max_cutoff": 0.001,
217217
"cond_check_lang_id": True,
218-
"lang_id_min_cutoff": 0.75,
218+
"lang_id_min_cutoff": 0.95,
219219
"cond_check_perplexity": True,
220-
"perplexity_max_cutoff": 575000,
220+
"perplexity_max_cutoff": 2000,
221221
}
222222

223223
parameters_filtering_ca = {

0 commit comments

Comments
 (0)