|
94 | 94 | "cond_check_number_words": True, |
95 | 95 | "tokenization": False, |
96 | 96 | "strip_characters": special_characters_default, |
97 | | - "number_words_min_cutoff": 1, |
| 97 | + "number_words_min_cutoff": 20, |
98 | 98 | "number_words_max_cutoff": 100000, |
99 | 99 | "cond_check_character_repetition_removal": True, |
100 | 100 | "character_repetition_length": 10, |
101 | | - "character_repetition_max_cutoff": 0.106, |
| 101 | + "character_repetition_max_cutoff": 0.20, |
102 | 102 | "cond_check_word_repetition_removal": True, |
103 | 103 | "word_repetition_length": 5, |
104 | | - "word_repetition_max_cutoff": 0.19, |
| 104 | + "word_repetition_max_cutoff": 0.34, |
105 | 105 | "cond_check_special_characters": True, |
106 | 106 | "special_characters": special_characters_default, |
107 | | - "special_characters_max_cutoff": 0.45, |
| 107 | + "special_characters_max_cutoff": 0.30, |
108 | 108 | "cond_words_augmentation": False, |
109 | 109 | "words_augmentation_group_sizes": [], |
110 | 110 | "words_augmentation_join_char": "", |
111 | 111 | "cond_check_stopwords": True, |
112 | | - "stopwords_min_cutoff": 0, |
| 112 | + "stopwords_min_cutoff": 0.07, |
113 | 113 | "cond_check_flagged_words": True, |
114 | | - "flagged_words_max_cutoff": 0.2, |
| 114 | + "flagged_words_max_cutoff": 0.03, |
115 | 115 | "cond_check_lang_id": True, |
116 | | - "lang_id_min_cutoff": 0.75, |
| 116 | + "lang_id_min_cutoff": 0.81, |
117 | 117 | "cond_check_perplexity": True, |
118 | | - "perplexity_max_cutoff": 1000000, |
| 118 | + "perplexity_max_cutoff": 2300, |
119 | 119 | } |
120 | 120 |
|
121 | 121 | parameters_filtering_arz = { |
|
868 | 868 |
|
869 | 869 | parameters_filtering = { |
870 | 870 | "default": parameters_filtering_default, |
871 | | - "af": parameters_filtering_af, |
872 | 871 | "ar": parameters_filtering_ar, |
873 | 872 | "arz": parameters_filtering_arz, |
874 | 873 | "as": parameters_filtering_as, |
|
0 commit comments