|
192 | 192 | "cond_remove_words_with_incorrect_substrings": True, |
193 | 193 | "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], |
194 | 194 | "cond_remove_long_words": True, |
195 | | - "length_word_max_cutoff": 30, |
| 195 | + "length_word_max_cutoff": 20, |
196 | 196 | "cond_check_number_words": True, |
197 | 197 | "tokenization": False, |
198 | 198 | "strip_characters": special_characters_default, |
199 | | - "number_words_min_cutoff": 1, |
| 199 | + "number_words_min_cutoff": 33, |
200 | 200 | "number_words_max_cutoff": 100000, |
201 | 201 | "cond_check_character_repetition_removal": True, |
202 | 202 | "character_repetition_length": 10, |
203 | | - "character_repetition_max_cutoff": 0.106, |
| 203 | + "character_repetition_max_cutoff": 0.13, |
204 | 204 | "cond_check_word_repetition_removal": True, |
205 | 205 | "word_repetition_length": 5, |
206 | | - "word_repetition_max_cutoff": 0.19, |
| 206 | + "word_repetition_max_cutoff": 0.21, |
207 | 207 | "cond_check_special_characters": True, |
208 | 208 | "special_characters": special_characters_default, |
209 | | - "special_characters_max_cutoff": 0.275, |
| 209 | + "special_characters_max_cutoff": 0.45, |
210 | 210 | "cond_words_augmentation": False, |
211 | 211 | "words_augmentation_group_sizes": [], |
212 | 212 | "words_augmentation_join_char": "", |
213 | 213 | "cond_check_stopwords": True, |
214 | | - "stopwords_min_cutoff": 0.05, |
| 214 | + "stopwords_min_cutoff": 0.002, |
215 | 215 | "cond_check_flagged_words": True, |
216 | | - "flagged_words_max_cutoff": 0.2, |
| 216 | + "flagged_words_max_cutoff": 0.001, |
217 | 217 | "cond_check_lang_id": True, |
218 | | - "lang_id_min_cutoff": 0.75, |
| 218 | + "lang_id_min_cutoff": 0.95, |
219 | 219 | "cond_check_perplexity": True, |
220 | | - "perplexity_max_cutoff": 575000, |
| 220 | + "perplexity_max_cutoff": 2000, |
221 | 221 | } |
222 | 222 |
|
223 | 223 | parameters_filtering_ca = { |
|
0 commit comments