|
294 | 294 | "cond_remove_words_with_incorrect_substrings": True, |
295 | 295 | "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], |
296 | 296 | "cond_remove_long_words": True, |
297 | | - "length_word_max_cutoff": 30, |
| 297 | + "length_word_max_cutoff": 25, |
298 | 298 | "cond_check_number_words": True, |
299 | 299 | "tokenization": False, |
300 | 300 | "strip_characters": special_characters_default, |
301 | | - "number_words_min_cutoff": 1, |
| 301 | + "number_words_min_cutoff": 16, |
302 | 302 | "number_words_max_cutoff": 100000, |
303 | 303 | "cond_check_character_repetition_removal": True, |
304 | 304 | "character_repetition_length": 10, |
305 | | - "character_repetition_max_cutoff": 0.106, |
| 305 | + "character_repetition_max_cutoff": 0.14, |
306 | 306 | "cond_check_word_repetition_removal": True, |
307 | 307 | "word_repetition_length": 5, |
308 | | - "word_repetition_max_cutoff": 0.19, |
| 308 | + "word_repetition_max_cutoff": 0.25, |
309 | 309 | "cond_check_special_characters": True, |
310 | 310 | "special_characters": special_characters_default, |
311 | | - "special_characters_max_cutoff": 0.3, |
| 311 | + "special_characters_max_cutoff": 0.34, |
312 | 312 | "cond_words_augmentation": False, |
313 | 313 | "words_augmentation_group_sizes": [], |
314 | 314 | "words_augmentation_join_char": "", |
315 | 315 | "cond_check_stopwords": True, |
316 | | - "stopwords_min_cutoff": 0.2, |
| 316 | + "stopwords_min_cutoff": 0.4, |
317 | 317 | "cond_check_flagged_words": True, |
318 | | - "flagged_words_max_cutoff": 0.2, |
| 318 | + "flagged_words_max_cutoff": 0.01, |
319 | 319 | "cond_check_lang_id": True, |
320 | | - "lang_id_min_cutoff": 0.75, |
| 320 | + "lang_id_min_cutoff": 0.8, |
321 | 321 | "cond_check_perplexity": True, |
322 | | - "perplexity_max_cutoff": 2500000, |
| 322 | + "perplexity_max_cutoff": 1300, |
323 | 323 | } |
324 | 324 |
|
325 | 325 | parameters_filtering_eu = { |
|
0 commit comments