|
328 | 328 | "cond_remove_words_with_incorrect_substrings": True, |
329 | 329 | "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], |
330 | 330 | "cond_remove_long_words": True, |
331 | | - "length_word_max_cutoff": 35, |
| 331 | + "length_word_max_cutoff": 28, |
332 | 332 | "cond_check_number_words": True, |
333 | 333 | "tokenization": False, |
334 | 334 | "strip_characters": special_characters_default, |
335 | | - "number_words_min_cutoff": 1, |
| 335 | + "number_words_min_cutoff": 8, |
336 | 336 | "number_words_max_cutoff": 100000, |
337 | 337 | "cond_check_character_repetition_removal": True, |
338 | 338 | "character_repetition_length": 10, |
339 | | - "character_repetition_max_cutoff": 0.106, |
| 339 | + "character_repetition_max_cutoff": 0.20, |
340 | 340 | "cond_check_word_repetition_removal": True, |
341 | 341 | "word_repetition_length": 5, |
342 | | - "word_repetition_max_cutoff": 0.19, |
| 342 | + "word_repetition_max_cutoff": 0.40, |
343 | 343 | "cond_check_special_characters": True, |
344 | 344 | "special_characters": special_characters_default, |
345 | | - "special_characters_max_cutoff": 0.3, |
| 345 | + "special_characters_max_cutoff": 0.31, |
346 | 346 | "cond_words_augmentation": False, |
347 | 347 | "words_augmentation_group_sizes": [], |
348 | 348 | "words_augmentation_join_char": "", |
349 | 349 | "cond_check_stopwords": True, |
350 | | - "stopwords_min_cutoff": 0, |
| 350 | + "stopwords_min_cutoff": 0.05, |
351 | 351 | "cond_check_flagged_words": True, |
352 | | - "flagged_words_max_cutoff": 0.2, |
| 352 | + "flagged_words_max_cutoff": 0.1, |
353 | 353 | "cond_check_lang_id": True, |
354 | | - "lang_id_min_cutoff": 0.75, |
| 354 | + "lang_id_min_cutoff": 0.70, |
355 | 355 | "cond_check_perplexity": True, |
356 | | - "perplexity_max_cutoff": 3000000, |
| 356 | + "perplexity_max_cutoff": 3000, |
357 | 357 | } |
358 | 358 |
|
359 | 359 | parameters_filtering_fr = { |
|
0 commit comments