|
434 | 434 | "cond_check_number_words": True, |
435 | 435 | "tokenization": False, |
436 | 436 | "strip_characters": special_characters_default, |
437 | | - "number_words_min_cutoff": 1, |
| 437 | + "number_words_min_cutoff": 38, |
438 | 438 | "number_words_max_cutoff": 100000, |
439 | 439 | "cond_check_character_repetition_removal": True, |
440 | 440 | "character_repetition_length": 10, |
441 | | - "character_repetition_max_cutoff": 0.106, |
| 441 | + "character_repetition_max_cutoff": 0.18, |
442 | 442 | "cond_check_word_repetition_removal": True, |
443 | 443 | "word_repetition_length": 5, |
444 | | - "word_repetition_max_cutoff": 0.19, |
| 444 | + "word_repetition_max_cutoff": 0.47, |
445 | 445 | "cond_check_special_characters": True, |
446 | 446 | "special_characters": special_characters_default, |
447 | | - "special_characters_max_cutoff": 0.35, |
| 447 | + "special_characters_max_cutoff": 0.45, |
448 | 448 | "cond_words_augmentation": False, |
449 | 449 | "words_augmentation_group_sizes": [], |
450 | 450 | "words_augmentation_join_char": "", |
451 | 451 | "cond_check_stopwords": True, |
452 | | - "stopwords_min_cutoff": 0, |
| 452 | + "stopwords_min_cutoff": 0.01, |
453 | 453 | "cond_check_flagged_words": True, |
454 | | - "flagged_words_max_cutoff": 0.2, |
| 454 | + "flagged_words_max_cutoff": 0.005, |
455 | 455 | "cond_check_lang_id": True, |
456 | | - "lang_id_min_cutoff": 0.75, |
| 456 | + "lang_id_min_cutoff": 0.90, |
457 | 457 | "cond_check_perplexity": True, |
458 | | - "perplexity_max_cutoff": 600000, |
| 458 | + "perplexity_max_cutoff": 1517, |
459 | 459 | } |
460 | 460 |
|
461 | 461 | parameters_filtering_id = { |
|
0 commit comments