|
843 | 843 | "tokenization": True, |
844 | 844 | "strip_characters": special_characters_default, |
845 | 845 | "number_words_min_cutoff": 1, |
846 | | - "number_words_max_cutoff": 100000, |
| 846 | + "number_words_max_cutoff": 1000000, |
847 | 847 | "cond_check_character_repetition_removal": True, |
848 | | - "character_repetition_length": 10, |
849 | | - "character_repetition_max_cutoff": 0.106, |
| 848 | + "character_repetition_length": 3, |
| 849 | + "character_repetition_max_cutoff": 0.2, |
850 | 850 | "cond_check_word_repetition_removal": True, |
851 | 851 | "word_repetition_length": 5, |
852 | | - "word_repetition_max_cutoff": 0.19, |
| 852 | + "word_repetition_max_cutoff": 0.96, |
853 | 853 | "cond_check_special_characters": True, |
854 | 854 | "special_characters": special_characters_default, |
855 | | - "special_characters_max_cutoff": 0.4, |
| 855 | + "special_characters_max_cutoff": 0.3, |
856 | 856 | "cond_words_augmentation": True, |
857 | 857 | "words_augmentation_group_sizes": [2], |
858 | 858 | "words_augmentation_join_char": "", |
859 | 859 | "cond_check_stopwords": True, |
860 | | - "stopwords_min_cutoff": 0, |
| 860 | + "stopwords_min_cutoff": 0.1691, |
861 | 861 | "cond_check_flagged_words": True, |
862 | | - "flagged_words_max_cutoff": 0.2, |
| 862 | + "flagged_words_max_cutoff": 0.001, |
863 | 863 | "cond_check_lang_id": True, |
864 | | - "lang_id_min_cutoff": 0.75, |
| 864 | + "lang_id_min_cutoff": 0.8, |
865 | 865 | "cond_check_perplexity": False, |
866 | | - "perplexity_max_cutoff": 3000000, |
| 866 | + "perplexity_max_cutoff": 2095, |
867 | 867 | } |
868 | 868 |
|
869 | 869 | parameters_filtering = { |
|
0 commit comments