|
46 | 46 | "flagged_words_max_cutoff": 0.1, |
47 | 47 | "cond_check_lang_id": True, |
48 | 48 | "lang_id_min_cutoff": 0.70, |
49 | | - "cond_check_perplexity": False, |
| 49 | + "cond_check_perplexity": True, |
50 | 50 | "perplexity_max_cutoff": 10000, |
51 | 51 | } |
52 | 52 |
|
|
148 | 148 | "flagged_words_max_cutoff": 0.2, |
149 | 149 | "cond_check_lang_id": True, |
150 | 150 | "lang_id_min_cutoff": 0.75, |
151 | | - "cond_check_perplexity": False, |
| 151 | + "cond_check_perplexity": True, |
152 | 152 | "perplexity_max_cutoff": 3000000, |
153 | 153 | } |
154 | 154 |
|
|
182 | 182 | "flagged_words_max_cutoff": 0.2, |
183 | 183 | "cond_check_lang_id": True, |
184 | 184 | "lang_id_min_cutoff": 0.75, |
185 | | - "cond_check_perplexity": False, |
| 185 | + "cond_check_perplexity": True, |
186 | 186 | "perplexity_max_cutoff": 3000000, |
187 | 187 | } |
188 | 188 |
|
|
216 | 216 | "flagged_words_max_cutoff": 0.2, |
217 | 217 | "cond_check_lang_id": True, |
218 | 218 | "lang_id_min_cutoff": 0.75, |
219 | | - "cond_check_perplexity": False, |
| 219 | + "cond_check_perplexity": True, |
220 | 220 | "perplexity_max_cutoff": 575000, |
221 | 221 | } |
222 | 222 |
|
|
352 | 352 | "flagged_words_max_cutoff": 0.2, |
353 | 353 | "cond_check_lang_id": True, |
354 | 354 | "lang_id_min_cutoff": 0.75, |
355 | | - "cond_check_perplexity": False, |
| 355 | + "cond_check_perplexity": True, |
356 | 356 | "perplexity_max_cutoff": 3000000, |
357 | 357 | } |
358 | 358 |
|
|
658 | 658 | "flagged_words_max_cutoff": 0.2, |
659 | 659 | "cond_check_lang_id": True, |
660 | 660 | "lang_id_min_cutoff": 0.75, |
661 | | - "cond_check_perplexity": False, |
| 661 | + "cond_check_perplexity": True, |
662 | 662 | "perplexity_max_cutoff": 3000000, |
663 | 663 | } |
664 | 664 |
|
|
692 | 692 | "flagged_words_max_cutoff": 0.2, |
693 | 693 | "cond_check_lang_id": True, |
694 | 694 | "lang_id_min_cutoff": 0.75, |
695 | | - "cond_check_perplexity": False, |
| 695 | + "cond_check_perplexity": True, |
696 | 696 | "perplexity_max_cutoff": 3000000, |
697 | 697 | } |
698 | 698 |
|
|
726 | 726 | "flagged_words_max_cutoff": 0.2, |
727 | 727 | "cond_check_lang_id": True, |
728 | 728 | "lang_id_min_cutoff": 0.75, |
729 | | - "cond_check_perplexity": False, |
| 729 | + "cond_check_perplexity": True, |
730 | 730 | "perplexity_max_cutoff": 3000000, |
731 | 731 | } |
732 | 732 |
|
|
760 | 760 | "flagged_words_max_cutoff": 0.2, |
761 | 761 | "cond_check_lang_id": True, |
762 | 762 | "lang_id_min_cutoff": 0.75, |
763 | | - "cond_check_perplexity": False, |
| 763 | + "cond_check_perplexity": True, |
764 | 764 | "perplexity_max_cutoff": 3000000, |
765 | 765 | } |
766 | 766 |
|
|
770 | 770 | "cond_remove_words_with_incorrect_substrings": True, |
771 | 771 | "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], |
772 | 772 | "cond_remove_long_words": True, |
773 | | - "length_word_max_cutoff": 30, |
| 773 | + "length_word_max_cutoff": 25, |
774 | 774 | "cond_check_number_words": True, |
775 | 775 | "tokenization": False, |
776 | 776 | "strip_characters": special_characters_default, |
777 | | - "number_words_min_cutoff": 1, |
| 777 | + "number_words_min_cutoff": 30, |
778 | 778 | "number_words_max_cutoff": 100000, |
779 | 779 | "cond_check_character_repetition_removal": True, |
780 | 780 | "character_repetition_length": 10, |
781 | | - "character_repetition_max_cutoff": 0.106, |
| 781 | + "character_repetition_max_cutoff": 0.15, |
782 | 782 | "cond_check_word_repetition_removal": True, |
783 | 783 | "word_repetition_length": 5, |
784 | | - "word_repetition_max_cutoff": 0.19, |
| 784 | + "word_repetition_max_cutoff": 0.20, |
785 | 785 | "cond_check_special_characters": True, |
786 | 786 | "special_characters": special_characters_default, |
787 | | - "special_characters_max_cutoff": 0.35, |
| 787 | + "special_characters_max_cutoff": 0.34, |
788 | 788 | "cond_words_augmentation": True, |
789 | 789 | "words_augmentation_group_sizes": [2], |
790 | 790 | "words_augmentation_join_char": " ", |
791 | 791 | "cond_check_stopwords": True, |
792 | | - "stopwords_min_cutoff": 0, |
| 792 | + "stopwords_min_cutoff": 0.08, |
793 | 793 | "cond_check_flagged_words": True, |
794 | | - "flagged_words_max_cutoff": 0.2, |
| 794 | + "flagged_words_max_cutoff": 0.005, |
795 | 795 | "cond_check_lang_id": True, |
796 | | - "lang_id_min_cutoff": 0.75, |
797 | | - "cond_check_perplexity": False, |
798 | | - "perplexity_max_cutoff": 3000000, |
| 796 | + "lang_id_min_cutoff": 0.90, |
| 797 | + "cond_check_perplexity": True, |
| 798 | + "perplexity_max_cutoff": 1600, |
799 | 799 | } |
800 | 800 |
|
801 | 801 | parameters_filtering_yo = { |
|
828 | 828 | "flagged_words_max_cutoff": 0.2, |
829 | 829 | "cond_check_lang_id": True, |
830 | 830 | "lang_id_min_cutoff": 0.75, |
831 | | - "cond_check_perplexity": False, |
| 831 | + "cond_check_perplexity": True, |
832 | 832 | "perplexity_max_cutoff": 3000000, |
833 | 833 | } |
834 | 834 |
|
|
862 | 862 | "flagged_words_max_cutoff": 0.001, |
863 | 863 | "cond_check_lang_id": True, |
864 | 864 | "lang_id_min_cutoff": 0.85, |
865 | | - "cond_check_perplexity": False, |
| 865 | + "cond_check_perplexity": True, |
866 | 866 | "perplexity_max_cutoff": 2095, |
867 | 867 | } |
868 | 868 |
|
869 | 869 | parameters_filtering = { |
870 | 870 | "default": parameters_filtering_default, |
| 871 | + "af": parameters_filtering_af, |
871 | 872 | "ar": parameters_filtering_ar, |
872 | 873 | "arz": parameters_filtering_arz, |
873 | 874 | "as": parameters_filtering_as, |
|
0 commit comments