|
736 | 736 | "cond_remove_words_with_incorrect_substrings": True, |
737 | 737 | "incorrect_word_substrings": ["http", "www", ".com", "href", "//"], |
738 | 738 | "cond_remove_long_words": True, |
739 | | - "length_word_max_cutoff": 30, |
| 739 | + "length_word_max_cutoff": 20, |
740 | 740 | "cond_check_number_words": True, |
741 | 741 | "tokenization": False, |
742 | 742 | "strip_characters": special_characters_default, |
743 | | - "number_words_min_cutoff": 1, |
| 743 | + "number_words_min_cutoff": 25, |
744 | 744 | "number_words_max_cutoff": 100000, |
745 | 745 | "cond_check_character_repetition_removal": True, |
746 | 746 | "character_repetition_length": 10, |
747 | | - "character_repetition_max_cutoff": 0.106, |
| 747 | + "character_repetition_max_cutoff": 0.19, |
748 | 748 | "cond_check_word_repetition_removal": True, |
749 | | - "word_repetition_length": 5, |
750 | | - "word_repetition_max_cutoff": 0.19, |
| 749 | + "word_repetition_length": 7, |
| 750 | + "word_repetition_max_cutoff": 0.5, |
751 | 751 | "cond_check_special_characters": True, |
752 | 752 | "special_characters": special_characters_default, |
753 | | - "special_characters_max_cutoff": 0.4, |
| 753 | + "special_characters_max_cutoff": 0.25, |
754 | 754 | "cond_words_augmentation": False, |
755 | 755 | "words_augmentation_group_sizes": [], |
756 | 756 | "words_augmentation_join_char": "", |
757 | 757 | "cond_check_stopwords": True, |
758 | | - "stopwords_min_cutoff": 0, |
| 758 | + "stopwords_min_cutoff": 0.01, |
759 | 759 | "cond_check_flagged_words": True, |
760 | | - "flagged_words_max_cutoff": 0.2, |
| 760 | + "flagged_words_max_cutoff": 0.025, |
761 | 761 | "cond_check_lang_id": True, |
762 | | - "lang_id_min_cutoff": 0.75, |
| 762 | + "lang_id_min_cutoff": 0.90, |
763 | 763 | "cond_check_perplexity": True, |
764 | | - "perplexity_max_cutoff": 3000000, |
| 764 | + "perplexity_max_cutoff": 1495, |
765 | 765 | } |
766 | 766 |
|
767 | 767 | parameters_filtering_vi = { |
|
0 commit comments