Skip to content

Commit 4d81fa3

Browse files
committed
Update parameters for Urdu
1 parent c6a91d9 commit 4d81fa3

1 file changed

Lines changed: 10 additions & 10 deletions

File tree

ac_dc/parameters_filtering.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -736,32 +736,32 @@
736736
"cond_remove_words_with_incorrect_substrings": True,
737737
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
738738
"cond_remove_long_words": True,
739-
"length_word_max_cutoff": 30,
739+
"length_word_max_cutoff": 20,
740740
"cond_check_number_words": True,
741741
"tokenization": False,
742742
"strip_characters": special_characters_default,
743-
"number_words_min_cutoff": 1,
743+
"number_words_min_cutoff": 25,
744744
"number_words_max_cutoff": 100000,
745745
"cond_check_character_repetition_removal": True,
746746
"character_repetition_length": 10,
747-
"character_repetition_max_cutoff": 0.106,
747+
"character_repetition_max_cutoff": 0.19,
748748
"cond_check_word_repetition_removal": True,
749-
"word_repetition_length": 5,
750-
"word_repetition_max_cutoff": 0.19,
749+
"word_repetition_length": 7,
750+
"word_repetition_max_cutoff": 0.5,
751751
"cond_check_special_characters": True,
752752
"special_characters": special_characters_default,
753-
"special_characters_max_cutoff": 0.4,
753+
"special_characters_max_cutoff": 0.25,
754754
"cond_words_augmentation": False,
755755
"words_augmentation_group_sizes": [],
756756
"words_augmentation_join_char": "",
757757
"cond_check_stopwords": True,
758-
"stopwords_min_cutoff": 0,
758+
"stopwords_min_cutoff": 0.01,
759759
"cond_check_flagged_words": True,
760-
"flagged_words_max_cutoff": 0.2,
760+
"flagged_words_max_cutoff": 0.025,
761761
"cond_check_lang_id": True,
762-
"lang_id_min_cutoff": 0.75,
762+
"lang_id_min_cutoff": 0.90,
763763
"cond_check_perplexity": True,
764-
"perplexity_max_cutoff": 3000000,
764+
"perplexity_max_cutoff": 1495,
765765
}
766766

767767
parameters_filtering_vi = {

0 commit comments

Comments
 (0)