Skip to content

Commit cb05d64

Browse files
committed
Update parameters for Chinese
1 parent 9e9c65d commit cb05d64

2 files changed

Lines changed: 9 additions & 10 deletions

File tree

ac_dc/parameters_filtering.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -843,27 +843,27 @@
843843
"tokenization": True,
844844
"strip_characters": special_characters_default,
845845
"number_words_min_cutoff": 1,
846-
"number_words_max_cutoff": 100000,
846+
"number_words_max_cutoff": 1000000,
847847
"cond_check_character_repetition_removal": True,
848-
"character_repetition_length": 10,
849-
"character_repetition_max_cutoff": 0.106,
848+
"character_repetition_length": 3,
849+
"character_repetition_max_cutoff": 0.2,
850850
"cond_check_word_repetition_removal": True,
851851
"word_repetition_length": 5,
852-
"word_repetition_max_cutoff": 0.19,
852+
"word_repetition_max_cutoff": 0.96,
853853
"cond_check_special_characters": True,
854854
"special_characters": special_characters_default,
855-
"special_characters_max_cutoff": 0.4,
855+
"special_characters_max_cutoff": 0.3,
856856
"cond_words_augmentation": True,
857857
"words_augmentation_group_sizes": [2],
858858
"words_augmentation_join_char": "",
859859
"cond_check_stopwords": True,
860-
"stopwords_min_cutoff": 0,
860+
"stopwords_min_cutoff": 0.1691,
861861
"cond_check_flagged_words": True,
862-
"flagged_words_max_cutoff": 0.2,
862+
"flagged_words_max_cutoff": 0.001,
863863
"cond_check_lang_id": True,
864-
"lang_id_min_cutoff": 0.75,
864+
"lang_id_min_cutoff": 0.8,
865865
"cond_check_perplexity": False,
866-
"perplexity_max_cutoff": 3000000,
866+
"perplexity_max_cutoff": 2095,
867867
}
868868

869869
parameters_filtering = {

ac_dc/stopwords.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5602,7 +5602,6 @@
56025602
"ọ̀pọ̀lọpọ̀",
56035603
],
56045604
"zh": [
5605-
"",
56065605
"一",
56075606
"一争",
56085607
"一些",

0 commit comments

Comments
 (0)