Skip to content

Commit 8e91629

Browse files
committed
Update parameters for Vietnamese
1 parent 0567dad commit 8e91629

1 file changed

Lines changed: 22 additions & 21 deletions

File tree

ac_dc/parameters_filtering.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"flagged_words_max_cutoff": 0.1,
4747
"cond_check_lang_id": True,
4848
"lang_id_min_cutoff": 0.70,
49-
"cond_check_perplexity": False,
49+
"cond_check_perplexity": True,
5050
"perplexity_max_cutoff": 10000,
5151
}
5252

@@ -148,7 +148,7 @@
148148
"flagged_words_max_cutoff": 0.2,
149149
"cond_check_lang_id": True,
150150
"lang_id_min_cutoff": 0.75,
151-
"cond_check_perplexity": False,
151+
"cond_check_perplexity": True,
152152
"perplexity_max_cutoff": 3000000,
153153
}
154154

@@ -182,7 +182,7 @@
182182
"flagged_words_max_cutoff": 0.2,
183183
"cond_check_lang_id": True,
184184
"lang_id_min_cutoff": 0.75,
185-
"cond_check_perplexity": False,
185+
"cond_check_perplexity": True,
186186
"perplexity_max_cutoff": 3000000,
187187
}
188188

@@ -216,7 +216,7 @@
216216
"flagged_words_max_cutoff": 0.2,
217217
"cond_check_lang_id": True,
218218
"lang_id_min_cutoff": 0.75,
219-
"cond_check_perplexity": False,
219+
"cond_check_perplexity": True,
220220
"perplexity_max_cutoff": 575000,
221221
}
222222

@@ -352,7 +352,7 @@
352352
"flagged_words_max_cutoff": 0.2,
353353
"cond_check_lang_id": True,
354354
"lang_id_min_cutoff": 0.75,
355-
"cond_check_perplexity": False,
355+
"cond_check_perplexity": True,
356356
"perplexity_max_cutoff": 3000000,
357357
}
358358

@@ -658,7 +658,7 @@
658658
"flagged_words_max_cutoff": 0.2,
659659
"cond_check_lang_id": True,
660660
"lang_id_min_cutoff": 0.75,
661-
"cond_check_perplexity": False,
661+
"cond_check_perplexity": True,
662662
"perplexity_max_cutoff": 3000000,
663663
}
664664

@@ -692,7 +692,7 @@
692692
"flagged_words_max_cutoff": 0.2,
693693
"cond_check_lang_id": True,
694694
"lang_id_min_cutoff": 0.75,
695-
"cond_check_perplexity": False,
695+
"cond_check_perplexity": True,
696696
"perplexity_max_cutoff": 3000000,
697697
}
698698

@@ -726,7 +726,7 @@
726726
"flagged_words_max_cutoff": 0.2,
727727
"cond_check_lang_id": True,
728728
"lang_id_min_cutoff": 0.75,
729-
"cond_check_perplexity": False,
729+
"cond_check_perplexity": True,
730730
"perplexity_max_cutoff": 3000000,
731731
}
732732

@@ -760,7 +760,7 @@
760760
"flagged_words_max_cutoff": 0.2,
761761
"cond_check_lang_id": True,
762762
"lang_id_min_cutoff": 0.75,
763-
"cond_check_perplexity": False,
763+
"cond_check_perplexity": True,
764764
"perplexity_max_cutoff": 3000000,
765765
}
766766

@@ -770,32 +770,32 @@
770770
"cond_remove_words_with_incorrect_substrings": True,
771771
"incorrect_word_substrings": ["http", "www", ".com", "href", "//"],
772772
"cond_remove_long_words": True,
773-
"length_word_max_cutoff": 30,
773+
"length_word_max_cutoff": 25,
774774
"cond_check_number_words": True,
775775
"tokenization": False,
776776
"strip_characters": special_characters_default,
777-
"number_words_min_cutoff": 1,
777+
"number_words_min_cutoff": 30,
778778
"number_words_max_cutoff": 100000,
779779
"cond_check_character_repetition_removal": True,
780780
"character_repetition_length": 10,
781-
"character_repetition_max_cutoff": 0.106,
781+
"character_repetition_max_cutoff": 0.15,
782782
"cond_check_word_repetition_removal": True,
783783
"word_repetition_length": 5,
784-
"word_repetition_max_cutoff": 0.19,
784+
"word_repetition_max_cutoff": 0.20,
785785
"cond_check_special_characters": True,
786786
"special_characters": special_characters_default,
787-
"special_characters_max_cutoff": 0.35,
787+
"special_characters_max_cutoff": 0.34,
788788
"cond_words_augmentation": True,
789789
"words_augmentation_group_sizes": [2],
790790
"words_augmentation_join_char": " ",
791791
"cond_check_stopwords": True,
792-
"stopwords_min_cutoff": 0,
792+
"stopwords_min_cutoff": 0.08,
793793
"cond_check_flagged_words": True,
794-
"flagged_words_max_cutoff": 0.2,
794+
"flagged_words_max_cutoff": 0.005,
795795
"cond_check_lang_id": True,
796-
"lang_id_min_cutoff": 0.75,
797-
"cond_check_perplexity": False,
798-
"perplexity_max_cutoff": 3000000,
796+
"lang_id_min_cutoff": 0.90,
797+
"cond_check_perplexity": True,
798+
"perplexity_max_cutoff": 1600,
799799
}
800800

801801
parameters_filtering_yo = {
@@ -828,7 +828,7 @@
828828
"flagged_words_max_cutoff": 0.2,
829829
"cond_check_lang_id": True,
830830
"lang_id_min_cutoff": 0.75,
831-
"cond_check_perplexity": False,
831+
"cond_check_perplexity": True,
832832
"perplexity_max_cutoff": 3000000,
833833
}
834834

@@ -862,12 +862,13 @@
862862
"flagged_words_max_cutoff": 0.001,
863863
"cond_check_lang_id": True,
864864
"lang_id_min_cutoff": 0.85,
865-
"cond_check_perplexity": False,
865+
"cond_check_perplexity": True,
866866
"perplexity_max_cutoff": 2095,
867867
}
868868

869869
parameters_filtering = {
870870
"default": parameters_filtering_default,
871+
"af": parameters_filtering_af,
871872
"ar": parameters_filtering_ar,
872873
"arz": parameters_filtering_arz,
873874
"as": parameters_filtering_as,

0 commit comments

Comments
 (0)