Skip to content

Commit c6a91d9

Browse files
committed
Updated parameters for Hindi
1 parent a590896 commit c6a91d9

1 file changed

Lines changed: 8 additions & 8 deletions

File tree

ac_dc/parameters_filtering.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -434,28 +434,28 @@
434434
"cond_check_number_words": True,
435435
"tokenization": False,
436436
"strip_characters": special_characters_default,
437-
"number_words_min_cutoff": 1,
437+
"number_words_min_cutoff": 38,
438438
"number_words_max_cutoff": 100000,
439439
"cond_check_character_repetition_removal": True,
440440
"character_repetition_length": 10,
441-
"character_repetition_max_cutoff": 0.106,
441+
"character_repetition_max_cutoff": 0.18,
442442
"cond_check_word_repetition_removal": True,
443443
"word_repetition_length": 5,
444-
"word_repetition_max_cutoff": 0.19,
444+
"word_repetition_max_cutoff": 0.47,
445445
"cond_check_special_characters": True,
446446
"special_characters": special_characters_default,
447-
"special_characters_max_cutoff": 0.35,
447+
"special_characters_max_cutoff": 0.45,
448448
"cond_words_augmentation": False,
449449
"words_augmentation_group_sizes": [],
450450
"words_augmentation_join_char": "",
451451
"cond_check_stopwords": True,
452-
"stopwords_min_cutoff": 0,
452+
"stopwords_min_cutoff": 0.01,
453453
"cond_check_flagged_words": True,
454-
"flagged_words_max_cutoff": 0.2,
454+
"flagged_words_max_cutoff": 0.005,
455455
"cond_check_lang_id": True,
456-
"lang_id_min_cutoff": 0.75,
456+
"lang_id_min_cutoff": 0.90,
457457
"cond_check_perplexity": True,
458-
"perplexity_max_cutoff": 600000,
458+
"perplexity_max_cutoff": 1517,
459459
}
460460

461461
parameters_filtering_id = {

0 commit comments

Comments
 (0)