Skip to content

Commit 3fc7bb6

Browse files
authored
[runtime] support english tn (#219)
1 parent 9d6bb56 commit 3fc7bb6

3 files changed

Lines changed: 25 additions & 16 deletions

File tree

runtime/processor/wetext_processor.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@ Processor::Processor(const std::string& tagger_path,
2424
compiler_ = std::make_shared<StringCompiler<StdArc>>(StringTokenType::BYTE);
2525
printer_ = std::make_shared<StringPrinter<StdArc>>(StringTokenType::BYTE);
2626

27-
if (tagger_path.find("_tn_") != tagger_path.npos) {
28-
parse_type_ = ParseType::kTN;
29-
} else if (tagger_path.find("_itn_") != tagger_path.npos) {
30-
parse_type_ = ParseType::kITN;
27+
if (tagger_path.find("zh_tn_") != tagger_path.npos) {
28+
parse_type_ = ParseType::kZH_TN;
29+
} else if (tagger_path.find("zh_itn_") != tagger_path.npos) {
30+
parse_type_ = ParseType::kZH_ITN;
31+
} else if (tagger_path.find("en_tn_") != tagger_path.npos) {
32+
parse_type_ = ParseType::kEN_TN;
3133
} else {
3234
LOG(FATAL) << "Invalid fst prefix, prefix should contain"
3335
<< " either \"_tn_\" or \"_itn_\".";

runtime/processor/wetext_token_parser.cc

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,24 +26,30 @@ const std::set<std::string> ASCII_LETTERS = {
2626
"o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "A", "B",
2727
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
2828
"Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "_"};
29-
const std::unordered_map<std::string, std::vector<std::string>> TN_ORDERS = {
29+
const std::unordered_map<std::string, std::vector<std::string>> ZH_TN_ORDERS = {
3030
{"date", {"year", "month", "day"}},
3131
{"fraction", {"denominator", "numerator"}},
3232
{"measure", {"denominator", "numerator", "value"}},
3333
{"money", {"value", "currency"}},
3434
{"time", {"noon", "hour", "minute", "second"}}};
35-
const std::unordered_map<std::string, std::vector<std::string>> ITN_ORDERS = {
36-
{"date", {"year", "month", "day"}},
37-
{"fraction", {"sign", "numerator", "denominator"}},
38-
{"measure", {"numerator", "denominator", "value"}},
39-
{"money", {"currency", "value", "decimal"}},
40-
{"time", {"hour", "minute", "second", "noon"}}};
35+
const std::unordered_map<std::string, std::vector<std::string>> EN_TN_ORDERS = {
36+
{"date", {"preserve_order", "text", "day", "month", "year"}}};
37+
const std::unordered_map<std::string, std::vector<std::string>> ZH_ITN_ORDERS =
38+
{{"date", {"year", "month", "day"}},
39+
{"fraction", {"sign", "numerator", "denominator"}},
40+
{"measure", {"numerator", "denominator", "value"}},
41+
{"money", {"currency", "value", "decimal"}},
42+
{"time", {"hour", "minute", "second", "noon"}}};
4143

4244
TokenParser::TokenParser(ParseType type) {
43-
if (type == ParseType::kTN) {
44-
orders_ = TN_ORDERS;
45+
if (type == ParseType::kZH_TN) {
46+
orders_ = ZH_TN_ORDERS;
47+
} else if (type == ParseType::kZH_ITN) {
48+
orders_ = ZH_TN_ORDERS;
49+
} else if (type == ParseType::kEN_TN) {
50+
orders_ = EN_TN_ORDERS;
4551
} else {
46-
orders_ = ITN_ORDERS;
52+
orders_ = ZH_ITN_ORDERS;
4753
}
4854
}
4955

runtime/processor/wetext_token_parser.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,9 @@ struct Token {
6060
};
6161

6262
enum ParseType {
63-
kTN = 0x00, // Text Normalization
64-
kITN = 0x01 // Inverse Text Normalization
63+
kZH_TN = 0x00, // Chinese Text Normalization
64+
kZH_ITN = 0x01, // Chinese Inverse Text Normalization
65+
kEN_TN = 0x02 // English Text Normalization
6566
};
6667

6768
class TokenParser {

0 commit comments

Comments
 (0)