Skip to content

Commit d96d2b2

Browse files
authored
[tn] english tn, support range (#233)
* [tn] english tn, support range * [tn] english tn, support range * [tn] english tn, support range
1 parent 13fc9a3 commit d96d2b2

6 files changed

Lines changed: 170 additions & 3 deletions

File tree

tn/english/normalizer.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from tn.english.rules.electronic import Electronic
2828
from tn.english.rules.whitelist import WhiteList
2929
from tn.english.rules.punctuation import Punctuation
30+
from tn.english.rules.range import Range
3031

3132
from pynini.lib.pynutil import add_weight, delete
3233
from importlib_resources import files
@@ -54,14 +55,16 @@ def build_tagger(self):
5455
word = add_weight(Word().tagger, 100)
5556
whitelist = add_weight(WhiteList().tagger, 1.00)
5657
punct = add_weight(Punctuation().tagger, 2.00)
58+
rang = add_weight(Range().tagger, 1.01)
5759
# TODO(xcsong): add roman
5860
tagger = punct.star + \
5961
(cardinal | ordinal | word
6062
| date | decimal | fraction
6163
| time | measure | money
6264
| telephone | electronic
6365
| whitelist
64-
| punct).optimize() + (punct.plus | self.DELETE_SPACE)
66+
| punct
67+
| rang).optimize() + (punct.plus | self.DELETE_SPACE)
6568
# delete the last space
6669
self.tagger = tagger.star @ self.build_rule(delete(' '), r='[EOS]')
6770

@@ -79,6 +82,7 @@ def build_verbalizer(self):
7982
electronic = Electronic().verbalizer
8083
whitelist = WhiteList().verbalizer
8184
punct = Punctuation().verbalizer
85+
rang = Range().verbalizer
8286
verbalizer = \
8387
(cardinal | ordinal | word
8488
| date | decimal
@@ -87,6 +91,7 @@ def build_verbalizer(self):
8791
| telephone
8892
| electronic
8993
| whitelist
90-
| punct).optimize() + punct.ques + self.INSERT_SPACE
94+
| punct
95+
| rang).optimize() + punct.ques + self.INSERT_SPACE
9196
self.verbalizer = verbalizer.star @ self.build_rule(delete(' '),
9297
r='[EOS]')

tn/english/rules/range.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2+
# Copyright (c) 2024, WENET COMMUNITY. Xingchen Song (sxc19@tsinghua.org.cn).
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
import pynini
17+
from pynini.lib import pynutil
18+
19+
from tn.processor import Processor
20+
from tn.english.rules.cardinal import Cardinal
21+
from tn.english.rules.time import Time
22+
from tn.english.rules.date import Date
23+
24+
25+
class Range(Processor):
26+
27+
def __init__(self, deterministic: bool = False):
28+
"""
29+
Args:
30+
deterministic: if True will provide a single transduction option,
31+
for False multiple transduction are generated (used for audio-based normalization)
32+
"""
33+
super().__init__('range', ordertype="en_tn")
34+
self.deterministic = deterministic
35+
self.build_tagger()
36+
self.build_verbalizer()
37+
38+
def build_tagger(self):
39+
"""
40+
Finite state transducer for verbalizing range, e.g.
41+
2-3 => range { value "two to three" }
42+
"""
43+
cardinal = Cardinal(deterministic=True).graph_with_and
44+
time = Time(deterministic=self.deterministic)
45+
time = time.tagger @ time.verbalizer
46+
date = Date(deterministic=self.deterministic)
47+
date = date.tagger @ date.verbalizer
48+
delete_space = pynini.closure(pynutil.delete(" "), 0, 1)
49+
50+
approx = pynini.cross("~", "approximately")
51+
52+
# TIME
53+
time_graph = time + delete_space + pynini.cross(
54+
"-", " to ") + delete_space + time
55+
self.graph = time_graph | (approx + time)
56+
57+
# YEAR
58+
date_year_four_digit = (self.DIGIT**4 +
59+
pynini.closure(pynini.accep("s"), 0, 1)) @ date
60+
date_year_two_digit = (self.DIGIT**2 +
61+
pynini.closure(pynini.accep("s"), 0, 1)) @ date
62+
year_to_year_graph = (date_year_four_digit + delete_space +
63+
pynini.cross("-", " to ") + delete_space +
64+
(date_year_four_digit | date_year_two_digit |
65+
(self.DIGIT**2 @ cardinal)))
66+
mid_year_graph = pynini.accep("mid") + pynini.cross(
67+
"-", " ") + (date_year_four_digit | date_year_two_digit)
68+
69+
self.graph |= year_to_year_graph
70+
self.graph |= mid_year_graph
71+
72+
# ADDITION
73+
range_graph = cardinal + pynini.closure(
74+
pynini.cross("+", " plus ") + cardinal, 1)
75+
range_graph |= cardinal + pynini.closure(
76+
pynini.cross(" + ", " plus ") + cardinal, 1)
77+
range_graph |= approx + cardinal
78+
range_graph |= cardinal + (pynini.cross("...", " ... ")
79+
| pynini.accep(" ... ")) + cardinal
80+
81+
if not self.deterministic:
82+
# cardinal ----
83+
cardinal_to_cardinal_graph = (
84+
cardinal + delete_space +
85+
pynini.cross("-", pynini.union(" to ", " minus ")) +
86+
delete_space + cardinal)
87+
88+
range_graph |= cardinal_to_cardinal_graph | (
89+
cardinal + delete_space + pynini.cross(":", " to ") +
90+
delete_space + cardinal)
91+
92+
# MULTIPLY
93+
for x in [" x ", "x"]:
94+
range_graph |= cardinal + pynini.cross(
95+
x, pynini.union(" by ", " times ")) + cardinal
96+
97+
# 40x -> "40 times" ("40 x" cases is covered in serial)
98+
for x in [" x", "x"]:
99+
range_graph |= cardinal + pynini.cross(x, " times")
100+
101+
# 5x to 7x-> five to seven x/times
102+
range_graph |= (cardinal + pynutil.delete(x) +
103+
pynini.union(" to ", "-", " - ") + cardinal +
104+
pynini.cross(x, pynini.union(" x", " times")))
105+
106+
for x in ["*", " * "]:
107+
range_graph |= cardinal + pynini.closure(
108+
pynini.cross(x, " times ") + cardinal, 1)
109+
110+
# supports "No. 12" -> "Number 12"
111+
range_graph |= ((pynini.cross(pynini.union("NO", "No"), "Number")
112+
| pynini.cross("no", "number")) +
113+
pynini.closure(pynini.union(". ", " "), 0, 1) +
114+
cardinal)
115+
116+
for x in ["/", " / "]:
117+
range_graph |= cardinal + pynini.closure(
118+
pynini.cross(x, " divided by ") + cardinal, 1)
119+
120+
# 10% to 20% -> ten to twenty percent
121+
range_graph |= (
122+
cardinal + pynini.closure( # noqa
123+
pynini.cross("%", " percent") | pynutil.delete("%"), 0, 1)
124+
+ # noqa
125+
pynini.union(" to ", "-", " - ") + cardinal + # noqa
126+
pynini.cross("%", " percent")) # noqa
127+
128+
self.graph |= range_graph
129+
130+
final_graph = pynutil.insert(
131+
"value: \"") + self.graph + pynutil.insert("\"")
132+
self.tagger = self.add_tokens(final_graph)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. ¾ people like chattts, let's eat at 03:43 p.m. run 10 km, give me $12.345 please, call 123-123-5678-1 Mt Hill "HAHAHA" billion 4 March => this is twelfth game, number two hundred and fifty six, the sixth of may twenty twenty four, the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts, let' s eat at three forty three PM run ten kilometers, give me twelve point three four five dollars please, call one two three, one two three, five six seven eight, one Mt Hill" HAHAHA" billion the fourth of march
22
The National Map, accessed April 1, 2011" Site Description of Koppers Co. From the quartet's recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War II. 1988 (1988) ( 1988) ( 1988). Starling, Arthur E.( 1988 ). this is 12th game, number 256, 2024-05-06, 2021-03-07 31.990 billion. 3/4 people like chattts Retrieved December 2011. Information on Album" Thepodule.com"" Biography by Amy Hanson". => The National Map, accessed the first of april , twenty eleven" Site Description of Koppers company From the quartet' s recording" Jefferson Friedman: Quartets,"" String Quartet no, Riots again broke out, Atassi resigned, and Syrian independence was deferred until after World War two nineteen eighty eight( nineteen eighty eight )( nineteen eighty eight )( nineteen eighty eight). Starling, Arthur E.( nineteen eighty eight). this is twelfth game, number two fifty six, the sixth of may twenty twenty four, the seventh of march twenty twenty one thirty one point nine nine oh billion. three quarters people like chattts Retrieved december twenty eleven. Information on Album" Thepodule dot com"" Biography by Amy Hanson".
33
.345" and ".456" "9.456" or 6.7890" => point three four five" and". four hundred and fifty six" " nine point four five six" or six point seven eight nine oh"
4+
The museum is open Mon.-Sun. children of 3-4 years 123 The plan will help you lose 3-4 pounds the first week, and 1-2 pounds the weeks thereafter. => The museum is open Monday.- Sunday. children of three to four years one hundred and twenty three The plan will help you lose three to four pounds the first week, and one to two pounds the weeks thereafter.

tn/english/test/data/range.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
2-3 => two to three

tn/english/test/range_test.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import pytest
16+
17+
from tn.english.rules.range import Range
18+
from tn.english.test.utils import parse_test_case
19+
20+
21+
class TestRange:
22+
23+
range = Range(deterministic=False)
24+
range_cases = parse_test_case('data/range.txt')
25+
26+
@pytest.mark.parametrize("written, spoken", range_cases)
27+
def test_range(self, written, spoken):
28+
assert self.range.normalize(written) == spoken

tn/processor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def build_fst(self, prefix, cache_dir, overwrite_cache):
101101
self.verbalizer.optimize().write(verbalizer_path)
102102
logging.info("done")
103103
logging.info("fst path: {}".format(tagger_path))
104-
logging.info(" {}".format(tagger_path))
104+
logging.info(" {}".format(verbalizer_path))
105105

106106
def tag(self, input):
107107
if len(input) == 0:

0 commit comments

Comments
 (0)