NVIDIA · shrpawar-alt · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026 · Jun 23, 2026
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -28,7 +28,7 @@ pipeline {
     MR_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/10-17-24-1'
     KO_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/04-23-26-0'
-    HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-17-26-0'
+    HI_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-25-26-0'
     DEFAULT_TN_CACHE='/home/jenkins/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {

diff --git a/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv b/nemo_text_processing/text_normalization/hi/data/ordinal/suffixes_map.tsv
@@ -1,2 +1 @@
-वे	वें
-
+वे	वें
diff --git a/nemo_text_processing/text_normalization/hi/data/roman/__init__.py b/nemo_text_processing/text_normalization/hi/data/roman/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/hi/data/roman/roman_ordinal_exceptions.tsv b/nemo_text_processing/text_normalization/hi/data/roman/roman_ordinal_exceptions.tsv
@@ -0,0 +1,10 @@
+Iला	पहला
+Iली	पहली
+IIरा	दूसरा
+IIरी	दूसरी
+IIIरा	तीसरा
+IIIरी	तीसरी
+IVथा	चौथा
+IVथी	चौथी
+VIठा	छठा
+VIठी	छठी
diff --git a/nemo_text_processing/text_normalization/hi/data/roman/roman_to_spoken.tsv b/nemo_text_processing/text_normalization/hi/data/roman/roman_to_spoken.tsv
@@ -0,0 +1,100 @@
+I	एक
+II	दो
+III	तीन
+IV	चार
+V	पाँच
+VI	छह
+VII	सात
+VIII	आठ
+IX	नौ
+X	दस
+XI	ग्यारह
+XII	बारह
+XIII	तेरह
+XIV	चौदह
+XV	पंद्रह
+XVI	सोलह
+XVII	सत्रह
+XVIII	अठारह
+XIX	उन्नीस
+XX	बीस
+XXI	इक्कीस
+XXII	बाईस
+XXIII	तेईस
+XXIV	चौबीस
+XXV	पच्चीस
+XXVI	छब्बीस
+XXVII	सत्ताईस
+XXVIII	अट्ठाईस
+XXIX	उनतीस
+XXX	तीस
+XXXI	इकतीस
+XXXII	बत्तीस
+XXXIII	तैंतीस
+XXXIV	चौंतीस
+XXXV	पैंतीस
+XXXVI	छत्तीस
+XXXVII	सैंतीस
+XXXVIII	अड़तीस
+XXXIX	उनचालीस
+XL	चालीस
+XLI	इकतालीस
+XLII	बयालीस
+XLIII	तैंतालीस
+XLIV	चौंतालीस
+XLV	पैंतालीस
+XLVI	छियालीस
+XLVII	सैंतालीस
+XLVIII	अड़तालीस
+XLIX	उनचास
+L	पचास
+LI	इक्यावन
+LII	बावन
+LIII	तिरपन
+LIV	चौवन
+LV	पचपन
+LVI	छप्पन
+LVII	सत्तावन
+LVIII	अट्ठावन
+LIX	उनसठ
+LX	साठ
+LXI	इकसठ
+LXII	बासठ
+LXIII	तिरसठ
+LXIV	चौंसठ
+LXV	पैंसठ
+LXVI	छियासठ
+LXVII	सड़सठ
+LXVIII	अड़सठ
+LXIX	उनहत्तर
+LXX	सत्तर
+LXXI	इकहत्तर
+LXXII	बहत्तर
+LXXIII	तिहत्तर
+LXXIV	चौहत्तर
+LXXV	पचहत्तर
+LXXVI	छिहत्तर
+LXXVII	सतहत्तर
+LXXVIII	अठहत्तर
+LXXIX	उनासी
+LXXX	अस्सी
+LXXXI	इक्यासी
+LXXXII	बयासी
+LXXXIII	तिरासी
+LXXXIV	चौरासी
+LXXXV	पचासी
+LXXXVI	छियासी
+LXXXVII	सत्तासी
+LXXXVIII	अट्ठासी
+LXXXIX	नवासी
+XC	नब्बे
+XCI	इक्यानवे
+XCII	बानवे
+XCIII	तिरानवे
+XCIV	चौरानवे
+XCV	पचानवे
+XCVI	छियानवे
+XCVII	सत्तानवे
+XCVIII	अट्ठानवे
+XCIX	निन्यानवे
+C	एक सौ
diff --git a/nemo_text_processing/text_normalization/hi/data/serial/__init__.py b/nemo_text_processing/text_normalization/hi/data/serial/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo_text_processing/text_normalization/hi/taggers/roman.py b/nemo_text_processing/text_normalization/hi/taggers/roman.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pynini
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, convert_space, insert_space
+from nemo_text_processing.text_normalization.hi.utils import get_abs_path, load_labels
+
+
+class RomanFst(GraphFst):
+    """
+    Finite state transducer for classifying Roman numerals in Hindi text.
+        e.g. भास्कर-II    -> tokens { roman { key_cardinal: "भास्कर" integer: "II" } }
+        e.g. कक्षा XII    -> tokens { roman { key_cardinal: "कक्षा" integer: "XII" } }
+        e.g. XIIवीं कक्षा -> tokens { roman { integer: "XII" default_ordinal: "बारहवीं" key_cardinal: "कक्षा" } }
+        e.g. IVथी कक्षा   -> tokens { roman { integer: "IV" default_ordinal: "चौथी" key_cardinal: "कक्षा" } }
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="roman", kind="classify", deterministic=deterministic)
+
+        roman_graph = pynini.string_file(get_abs_path("data/roman/roman_to_spoken.tsv")).optimize()
+        roman_numeral_only = pynini.project(roman_graph, "input").optimize()
+
+        devanagari_chars = pynini.project(
+            pynini.string_file(get_abs_path("data/serial/chars.tsv")), "input"
+        ).optimize()
+
+        devanagari_word = pynini.closure(devanagari_chars, 1).optimize()
+
+        devanagari_phrase = (
+            devanagari_word + pynini.closure((pynini.accep(" ") | pynini.accep("-")) + devanagari_word)
+        ).optimize()
+
+        separator = (pynini.accep("-") | pynini.accep(" ")).optimize()
+
+        key_before_numeral = (
+            pynutil.insert("preserve_order: true ")
+            + pynutil.insert('key_cardinal: "')
+            + convert_space(devanagari_phrase)
+            + pynutil.insert('"')
+            + pynutil.delete(separator)
+            + insert_space
+            + pynutil.insert('integer: "')
+            + roman_numeral_only
+            + pynutil.insert('"')
+        ).optimize()
+
+        numeral_before_key = (
+            pynutil.insert("preserve_order: true ")
+            + pynutil.insert('integer: "')
+            + roman_numeral_only
+            + pynutil.insert('"')
+            + pynutil.delete(separator)
+            + insert_space
+            + pynutil.insert('key_cardinal: "')
+            + convert_space(devanagari_phrase)
+            + pynutil.insert('"')
+        ).optimize()
+
+        roman_rows = load_labels(get_abs_path("data/roman/roman_to_spoken.tsv"))
+        numerals_by_len_desc = sorted((n for n, _ in roman_rows), key=len, reverse=True)
+
+        exception_rows = load_labels(get_abs_path("data/roman/roman_ordinal_exceptions.tsv"))
+        exception_fused_set = {fused for fused, _ in exception_rows}
+
+        suffix_rows_raw = load_labels(get_abs_path("data/ordinal/suffixes.tsv")) + load_labels(
+            get_abs_path("data/ordinal/suffixes_map.tsv")
+        )
+
+        exception_graphs = []
+        for fused, spoken_word in exception_rows:
+            matched_numeral = next(c for c in numerals_by_len_desc if fused.startswith(c))
+            exception_graphs.append(
+                pynutil.insert('integer: "' + matched_numeral + '"')
+                + insert_space
+                + pynutil.insert('default_ordinal: "' + spoken_word + '"')
+                + pynutil.delete(fused)
+            )
+        glued_ordinal_exceptions_graph = pynini.union(*exception_graphs).optimize()
+
+        regular_row_graphs = []
+        for numeral, spoken in roman_rows:
+            for row in suffix_rows_raw:
+
+                suffix_input = row[0]
+                suffix_output = row[1] if len(row) > 1 else row[0]
+
+                fused = numeral + suffix_input
+                if fused in exception_fused_set:
+                    continue
+                spoken_ordinal = spoken + suffix_output
+                regular_row_graphs.append(
+                    pynutil.insert('integer: "' + numeral + '"')
+                    + insert_space
+                    + pynutil.insert('default_ordinal: "' + spoken_ordinal + '"')
+                    + pynutil.delete(fused)
+                )
+        glued_ordinal_regular_graph = pynini.union(*regular_row_graphs).optimize()
+
+        roman_glued_ordinal_fields = pynini.union(
+            pynutil.add_weight(glued_ordinal_exceptions_graph, -0.1),
+            glued_ordinal_regular_graph,
+        ).optimize()
+
+        roman_glued_ordinal = (
+            pynutil.insert("preserve_order: true ")
+            + roman_glued_ordinal_fields
+            + pynini.closure(
+                pynutil.delete(" ")
+                + insert_space
+                + pynutil.insert('key_cardinal: "')
+                + convert_space(devanagari_phrase)
+                + pynutil.insert('"'),
+                0,
+                1,
+            )
+        ).optimize()
+
+        graph = pynini.union(key_before_numeral, numeral_before_key, roman_glued_ordinal).optimize()
+
+        self.fst = self.add_tokens(graph).optimize()
diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py
@@ -35,6 +35,7 @@
 from nemo_text_processing.text_normalization.hi.taggers.money import MoneyFst
 from nemo_text_processing.text_normalization.hi.taggers.ordinal import OrdinalFst
 from nemo_text_processing.text_normalization.hi.taggers.punctuation import PunctuationFst
+from nemo_text_processing.text_normalization.hi.taggers.roman import RomanFst
 from nemo_text_processing.text_normalization.hi.taggers.serial import SerialFst
 from nemo_text_processing.text_normalization.hi.taggers.telephone import TelephoneFst
 from nemo_text_processing.text_normalization.hi.taggers.time import TimeFst
@@ -115,6 +116,9 @@ def __init__(
             word = WordFst(punctuation=punctuation, deterministic=deterministic)
             word_graph = word.fst
 
+            roman = RomanFst(deterministic=deterministic)
+            roman_graph = roman.fst
+
             telephone = TelephoneFst()
             telephone_graph = telephone.fst
 
@@ -137,6 +141,7 @@ def __init__(
                 | pynutil.add_weight(ordinal_graph, 1.1)
                 | pynutil.add_weight(electronic_graph, 1.1)
                 | pynutil.add_weight(serial_graph, 1.11)
+                | pynutil.add_weight(roman_graph, 1.1)
             )
 
             punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=2.1) + pynutil.insert(" }")