NVIDIA · bbae0312 · Dec 15, 2025 · Dec 16, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv
@@ -1,6 +1,6 @@
 카드 끝자리	카드 끝자리
 카드 마지막 네자리	카드 마지막 네자리
-카드 마지막 4자리	카드 마지막 4자리
+카드 마지막 4자리	카드 마지막 네자리
 신용카드 번호	신용카드 번호
 신용카드	신용카드
 체크카드 번호	체크카드 번호

diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv
@@ -26,7 +26,4 @@
 .uk	닷 유케이
 .br	닷 비알
 .in	닷 아이엔
-.ru	닷 알유
-.jpg	닷 제이피지
-.png	닷 피엔지
-.pdf	닷 피디에프
+.ru	닷 알유
diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv
@@ -0,0 +1,6 @@
+.jpg	닷 제이피지
+.png	닷 피엔지
+.pdf	닷 피디에프
+.JPG	닷 제이피지
+.PNG	닷 피엔지
+.PDF	닷 피디에프
diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
@@ -16,13 +16,19 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst
 from nemo_text_processing.text_normalization.ko.utils import get_abs_path
 
 
 class CardinalFst(GraphFst):
     def __init__(self, deterministic: bool = True):
         super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
+
+        # Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP)
+        SEP = pynini.union(",", "’", "'", "\u00a0", "\u2009", "\u202f")
+        # Optional small whitespace inside parentheses or after signs
+        WS = pynini.closure(pynini.accep(" "), 0, 2)
+
         # Load base .tsv files
         graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
         graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
@@ -53,7 +59,9 @@ def __init__(self, deterministic: bool = True):
         graph_thousand = thousands @ graph_thousand_component
 
         ten_thousands = NEMO_DIGIT**5
-        graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union(
+        graph_ten_thousand_component = (
+            pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만'))
+        ) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
@@ -268,8 +276,38 @@ def __init__(self, deterministic: bool = True):
         ).optimize()
 
         # Sign and final formatting
-        optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1)
-        final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"')
+        # Delete group separators when they appear between digits (e.g., "1,234" -> "1234")
+        delete_sep_between_digits = pynini.cdrewrite(
+            pynutil.delete(SEP),
+            NEMO_DIGIT,
+            NEMO_DIGIT,
+            NEMO_SIGMA,
+        )
+
+        # Let the number graph accept numbers with separators
+        graph_num_accepting_separators = delete_sep_between_digits @ graph_num
+
+        # Build the integer token (integer: "...")
+        integer_token = pynutil.insert('integer: "') + graph_num_accepting_separators + pynutil.insert('"')
+
+        # Sign handling:
+        #  - minus sets negative flag
+        #  - plus is ignored (positive number)
+        minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-")
+        plus_prefix = pynutil.delete("+")
+
+        # Accounting negative: "( 1,234 )" -> negative + integer:"1234"
+        paren_negative = (
+            pynutil.insert('negative: "true" ') + pynutil.delete("(") + WS + integer_token + WS + pynutil.delete(")")
+        )
+
+        # Signed number: optional (+|-) + integer
+        signed_integer = (minus_prefix | plus_prefix).ques + integer_token
+
+        # Prefer accounting-form first, then signed form
+        final_graph = paren_negative | signed_integer
+
+        # Wrap with class tokens and finalize
         final_graph = self.add_tokens(final_graph)
         self.fst = final_graph.optimize()
-        self.graph = graph_num.optimize()
+        self.graph = graph_num_accepting_separators.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py
@@ -15,7 +15,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst
 from nemo_text_processing.text_normalization.ko.utils import get_abs_path
 
 
@@ -32,7 +32,14 @@ class DecimalFst(GraphFst):
     def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         super().__init__(name="decimal", kind="classify", deterministic=deterministic)
 
-        cardinal_before_decimal = cardinal.graph
+        # Use the base cardinal graph for the integer part
+        base_integer_graph = cardinal.graph
+        # Only special-case 10000 -> 만 for decimal integer part (if needed)
+        specials_input = pynini.cross("10000", "만")
+
+        # Try the special mapping first, then fall back to normal cardinal
+        cardinal_before_decimal = (specials_input | base_integer_graph).optimize()
+
         cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv"))
         zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
 

diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py
@@ -121,11 +121,24 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         dollar_accep = pynini.accep("$")
         excluded_symbols = DOT | dollar_accep | AT
         filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols)
-        accepted_characters = ASCII_ALNUM | filtered_symbols
         # Domain core graph
         graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize()
         graph |= graph_domain
 
+        known_extensions = pynini.project(
+            pynini.string_file(get_abs_path("data/electronic/extensions.tsv")),
+            "input",
+        )
+
+        filename_stem = pynini.closure(
+            pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)),
+            1,
+        )
+
+        file_with_extension = filename_stem + known_extensions
+
+        graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize()
+
         # (3) URL with protocol
         graph |= protocol + insert_space + domain_graph_with_class_tags
 
@@ -144,9 +157,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
 
             four = pynini.closure(NEMO_DIGIT, 4, 4)
             sep_token = pynini.union(HYPHEN, NEMO_SPACE)
-            sep_del = pynutil.delete(pynini.closure(sep_token, 1))  # allow mix of - or space
-
-            cc16_grouped = four + sep_del + four + sep_del + four + sep_del + four
+            sep_to_space = pynutil.delete(pynini.closure(sep_token, 0, 1)) + insert_space
+            cc16_grouped = four + sep_to_space + four + sep_to_space + four + sep_to_space + four
+            cc16_grouped = cc16_grouped + delete_space
 
             cc16_no_cue = (
                 pynutil.insert('protocol: "신용카드 " ')

diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py
@@ -15,7 +15,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space
 from nemo_text_processing.text_normalization.ko.utils import get_abs_path
 
 
@@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
             + pynutil.insert(DOUBLE_QUOTE)
         )
 
-        integer_component_with_space = integer_component + pynutil.insert(NEMO_SPACE)
+        integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE)
 
         # Denominator and numerator
         denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE)

diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py
@@ -66,5 +66,20 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
 
         graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize()  # Handles 1-39  # Handles 40+
 
-        final_graph = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"')
+        # Single-character particles (가, 이, 은, 는, 로, 도 ...)
+        josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다")
+
+        # Multi-character particles (부터, 까지)
+        josa_multi = pynini.union("부터", "까지")
+
+        # Allow patterns like:
+        #   번째 + (optional single-josa) + (optional multi-josa)
+        josa = (josa_single.ques + josa_multi.ques).optimize()
+
+        # Final ordinal graph with optional particles
+        graph_ordinal_with_josa = (graph_ordinal + josa).optimize()
+
+        # Build the “integer: …” token structure
+        final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"')
+
         self.fst = self.add_tokens(final_graph).optimize()
diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py
@@ -24,11 +24,11 @@ class TelephoneFst(GraphFst):
     Finite state transducer for classifying Korean telephone numbers.
 
     Example inputs → tokens:
-        +82-10-3713-7050  -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" }
-        +1 (415) 555-0123 -> telephone { country_code: "플러스 일,"   number_part: "사일오, 오오오, 영일이삼" }
-        (031)371-3700     -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" }
-        010-3713-7050     -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" }
-        010.777.8888      -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" }
+        +82 010-3713-7050  -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" }
+        +1 (415) 555-0123 -> telephone { country_code: "국가번호 일,"   number_part: "사일오 오오오 영일이삼" }
+        (031)371-3700     -> telephone { number_part: "영삼일 삼칠일 삼칠영영" }
+        010-3713-7050     -> telephone { number_part: "영일영 삼칠일삼 칠영오영" }
+        010.777.8888      -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" }
 
     Args:
         deterministic (bool, optional): If True, provide a single transduction;
@@ -37,8 +37,10 @@ class TelephoneFst(GraphFst):
 
     def __init__(self, deterministic: bool = True):
         super().__init__(name="telephone", kind="classify", deterministic=deterministic)
-
-        add_sep = pynutil.insert(", ")  # standard block separator ", "
+        # Separator between digit blocks (e.g., "-" or ".")
+        add_sep = pynutil.delete("-") | pynutil.delete(".")
+        # Optional space inserted between blocks
+        sep_space = insert_space
 
         # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert)
         digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize()
@@ -49,35 +51,39 @@ def __init__(self, deterministic: bool = True):
         four_digits = digit_ko**4
 
         # country code: "+1", "+82", "+1-"
-        country_core = (
-            pynini.cross("+", "플러스 ")
-            + pynini.closure(digit_ko + insert_space, 0, 2)
-            + digit_ko
-            + pynutil.insert(",")
+        cc_digits = pynini.closure(digit_ko, 1, 3)
+
+        country_code = (
+            pynutil.delete("+")
+            + pynutil.insert('country_code: "')
+            + cc_digits
+            + pynutil.insert('"')
+            + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1)
+            + delete_space
         )
-        country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"')
-        country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space
 
         # area part: "123-" | "123." | "(123)" [space?] or "(123)-"
         area_core = three_digits
         area_part = (
-            (area_core + (pynutil.delete("-") | pynutil.delete(".")))
+            (area_core + add_sep)
             | (
                 pynutil.delete("(")
                 + area_core
-                + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
+                + pynutil.delete(")")
+                + pynini.closure(pynutil.delete(" "), 0, 1)
+                + pynini.closure(add_sep, 0, 1)
             )
-        ) + add_sep
+        ) + sep_space
 
         # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050)
         mid = pynini.union(three_digits, four_digits)
         last4 = four_digits
 
         # consume '-' or '.' between middle and last blocks
-        number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4
+        number_part_core = area_part + mid + add_sep + sep_space + last4
         number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"')
 
         # final graph: with or without country code
-        graph = pynini.union(country_code + number_part, number_part).optimize()
+        graph = pynini.union(country_code + insert_space + number_part, number_part).optimize()
 
         self.fst = self.add_tokens(graph).optimize()
diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
@@ -17,7 +17,13 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main
+from nemo_text_processing.text_normalization.ko.graph_utils import (
+    NEMO_WHITE_SPACE,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
 from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
 from nemo_text_processing.text_normalization.ko.taggers.date import DateFst
 from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst
@@ -98,9 +104,14 @@ def __init__(
             )
 
             token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
-            tagger = pynini.closure(token, 1)
+            space = pynini.closure(NEMO_WHITE_SPACE, 1)
+            space = pynini.compose(space, delete_extra_space)
 
-            self.fst = tagger.optimize()
+            space_opt = pynini.closure(space, 0, 1)
+
+            graph = delete_space + token + pynini.closure(space_opt + token) + delete_space
+
+            self.fst = graph.optimize()
 
             if far_file:
                 generator_main(far_file, {"tokenize_and_classify": self.fst})
diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, GraphFst
+from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst
 
 
 class WordFst(GraphFst):
@@ -25,5 +26,11 @@ class WordFst(GraphFst):
 
     def __init__(self, deterministic: bool = True):
         super().__init__(name="word", kind="classify", deterministic=deterministic)
-        word = pynutil.insert("name: \"") + NEMO_NOT_SPACE + pynutil.insert("\"")
+
+        word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT)
+
+        word = pynutil.insert('name: "')
+        word += pynini.closure(word_char, 1)
+        word += pynutil.insert('"')
+
         self.fst = word.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py
@@ -38,14 +38,16 @@ def __init__(self, deterministic: bool = True):
             + pynutil.delete("\"")
         )
 
+        SPACE = pynini.closure(delete_space, 0, 1) + insert_space
+
         # This graph now correctly uses the 'delete_space' variable defined above.
         graph_basic_date = (
-            pynini.closure(era_component + delete_space, 0, 1)
-            + pynini.closure(year_component + delete_space, 0, 1)
-            + pynini.closure(month_component + delete_space, 0, 1)
+            pynini.closure(era_component + SPACE, 0, 1)
+            + pynini.closure(year_component + SPACE, 0, 1)
+            + pynini.closure(month_component + SPACE, 0, 1)
             + pynini.closure(day_component, 0, 1)
-            + pynini.closure((delete_space + week_component) | (week_component), 0, 1)
-        ) | month_component + delete_space + week_component
+            + pynini.closure(SPACE + week_component, 0, 1)
+        ) | (month_component + SPACE + week_component)
 
         final_graph = graph_basic_date