From a108dce2399a93ea9565e7cf99d1219057c4e7ce Mon Sep 17 00:00:00 2001
From: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com>
Date: Mon, 15 Dec 2025 15:39:56 -0800
Subject: [PATCH 1/4] Korean TN fixes: cardinal, decimal, fraction, date

Signed-off-by: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com>
---
 .../text_normalization/ko/taggers/cardinal.py | 49 +++++++++++++++++--
 .../text_normalization/ko/taggers/decimal.py  | 11 ++++-
 .../text_normalization/ko/taggers/fraction.py |  4 +-
 .../text_normalization/ko/taggers/ordinal.py  | 23 ++++++++-
 .../ko/taggers/tokenize_and_classify.py       | 18 +++++--
 .../text_normalization/ko/taggers/word.py     | 12 +++--
 .../text_normalization/ko/verbalizers/date.py | 12 +++--
 .../ko/verbalizers/fraction.py                |  7 ++-
 .../ko/verbalizers/verbalize_final.py         | 19 +++++--
 .../text_normalization/run_evaluate.py        |  2 +-
 10 files changed, 129 insertions(+), 28 deletions(-)

diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
index 9454203c9..ae6af7414 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
@@ -16,13 +16,19 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst
 from nemo_text_processing.text_normalization.ko.utils import get_abs_path
 
 
 class CardinalFst(GraphFst):
     def __init__(self, deterministic: bool = True):
         super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
+        
+        # Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP)
+        SEP = pynini.union(",", "’", "'", "\u00A0", "\u2009", "\u202F")
+        # Optional small whitespace inside parentheses or after signs
+        WS = pynini.closure(pynini.accep(" "), 0, 2)
+
         # Load base .tsv files
         graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
         graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
@@ -53,7 +59,7 @@ def __init__(self, deterministic: bool = True):
         graph_thousand = thousands @ graph_thousand_component
 
         ten_thousands = NEMO_DIGIT**5
-        graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union(
+        graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만'))) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
@@ -268,8 +274,41 @@ def __init__(self, deterministic: bool = True):
         ).optimize()
 
         # Sign and final formatting
-        optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1)
-        final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"')
+        # Delete group separators when they appear between digits (e.g., "1,234" -> "1234")
+        delete_sep_between_digits = pynini.cdrewrite(
+            pynutil.delete(SEP),
+            NEMO_DIGIT,
+            NEMO_DIGIT,
+            NEMO_SIGMA,
+        )
+        
+        # Let the number graph accept numbers with separators
+        graph_num_accepting_separators = delete_sep_between_digits @ graph_num
+
+        # Build the integer token (integer: "...")
+        integer_token = pynutil.insert('integer: "') + graph_num_accepting_separators + pynutil.insert('"')
+
+        # Sign handling:
+        #  - minus sets negative flag
+        #  - plus is ignored (positive number)
+        minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-")
+        plus_prefix  = pynutil.delete("+")
+
+        # Accounting negative: "( 1,234 )" -> negative + integer:"1234"
+        paren_negative = (
+            pynutil.insert('negative: "true" ')
+            + pynutil.delete("(") + WS
+            + integer_token
+            + WS + pynutil.delete(")")
+        )
+
+        # Signed number: optional (+|-) + integer
+        signed_integer = ( (minus_prefix | plus_prefix).ques + integer_token )
+
+        # Prefer accounting-form first, then signed form
+        final_graph = paren_negative | signed_integer
+
+        # Wrap with class tokens and finalize
         final_graph = self.add_tokens(final_graph)
         self.fst = final_graph.optimize()
-        self.graph = graph_num.optimize()
+        self.graph = graph_num_accepting_separators.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py
index 787acf817..bec6ea87a 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/decimal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/decimal.py
@@ -15,7 +15,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, NEMO_SIGMA
 from nemo_text_processing.text_normalization.ko.utils import get_abs_path
 
 
@@ -31,8 +31,15 @@ class DecimalFst(GraphFst):
 
     def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         super().__init__(name="decimal", kind="classify", deterministic=deterministic)
+        
+        # Use the base cardinal graph for the integer part
+        base_integer_graph = cardinal.graph
+        # Only special-case 10000 -> 만 for decimal integer part (if needed)
+        specials_input = pynini.cross("10000", "만")
 
-        cardinal_before_decimal = cardinal.graph
+        # Try the special mapping first, then fall back to normal cardinal
+        cardinal_before_decimal = (specials_input | base_integer_graph).optimize()
+        
         cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv"))
         zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
 
diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py
index ebd7ee7ef..4e30ef1c6 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/fraction.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py
@@ -15,7 +15,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space
 from nemo_text_processing.text_normalization.ko.utils import get_abs_path
 
 
@@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
             + pynutil.insert(DOUBLE_QUOTE)
         )
 
-        integer_component_with_space = integer_component + pynutil.insert(NEMO_SPACE)
+        integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE)
 
         # Denominator and numerator
         denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE)
diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py
index 66feaa727..f7bd179c3 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py
@@ -65,6 +65,27 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         graph_ordinal_from40 = graph_cardinal_from40_filtered + pynini.accep("번째")
 
         graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize()  # Handles 1-39  # Handles 40+
+        
+        # Single-character particles (가, 이, 은, 는, 로, 도 ...)
+        josa_single = pynini.union(
+            "가", "이", "은", "는", "를", "을", "로", "도", "다"
+        )
+
+        # Multi-character particles (부터, 까지)
+        josa_multi = pynini.union("부터", "까지")
+
+        # Allow patterns like:
+        #   번째 + (optional single-josa) + (optional multi-josa)
+        josa = (josa_single.ques + josa_multi.ques).optimize()
+
+        # Final ordinal graph with optional particles
+        graph_ordinal_with_josa = (graph_ordinal + josa).optimize()
+
+        # Build the “integer: …” token structure
+        final_graph = (
+            pynutil.insert('integer: "')
+            + graph_ordinal_with_josa
+            + pynutil.insert('"')
+        )
 
-        final_graph = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"')
         self.fst = self.add_tokens(final_graph).optimize()
diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
index 80c15aa70..a0389d0e7 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
@@ -17,7 +17,14 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main
+from nemo_text_processing.text_normalization.ko.graph_utils import (
+    GraphFst,
+    generator_main,
+    delete_space,
+    delete_extra_space,
+    NEMO_WHITE_SPACE,
+)
+
 from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
 from nemo_text_processing.text_normalization.ko.taggers.date import DateFst
 from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst
@@ -98,9 +105,14 @@ def __init__(
             )
 
             token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
-            tagger = pynini.closure(token, 1)
+            space = pynini.closure(NEMO_WHITE_SPACE, 1)
+            space = pynini.compose(space, delete_extra_space)
+
+            space_opt = pynini.closure(space, 0, 1)
+
+            graph = delete_space + token + pynini.closure(space_opt + token) + delete_space
 
-            self.fst = tagger.optimize()
+            self.fst = graph.optimize()
 
             if far_file:
                 generator_main(far_file, {"tokenize_and_classify": self.fst})
diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py
index 356bc04ca..adbe74d1b 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/word.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/word.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 
 from pynini.lib import pynutil
-
-from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, GraphFst
+import pynini
+from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, NEMO_DIGIT, GraphFst
 
 
 class WordFst(GraphFst):
@@ -25,5 +25,11 @@ class WordFst(GraphFst):
 
     def __init__(self, deterministic: bool = True):
         super().__init__(name="word", kind="classify", deterministic=deterministic)
-        word = pynutil.insert("name: \"") + NEMO_NOT_SPACE + pynutil.insert("\"")
+        
+        word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT)
+
+        word = pynutil.insert('name: "')
+        word += pynini.closure(word_char, 1)
+        word += pynutil.insert('"')
+        
         self.fst = word.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py
index aa32529f3..fc4c023a2 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/date.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/date.py
@@ -38,14 +38,16 @@ def __init__(self, deterministic: bool = True):
             + pynutil.delete("\"")
         )
 
+        SPACE = pynini.closure(delete_space, 0, 1) + insert_space
+        
         # This graph now correctly uses the 'delete_space' variable defined above.
         graph_basic_date = (
-            pynini.closure(era_component + delete_space, 0, 1)
-            + pynini.closure(year_component + delete_space, 0, 1)
-            + pynini.closure(month_component + delete_space, 0, 1)
+            pynini.closure(era_component + SPACE, 0, 1)
+            + pynini.closure(year_component + SPACE, 0, 1)
+            + pynini.closure(month_component + SPACE, 0, 1)
             + pynini.closure(day_component, 0, 1)
-            + pynini.closure((delete_space + week_component) | (week_component), 0, 1)
-        ) | month_component + delete_space + week_component
+            + pynini.closure(SPACE + week_component, 0, 1)
+        ) | (month_component + SPACE + week_component)
 
         final_graph = graph_basic_date
 
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py
index 2c56f7f39..bafbf133d 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py
@@ -32,8 +32,8 @@ def __init__(self, deterministic: bool = True):
         super().__init__(name="fraction", kind="verbalize", deterministic=deterministic)
 
         # Handles square root symbols like "√3" → "루트3"
-        denominator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE)
-        numerator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE)
+        denominator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE)
+        numerator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE)
 
         # Matches non-root numeric content
         denominator = pynini.closure(NEMO_NOT_QUOTE - "√")
@@ -56,6 +56,7 @@ def __init__(self, deterministic: bool = True):
                 1,
             )
             + pynutil.insert("분의")
+            + pynutil.insert(NEMO_SPACE)
             + numerator_component
         )
 
@@ -66,6 +67,7 @@ def __init__(self, deterministic: bool = True):
             + pynutil.delete('"')
             + pynini.closure(pynini.union("√", ".", NEMO_NOT_QUOTE - '"'))
             + pynutil.delete('"')
+            + pynutil.insert(NEMO_SPACE)
         )
         graph_integer_fraction = graph_integer + delete_space + graph_fraction
 
@@ -77,6 +79,7 @@ def __init__(self, deterministic: bool = True):
             + pynini.closure(NEMO_NOT_QUOTE - '"')
             + pynutil.delete('"')
             + delete_space
+            + pynutil.insert(NEMO_SPACE)
         )
 
         # Final graph handles optional negative + (integer + fraction | fraction only)
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
index 09494ef86..9ad80a23c 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
@@ -17,7 +17,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main
+from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, delete_extra_space, NEMO_WHITE_SPACE, NEMO_SIGMA, generator_main
 from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst
 from nemo_text_processing.utils.logging import logger
 
@@ -49,11 +49,22 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_
             token_graph = VerbalizeFst(deterministic=deterministic)
 
             token_verbalizer = (
-                pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }")
+                pynutil.delete("tokens {")
+                + delete_space
+                + token_graph.fst
+                + delete_space
+                + pynutil.delete(" }")
             )
+            
+            space_between_tokens = pynini.closure(NEMO_WHITE_SPACE, 1)
 
-            verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space)
-
+            verbalizer = (
+                delete_space
+                + token_verbalizer
+                + pynini.closure(space_between_tokens + token_verbalizer)
+                + delete_space
+            )
+            
             self.fst = verbalizer.optimize()
 
             if far_file:
diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py
index 0438579a7..fc9b21c29 100644
--- a/nemo_text_processing/text_normalization/run_evaluate.py
+++ b/nemo_text_processing/text_normalization/run_evaluate.py
@@ -35,7 +35,7 @@ def parse_args():
     parser.add_argument(
         "--lang",
         help="language",
-        choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'],
+        choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko'],
         default="en",
         type=str,
     )

From 43315121c375e40b4ce36ad4f206024210e30685 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Dec 2025 00:07:01 +0000
Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../text_normalization/ko/taggers/cardinal.py | 19 +++++++++----------
 .../text_normalization/ko/taggers/decimal.py  |  6 +++---
 .../text_normalization/ko/taggers/ordinal.py  | 12 +++---------
 .../ko/taggers/tokenize_and_classify.py       |  7 +++----
 .../text_normalization/ko/taggers/word.py     |  9 +++++----
 .../text_normalization/ko/verbalizers/date.py |  2 +-
 .../ko/verbalizers/verbalize_final.py         | 19 +++++++++++--------
 7 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
index ae6af7414..2df876fea 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py
@@ -23,9 +23,9 @@
 class CardinalFst(GraphFst):
     def __init__(self, deterministic: bool = True):
         super().__init__(name="cardinal", kind="classify", deterministic=deterministic)
-        
+
         # Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP)
-        SEP = pynini.union(",", "’", "'", "\u00A0", "\u2009", "\u202F")
+        SEP = pynini.union(",", "’", "'", "\u00a0", "\u2009", "\u202f")
         # Optional small whitespace inside parentheses or after signs
         WS = pynini.closure(pynini.accep(" "), 0, 2)
 
@@ -59,7 +59,9 @@ def __init__(self, deterministic: bool = True):
         graph_thousand = thousands @ graph_thousand_component
 
         ten_thousands = NEMO_DIGIT**5
-        graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만'))) + pynini.union(
+        graph_ten_thousand_component = (
+            pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만'))
+        ) + pynini.union(
             pynini.closure(pynutil.delete('0')),
             graph_thousand_component,
             (pynutil.delete('0') + graph_hundred_component),
@@ -281,7 +283,7 @@ def __init__(self, deterministic: bool = True):
             NEMO_DIGIT,
             NEMO_SIGMA,
         )
-        
+
         # Let the number graph accept numbers with separators
         graph_num_accepting_separators = delete_sep_between_digits @ graph_num
 
@@ -292,18 +294,15 @@ def __init__(self, deterministic: bool = True):
         #  - minus sets negative flag
         #  - plus is ignored (positive number)
         minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-")
-        plus_prefix  = pynutil.delete("+")
+        plus_prefix = pynutil.delete("+")
 
         # Accounting negative: "( 1,234 )" -> negative + integer:"1234"
         paren_negative = (
-            pynutil.insert('negative: "true" ')
-            + pynutil.delete("(") + WS
-            + integer_token
-            + WS + pynutil.delete(")")
+            pynutil.insert('negative: "true" ') + pynutil.delete("(") + WS + integer_token + WS + pynutil.delete(")")
         )
 
         # Signed number: optional (+|-) + integer
-        signed_integer = ( (minus_prefix | plus_prefix).ques + integer_token )
+        signed_integer = (minus_prefix | plus_prefix).ques + integer_token
 
         # Prefer accounting-form first, then signed form
         final_graph = paren_negative | signed_integer
diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py
index bec6ea87a..6d2d07f66 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/decimal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/decimal.py
@@ -15,7 +15,7 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, NEMO_SIGMA
+from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst
 from nemo_text_processing.text_normalization.ko.utils import get_abs_path
 
 
@@ -31,7 +31,7 @@ class DecimalFst(GraphFst):
 
     def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         super().__init__(name="decimal", kind="classify", deterministic=deterministic)
-        
+
         # Use the base cardinal graph for the integer part
         base_integer_graph = cardinal.graph
         # Only special-case 10000 -> 만 for decimal integer part (if needed)
@@ -39,7 +39,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
 
         # Try the special mapping first, then fall back to normal cardinal
         cardinal_before_decimal = (specials_input | base_integer_graph).optimize()
-        
+
         cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv"))
         zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
 
diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py
index f7bd179c3..59fa30ada 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py
@@ -65,11 +65,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         graph_ordinal_from40 = graph_cardinal_from40_filtered + pynini.accep("번째")
 
         graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize()  # Handles 1-39  # Handles 40+
-        
+
         # Single-character particles (가, 이, 은, 는, 로, 도 ...)
-        josa_single = pynini.union(
-            "가", "이", "은", "는", "를", "을", "로", "도", "다"
-        )
+        josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다")
 
         # Multi-character particles (부터, 까지)
         josa_multi = pynini.union("부터", "까지")
@@ -82,10 +80,6 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         graph_ordinal_with_josa = (graph_ordinal + josa).optimize()
 
         # Build the “integer: …” token structure
-        final_graph = (
-            pynutil.insert('integer: "')
-            + graph_ordinal_with_josa
-            + pynutil.insert('"')
-        )
+        final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"')
 
         self.fst = self.add_tokens(final_graph).optimize()
diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
index a0389d0e7..e15129c7c 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py
@@ -18,13 +18,12 @@
 from pynini.lib import pynutil
 
 from nemo_text_processing.text_normalization.ko.graph_utils import (
+    NEMO_WHITE_SPACE,
     GraphFst,
-    generator_main,
-    delete_space,
     delete_extra_space,
-    NEMO_WHITE_SPACE,
+    delete_space,
+    generator_main,
 )
-
 from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
 from nemo_text_processing.text_normalization.ko.taggers.date import DateFst
 from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst
diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py
index adbe74d1b..7aa3db709 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/word.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/word.py
@@ -12,9 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from pynini.lib import pynutil
 import pynini
-from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, NEMO_DIGIT, GraphFst
+from pynini.lib import pynutil
+
+from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst
 
 
 class WordFst(GraphFst):
@@ -25,11 +26,11 @@ class WordFst(GraphFst):
 
     def __init__(self, deterministic: bool = True):
         super().__init__(name="word", kind="classify", deterministic=deterministic)
-        
+
         word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT)
 
         word = pynutil.insert('name: "')
         word += pynini.closure(word_char, 1)
         word += pynutil.insert('"')
-        
+
         self.fst = word.optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py
index fc4c023a2..bfd5e9aa1 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/date.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/date.py
@@ -39,7 +39,7 @@ def __init__(self, deterministic: bool = True):
         )
 
         SPACE = pynini.closure(delete_space, 0, 1) + insert_space
-        
+
         # This graph now correctly uses the 'delete_space' variable defined above.
         graph_basic_date = (
             pynini.closure(era_component + SPACE, 0, 1)
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
index 9ad80a23c..3ec44eac6 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py
@@ -17,7 +17,14 @@
 import pynini
 from pynini.lib import pynutil
 
-from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, delete_extra_space, NEMO_WHITE_SPACE, NEMO_SIGMA, generator_main
+from nemo_text_processing.text_normalization.ko.graph_utils import (
+    NEMO_SIGMA,
+    NEMO_WHITE_SPACE,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
 from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst
 from nemo_text_processing.utils.logging import logger
 
@@ -49,13 +56,9 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_
             token_graph = VerbalizeFst(deterministic=deterministic)
 
             token_verbalizer = (
-                pynutil.delete("tokens {")
-                + delete_space
-                + token_graph.fst
-                + delete_space
-                + pynutil.delete(" }")
+                pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }")
             )
-            
+
             space_between_tokens = pynini.closure(NEMO_WHITE_SPACE, 1)
 
             verbalizer = (
@@ -64,7 +67,7 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_
                 + pynini.closure(space_between_tokens + token_verbalizer)
                 + delete_space
             )
-            
+
             self.fst = verbalizer.optimize()
 
             if far_file:

From aec257a41a339092ad1cab782b31df50d2991b05 Mon Sep 17 00:00:00 2001
From: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com>
Date: Wed, 17 Dec 2025 17:05:55 -0800
Subject: [PATCH 3/4] Add ko electronic extensions and improve
 electronic/telephone normalization

Signed-off-by: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com>
---
 .../ko/data/electronic/cc_cues.tsv            |  2 +-
 .../ko/data/electronic/domain.tsv             |  5 +-
 .../ko/data/electronic/extensions.tsv         |  6 ++
 .../ko/taggers/electronic.py                  | 25 +++++--
 .../ko/taggers/telephone.py                   | 49 ++++++++------
 .../ko/verbalizers/electronic.py              | 35 +++++++++-
 .../ko/verbalizers/telephone.py               |  5 +-
 .../test_cases_electronic.txt                 | 16 +++--
 .../test_cases_telephone.txt                  | 66 +++++++++----------
 9 files changed, 137 insertions(+), 72 deletions(-)
 create mode 100644 nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv

diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv
index 2063fe8d1..f562cfbad 100644
--- a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv
+++ b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv
@@ -1,6 +1,6 @@
 카드 끝자리	카드 끝자리
 카드 마지막 네자리	카드 마지막 네자리
-카드 마지막 4자리	카드 마지막 4자리
+카드 마지막 4자리	카드 마지막 네자리
 신용카드 번호	신용카드 번호
 신용카드	신용카드
 체크카드 번호	체크카드 번호
diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv
index c51ab615f..cd817d539 100644
--- a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv
+++ b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv
@@ -26,7 +26,4 @@
 .uk	닷 유케이
 .br	닷 비알
 .in	닷 아이엔
-.ru	닷 알유
-.jpg	닷 제이피지
-.png	닷 피엔지
-.pdf	닷 피디에프
\ No newline at end of file
+.ru	닷 알유
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv
new file mode 100644
index 000000000..c80d08a69
--- /dev/null
+++ b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv
@@ -0,0 +1,6 @@
+.jpg	닷 제이피지
+.png	닷 피엔지
+.pdf	닷 피디에프
+.JPG	닷 제이피지
+.PNG	닷 피엔지
+.PDF	닷 피디에프
\ No newline at end of file
diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py
index 182f29b63..d5e0d495b 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/electronic.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/electronic.py
@@ -121,11 +121,28 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
         dollar_accep = pynini.accep("$")
         excluded_symbols = DOT | dollar_accep | AT
         filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols)
-        accepted_characters = ASCII_ALNUM | filtered_symbols
         # Domain core graph
         graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize()
         graph |= graph_domain
 
+        known_extensions = pynini.project(
+            pynini.string_file(get_abs_path("data/electronic/extensions.tsv")),
+            "input",
+        )
+
+        filename_stem = pynini.closure(
+            pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)),
+            1,
+        )
+
+        file_with_extension = filename_stem + known_extensions 
+
+        graph |= (
+            pynutil.insert('domain: "')
+            + file_with_extension
+            + pynutil.insert('"')
+        ).optimize()
+
         # (3) URL with protocol
         graph |= protocol + insert_space + domain_graph_with_class_tags
 
@@ -144,9 +161,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
 
             four = pynini.closure(NEMO_DIGIT, 4, 4)
             sep_token = pynini.union(HYPHEN, NEMO_SPACE)
-            sep_del = pynutil.delete(pynini.closure(sep_token, 1))  # allow mix of - or space
-
-            cc16_grouped = four + sep_del + four + sep_del + four + sep_del + four
+            sep_to_space = pynutil.delete(pynini.closure(sep_token, 0, 1)) + insert_space
+            cc16_grouped = four + sep_to_space + four + sep_to_space + four + sep_to_space + four
+            cc16_grouped = cc16_grouped + delete_space
 
             cc16_no_cue = (
                 pynutil.insert('protocol: "신용카드 " ')
diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py
index 0a9eb52dc..3f6914fdd 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/telephone.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/telephone.py
@@ -24,11 +24,11 @@ class TelephoneFst(GraphFst):
     Finite state transducer for classifying Korean telephone numbers.
 
     Example inputs → tokens:
-        +82-10-3713-7050  -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" }
-        +1 (415) 555-0123 -> telephone { country_code: "플러스 일,"   number_part: "사일오, 오오오, 영일이삼" }
-        (031)371-3700     -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" }
-        010-3713-7050     -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" }
-        010.777.8888      -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" }
+        +82 010-3713-7050  -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" }
+        +1 (415) 555-0123 -> telephone { country_code: "국가번호 일,"   number_part: "사일오 오오오 영일이삼" }
+        (031)371-3700     -> telephone { number_part: "영삼일 삼칠일 삼칠영영" }
+        010-3713-7050     -> telephone { number_part: "영일영 삼칠일삼 칠영오영" }
+        010.777.8888      -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" }
 
     Args:
         deterministic (bool, optional): If True, provide a single transduction;
@@ -37,9 +37,11 @@ class TelephoneFst(GraphFst):
 
     def __init__(self, deterministic: bool = True):
         super().__init__(name="telephone", kind="classify", deterministic=deterministic)
-
-        add_sep = pynutil.insert(", ")  # standard block separator ", "
-
+        # Separator between digit blocks (e.g., "-" or ".")
+        add_sep = pynutil.delete("-") | pynutil.delete(".")
+        # Optional space inserted between blocks
+        sep_space = insert_space
+        
         # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert)
         digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize()
         zero_map = pynini.cross("0", "영")
@@ -49,35 +51,40 @@ def __init__(self, deterministic: bool = True):
         four_digits = digit_ko**4
 
         # country code: "+1", "+82", "+1-"
-        country_core = (
-            pynini.cross("+", "플러스 ")
-            + pynini.closure(digit_ko + insert_space, 0, 2)
-            + digit_ko
-            + pynutil.insert(",")
+        cc_digits = pynini.closure(digit_ko, 1, 3)
+
+        country_code = (
+            pynutil.delete("+")
+            + pynutil.insert('country_code: "')
+            + cc_digits
+            + pynutil.insert('"')
+            + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1)
+            + delete_space
         )
-        country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"')
-        country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space
-
+        
         # area part: "123-" | "123." | "(123)" [space?] or "(123)-"
         area_core = three_digits
         area_part = (
-            (area_core + (pynutil.delete("-") | pynutil.delete(".")))
+            (area_core + add_sep)
             | (
                 pynutil.delete("(")
                 + area_core
-                + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
+                + pynutil.delete(")")
+                + pynini.closure(pynutil.delete(" "), 0, 1)
+                + pynini.closure(add_sep, 0, 1)
             )
-        ) + add_sep
+        ) + sep_space
+
 
         # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050)
         mid = pynini.union(three_digits, four_digits)
         last4 = four_digits
 
         # consume '-' or '.' between middle and last blocks
-        number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4
+        number_part_core = area_part + mid + add_sep + sep_space + last4
         number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"')
 
         # final graph: with or without country code
-        graph = pynini.union(country_code + number_part, number_part).optimize()
+        graph = pynini.union(country_code + insert_space + number_part, number_part).optimize()
 
         self.fst = self.add_tokens(graph).optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py
index ecbb805cb..d1b29c2f2 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py
@@ -89,7 +89,10 @@ def __init__(self, deterministic: bool = True):
         )
 
         # 5) domain part (handle common endings like .com → 닷컴)
-        domain_common_pairs = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize()
+        domain_common_pairs = (
+            pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
+            | pynini.string_file(get_abs_path("data/electronic/extensions.tsv"))
+        ).optimize()
 
         # Rewrite known domains (.com → 닷컴)
         tld_rewrite = pynini.cdrewrite(
@@ -108,6 +111,10 @@ def __init__(self, deterministic: bool = True):
 
         raw_domain = pynini.closure(NEMO_NOT_QUOTE, 1)
 
+        four = pynini.closure(NEMO_DIGIT, 4, 4)
+        cc16_grouped = four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + pynutil.insert(" ") + four
+        cc_domain = (cc16_grouped @ digit_inline_rewrite).optimize()
+
         domain = (
             pynutil.delete("domain:")
             + delete_space
@@ -122,11 +129,33 @@ def __init__(self, deterministic: bool = True):
             pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space
         )
 
+        protocol_raw = (
+            pynutil.delete('protocol: "')
+            + pynini.closure(NEMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        cc_protocol_guard = pynini.accep("신용카드") + pynini.closure(NEMO_NOT_QUOTE, 0)
+        cc_protocol = (protocol_raw @ cc_protocol_guard) + insert_space
+
+        # Credit card case: "신용카드 ..." protocol + 16-digit domain grouped as 4-4-4-4
+        cc_graph = (
+            cc_protocol
+            + delete_space
+            + pynutil.delete("domain:")
+            + delete_space
+            + pynutil.delete('"')
+            + cc_domain
+            + pynutil.delete('"')
+            + delete_space
+        ).optimize()
+
         # 7) Combine: optional protocol + optional username + domain
-        graph = (
+        default_graph = (
             pynini.closure(protocol + delete_space, 0, 1)
             + pynini.closure(user_name + delete_space + pynutil.insert(" 골뱅이 ") + delete_space, 0, 1)
             + domain
             + delete_space
-        ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA)
+        ).optimize()
+
+        graph = (cc_graph | default_graph) @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA)
         self.fst = self.delete_tokens(graph).optimize()
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py
index 766da5e6f..211358141 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py
@@ -29,9 +29,9 @@ class TelephoneFst(GraphFst):
         [country_code + " "] + number_part [+ ", 내선 " + extension]
 
     Examples:
-        telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" }
+        telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" }
             -> 플러스 팔 이, 영일영, 삼칠일삼, 칠영오영
-        telephone { number_part: "팔영영, 오오오, 영영영영" extension: "이삼사" }
+        telephone { number_part: "팔영영 오오오 영영영영" extension: "이삼사" }
             -> 팔영영, 오오오, 영영영영, 내선 이삼사
 
     Args:
@@ -45,6 +45,7 @@ def __init__(self, deterministic: bool = True):
         country = (
             pynini.closure(delete_space, 0, 1)
             + pynutil.delete('country_code: "')
+            + pynutil.insert("국가번호 ")
             + pynini.closure(NEMO_NOT_QUOTE, 1)
             + pynutil.delete('"')
             + insert_space
diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt
index 70cbab631..4e09d0db2 100644
--- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt
+++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt
@@ -6,7 +6,15 @@ a@msn.it~a 골뱅이 msn 닷 아이티
 abc@nvidia.app~abc 골뱅이 nvidia 닷 앱
 user01@gmail.co.kr~user영일 골뱅이 gmail 닷 씨오 닷 케이알
 nvidia.co.kr~nvidia 닷 씨오 닷 케이알
-1234-5678-9012-3456~신용카드 일이삼사오육칠팔구영일이삼사오육
-카드 마지막 4자리 3456~카드 마지막 4자리 삼사오육
-카드 마지막 4자리 7890~카드 마지막 4자리 칠팔구영
-카드 끝자리 3456~카드 끝자리 삼사오육
\ No newline at end of file
+1234-5678-9012-3456~신용카드 일이삼사 오육칠팔 구영일이 삼사오육
+2345-2222-3333-4444~신용카드 이삼사오 이이이이 삼삼삼삼 사사사사
+9090-1234-5555-9876~신용카드 구영구영 일이삼사 오오오오 구팔칠육
+카드 마지막 네자리 3456~카드 마지막 네자리 삼사오육
+카드 마지막 4자리 7890~카드 마지막 네자리 칠팔구영
+카드 끝자리 3456~카드 끝자리 삼사오육
+사진.jpg~사진 닷 제이피지
+사진.JPG~사진 닷 제이피지
+사진.png~사진 닷 피엔지
+사진.PNG~사진 닷 피엔지
+문서.pdf~문서 닷 피디에프
+문서.PDF~문서 닷 피디에프
\ No newline at end of file
diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt
index dfcb7c3e9..b6e573aec 100644
--- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt
+++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt
@@ -1,33 +1,33 @@
-+1 123-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔
-123-123-5678~일이삼, 일이삼, 오육칠팔
-+1-123-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔
-+1 (123)-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔
-(123)-123-5678~일이삼, 일이삼, 오육칠팔
-555.555.5555~오오오, 오오오, 오오오오
-(123) 123-5678~일이삼, 일이삼, 오육칠팔
-010-3713-7050~영일영, 삼칠일삼, 칠영오영
-+82 123-456-7890~플러스 팔 이, 일이삼, 사오육, 칠팔구영
-+82-123-4567-8901~플러스 팔 이, 일이삼, 사오육칠, 팔구영일
-+44-207-555-1234~플러스 사 사, 이영칠, 오오오, 일이삼사
-123.456-7890~일이삼, 사오육, 칠팔구영
-123-456.7890~일이삼, 사오육, 칠팔구영
-(987)-654-3210~구팔칠, 육오사, 삼이일영
-(987) 654-3210~구팔칠, 육오사, 삼이일영
-+7 000-000-0000~플러스 칠, 영영영, 영영영, 영영영영
-000.000.0000~영영영, 영영영, 영영영영
-271-828-1828~이칠일, 팔이팔, 일팔이팔
-314-159-2653~삼일사, 일오구, 이육오삼
-(010) 123-4567~영일영, 일이삼, 사오육칠
-+358-123-456-7890~플러스 삼 오 팔, 일이삼, 사오육, 칠팔구영
-+1 800-555-0000~플러스 일, 팔영영, 오오오, 영영영영
-(800) 555-0000~팔영영, 오오오, 영영영영
-+12 345-678-9012~플러스 일 이, 삼사오, 육칠팔, 구영일이
-+999 999-999-9999~플러스 구 구 구, 구구구, 구구구, 구구구구
-321.654.0987~삼이일, 육오사, 영구팔칠
-+82 010-1234-5678~플러스 팔 이, 영일영, 일이삼사, 오육칠팔
-(999)-000-0000~구구구, 영영영, 영영영영
-+1-123.456.7890~플러스 일, 일이삼, 사오육, 칠팔구영
-+82-123.456-7890~플러스 팔 이, 일이삼, 사오육, 칠팔구영
-111-222-3333~일일일, 이이이, 삼삼삼삼
-909-808-7070~구영구, 팔영팔, 칠영칠영
-(555)555-5555~오오오, 오오오, 오오오오
++1 123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔
+123-123-5678~일이삼 일이삼 오육칠팔
++1-123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔
++1 (123)-123-5678~국가번호 일 일이삼 일이삼 오육칠팔
+(123)-123-5678~일이삼 일이삼 오육칠팔
+555.555.5555~오오오 오오오 오오오오
+(123) 123-5678~일이삼 일이삼 오육칠팔
+010-3713-7050~영일영 삼칠일삼 칠영오영
++82 123-456-7890~국가번호 팔이 일이삼 사오육 칠팔구영
++82-123-4567-8901~국가번호 팔이 일이삼 사오육칠 팔구영일
++44-207-555-1234~국가번호 사사 이영칠 오오오 일이삼사
+123.456-7890~일이삼 사오육 칠팔구영
+123-456.7890~일이삼 사오육 칠팔구영
+(987)-654-3210~구팔칠 육오사 삼이일영
+(987) 654-3210~구팔칠 육오사 삼이일영
++7 000-000-0000~국가번호 칠 영영영 영영영 영영영영
+000.000.0000~영영영 영영영 영영영영
+271-828-1828~이칠일 팔이팔 일팔이팔
+314-159-2653~삼일사 일오구 이육오삼
+(010) 123-4567~영일영 일이삼 사오육칠
++358-123-456-7890~국가번호 삼오팔 일이삼 사오육 칠팔구영
++1 800-555-0000~국가번호 일 팔영영 오오오 영영영영
+(800) 555-0000~팔영영 오오오 영영영영
++12 345-678-9012~국가번호 일이 삼사오 육칠팔 구영일이
++999 999-999-9999~국가번호 구구구 구구구 구구구 구구구구
+321.654.0987~삼이일 육오사 영구팔칠
++82 010-1234-5678~국가번호 팔이 영일영 일이삼사 오육칠팔
+(999)-000-0000~구구구 영영영 영영영영
++1-123.456.7890~국가번호 일 일이삼 사오육 칠팔구영
++82-123.456-7890~국가번호 팔이 일이삼 사오육 칠팔구영
+111-222-3333~일일일 이이이 삼삼삼삼
+909-808-7070~구영구 팔영팔 칠영칠영
+(555)555-5555~오오오 오오오 오오오오
\ No newline at end of file

From aebe1f2e9e3e95521ea542baf5c15a4a8d1168f8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 18 Dec 2025 01:11:49 +0000
Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../text_normalization/ko/taggers/electronic.py           | 8 ++------
 .../text_normalization/ko/taggers/telephone.py            | 5 ++---
 .../text_normalization/ko/verbalizers/electronic.py       | 6 +-----
 3 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py
index d5e0d495b..80d014263 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/electronic.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/electronic.py
@@ -135,13 +135,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
             1,
         )
 
-        file_with_extension = filename_stem + known_extensions 
+        file_with_extension = filename_stem + known_extensions
 
-        graph |= (
-            pynutil.insert('domain: "')
-            + file_with_extension
-            + pynutil.insert('"')
-        ).optimize()
+        graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize()
 
         # (3) URL with protocol
         graph |= protocol + insert_space + domain_graph_with_class_tags
diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py
index 3f6914fdd..04bbb9aa8 100644
--- a/nemo_text_processing/text_normalization/ko/taggers/telephone.py
+++ b/nemo_text_processing/text_normalization/ko/taggers/telephone.py
@@ -41,7 +41,7 @@ def __init__(self, deterministic: bool = True):
         add_sep = pynutil.delete("-") | pynutil.delete(".")
         # Optional space inserted between blocks
         sep_space = insert_space
-        
+
         # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert)
         digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize()
         zero_map = pynini.cross("0", "영")
@@ -61,7 +61,7 @@ def __init__(self, deterministic: bool = True):
             + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1)
             + delete_space
         )
-        
+
         # area part: "123-" | "123." | "(123)" [space?] or "(123)-"
         area_core = three_digits
         area_part = (
@@ -75,7 +75,6 @@ def __init__(self, deterministic: bool = True):
             )
         ) + sep_space
 
-
         # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050)
         mid = pynini.union(three_digits, four_digits)
         last4 = four_digits
diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py
index d1b29c2f2..c880e432d 100644
--- a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py
+++ b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py
@@ -129,11 +129,7 @@ def __init__(self, deterministic: bool = True):
             pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space
         )
 
-        protocol_raw = (
-            pynutil.delete('protocol: "')
-            + pynini.closure(NEMO_NOT_QUOTE, 1)
-            + pynutil.delete('"')
-        )
+        protocol_raw = pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"')
         cc_protocol_guard = pynini.accep("신용카드") + pynini.closure(NEMO_NOT_QUOTE, 0)
         cc_protocol = (protocol_raw @ cc_protocol_guard) + insert_space