From a108dce2399a93ea9565e7cf99d1219057c4e7ce Mon Sep 17 00:00:00 2001 From: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com> Date: Mon, 15 Dec 2025 15:39:56 -0800 Subject: [PATCH 1/4] Korean TN fixes: cardinal, decimal, fraction, date Signed-off-by: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com> --- .../text_normalization/ko/taggers/cardinal.py | 49 +++++++++++++++++-- .../text_normalization/ko/taggers/decimal.py | 11 ++++- .../text_normalization/ko/taggers/fraction.py | 4 +- .../text_normalization/ko/taggers/ordinal.py | 23 ++++++++- .../ko/taggers/tokenize_and_classify.py | 18 +++++-- .../text_normalization/ko/taggers/word.py | 12 +++-- .../text_normalization/ko/verbalizers/date.py | 12 +++-- .../ko/verbalizers/fraction.py | 7 ++- .../ko/verbalizers/verbalize_final.py | 19 +++++-- .../text_normalization/run_evaluate.py | 2 +- 10 files changed, 129 insertions(+), 28 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py index 9454203c9..ae6af7414 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -16,13 +16,19 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.ko.utils import get_abs_path class CardinalFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + + # Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP) + SEP = pynini.union(",", "’", "'", "\u00A0", "\u2009", "\u202F") + # Optional small whitespace inside parentheses or after signs + WS = pynini.closure(pynini.accep(" "), 0, 2) + # Load base .tsv files graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) @@ -53,7 +59,7 @@ def __init__(self, deterministic: bool = True): graph_thousand = thousands @ graph_thousand_component ten_thousands = NEMO_DIGIT**5 - graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union( + graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만'))) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), @@ -268,8 +274,41 @@ def __init__(self, deterministic: bool = True): ).optimize() # Sign and final formatting - optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1) - final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"') + # Delete group separators when they appear between digits (e.g., "1,234" -> "1234") + delete_sep_between_digits = pynini.cdrewrite( + pynutil.delete(SEP), + NEMO_DIGIT, + NEMO_DIGIT, + NEMO_SIGMA, + ) + + # Let the number graph accept numbers with separators + graph_num_accepting_separators = delete_sep_between_digits @ graph_num + + # Build the integer token (integer: "...") + integer_token = pynutil.insert('integer: "') + graph_num_accepting_separators + pynutil.insert('"') + + # Sign handling: + # - minus sets negative flag + # - plus is ignored (positive number) + minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-") + plus_prefix = pynutil.delete("+") + + # Accounting negative: "( 1,234 )" -> negative + integer:"1234" + paren_negative = ( + pynutil.insert('negative: "true" ') + + pynutil.delete("(") + WS + + integer_token + + WS + pynutil.delete(")") + ) + + # Signed number: optional (+|-) + integer + signed_integer = ( (minus_prefix | plus_prefix).ques + integer_token ) + + # Prefer accounting-form first, then signed form + final_graph = paren_negative | signed_integer + + # Wrap with class tokens and finalize final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() - self.graph = graph_num.optimize() + self.graph = graph_num_accepting_separators.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py index 787acf817..bec6ea87a 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/decimal.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, NEMO_SIGMA from nemo_text_processing.text_normalization.ko.utils import get_abs_path @@ -31,8 +31,15 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) + + # Use the base cardinal graph for the integer part + base_integer_graph = cardinal.graph + # Only special-case 10000 -> 만 for decimal integer part (if needed) + specials_input = pynini.cross("10000", "만") - cardinal_before_decimal = cardinal.graph + # Try the special mapping first, then fall back to normal cardinal + cardinal_before_decimal = (specials_input | base_integer_graph).optimize() + cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py index ebd7ee7ef..4e30ef1c6 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space from nemo_text_processing.text_normalization.ko.utils import get_abs_path @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynutil.insert(DOUBLE_QUOTE) ) - integer_component_with_space = integer_component + pynutil.insert(NEMO_SPACE) + integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE) # Denominator and numerator denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE) diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py index 66feaa727..f7bd179c3 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py @@ -65,6 +65,27 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_ordinal_from40 = graph_cardinal_from40_filtered + pynini.accep("번째") graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize() # Handles 1-39 # Handles 40+ + + # Single-character particles (가, 이, 은, 는, 로, 도 ...) + josa_single = pynini.union( + "가", "이", "은", "는", "를", "을", "로", "도", "다" + ) + + # Multi-character particles (부터, 까지) + josa_multi = pynini.union("부터", "까지") + + # Allow patterns like: + # 번째 + (optional single-josa) + (optional multi-josa) + josa = (josa_single.ques + josa_multi.ques).optimize() + + # Final ordinal graph with optional particles + graph_ordinal_with_josa = (graph_ordinal + josa).optimize() + + # Build the “integer: …” token structure + final_graph = ( + pynutil.insert('integer: "') + + graph_ordinal_with_josa + + pynutil.insert('"') + ) - final_graph = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"') self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py index 80c15aa70..a0389d0e7 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -17,7 +17,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main +from nemo_text_processing.text_normalization.ko.graph_utils import ( + GraphFst, + generator_main, + delete_space, + delete_extra_space, + NEMO_WHITE_SPACE, +) + from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.ko.taggers.date import DateFst from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst @@ -98,9 +105,14 @@ def __init__( ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") - tagger = pynini.closure(token, 1) + space = pynini.closure(NEMO_WHITE_SPACE, 1) + space = pynini.compose(space, delete_extra_space) + + space_opt = pynini.closure(space, 0, 1) + + graph = delete_space + token + pynini.closure(space_opt + token) + delete_space - self.fst = tagger.optimize() + self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py index 356bc04ca..adbe74d1b 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/text_normalization/ko/taggers/word.py @@ -13,8 +13,8 @@ # limitations under the License. from pynini.lib import pynutil - -from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, GraphFst +import pynini +from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, NEMO_DIGIT, GraphFst class WordFst(GraphFst): @@ -25,5 +25,11 @@ class WordFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) - word = pynutil.insert("name: \"") + NEMO_NOT_SPACE + pynutil.insert("\"") + + word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT) + + word = pynutil.insert('name: "') + word += pynini.closure(word_char, 1) + word += pynutil.insert('"') + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py index aa32529f3..fc4c023a2 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/date.py @@ -38,14 +38,16 @@ def __init__(self, deterministic: bool = True): + pynutil.delete("\"") ) + SPACE = pynini.closure(delete_space, 0, 1) + insert_space + # This graph now correctly uses the 'delete_space' variable defined above. graph_basic_date = ( - pynini.closure(era_component + delete_space, 0, 1) - + pynini.closure(year_component + delete_space, 0, 1) - + pynini.closure(month_component + delete_space, 0, 1) + pynini.closure(era_component + SPACE, 0, 1) + + pynini.closure(year_component + SPACE, 0, 1) + + pynini.closure(month_component + SPACE, 0, 1) + pynini.closure(day_component, 0, 1) - + pynini.closure((delete_space + week_component) | (week_component), 0, 1) - ) | month_component + delete_space + week_component + + pynini.closure(SPACE + week_component, 0, 1) + ) | (month_component + SPACE + week_component) final_graph = graph_basic_date diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py index 2c56f7f39..bafbf133d 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -32,8 +32,8 @@ def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) # Handles square root symbols like "√3" → "루트3" - denominator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE) - numerator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE) + denominator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) + numerator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) # Matches non-root numeric content denominator = pynini.closure(NEMO_NOT_QUOTE - "√") @@ -56,6 +56,7 @@ def __init__(self, deterministic: bool = True): 1, ) + pynutil.insert("분의") + + pynutil.insert(NEMO_SPACE) + numerator_component ) @@ -66,6 +67,7 @@ def __init__(self, deterministic: bool = True): + pynutil.delete('"') + pynini.closure(pynini.union("√", ".", NEMO_NOT_QUOTE - '"')) + pynutil.delete('"') + + pynutil.insert(NEMO_SPACE) ) graph_integer_fraction = graph_integer + delete_space + graph_fraction @@ -77,6 +79,7 @@ def __init__(self, deterministic: bool = True): + pynini.closure(NEMO_NOT_QUOTE - '"') + pynutil.delete('"') + delete_space + + pynutil.insert(NEMO_SPACE) ) # Final graph handles optional negative + (integer + fraction | fraction only) diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py index 09494ef86..9ad80a23c 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -17,7 +17,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main +from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, delete_extra_space, NEMO_WHITE_SPACE, NEMO_SIGMA, generator_main from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger @@ -49,11 +49,22 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ token_graph = VerbalizeFst(deterministic=deterministic) token_verbalizer = ( - pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }") + pynutil.delete("tokens {") + + delete_space + + token_graph.fst + + delete_space + + pynutil.delete(" }") ) + + space_between_tokens = pynini.closure(NEMO_WHITE_SPACE, 1) - verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) - + verbalizer = ( + delete_space + + token_verbalizer + + pynini.closure(space_between_tokens + token_verbalizer) + + delete_space + ) + self.fst = verbalizer.optimize() if far_file: diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 0438579a7..fc9b21c29 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko'], default="en", type=str, ) From 43315121c375e40b4ce36ad4f206024210e30685 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 16 Dec 2025 00:07:01 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/ko/taggers/cardinal.py | 19 +++++++++---------- .../text_normalization/ko/taggers/decimal.py | 6 +++--- .../text_normalization/ko/taggers/ordinal.py | 12 +++--------- .../ko/taggers/tokenize_and_classify.py | 7 +++---- .../text_normalization/ko/taggers/word.py | 9 +++++---- .../text_normalization/ko/verbalizers/date.py | 2 +- .../ko/verbalizers/verbalize_final.py | 19 +++++++++++-------- 7 files changed, 35 insertions(+), 39 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py index ae6af7414..2df876fea 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -23,9 +23,9 @@ class CardinalFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) - + # Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP) - SEP = pynini.union(",", "’", "'", "\u00A0", "\u2009", "\u202F") + SEP = pynini.union(",", "’", "'", "\u00a0", "\u2009", "\u202f") # Optional small whitespace inside parentheses or after signs WS = pynini.closure(pynini.accep(" "), 0, 2) @@ -59,7 +59,9 @@ def __init__(self, deterministic: bool = True): graph_thousand = thousands @ graph_thousand_component ten_thousands = NEMO_DIGIT**5 - graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만'))) + pynini.union( + graph_ten_thousand_component = ( + pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만')) + ) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), @@ -281,7 +283,7 @@ def __init__(self, deterministic: bool = True): NEMO_DIGIT, NEMO_SIGMA, ) - + # Let the number graph accept numbers with separators graph_num_accepting_separators = delete_sep_between_digits @ graph_num @@ -292,18 +294,15 @@ def __init__(self, deterministic: bool = True): # - minus sets negative flag # - plus is ignored (positive number) minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-") - plus_prefix = pynutil.delete("+") + plus_prefix = pynutil.delete("+") # Accounting negative: "( 1,234 )" -> negative + integer:"1234" paren_negative = ( - pynutil.insert('negative: "true" ') - + pynutil.delete("(") + WS - + integer_token - + WS + pynutil.delete(")") + pynutil.insert('negative: "true" ') + pynutil.delete("(") + WS + integer_token + WS + pynutil.delete(")") ) # Signed number: optional (+|-) + integer - signed_integer = ( (minus_prefix | plus_prefix).ques + integer_token ) + signed_integer = (minus_prefix | plus_prefix).ques + integer_token # Prefer accounting-form first, then signed form final_graph = paren_negative | signed_integer diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py index bec6ea87a..6d2d07f66 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/decimal.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, NEMO_SIGMA +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.ko.utils import get_abs_path @@ -31,7 +31,7 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) - + # Use the base cardinal graph for the integer part base_integer_graph = cardinal.graph # Only special-case 10000 -> 만 for decimal integer part (if needed) @@ -39,7 +39,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): # Try the special mapping first, then fall back to normal cardinal cardinal_before_decimal = (specials_input | base_integer_graph).optimize() - + cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py index f7bd179c3..59fa30ada 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py @@ -65,11 +65,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_ordinal_from40 = graph_cardinal_from40_filtered + pynini.accep("번째") graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize() # Handles 1-39 # Handles 40+ - + # Single-character particles (가, 이, 은, 는, 로, 도 ...) - josa_single = pynini.union( - "가", "이", "은", "는", "를", "을", "로", "도", "다" - ) + josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다") # Multi-character particles (부터, 까지) josa_multi = pynini.union("부터", "까지") @@ -82,10 +80,6 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_ordinal_with_josa = (graph_ordinal + josa).optimize() # Build the “integer: …” token structure - final_graph = ( - pynutil.insert('integer: "') - + graph_ordinal_with_josa - + pynutil.insert('"') - ) + final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"') self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py index a0389d0e7..e15129c7c 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -18,13 +18,12 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_WHITE_SPACE, GraphFst, - generator_main, - delete_space, delete_extra_space, - NEMO_WHITE_SPACE, + delete_space, + generator_main, ) - from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.ko.taggers.date import DateFst from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py index adbe74d1b..7aa3db709 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/text_normalization/ko/taggers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from pynini.lib import pynutil import pynini -from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, NEMO_DIGIT, GraphFst +from pynini.lib import pynutil + +from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst class WordFst(GraphFst): @@ -25,11 +26,11 @@ class WordFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) - + word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT) word = pynutil.insert('name: "') word += pynini.closure(word_char, 1) word += pynutil.insert('"') - + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py index fc4c023a2..bfd5e9aa1 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/date.py @@ -39,7 +39,7 @@ def __init__(self, deterministic: bool = True): ) SPACE = pynini.closure(delete_space, 0, 1) + insert_space - + # This graph now correctly uses the 'delete_space' variable defined above. graph_basic_date = ( pynini.closure(era_component + SPACE, 0, 1) diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py index 9ad80a23c..3ec44eac6 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -17,7 +17,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, delete_extra_space, NEMO_WHITE_SPACE, NEMO_SIGMA, generator_main +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_SIGMA, + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger @@ -49,13 +56,9 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ token_graph = VerbalizeFst(deterministic=deterministic) token_verbalizer = ( - pynutil.delete("tokens {") - + delete_space - + token_graph.fst - + delete_space - + pynutil.delete(" }") + pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }") ) - + space_between_tokens = pynini.closure(NEMO_WHITE_SPACE, 1) verbalizer = ( @@ -64,7 +67,7 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ + pynini.closure(space_between_tokens + token_verbalizer) + delete_space ) - + self.fst = verbalizer.optimize() if far_file: From aec257a41a339092ad1cab782b31df50d2991b05 Mon Sep 17 00:00:00 2001 From: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com> Date: Wed, 17 Dec 2025 17:05:55 -0800 Subject: [PATCH 3/4] Add ko electronic extensions and improve electronic/telephone normalization Signed-off-by: Jinwoo Bae <34386414+bbae0312@users.noreply.github.com> --- .../ko/data/electronic/cc_cues.tsv | 2 +- .../ko/data/electronic/domain.tsv | 5 +- .../ko/data/electronic/extensions.tsv | 6 ++ .../ko/taggers/electronic.py | 25 +++++-- .../ko/taggers/telephone.py | 49 ++++++++------ .../ko/verbalizers/electronic.py | 35 +++++++++- .../ko/verbalizers/telephone.py | 5 +- .../test_cases_electronic.txt | 16 +++-- .../test_cases_telephone.txt | 66 +++++++++---------- 9 files changed, 137 insertions(+), 72 deletions(-) create mode 100644 nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv index 2063fe8d1..f562cfbad 100644 --- a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv +++ b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv @@ -1,6 +1,6 @@ 카드 끝자리 카드 끝자리 카드 마지막 네자리 카드 마지막 네자리 -카드 마지막 4자리 카드 마지막 4자리 +카드 마지막 4자리 카드 마지막 네자리 신용카드 번호 신용카드 번호 신용카드 신용카드 체크카드 번호 체크카드 번호 diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv index c51ab615f..cd817d539 100644 --- a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv +++ b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv @@ -26,7 +26,4 @@ .uk 닷 유케이 .br 닷 비알 .in 닷 아이엔 -.ru 닷 알유 -.jpg 닷 제이피지 -.png 닷 피엔지 -.pdf 닷 피디에프 \ No newline at end of file +.ru 닷 알유 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv new file mode 100644 index 000000000..c80d08a69 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv @@ -0,0 +1,6 @@ +.jpg 닷 제이피지 +.png 닷 피엔지 +.pdf 닷 피디에프 +.JPG 닷 제이피지 +.PNG 닷 피엔지 +.PDF 닷 피디에프 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py index 182f29b63..d5e0d495b 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/ko/taggers/electronic.py @@ -121,11 +121,28 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): dollar_accep = pynini.accep("$") excluded_symbols = DOT | dollar_accep | AT filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols) - accepted_characters = ASCII_ALNUM | filtered_symbols # Domain core graph graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize() graph |= graph_domain + known_extensions = pynini.project( + pynini.string_file(get_abs_path("data/electronic/extensions.tsv")), + "input", + ) + + filename_stem = pynini.closure( + pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)), + 1, + ) + + file_with_extension = filename_stem + known_extensions + + graph |= ( + pynutil.insert('domain: "') + + file_with_extension + + pynutil.insert('"') + ).optimize() + # (3) URL with protocol graph |= protocol + insert_space + domain_graph_with_class_tags @@ -144,9 +161,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): four = pynini.closure(NEMO_DIGIT, 4, 4) sep_token = pynini.union(HYPHEN, NEMO_SPACE) - sep_del = pynutil.delete(pynini.closure(sep_token, 1)) # allow mix of - or space - - cc16_grouped = four + sep_del + four + sep_del + four + sep_del + four + sep_to_space = pynutil.delete(pynini.closure(sep_token, 0, 1)) + insert_space + cc16_grouped = four + sep_to_space + four + sep_to_space + four + sep_to_space + four + cc16_grouped = cc16_grouped + delete_space cc16_no_cue = ( pynutil.insert('protocol: "신용카드 " ') diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py index 0a9eb52dc..3f6914fdd 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ko/taggers/telephone.py @@ -24,11 +24,11 @@ class TelephoneFst(GraphFst): Finite state transducer for classifying Korean telephone numbers. Example inputs → tokens: - +82-10-3713-7050 -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" } - +1 (415) 555-0123 -> telephone { country_code: "플러스 일," number_part: "사일오, 오오오, 영일이삼" } - (031)371-3700 -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" } - 010-3713-7050 -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" } - 010.777.8888 -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" } + +82 010-3713-7050 -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } + +1 (415) 555-0123 -> telephone { country_code: "국가번호 일," number_part: "사일오 오오오 영일이삼" } + (031)371-3700 -> telephone { number_part: "영삼일 삼칠일 삼칠영영" } + 010-3713-7050 -> telephone { number_part: "영일영 삼칠일삼 칠영오영" } + 010.777.8888 -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" } Args: deterministic (bool, optional): If True, provide a single transduction; @@ -37,9 +37,11 @@ class TelephoneFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) - - add_sep = pynutil.insert(", ") # standard block separator ", " - + # Separator between digit blocks (e.g., "-" or ".") + add_sep = pynutil.delete("-") | pynutil.delete(".") + # Optional space inserted between blocks + sep_space = insert_space + # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert) digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() zero_map = pynini.cross("0", "영") @@ -49,35 +51,40 @@ def __init__(self, deterministic: bool = True): four_digits = digit_ko**4 # country code: "+1", "+82", "+1-" - country_core = ( - pynini.cross("+", "플러스 ") - + pynini.closure(digit_ko + insert_space, 0, 2) - + digit_ko - + pynutil.insert(",") + cc_digits = pynini.closure(digit_ko, 1, 3) + + country_code = ( + pynutil.delete("+") + + pynutil.insert('country_code: "') + + cc_digits + + pynutil.insert('"') + + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1) + + delete_space ) - country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"') - country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space - + # area part: "123-" | "123." | "(123)" [space?] or "(123)-" area_core = three_digits area_part = ( - (area_core + (pynutil.delete("-") | pynutil.delete("."))) + (area_core + add_sep) | ( pynutil.delete("(") + area_core - + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-")) + + pynutil.delete(")") + + pynini.closure(pynutil.delete(" "), 0, 1) + + pynini.closure(add_sep, 0, 1) ) - ) + add_sep + ) + sep_space + # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050) mid = pynini.union(three_digits, four_digits) last4 = four_digits # consume '-' or '.' between middle and last blocks - number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4 + number_part_core = area_part + mid + add_sep + sep_space + last4 number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"') # final graph: with or without country code - graph = pynini.union(country_code + number_part, number_part).optimize() + graph = pynini.union(country_code + insert_space + number_part, number_part).optimize() self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py index ecbb805cb..d1b29c2f2 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py @@ -89,7 +89,10 @@ def __init__(self, deterministic: bool = True): ) # 5) domain part (handle common endings like .com → 닷컴) - domain_common_pairs = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() + domain_common_pairs = ( + pynini.string_file(get_abs_path("data/electronic/domain.tsv")) + | pynini.string_file(get_abs_path("data/electronic/extensions.tsv")) + ).optimize() # Rewrite known domains (.com → 닷컴) tld_rewrite = pynini.cdrewrite( @@ -108,6 +111,10 @@ def __init__(self, deterministic: bool = True): raw_domain = pynini.closure(NEMO_NOT_QUOTE, 1) + four = pynini.closure(NEMO_DIGIT, 4, 4) + cc16_grouped = four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + cc_domain = (cc16_grouped @ digit_inline_rewrite).optimize() + domain = ( pynutil.delete("domain:") + delete_space @@ -122,11 +129,33 @@ def __init__(self, deterministic: bool = True): pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space ) + protocol_raw = ( + pynutil.delete('protocol: "') + + pynini.closure(NEMO_NOT_QUOTE, 1) + + pynutil.delete('"') + ) + cc_protocol_guard = pynini.accep("신용카드") + pynini.closure(NEMO_NOT_QUOTE, 0) + cc_protocol = (protocol_raw @ cc_protocol_guard) + insert_space + + # Credit card case: "신용카드 ..." protocol + 16-digit domain grouped as 4-4-4-4 + cc_graph = ( + cc_protocol + + delete_space + + pynutil.delete("domain:") + + delete_space + + pynutil.delete('"') + + cc_domain + + pynutil.delete('"') + + delete_space + ).optimize() + # 7) Combine: optional protocol + optional username + domain - graph = ( + default_graph = ( pynini.closure(protocol + delete_space, 0, 1) + pynini.closure(user_name + delete_space + pynutil.insert(" 골뱅이 ") + delete_space, 0, 1) + domain + delete_space - ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) + ).optimize() + + graph = (cc_graph | default_graph) @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py index 766da5e6f..211358141 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py @@ -29,9 +29,9 @@ class TelephoneFst(GraphFst): [country_code + " "] + number_part [+ ", 내선 " + extension] Examples: - telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" } + telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } -> 플러스 팔 이, 영일영, 삼칠일삼, 칠영오영 - telephone { number_part: "팔영영, 오오오, 영영영영" extension: "이삼사" } + telephone { number_part: "팔영영 오오오 영영영영" extension: "이삼사" } -> 팔영영, 오오오, 영영영영, 내선 이삼사 Args: @@ -45,6 +45,7 @@ def __init__(self, deterministic: bool = True): country = ( pynini.closure(delete_space, 0, 1) + pynutil.delete('country_code: "') + + pynutil.insert("국가번호 ") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt index 70cbab631..4e09d0db2 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt @@ -6,7 +6,15 @@ a@msn.it~a 골뱅이 msn 닷 아이티 abc@nvidia.app~abc 골뱅이 nvidia 닷 앱 user01@gmail.co.kr~user영일 골뱅이 gmail 닷 씨오 닷 케이알 nvidia.co.kr~nvidia 닷 씨오 닷 케이알 -1234-5678-9012-3456~신용카드 일이삼사오육칠팔구영일이삼사오육 -카드 마지막 4자리 3456~카드 마지막 4자리 삼사오육 -카드 마지막 4자리 7890~카드 마지막 4자리 칠팔구영 -카드 끝자리 3456~카드 끝자리 삼사오육 \ No newline at end of file +1234-5678-9012-3456~신용카드 일이삼사 오육칠팔 구영일이 삼사오육 +2345-2222-3333-4444~신용카드 이삼사오 이이이이 삼삼삼삼 사사사사 +9090-1234-5555-9876~신용카드 구영구영 일이삼사 오오오오 구팔칠육 +카드 마지막 네자리 3456~카드 마지막 네자리 삼사오육 +카드 마지막 4자리 7890~카드 마지막 네자리 칠팔구영 +카드 끝자리 3456~카드 끝자리 삼사오육 +사진.jpg~사진 닷 제이피지 +사진.JPG~사진 닷 제이피지 +사진.png~사진 닷 피엔지 +사진.PNG~사진 닷 피엔지 +문서.pdf~문서 닷 피디에프 +문서.PDF~문서 닷 피디에프 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt index dfcb7c3e9..b6e573aec 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt @@ -1,33 +1,33 @@ -+1 123-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔 -123-123-5678~일이삼, 일이삼, 오육칠팔 -+1-123-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔 -+1 (123)-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔 -(123)-123-5678~일이삼, 일이삼, 오육칠팔 -555.555.5555~오오오, 오오오, 오오오오 -(123) 123-5678~일이삼, 일이삼, 오육칠팔 -010-3713-7050~영일영, 삼칠일삼, 칠영오영 -+82 123-456-7890~플러스 팔 이, 일이삼, 사오육, 칠팔구영 -+82-123-4567-8901~플러스 팔 이, 일이삼, 사오육칠, 팔구영일 -+44-207-555-1234~플러스 사 사, 이영칠, 오오오, 일이삼사 -123.456-7890~일이삼, 사오육, 칠팔구영 -123-456.7890~일이삼, 사오육, 칠팔구영 -(987)-654-3210~구팔칠, 육오사, 삼이일영 -(987) 654-3210~구팔칠, 육오사, 삼이일영 -+7 000-000-0000~플러스 칠, 영영영, 영영영, 영영영영 -000.000.0000~영영영, 영영영, 영영영영 -271-828-1828~이칠일, 팔이팔, 일팔이팔 -314-159-2653~삼일사, 일오구, 이육오삼 -(010) 123-4567~영일영, 일이삼, 사오육칠 -+358-123-456-7890~플러스 삼 오 팔, 일이삼, 사오육, 칠팔구영 -+1 800-555-0000~플러스 일, 팔영영, 오오오, 영영영영 -(800) 555-0000~팔영영, 오오오, 영영영영 -+12 345-678-9012~플러스 일 이, 삼사오, 육칠팔, 구영일이 -+999 999-999-9999~플러스 구 구 구, 구구구, 구구구, 구구구구 -321.654.0987~삼이일, 육오사, 영구팔칠 -+82 010-1234-5678~플러스 팔 이, 영일영, 일이삼사, 오육칠팔 -(999)-000-0000~구구구, 영영영, 영영영영 -+1-123.456.7890~플러스 일, 일이삼, 사오육, 칠팔구영 -+82-123.456-7890~플러스 팔 이, 일이삼, 사오육, 칠팔구영 -111-222-3333~일일일, 이이이, 삼삼삼삼 -909-808-7070~구영구, 팔영팔, 칠영칠영 -(555)555-5555~오오오, 오오오, 오오오오 ++1 123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +123-123-5678~일이삼 일이삼 오육칠팔 ++1-123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 ++1 (123)-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +(123)-123-5678~일이삼 일이삼 오육칠팔 +555.555.5555~오오오 오오오 오오오오 +(123) 123-5678~일이삼 일이삼 오육칠팔 +010-3713-7050~영일영 삼칠일삼 칠영오영 ++82 123-456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 ++82-123-4567-8901~국가번호 팔이 일이삼 사오육칠 팔구영일 ++44-207-555-1234~국가번호 사사 이영칠 오오오 일이삼사 +123.456-7890~일이삼 사오육 칠팔구영 +123-456.7890~일이삼 사오육 칠팔구영 +(987)-654-3210~구팔칠 육오사 삼이일영 +(987) 654-3210~구팔칠 육오사 삼이일영 ++7 000-000-0000~국가번호 칠 영영영 영영영 영영영영 +000.000.0000~영영영 영영영 영영영영 +271-828-1828~이칠일 팔이팔 일팔이팔 +314-159-2653~삼일사 일오구 이육오삼 +(010) 123-4567~영일영 일이삼 사오육칠 ++358-123-456-7890~국가번호 삼오팔 일이삼 사오육 칠팔구영 ++1 800-555-0000~국가번호 일 팔영영 오오오 영영영영 +(800) 555-0000~팔영영 오오오 영영영영 ++12 345-678-9012~국가번호 일이 삼사오 육칠팔 구영일이 ++999 999-999-9999~국가번호 구구구 구구구 구구구 구구구구 +321.654.0987~삼이일 육오사 영구팔칠 ++82 010-1234-5678~국가번호 팔이 영일영 일이삼사 오육칠팔 +(999)-000-0000~구구구 영영영 영영영영 ++1-123.456.7890~국가번호 일 일이삼 사오육 칠팔구영 ++82-123.456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 +111-222-3333~일일일 이이이 삼삼삼삼 +909-808-7070~구영구 팔영팔 칠영칠영 +(555)555-5555~오오오 오오오 오오오오 \ No newline at end of file From aebe1f2e9e3e95521ea542baf5c15a4a8d1168f8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 18 Dec 2025 01:11:49 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../text_normalization/ko/taggers/electronic.py | 8 ++------ .../text_normalization/ko/taggers/telephone.py | 5 ++--- .../text_normalization/ko/verbalizers/electronic.py | 6 +----- 3 files changed, 5 insertions(+), 14 deletions(-) diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py index d5e0d495b..80d014263 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/ko/taggers/electronic.py @@ -135,13 +135,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): 1, ) - file_with_extension = filename_stem + known_extensions + file_with_extension = filename_stem + known_extensions - graph |= ( - pynutil.insert('domain: "') - + file_with_extension - + pynutil.insert('"') - ).optimize() + graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize() # (3) URL with protocol graph |= protocol + insert_space + domain_graph_with_class_tags diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py index 3f6914fdd..04bbb9aa8 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ko/taggers/telephone.py @@ -41,7 +41,7 @@ def __init__(self, deterministic: bool = True): add_sep = pynutil.delete("-") | pynutil.delete(".") # Optional space inserted between blocks sep_space = insert_space - + # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert) digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() zero_map = pynini.cross("0", "영") @@ -61,7 +61,7 @@ def __init__(self, deterministic: bool = True): + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1) + delete_space ) - + # area part: "123-" | "123." | "(123)" [space?] or "(123)-" area_core = three_digits area_part = ( @@ -75,7 +75,6 @@ def __init__(self, deterministic: bool = True): ) ) + sep_space - # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050) mid = pynini.union(three_digits, four_digits) last4 = four_digits diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py index d1b29c2f2..c880e432d 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py @@ -129,11 +129,7 @@ def __init__(self, deterministic: bool = True): pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space ) - protocol_raw = ( - pynutil.delete('protocol: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('"') - ) + protocol_raw = pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') cc_protocol_guard = pynini.accep("신용카드") + pynini.closure(NEMO_NOT_QUOTE, 0) cc_protocol = (protocol_raw @ cc_protocol_guard) + insert_space