diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv index 2063fe8d1..f562cfbad 100644 --- a/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv +++ b/nemo_text_processing/text_normalization/ko/data/electronic/cc_cues.tsv @@ -1,6 +1,6 @@ 카드 끝자리 카드 끝자리 카드 마지막 네자리 카드 마지막 네자리 -카드 마지막 4자리 카드 마지막 4자리 +카드 마지막 4자리 카드 마지막 네자리 신용카드 번호 신용카드 번호 신용카드 신용카드 체크카드 번호 체크카드 번호 diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv index c51ab615f..cd817d539 100644 --- a/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv +++ b/nemo_text_processing/text_normalization/ko/data/electronic/domain.tsv @@ -26,7 +26,4 @@ .uk 닷 유케이 .br 닷 비알 .in 닷 아이엔 -.ru 닷 알유 -.jpg 닷 제이피지 -.png 닷 피엔지 -.pdf 닷 피디에프 \ No newline at end of file +.ru 닷 알유 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv new file mode 100644 index 000000000..c80d08a69 --- /dev/null +++ b/nemo_text_processing/text_normalization/ko/data/electronic/extensions.tsv @@ -0,0 +1,6 @@ +.jpg 닷 제이피지 +.png 닷 피엔지 +.pdf 닷 피디에프 +.JPG 닷 제이피지 +.PNG 닷 피엔지 +.PDF 닷 피디에프 \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py index 9454203c9..2df876fea 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/cardinal.py @@ -16,13 +16,19 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst from nemo_text_processing.text_normalization.ko.utils import get_abs_path class CardinalFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="cardinal", kind="classify", deterministic=deterministic) + + # Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP) + SEP = pynini.union(",", "’", "'", "\u00a0", "\u2009", "\u202f") + # Optional small whitespace inside parentheses or after signs + WS = pynini.closure(pynini.accep(" "), 0, 2) + # Load base .tsv files graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv")) @@ -53,7 +59,9 @@ def __init__(self, deterministic: bool = True): graph_thousand = thousands @ graph_thousand_component ten_thousands = NEMO_DIGIT**5 - graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union( + graph_ten_thousand_component = ( + pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만')) + ) + pynini.union( pynini.closure(pynutil.delete('0')), graph_thousand_component, (pynutil.delete('0') + graph_hundred_component), @@ -268,8 +276,38 @@ def __init__(self, deterministic: bool = True): ).optimize() # Sign and final formatting - optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1) - final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"') + # Delete group separators when they appear between digits (e.g., "1,234" -> "1234") + delete_sep_between_digits = pynini.cdrewrite( + pynutil.delete(SEP), + NEMO_DIGIT, + NEMO_DIGIT, + NEMO_SIGMA, + ) + + # Let the number graph accept numbers with separators + graph_num_accepting_separators = delete_sep_between_digits @ graph_num + + # Build the integer token (integer: "...") + integer_token = pynutil.insert('integer: "') + graph_num_accepting_separators + pynutil.insert('"') + + # Sign handling: + # - minus sets negative flag + # - plus is ignored (positive number) + minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-") + plus_prefix = pynutil.delete("+") + + # Accounting negative: "( 1,234 )" -> negative + integer:"1234" + paren_negative = ( + pynutil.insert('negative: "true" ') + pynutil.delete("(") + WS + integer_token + WS + pynutil.delete(")") + ) + + # Signed number: optional (+|-) + integer + signed_integer = (minus_prefix | plus_prefix).ques + integer_token + + # Prefer accounting-form first, then signed form + final_graph = paren_negative | signed_integer + + # Wrap with class tokens and finalize final_graph = self.add_tokens(final_graph) self.fst = final_graph.optimize() - self.graph = graph_num.optimize() + self.graph = graph_num_accepting_separators.optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/decimal.py b/nemo_text_processing/text_normalization/ko/taggers/decimal.py index 787acf817..6d2d07f66 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/decimal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/decimal.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst from nemo_text_processing.text_normalization.ko.utils import get_abs_path @@ -32,7 +32,14 @@ class DecimalFst(GraphFst): def __init__(self, cardinal: GraphFst, deterministic: bool = True): super().__init__(name="decimal", kind="classify", deterministic=deterministic) - cardinal_before_decimal = cardinal.graph + # Use the base cardinal graph for the integer part + base_integer_graph = cardinal.graph + # Only special-case 10000 -> 만 for decimal integer part (if needed) + specials_input = pynini.cross("10000", "만") + + # Try the special mapping first, then fall back to normal cardinal + cardinal_before_decimal = (specials_input | base_integer_graph).optimize() + cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv")) zero = pynini.string_file(get_abs_path("data/number/zero.tsv")) diff --git a/nemo_text_processing/text_normalization/ko/taggers/electronic.py b/nemo_text_processing/text_normalization/ko/taggers/electronic.py index 182f29b63..80d014263 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/electronic.py +++ b/nemo_text_processing/text_normalization/ko/taggers/electronic.py @@ -121,11 +121,24 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): dollar_accep = pynini.accep("$") excluded_symbols = DOT | dollar_accep | AT filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols) - accepted_characters = ASCII_ALNUM | filtered_symbols # Domain core graph graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize() graph |= graph_domain + known_extensions = pynini.project( + pynini.string_file(get_abs_path("data/electronic/extensions.tsv")), + "input", + ) + + filename_stem = pynini.closure( + pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)), + 1, + ) + + file_with_extension = filename_stem + known_extensions + + graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize() + # (3) URL with protocol graph |= protocol + insert_space + domain_graph_with_class_tags @@ -144,9 +157,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): four = pynini.closure(NEMO_DIGIT, 4, 4) sep_token = pynini.union(HYPHEN, NEMO_SPACE) - sep_del = pynutil.delete(pynini.closure(sep_token, 1)) # allow mix of - or space - - cc16_grouped = four + sep_del + four + sep_del + four + sep_del + four + sep_to_space = pynutil.delete(pynini.closure(sep_token, 0, 1)) + insert_space + cc16_grouped = four + sep_to_space + four + sep_to_space + four + sep_to_space + four + cc16_grouped = cc16_grouped + delete_space cc16_no_cue = ( pynutil.insert('protocol: "신용카드 " ') diff --git a/nemo_text_processing/text_normalization/ko/taggers/fraction.py b/nemo_text_processing/text_normalization/ko/taggers/fraction.py index ebd7ee7ef..4e30ef1c6 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/taggers/fraction.py @@ -15,7 +15,7 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst +from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space from nemo_text_processing.text_normalization.ko.utils import get_abs_path @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): + pynutil.insert(DOUBLE_QUOTE) ) - integer_component_with_space = integer_component + pynutil.insert(NEMO_SPACE) + integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE) # Denominator and numerator denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE) diff --git a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py index 66feaa727..59fa30ada 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/ordinal.py +++ b/nemo_text_processing/text_normalization/ko/taggers/ordinal.py @@ -66,5 +66,20 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True): graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize() # Handles 1-39 # Handles 40+ - final_graph = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"') + # Single-character particles (가, 이, 은, 는, 로, 도 ...) + josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다") + + # Multi-character particles (부터, 까지) + josa_multi = pynini.union("부터", "까지") + + # Allow patterns like: + # 번째 + (optional single-josa) + (optional multi-josa) + josa = (josa_single.ques + josa_multi.ques).optimize() + + # Final ordinal graph with optional particles + graph_ordinal_with_josa = (graph_ordinal + josa).optimize() + + # Build the “integer: …” token structure + final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"') + self.fst = self.add_tokens(final_graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/telephone.py b/nemo_text_processing/text_normalization/ko/taggers/telephone.py index 0a9eb52dc..04bbb9aa8 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/telephone.py +++ b/nemo_text_processing/text_normalization/ko/taggers/telephone.py @@ -24,11 +24,11 @@ class TelephoneFst(GraphFst): Finite state transducer for classifying Korean telephone numbers. Example inputs → tokens: - +82-10-3713-7050 -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" } - +1 (415) 555-0123 -> telephone { country_code: "플러스 일," number_part: "사일오, 오오오, 영일이삼" } - (031)371-3700 -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" } - 010-3713-7050 -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" } - 010.777.8888 -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" } + +82 010-3713-7050 -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } + +1 (415) 555-0123 -> telephone { country_code: "국가번호 일," number_part: "사일오 오오오 영일이삼" } + (031)371-3700 -> telephone { number_part: "영삼일 삼칠일 삼칠영영" } + 010-3713-7050 -> telephone { number_part: "영일영 삼칠일삼 칠영오영" } + 010.777.8888 -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" } Args: deterministic (bool, optional): If True, provide a single transduction; @@ -37,8 +37,10 @@ class TelephoneFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="telephone", kind="classify", deterministic=deterministic) - - add_sep = pynutil.insert(", ") # standard block separator ", " + # Separator between digit blocks (e.g., "-" or ".") + add_sep = pynutil.delete("-") | pynutil.delete(".") + # Optional space inserted between blocks + sep_space = insert_space # 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert) digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize() @@ -49,35 +51,39 @@ def __init__(self, deterministic: bool = True): four_digits = digit_ko**4 # country code: "+1", "+82", "+1-" - country_core = ( - pynini.cross("+", "플러스 ") - + pynini.closure(digit_ko + insert_space, 0, 2) - + digit_ko - + pynutil.insert(",") + cc_digits = pynini.closure(digit_ko, 1, 3) + + country_code = ( + pynutil.delete("+") + + pynutil.insert('country_code: "') + + cc_digits + + pynutil.insert('"') + + pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1) + + delete_space ) - country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"') - country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space # area part: "123-" | "123." | "(123)" [space?] or "(123)-" area_core = three_digits area_part = ( - (area_core + (pynutil.delete("-") | pynutil.delete("."))) + (area_core + add_sep) | ( pynutil.delete("(") + area_core - + ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-")) + + pynutil.delete(")") + + pynini.closure(pynutil.delete(" "), 0, 1) + + pynini.closure(add_sep, 0, 1) ) - ) + add_sep + ) + sep_space # 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050) mid = pynini.union(three_digits, four_digits) last4 = four_digits # consume '-' or '.' between middle and last blocks - number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4 + number_part_core = area_part + mid + add_sep + sep_space + last4 number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"') # final graph: with or without country code - graph = pynini.union(country_code + number_part, number_part).optimize() + graph = pynini.union(country_code + insert_space + number_part, number_part).optimize() self.fst = self.add_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py index 80c15aa70..e15129c7c 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/ko/taggers/tokenize_and_classify.py @@ -17,7 +17,13 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst from nemo_text_processing.text_normalization.ko.taggers.date import DateFst from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst @@ -98,9 +104,14 @@ def __init__( ) token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }") - tagger = pynini.closure(token, 1) + space = pynini.closure(NEMO_WHITE_SPACE, 1) + space = pynini.compose(space, delete_extra_space) - self.fst = tagger.optimize() + space_opt = pynini.closure(space, 0, 1) + + graph = delete_space + token + pynini.closure(space_opt + token) + delete_space + + self.fst = graph.optimize() if far_file: generator_main(far_file, {"tokenize_and_classify": self.fst}) diff --git a/nemo_text_processing/text_normalization/ko/taggers/word.py b/nemo_text_processing/text_normalization/ko/taggers/word.py index 356bc04ca..7aa3db709 100644 --- a/nemo_text_processing/text_normalization/ko/taggers/word.py +++ b/nemo_text_processing/text_normalization/ko/taggers/word.py @@ -12,9 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, GraphFst +from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst class WordFst(GraphFst): @@ -25,5 +26,11 @@ class WordFst(GraphFst): def __init__(self, deterministic: bool = True): super().__init__(name="word", kind="classify", deterministic=deterministic) - word = pynutil.insert("name: \"") + NEMO_NOT_SPACE + pynutil.insert("\"") + + word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT) + + word = pynutil.insert('name: "') + word += pynini.closure(word_char, 1) + word += pynutil.insert('"') + self.fst = word.optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/date.py b/nemo_text_processing/text_normalization/ko/verbalizers/date.py index aa32529f3..bfd5e9aa1 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/date.py @@ -38,14 +38,16 @@ def __init__(self, deterministic: bool = True): + pynutil.delete("\"") ) + SPACE = pynini.closure(delete_space, 0, 1) + insert_space + # This graph now correctly uses the 'delete_space' variable defined above. graph_basic_date = ( - pynini.closure(era_component + delete_space, 0, 1) - + pynini.closure(year_component + delete_space, 0, 1) - + pynini.closure(month_component + delete_space, 0, 1) + pynini.closure(era_component + SPACE, 0, 1) + + pynini.closure(year_component + SPACE, 0, 1) + + pynini.closure(month_component + SPACE, 0, 1) + pynini.closure(day_component, 0, 1) - + pynini.closure((delete_space + week_component) | (week_component), 0, 1) - ) | month_component + delete_space + week_component + + pynini.closure(SPACE + week_component, 0, 1) + ) | (month_component + SPACE + week_component) final_graph = graph_basic_date diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py index ecbb805cb..c880e432d 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/electronic.py @@ -89,7 +89,10 @@ def __init__(self, deterministic: bool = True): ) # 5) domain part (handle common endings like .com → 닷컴) - domain_common_pairs = pynini.string_file(get_abs_path("data/electronic/domain.tsv")).optimize() + domain_common_pairs = ( + pynini.string_file(get_abs_path("data/electronic/domain.tsv")) + | pynini.string_file(get_abs_path("data/electronic/extensions.tsv")) + ).optimize() # Rewrite known domains (.com → 닷컴) tld_rewrite = pynini.cdrewrite( @@ -108,6 +111,10 @@ def __init__(self, deterministic: bool = True): raw_domain = pynini.closure(NEMO_NOT_QUOTE, 1) + four = pynini.closure(NEMO_DIGIT, 4, 4) + cc16_grouped = four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + pynutil.insert(" ") + four + cc_domain = (cc16_grouped @ digit_inline_rewrite).optimize() + domain = ( pynutil.delete("domain:") + delete_space @@ -122,11 +129,29 @@ def __init__(self, deterministic: bool = True): pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space ) + protocol_raw = pynutil.delete('protocol: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + cc_protocol_guard = pynini.accep("신용카드") + pynini.closure(NEMO_NOT_QUOTE, 0) + cc_protocol = (protocol_raw @ cc_protocol_guard) + insert_space + + # Credit card case: "신용카드 ..." protocol + 16-digit domain grouped as 4-4-4-4 + cc_graph = ( + cc_protocol + + delete_space + + pynutil.delete("domain:") + + delete_space + + pynutil.delete('"') + + cc_domain + + pynutil.delete('"') + + delete_space + ).optimize() + # 7) Combine: optional protocol + optional username + domain - graph = ( + default_graph = ( pynini.closure(protocol + delete_space, 0, 1) + pynini.closure(user_name + delete_space + pynutil.insert(" 골뱅이 ") + delete_space, 0, 1) + domain + delete_space - ).optimize() @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) + ).optimize() + + graph = (cc_graph | default_graph) @ pynini.cdrewrite(delete_extra_space, "", "", NEMO_SIGMA) self.fst = self.delete_tokens(graph).optimize() diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py index 2c56f7f39..bafbf133d 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/fraction.py @@ -32,8 +32,8 @@ def __init__(self, deterministic: bool = True): super().__init__(name="fraction", kind="verbalize", deterministic=deterministic) # Handles square root symbols like "√3" → "루트3" - denominator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE) - numerator_root = pynini.cross("√", "루트") + pynini.closure(NEMO_NOT_QUOTE) + denominator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) + numerator_root = pynini.cross("√", "루트") + pynutil.insert(NEMO_SPACE) + pynini.closure(NEMO_NOT_QUOTE) # Matches non-root numeric content denominator = pynini.closure(NEMO_NOT_QUOTE - "√") @@ -56,6 +56,7 @@ def __init__(self, deterministic: bool = True): 1, ) + pynutil.insert("분의") + + pynutil.insert(NEMO_SPACE) + numerator_component ) @@ -66,6 +67,7 @@ def __init__(self, deterministic: bool = True): + pynutil.delete('"') + pynini.closure(pynini.union("√", ".", NEMO_NOT_QUOTE - '"')) + pynutil.delete('"') + + pynutil.insert(NEMO_SPACE) ) graph_integer_fraction = graph_integer + delete_space + graph_fraction @@ -77,6 +79,7 @@ def __init__(self, deterministic: bool = True): + pynini.closure(NEMO_NOT_QUOTE - '"') + pynutil.delete('"') + delete_space + + pynutil.insert(NEMO_SPACE) ) # Final graph handles optional negative + (integer + fraction | fraction only) diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py index 766da5e6f..211358141 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/telephone.py @@ -29,9 +29,9 @@ class TelephoneFst(GraphFst): [country_code + " "] + number_part [+ ", 내선 " + extension] Examples: - telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" } + telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" } -> 플러스 팔 이, 영일영, 삼칠일삼, 칠영오영 - telephone { number_part: "팔영영, 오오오, 영영영영" extension: "이삼사" } + telephone { number_part: "팔영영 오오오 영영영영" extension: "이삼사" } -> 팔영영, 오오오, 영영영영, 내선 이삼사 Args: @@ -45,6 +45,7 @@ def __init__(self, deterministic: bool = True): country = ( pynini.closure(delete_space, 0, 1) + pynutil.delete('country_code: "') + + pynutil.insert("국가번호 ") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') + insert_space diff --git a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py index 09494ef86..3ec44eac6 100644 --- a/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py +++ b/nemo_text_processing/text_normalization/ko/verbalizers/verbalize_final.py @@ -17,7 +17,14 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, delete_space, generator_main +from nemo_text_processing.text_normalization.ko.graph_utils import ( + NEMO_SIGMA, + NEMO_WHITE_SPACE, + GraphFst, + delete_extra_space, + delete_space, + generator_main, +) from nemo_text_processing.text_normalization.ko.verbalizers.verbalize import VerbalizeFst from nemo_text_processing.utils.logging import logger @@ -52,7 +59,14 @@ def __init__(self, deterministic: bool = True, cache_dir: str = None, overwrite_ pynutil.delete("tokens {") + delete_space + token_graph.fst + delete_space + pynutil.delete(" }") ) - verbalizer = pynini.closure(delete_space + token_verbalizer + delete_space) + space_between_tokens = pynini.closure(NEMO_WHITE_SPACE, 1) + + verbalizer = ( + delete_space + + token_verbalizer + + pynini.closure(space_between_tokens + token_verbalizer) + + delete_space + ) self.fst = verbalizer.optimize() diff --git a/nemo_text_processing/text_normalization/run_evaluate.py b/nemo_text_processing/text_normalization/run_evaluate.py index 0438579a7..fc9b21c29 100644 --- a/nemo_text_processing/text_normalization/run_evaluate.py +++ b/nemo_text_processing/text_normalization/run_evaluate.py @@ -35,7 +35,7 @@ def parse_args(): parser.add_argument( "--lang", help="language", - choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi'], + choices=['ar', 'de', 'en', 'es', 'fr', 'hu', 'it', 'ru', 'sv', 'zh', 'hy', 'hi', 'ko'], default="en", type=str, ) diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt index 70cbab631..4e09d0db2 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_electronic.txt @@ -6,7 +6,15 @@ a@msn.it~a 골뱅이 msn 닷 아이티 abc@nvidia.app~abc 골뱅이 nvidia 닷 앱 user01@gmail.co.kr~user영일 골뱅이 gmail 닷 씨오 닷 케이알 nvidia.co.kr~nvidia 닷 씨오 닷 케이알 -1234-5678-9012-3456~신용카드 일이삼사오육칠팔구영일이삼사오육 -카드 마지막 4자리 3456~카드 마지막 4자리 삼사오육 -카드 마지막 4자리 7890~카드 마지막 4자리 칠팔구영 -카드 끝자리 3456~카드 끝자리 삼사오육 \ No newline at end of file +1234-5678-9012-3456~신용카드 일이삼사 오육칠팔 구영일이 삼사오육 +2345-2222-3333-4444~신용카드 이삼사오 이이이이 삼삼삼삼 사사사사 +9090-1234-5555-9876~신용카드 구영구영 일이삼사 오오오오 구팔칠육 +카드 마지막 네자리 3456~카드 마지막 네자리 삼사오육 +카드 마지막 4자리 7890~카드 마지막 네자리 칠팔구영 +카드 끝자리 3456~카드 끝자리 삼사오육 +사진.jpg~사진 닷 제이피지 +사진.JPG~사진 닷 제이피지 +사진.png~사진 닷 피엔지 +사진.PNG~사진 닷 피엔지 +문서.pdf~문서 닷 피디에프 +문서.PDF~문서 닷 피디에프 \ No newline at end of file diff --git a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt index dfcb7c3e9..b6e573aec 100644 --- a/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt +++ b/tests/nemo_text_processing/ko/data_text_normalization/test_cases_telephone.txt @@ -1,33 +1,33 @@ -+1 123-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔 -123-123-5678~일이삼, 일이삼, 오육칠팔 -+1-123-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔 -+1 (123)-123-5678~플러스 일, 일이삼, 일이삼, 오육칠팔 -(123)-123-5678~일이삼, 일이삼, 오육칠팔 -555.555.5555~오오오, 오오오, 오오오오 -(123) 123-5678~일이삼, 일이삼, 오육칠팔 -010-3713-7050~영일영, 삼칠일삼, 칠영오영 -+82 123-456-7890~플러스 팔 이, 일이삼, 사오육, 칠팔구영 -+82-123-4567-8901~플러스 팔 이, 일이삼, 사오육칠, 팔구영일 -+44-207-555-1234~플러스 사 사, 이영칠, 오오오, 일이삼사 -123.456-7890~일이삼, 사오육, 칠팔구영 -123-456.7890~일이삼, 사오육, 칠팔구영 -(987)-654-3210~구팔칠, 육오사, 삼이일영 -(987) 654-3210~구팔칠, 육오사, 삼이일영 -+7 000-000-0000~플러스 칠, 영영영, 영영영, 영영영영 -000.000.0000~영영영, 영영영, 영영영영 -271-828-1828~이칠일, 팔이팔, 일팔이팔 -314-159-2653~삼일사, 일오구, 이육오삼 -(010) 123-4567~영일영, 일이삼, 사오육칠 -+358-123-456-7890~플러스 삼 오 팔, 일이삼, 사오육, 칠팔구영 -+1 800-555-0000~플러스 일, 팔영영, 오오오, 영영영영 -(800) 555-0000~팔영영, 오오오, 영영영영 -+12 345-678-9012~플러스 일 이, 삼사오, 육칠팔, 구영일이 -+999 999-999-9999~플러스 구 구 구, 구구구, 구구구, 구구구구 -321.654.0987~삼이일, 육오사, 영구팔칠 -+82 010-1234-5678~플러스 팔 이, 영일영, 일이삼사, 오육칠팔 -(999)-000-0000~구구구, 영영영, 영영영영 -+1-123.456.7890~플러스 일, 일이삼, 사오육, 칠팔구영 -+82-123.456-7890~플러스 팔 이, 일이삼, 사오육, 칠팔구영 -111-222-3333~일일일, 이이이, 삼삼삼삼 -909-808-7070~구영구, 팔영팔, 칠영칠영 -(555)555-5555~오오오, 오오오, 오오오오 ++1 123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +123-123-5678~일이삼 일이삼 오육칠팔 ++1-123-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 ++1 (123)-123-5678~국가번호 일 일이삼 일이삼 오육칠팔 +(123)-123-5678~일이삼 일이삼 오육칠팔 +555.555.5555~오오오 오오오 오오오오 +(123) 123-5678~일이삼 일이삼 오육칠팔 +010-3713-7050~영일영 삼칠일삼 칠영오영 ++82 123-456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 ++82-123-4567-8901~국가번호 팔이 일이삼 사오육칠 팔구영일 ++44-207-555-1234~국가번호 사사 이영칠 오오오 일이삼사 +123.456-7890~일이삼 사오육 칠팔구영 +123-456.7890~일이삼 사오육 칠팔구영 +(987)-654-3210~구팔칠 육오사 삼이일영 +(987) 654-3210~구팔칠 육오사 삼이일영 ++7 000-000-0000~국가번호 칠 영영영 영영영 영영영영 +000.000.0000~영영영 영영영 영영영영 +271-828-1828~이칠일 팔이팔 일팔이팔 +314-159-2653~삼일사 일오구 이육오삼 +(010) 123-4567~영일영 일이삼 사오육칠 ++358-123-456-7890~국가번호 삼오팔 일이삼 사오육 칠팔구영 ++1 800-555-0000~국가번호 일 팔영영 오오오 영영영영 +(800) 555-0000~팔영영 오오오 영영영영 ++12 345-678-9012~국가번호 일이 삼사오 육칠팔 구영일이 ++999 999-999-9999~국가번호 구구구 구구구 구구구 구구구구 +321.654.0987~삼이일 육오사 영구팔칠 ++82 010-1234-5678~국가번호 팔이 영일영 일이삼사 오육칠팔 +(999)-000-0000~구구구 영영영 영영영영 ++1-123.456.7890~국가번호 일 일이삼 사오육 칠팔구영 ++82-123.456-7890~국가번호 팔이 일이삼 사오육 칠팔구영 +111-222-3333~일일일 이이이 삼삼삼삼 +909-808-7070~구영구 팔영팔 칠영칠영 +(555)555-5555~오오오 오오오 오오오오 \ No newline at end of file