Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
카드 끝자리 카드 끝자리
카드 마지막 네자리 카드 마지막 네자리
카드 마지막 4자리 카드 마지막 4자리
카드 마지막 4자리 카드 마지막 네자리
신용카드 번호 신용카드 번호
신용카드 신용카드
체크카드 번호 체크카드 번호
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,4 @@
.uk 닷 유케이
.br 닷 비알
.in 닷 아이엔
.ru 닷 알유
.jpg 닷 제이피지
.png 닷 피엔지
.pdf 닷 피디에프
.ru 닷 알유
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.jpg 닷 제이피지
.png 닷 피엔지
.pdf 닷 피디에프
.JPG 닷 제이피지
.PNG 닷 피엔지
.PDF 닷 피디에프
48 changes: 43 additions & 5 deletions nemo_text_processing/text_normalization/ko/taggers/cardinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,19 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, GraphFst
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_DIGIT, NEMO_SIGMA, GraphFst
from nemo_text_processing.text_normalization.ko.utils import get_abs_path


class CardinalFst(GraphFst):
def __init__(self, deterministic: bool = True):
super().__init__(name="cardinal", kind="classify", deterministic=deterministic)

# Grouping separators to remove inside numbers (e.g., "1,234", "1’234", NBSP)
SEP = pynini.union(",", "’", "'", "\u00a0", "\u2009", "\u202f")
# Optional small whitespace inside parentheses or after signs
WS = pynini.closure(pynini.accep(" "), 0, 2)

# Load base .tsv files
graph_zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/number/digit.tsv"))
Expand Down Expand Up @@ -53,7 +59,9 @@ def __init__(self, deterministic: bool = True):
graph_thousand = thousands @ graph_thousand_component

ten_thousands = NEMO_DIGIT**5
graph_ten_thousand_component = (pynini.cross('1', '만') | (graph_digit + pynutil.insert('만'))) + pynini.union(
graph_ten_thousand_component = (
pynini.cross('1', '만') | (graph_digit_no_zero_one + pynutil.insert('만'))
) + pynini.union(
pynini.closure(pynutil.delete('0')),
graph_thousand_component,
(pynutil.delete('0') + graph_hundred_component),
Expand Down Expand Up @@ -268,8 +276,38 @@ def __init__(self, deterministic: bool = True):
).optimize()

# Sign and final formatting
optional_sign = pynini.closure(pynutil.insert('negative: "true" ') + pynini.cross("-", ""), 0, 1)
final_graph = optional_sign + pynutil.insert('integer: "') + graph_num + pynutil.insert('"')
# Delete group separators when they appear between digits (e.g., "1,234" -> "1234")
delete_sep_between_digits = pynini.cdrewrite(
pynutil.delete(SEP),
NEMO_DIGIT,
NEMO_DIGIT,
NEMO_SIGMA,
)

# Let the number graph accept numbers with separators
graph_num_accepting_separators = delete_sep_between_digits @ graph_num

# Build the integer token (integer: "...")
integer_token = pynutil.insert('integer: "') + graph_num_accepting_separators + pynutil.insert('"')

# Sign handling:
# - minus sets negative flag
# - plus is ignored (positive number)
minus_prefix = pynutil.insert('negative: "true" ') + pynutil.delete("-")
plus_prefix = pynutil.delete("+")

# Accounting negative: "( 1,234 )" -> negative + integer:"1234"
paren_negative = (
pynutil.insert('negative: "true" ') + pynutil.delete("(") + WS + integer_token + WS + pynutil.delete(")")
)

# Signed number: optional (+|-) + integer
signed_integer = (minus_prefix | plus_prefix).ques + integer_token

# Prefer accounting-form first, then signed form
final_graph = paren_negative | signed_integer

# Wrap with class tokens and finalize
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
self.graph = graph_num.optimize()
self.graph = graph_num_accepting_separators.optimize()
11 changes: 9 additions & 2 deletions nemo_text_processing/text_normalization/ko/taggers/decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SIGMA, NEMO_SPACE, GraphFst
from nemo_text_processing.text_normalization.ko.utils import get_abs_path


Expand All @@ -32,7 +32,14 @@ class DecimalFst(GraphFst):
def __init__(self, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="decimal", kind="classify", deterministic=deterministic)

cardinal_before_decimal = cardinal.graph
# Use the base cardinal graph for the integer part
base_integer_graph = cardinal.graph
# Only special-case 10000 -> 만 for decimal integer part (if needed)
specials_input = pynini.cross("10000", "만")

# Try the special mapping first, then fall back to normal cardinal
cardinal_before_decimal = (specials_input | base_integer_graph).optimize()

cardinal_after_decimal = pynini.string_file(get_abs_path("data/number/digit.tsv"))
zero = pynini.string_file(get_abs_path("data/number/zero.tsv"))

Expand Down
21 changes: 17 additions & 4 deletions nemo_text_processing/text_normalization/ko/taggers/electronic.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,11 +121,24 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
dollar_accep = pynini.accep("$")
excluded_symbols = DOT | dollar_accep | AT
filtered_symbols = pynini.difference(accepted_symbols, excluded_symbols)
accepted_characters = ASCII_ALNUM | filtered_symbols
# Domain core graph
graph_domain = (pynutil.insert('domain: "') + domain_core + pynutil.insert('"')).optimize()
graph |= graph_domain

known_extensions = pynini.project(
pynini.string_file(get_abs_path("data/electronic/extensions.tsv")),
"input",
)

filename_stem = pynini.closure(
pynini.difference(NEMO_NOT_SPACE, pynini.union(SLASH, DOT)),
1,
)

file_with_extension = filename_stem + known_extensions

graph |= (pynutil.insert('domain: "') + file_with_extension + pynutil.insert('"')).optimize()

# (3) URL with protocol
graph |= protocol + insert_space + domain_graph_with_class_tags

Expand All @@ -144,9 +157,9 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):

four = pynini.closure(NEMO_DIGIT, 4, 4)
sep_token = pynini.union(HYPHEN, NEMO_SPACE)
sep_del = pynutil.delete(pynini.closure(sep_token, 1)) # allow mix of - or space

cc16_grouped = four + sep_del + four + sep_del + four + sep_del + four
sep_to_space = pynutil.delete(pynini.closure(sep_token, 0, 1)) + insert_space
cc16_grouped = four + sep_to_space + four + sep_to_space + four + sep_to_space + four
cc16_grouped = cc16_grouped + delete_space

cc16_no_cue = (
pynutil.insert('protocol: "신용카드 " ')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst
from nemo_text_processing.text_normalization.ko.graph_utils import NEMO_SPACE, GraphFst, delete_space
from nemo_text_processing.text_normalization.ko.utils import get_abs_path


Expand Down Expand Up @@ -53,7 +53,7 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):
+ pynutil.insert(DOUBLE_QUOTE)
)

integer_component_with_space = integer_component + pynutil.insert(NEMO_SPACE)
integer_component_with_space = integer_component + delete_space + pynutil.insert(NEMO_SPACE)

# Denominator and numerator
denominator_component = pynutil.insert(f'denominator: {DOUBLE_QUOTE}') + numeral + pynutil.insert(DOUBLE_QUOTE)
Expand Down
17 changes: 16 additions & 1 deletion nemo_text_processing/text_normalization/ko/taggers/ordinal.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,5 +66,20 @@ def __init__(self, cardinal: GraphFst, deterministic: bool = True):

graph_ordinal = (graph_ordinal_1to39 | graph_ordinal_from40).optimize() # Handles 1-39 # Handles 40+

final_graph = pynutil.insert('integer: "') + graph_ordinal + pynutil.insert('"')
# Single-character particles (가, 이, 은, 는, 로, 도 ...)
josa_single = pynini.union("가", "이", "은", "는", "를", "을", "로", "도", "다")

# Multi-character particles (부터, 까지)
josa_multi = pynini.union("부터", "까지")

# Allow patterns like:
# 번째 + (optional single-josa) + (optional multi-josa)
josa = (josa_single.ques + josa_multi.ques).optimize()

# Final ordinal graph with optional particles
graph_ordinal_with_josa = (graph_ordinal + josa).optimize()

# Build the “integer: …” token structure
final_graph = pynutil.insert('integer: "') + graph_ordinal_with_josa + pynutil.insert('"')

self.fst = self.add_tokens(final_graph).optimize()
44 changes: 25 additions & 19 deletions nemo_text_processing/text_normalization/ko/taggers/telephone.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ class TelephoneFst(GraphFst):
Finite state transducer for classifying Korean telephone numbers.

Example inputs → tokens:
+82-10-3713-7050 -> telephone { country_code: "플러스 팔 이," number_part: "영일영, 삼칠일삼, 칠영오영" }
+1 (415) 555-0123 -> telephone { country_code: "플러스 일," number_part: "사일오, 오오오, 영일이삼" }
(031)371-3700 -> telephone { number_part: "영삼일, 삼칠일, 삼칠영영" }
010-3713-7050 -> telephone { number_part: "영일영, 삼칠일삼, 칠영오영" }
010.777.8888 -> telephone { number_part: "영일영, 칠칠칠, 팔팔팔팔" }
+82 010-3713-7050 -> telephone { country_code: "국가번호 팔이," number_part: "영일영 삼칠일삼 칠영오영" }
+1 (415) 555-0123 -> telephone { country_code: "국가번호 일," number_part: "사일오 오오오 영일이삼" }
(031)371-3700 -> telephone { number_part: "영삼일 삼칠일 삼칠영영" }
010-3713-7050 -> telephone { number_part: "영일영 삼칠일삼 칠영오영" }
010.777.8888 -> telephone { number_part: "영일영 칠칠칠 팔팔팔팔" }

Args:
deterministic (bool, optional): If True, provide a single transduction;
Expand All @@ -37,8 +37,10 @@ class TelephoneFst(GraphFst):

def __init__(self, deterministic: bool = True):
super().__init__(name="telephone", kind="classify", deterministic=deterministic)

add_sep = pynutil.insert(", ") # standard block separator ", "
# Separator between digit blocks (e.g., "-" or ".")
add_sep = pynutil.delete("-") | pynutil.delete(".")
# Optional space inserted between blocks
sep_space = insert_space

# 1) safe digit mapping: force 0 -> "영" (do not rely on zero.tsv invert)
digit = pynini.string_file(get_abs_path("data/number/digit.tsv")).optimize()
Expand All @@ -49,35 +51,39 @@ def __init__(self, deterministic: bool = True):
four_digits = digit_ko**4

# country code: "+1", "+82", "+1-"
country_core = (
pynini.cross("+", "플러스 ")
+ pynini.closure(digit_ko + insert_space, 0, 2)
+ digit_ko
+ pynutil.insert(",")
cc_digits = pynini.closure(digit_ko, 1, 3)

country_code = (
pynutil.delete("+")
+ pynutil.insert('country_code: "')
+ cc_digits
+ pynutil.insert('"')
+ pynini.closure(pynutil.delete("-") | pynutil.delete(" "), 0, 1)
+ delete_space
)
country_code = pynutil.insert('country_code: "') + country_core + pynutil.insert('"')
country_code = country_code + pynini.closure(pynutil.delete("-"), 0, 1) + delete_space + insert_space

# area part: "123-" | "123." | "(123)" [space?] or "(123)-"
area_core = three_digits
area_part = (
(area_core + (pynutil.delete("-") | pynutil.delete(".")))
(area_core + add_sep)
| (
pynutil.delete("(")
+ area_core
+ ((pynutil.delete(")") + pynini.closure(pynutil.delete(" "), 0, 1)) | pynutil.delete(")-"))
+ pynutil.delete(")")
+ pynini.closure(pynutil.delete(" "), 0, 1)
+ pynini.closure(add_sep, 0, 1)
)
) + add_sep
) + sep_space

# 2) allow 3 **or 4** digits in the middle block (to support 010-3713-7050)
mid = pynini.union(three_digits, four_digits)
last4 = four_digits

# consume '-' or '.' between middle and last blocks
number_part_core = area_part + mid + (pynutil.delete("-") | pynutil.delete(".")) + add_sep + last4
number_part_core = area_part + mid + add_sep + sep_space + last4
number_part = pynutil.insert('number_part: "') + number_part_core + pynutil.insert('"')

# final graph: with or without country code
graph = pynini.union(country_code + number_part, number_part).optimize()
graph = pynini.union(country_code + insert_space + number_part, number_part).optimize()

self.fst = self.add_tokens(graph).optimize()
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,13 @@
import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ko.graph_utils import GraphFst, generator_main
from nemo_text_processing.text_normalization.ko.graph_utils import (
NEMO_WHITE_SPACE,
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from nemo_text_processing.text_normalization.ko.taggers.cardinal import CardinalFst
from nemo_text_processing.text_normalization.ko.taggers.date import DateFst
from nemo_text_processing.text_normalization.ko.taggers.decimal import DecimalFst
Expand Down Expand Up @@ -98,9 +104,14 @@ def __init__(
)

token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
tagger = pynini.closure(token, 1)
space = pynini.closure(NEMO_WHITE_SPACE, 1)
space = pynini.compose(space, delete_extra_space)

self.fst = tagger.optimize()
space_opt = pynini.closure(space, 0, 1)

graph = delete_space + token + pynini.closure(space_opt + token) + delete_space

self.fst = graph.optimize()

if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
11 changes: 9 additions & 2 deletions nemo_text_processing/text_normalization/ko/taggers/word.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pynini
from pynini.lib import pynutil

from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_NOT_SPACE, GraphFst
from nemo_text_processing.text_normalization.ja.graph_utils import NEMO_DIGIT, NEMO_NOT_SPACE, GraphFst


class WordFst(GraphFst):
Expand All @@ -25,5 +26,11 @@ class WordFst(GraphFst):

def __init__(self, deterministic: bool = True):
super().__init__(name="word", kind="classify", deterministic=deterministic)
word = pynutil.insert("name: \"") + NEMO_NOT_SPACE + pynutil.insert("\"")

word_char = pynini.difference(NEMO_NOT_SPACE, NEMO_DIGIT)

word = pynutil.insert('name: "')
word += pynini.closure(word_char, 1)
word += pynutil.insert('"')

self.fst = word.optimize()
12 changes: 7 additions & 5 deletions nemo_text_processing/text_normalization/ko/verbalizers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,16 @@ def __init__(self, deterministic: bool = True):
+ pynutil.delete("\"")
)

SPACE = pynini.closure(delete_space, 0, 1) + insert_space

# This graph now correctly uses the 'delete_space' variable defined above.
graph_basic_date = (
pynini.closure(era_component + delete_space, 0, 1)
+ pynini.closure(year_component + delete_space, 0, 1)
+ pynini.closure(month_component + delete_space, 0, 1)
pynini.closure(era_component + SPACE, 0, 1)
+ pynini.closure(year_component + SPACE, 0, 1)
+ pynini.closure(month_component + SPACE, 0, 1)
+ pynini.closure(day_component, 0, 1)
+ pynini.closure((delete_space + week_component) | (week_component), 0, 1)
) | month_component + delete_space + week_component
+ pynini.closure(SPACE + week_component, 0, 1)
) | (month_component + SPACE + week_component)

final_graph = graph_basic_date

Expand Down
Loading