From cf3d94912e0531eafbcb72ae6cd05ed3543cef50 Mon Sep 17 00:00:00 2001 From: CrazyRoka Date: Wed, 17 Dec 2025 00:14:27 +0000 Subject: [PATCH 1/4] ptx: implement -S/--sentence-regexp --- src/uu/ptx/src/ptx.rs | 50 +++++++++++++++++++++++++++++++-------- tests/by-util/test_ptx.rs | 38 +++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index d3b9d103ce1..953f2d1b4ad 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -19,7 +19,7 @@ use clap::{Arg, ArgAction, Command}; use regex::Regex; use thiserror::Error; use uucore::display::Quotable; -use uucore::error::{FromIo, UError, UResult, UUsageError}; +use uucore::error::{FromIo, UError, UResult, USimpleError, UUsageError}; use uucore::format_usage; use uucore::translate; @@ -43,6 +43,7 @@ struct Config { context_regex: String, line_width: usize, gap_size: usize, + sentence_regex: Option, } impl Default for Config { @@ -59,6 +60,7 @@ impl Default for Config { context_regex: "\\w+".to_owned(), line_width: 72, gap_size: 3, + sentence_regex: None, } } } @@ -197,9 +199,6 @@ struct WordRef { #[derive(Debug, Error)] enum PtxError { - #[error("{}", translate!("ptx-error-not-implemented", "feature" => (*.0)))] - NotImplemented(&'static str), - #[error("{0}")] ParseError(ParseIntError), } @@ -214,8 +213,18 @@ fn get_config(matches: &clap::ArgMatches) -> UResult { config.format = OutFormat::Roff; "[^ \t\n]+".clone_into(&mut config.context_regex); } - if matches.contains_id(options::SENTENCE_REGEXP) { - return Err(PtxError::NotImplemented("-S").into()); + if let Some(regex) = matches.get_one::(options::SENTENCE_REGEXP) { + config.sentence_regex = Some(regex.clone()); + + // Verify regex is valid and doesn't match empty string + if let Ok(re) = Regex::new(regex) { + if re.is_match("") { + return Err(USimpleError::new( + 1, + "A regular expression cannot match a length zero string", + )); + } + } } config.auto_ref = matches.get_flag(options::AUTO_REFERENCE); config.input_ref = matches.get_flag(options::REFERENCES); @@ -271,17 +280,38 @@ struct FileContent { type FileMap = HashMap; -fn read_input(input_files: &[OsString]) -> std::io::Result { +fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result { let mut file_map: FileMap = HashMap::new(); let mut offset: usize = 0; + + let sentence_splitter = + if let Some(re_str) = &config.sentence_regex { + Some(Regex::new(re_str).map_err(|_| { + std::io::Error::new(std::io::ErrorKind::InvalidInput, "Invalid regex") + })?) + } else { + None + }; + for filename in input_files { - let reader: BufReader> = BufReader::new(if filename == "-" { + let mut reader: BufReader> = BufReader::new(if filename == "-" { Box::new(stdin()) } else { let file = File::open(Path::new(filename))?; Box::new(file) }); - let lines: Vec = reader.lines().collect::>>()?; + + let lines = if let Some(re) = &sentence_splitter { + let mut buffer = String::new(); + reader.read_to_string(&mut buffer)?; + + re.split(&buffer) + .map(|s| s.replace("\n", " ")) // ptx behavior: newlines become spaces inside sentences + .filter(|s| !s.is_empty()) // remove empty sentences + .collect() + } else { + reader.lines().collect::>>()? + }; // Indexing UTF-8 string requires walking from the beginning, which can hurts performance badly when the line is long. // Since we will be jumping around the line a lot, we dump the content into a Vec, which can be indexed in constant time. @@ -877,7 +907,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { } let word_filter = WordFilter::new(&matches, &config)?; - let file_map = read_input(&input_files).map_err_context(String::new)?; + let file_map = read_input(&input_files, &config).map_err_context(String::new)?; let word_set = create_word_set(&config, &word_filter, &file_map); write_traditional_output(&mut config, &file_map, &word_set, &output_file) } diff --git a/tests/by-util/test_ptx.rs b/tests/by-util/test_ptx.rs index 464dcf6aead..b8a87f2afd1 100644 --- a/tests/by-util/test_ptx.rs +++ b/tests/by-util/test_ptx.rs @@ -257,6 +257,44 @@ fn test_utf8() { .stdout_only("\\xx {}{it’s}{disabled}{}{}\n\\xx {}{}{it’s}{ disabled}{}\n"); } +#[test] +fn test_sentence_regexp_basic() { + new_ucmd!() + .args(&["-G", "-S", "\\."]) + .pipe_in("Hello. World.") + .succeeds() + .stdout_contains("Hello") + .stdout_contains("World"); +} + +#[test] +fn test_sentence_regexp_split_behavior() { + new_ucmd!() + .args(&["-G", "-w", "50", "-S", "[.!]"]) + .pipe_in("One sentence. Two sentence!") + .succeeds() + .stdout_contains("One sentence") + .stdout_contains("Two sentence"); +} + +#[test] +fn test_sentence_regexp_empty_match_failure() { + new_ucmd!() + .args(&["-G", "-S", "^"]) + .pipe_in("Input") + .fails() + .stderr_contains("A regular expression cannot match a length zero string"); +} + +#[test] +fn test_sentence_regexp_newlines_are_spaces() { + new_ucmd!() + .args(&["-G", "-S", "\\."]) + .pipe_in("Start of\nsentence.") + .succeeds() + .stdout_contains("Start of sentence"); +} + #[test] fn test_gnu_mode_dumb_format() { // Test GNU mode (dumb format) - the default mode without -G flag From 64929fb3298669440f2d04cd2f20d68627751d60 Mon Sep 17 00:00:00 2001 From: Rostyslav Toch Date: Wed, 24 Dec 2025 11:03:06 +0000 Subject: [PATCH 2/4] fix: resolve clippy single_char_pattern warning --- src/uu/ptx/src/ptx.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index 953f2d1b4ad..b8b47411fdc 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -306,7 +306,7 @@ fn read_input(input_files: &[OsString], config: &Config) -> std::io::Result Date: Wed, 24 Dec 2025 11:52:59 +0000 Subject: [PATCH 3/4] ptx: add TODO comment about regex compatibility --- src/uu/ptx/src/ptx.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/uu/ptx/src/ptx.rs b/src/uu/ptx/src/ptx.rs index b8b47411fdc..2a53158925c 100644 --- a/src/uu/ptx/src/ptx.rs +++ b/src/uu/ptx/src/ptx.rs @@ -216,6 +216,10 @@ fn get_config(matches: &clap::ArgMatches) -> UResult { if let Some(regex) = matches.get_one::(options::SENTENCE_REGEXP) { config.sentence_regex = Some(regex.clone()); + // TODO: The regex crate used here is not fully compatible with GNU's regex implementation. + // For example, it does not support backreferences. + // In the future, we might want to switch to the onig crate (like expr does) for better compatibility. + // Verify regex is valid and doesn't match empty string if let Ok(re) = Regex::new(regex) { if re.is_match("") { From 4e7785afef2b73c9604cfeaf4704706fecbea8a8 Mon Sep 17 00:00:00 2001 From: Rostyslav Toch Date: Wed, 24 Dec 2025 13:47:42 +0000 Subject: [PATCH 4/4] test: fix broken pipe panic in test_sentence_regexp_empty_match_failure --- tests/by-util/test_ptx.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/by-util/test_ptx.rs b/tests/by-util/test_ptx.rs index b8a87f2afd1..0b1da0d0488 100644 --- a/tests/by-util/test_ptx.rs +++ b/tests/by-util/test_ptx.rs @@ -281,7 +281,6 @@ fn test_sentence_regexp_split_behavior() { fn test_sentence_regexp_empty_match_failure() { new_ucmd!() .args(&["-G", "-S", "^"]) - .pipe_in("Input") .fails() .stderr_contains("A regular expression cannot match a length zero string"); }