From 419874f39d3517f49d3a98ca0ff06cd4a6bca989 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 21 Oct 2024 21:50:25 -0500 Subject: [PATCH 1/7] grep: add -o option --- text/grep.rs | 228 +++++++++++++++++++++++++++++++---------- text/tests/grep/mod.rs | 39 ++++++- 2 files changed, 213 insertions(+), 54 deletions(-) diff --git a/text/grep.rs b/text/grep.rs index 5221bdb89..c12a30bbd 100644 --- a/text/grep.rs +++ b/text/grep.rs @@ -8,14 +8,14 @@ // use clap::Parser; -use gettextrs::{bind_textdomain_codeset, setlocale, textdomain, LocaleCategory}; -use libc::{regcomp, regex_t, regexec, regfree, REG_EXTENDED, REG_ICASE, REG_NOMATCH}; +use gettextrs::{bind_textdomain_codeset, textdomain}; +use libc::{regcomp, regex_t, regexec, regfree, regmatch_t, REG_EXTENDED, REG_ICASE, REG_NOMATCH}; +use plib::PROJECT_NAME; use std::{ ffi::CString, fs::File, - io::{self, BufRead, BufReader}, + io::{self, BufRead, BufReader, StdoutLock, Write}, path::{Path, PathBuf}, - ptr, }; /// grep - search a file for a pattern. @@ -46,7 +46,7 @@ struct Args { #[arg(short, long)] ignore_case: bool, - /// Write only the names of input_files containing selected lines to standard output. + /// Write only the names of input files containing selected lines to standard output. #[arg(short = 'l', long)] files_with_matches: bool, @@ -54,7 +54,11 @@ struct Args { #[arg(short = 'n', long)] line_number: bool, - /// Write only the names of input_files containing selected lines to standard output. + /// Only print the matching characters in each line. + #[arg(short = 'o', long = "only-matching")] + only_matching: bool, + + /// Do not print to standard output. The presence or absence of a match is communicated with the exit status. #[arg(short, long)] quiet: bool, @@ -105,7 +109,7 @@ impl Args { return Err("Options '-l' and '-q' cannot be used together".to_string()); } if self.regexp.is_empty() && self.file.is_empty() && self.single_pattern.is_none() { - return Err("Required at least one pattern list or file".to_string()); + return Err("A pattern list or at least one file is required".to_string()); } Ok(()) } @@ -197,6 +201,8 @@ impl Args { output_mode, patterns, input_files: self.input_files, + stdout_lock: io::stdout().lock(), + only_matching: self.only_matching, }) } } @@ -297,8 +303,13 @@ impl Patterns { /// # Returns /// /// Returns [bool](bool) - `true` if input matches present patterns, else `false`. - fn matches(&self, input: impl AsRef) -> bool { + fn matches( + &self, + input: impl AsRef, + collect_matching_substrings: bool, + ) -> (bool, Vec>) { let input = input.as_ref(); + match self { Patterns::Fixed(patterns, ignore_case, line_regexp) => { let input = if *ignore_case { @@ -306,19 +317,104 @@ impl Patterns { } else { input.to_string() }; - patterns.iter().any(|p| { + + let mut matching_substrings = Vec::>::new(); + + let mut any_pattern_matched = false; + + for pattern in patterns { if *line_regexp { - input == *p + if input != *pattern { + continue; + } + + if !collect_matching_substrings { + return (true, Vec::>::new()); + } + + any_pattern_matched = true; + + matching_substrings.push(pattern.as_bytes().to_vec()); } else { - input.contains(p) + for st in input.matches(pattern) { + if !collect_matching_substrings { + return (true, Vec::>::new()); + } + + any_pattern_matched = true; + + matching_substrings.push(st.as_bytes().to_vec()); + } } - }) + } + + (any_pattern_matched, matching_substrings) } Patterns::Regex(patterns) => { - let c_input = CString::new(input).unwrap(); - patterns.iter().any(|p| unsafe { - regexec(p, c_input.as_ptr(), 0, ptr::null_mut(), 0) != REG_NOMATCH - }) + let nmatch_to_use = if collect_matching_substrings { 1 } else { 0 }; + + let input_slice = input.as_bytes(); + + let mut matching_substrings = Vec::>::new(); + + let mut any_pattern_matched = false; + + 'next_pattern: for p in patterns { + let mut current_string_index = 0_usize; + + loop { + let current_string_slice = &input_slice[current_string_index..]; + + let current_string_c_string = CString::new(current_string_slice).unwrap(); + + let mut regmatch_t_vec = vec![ + regmatch_t { + rm_so: -1, + rm_eo: -1, + }; + nmatch_to_use + ]; + + let regmatch_vec_pointer = regmatch_t_vec.as_mut_ptr(); + + let regexec_return_value = unsafe { + regexec( + p, + current_string_c_string.as_ptr(), + nmatch_to_use, + regmatch_vec_pointer, + 0, + ) + }; + + if regexec_return_value != 0 { + debug_assert!(regexec_return_value == REG_NOMATCH); + + continue 'next_pattern; + } + + if !collect_matching_substrings { + return (true, Vec::>::new()); + } + + any_pattern_matched = true; + + let regmatch_t = regmatch_t_vec.first().unwrap(); + + let start = usize::try_from(regmatch_t.rm_so).unwrap(); + let end = usize::try_from(regmatch_t.rm_eo).unwrap(); + + debug_assert!(end > 0_usize); + + matching_substrings.push(current_string_slice[start..end].to_vec()); + + debug_assert!(end > current_string_index); + + current_string_index += end; + } + } + + (any_pattern_matched, matching_substrings) } } } @@ -357,6 +453,8 @@ struct GrepModel { output_mode: OutputMode, patterns: Patterns, input_files: Vec, + stdout_lock: StdoutLock<'static>, + only_matching: bool, } impl GrepModel { @@ -398,6 +496,16 @@ impl GrepModel { } } + fn print_line_prefix(&mut self, input_name: &str, line_number: u64) { + if self.multiple_inputs { + write!(self.stdout_lock, "{input_name}:").unwrap(); + } + + if self.line_number { + write!(self.stdout_lock, "{line_number}:").unwrap(); + } + } + /// Reads lines from buffer and processes them. /// /// # Arguments @@ -405,78 +513,93 @@ impl GrepModel { /// * `input_name` - [str](str) that represents content source name. /// * `reader` - [Box](Box) that contains object that implements [BufRead] and reads lines. fn process_input(&mut self, input_name: &str, mut reader: Box) { - let mut line_number: u64 = 0; + let mut line_number = 0_u64; + + let mut line = String::new(); + loop { - let mut line = String::new(); line_number += 1; + + line.clear(); + + // TODO + // Probably should work on non-UTF-8 input match reader.read_line(&mut line) { Ok(n_read) => { if n_read == 0 { break; } - let trimmed = if line.ends_with('\n') { - &line[..line.len() - 1] - } else { - &line + + let mut chars = line.chars(); + + let line_without_newline = match chars.next_back() { + Some('\n') => chars.as_str(), + _ => line.as_str(), }; - let init_matches = self.patterns.matches(trimmed); + let (line_matches_any_pattern, matching_substrings) = self.patterns.matches( + line_without_newline, + self.only_matching && matches!(self.output_mode, OutputMode::Default), + ); + let matches = if self.invert_match { - !init_matches + !line_matches_any_pattern } else { - init_matches + line_matches_any_pattern }; + if matches { self.any_matches = true; + match &mut self.output_mode { OutputMode::Count(count) => { *count += 1; } OutputMode::FilesWithMatches => { - println!("{input_name}"); + writeln!(&mut self.stdout_lock, "{input_name}").unwrap(); + break; } OutputMode::Quiet => { return; } OutputMode::Default => { - let result = format!( - "{}{}{}", - if self.multiple_inputs { - format!("{input_name}:") - } else { - String::new() - }, - if self.line_number { - format!("{line_number}:") - } else { - String::new() - }, - trimmed - ); - println!("{result}"); + if self.only_matching { + for matching_substring in matching_substrings { + self.print_line_prefix(input_name, line_number); + + self.stdout_lock + .write_all(matching_substring.as_slice()) + .unwrap(); + + self.stdout_lock.write_all(b"\n").unwrap(); + } + } else { + self.print_line_prefix(input_name, line_number); + + writeln!(self.stdout_lock, "{line_without_newline}").unwrap(); + } } } } - line.clear(); } Err(err) => { self.any_errors = true; + if !self.no_messages { - eprintln!( - "{}: Error reading line {} ({})", - input_name, line_number, err - ); + eprintln!("{input_name}: Error reading line {line_number} ({err})",); } } } } + if let OutputMode::Count(count) = &mut self.output_mode { if self.multiple_inputs { - println!("{input_name}:{count}"); + writeln!(&mut self.stdout_lock, "{input_name}:{count}").unwrap(); } else { - println!("{count}"); + writeln!(&mut self.stdout_lock, "{count}").unwrap(); } + *count = 0; } } @@ -487,10 +610,9 @@ impl GrepModel { // 1 - No lines were selected. // >1 - An error occurred. fn main() -> Result<(), Box> { - setlocale(LocaleCategory::LcAll, ""); - textdomain(env!("PROJECT_NAME"))?; - bind_textdomain_codeset(env!("PROJECT_NAME"), "UTF-8")?; - + textdomain(PROJECT_NAME)?; + bind_textdomain_codeset(PROJECT_NAME, "UTF-8")?; + // Parse command line arguments let mut args = Args::parse(); let exit_code = args @@ -501,7 +623,7 @@ fn main() -> Result<(), Box> { }) .map(|mut grep_model| grep_model.grep()) .unwrap_or_else(|err| { - eprintln!("{}", err); + eprintln!("{err}"); 2 }); diff --git a/text/tests/grep/mod.rs b/text/tests/grep/mod.rs index 49ef06f6d..9261c231b 100644 --- a/text/tests/grep/mod.rs +++ b/text/tests/grep/mod.rs @@ -81,7 +81,7 @@ fn test_absent_pattern() { &[], "", "", - "Required at least one pattern list or file\n", + "A pattern list or at least one file is required\n", 2, ); } @@ -1389,3 +1389,40 @@ fn test_long_names_files() { 0, ); } + +#[test] +fn test_dash_o() { + const INPUT: &str = "\ +Contains KEYWORD here and also here: KEYWORD +Output is not organized: one KEYWORD per output line +Regardless of which input line it was found in +"; + + grep_test( + &["-o", "KEYWORD"], + INPUT, + "\ +KEYWORD +KEYWORD +KEYWORD +", + "", + 0_i32, + ); + + grep_test(&["-o", "NOT PRESENT"], INPUT, "", "", 1_i32); + + grep_test(&["-o", "-v", "NOT PRESENT"], INPUT, "", "", 0_i32); + + grep_test( + &["-o", "KEYWORD", "--", "-", "-"], + INPUT, + "\ +(standard input):KEYWORD +(standard input):KEYWORD +(standard input):KEYWORD +", + "", + 0_i32, + ); +} From aca3bdd04a1eb98c72c60896a0edacf8a8ef1dc4 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 21 Oct 2024 22:06:47 -0500 Subject: [PATCH 2/7] Fix edge case --- text/grep.rs | 15 ++++++++++----- text/tests/grep/mod.rs | 2 ++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/text/grep.rs b/text/grep.rs index c12a30bbd..8095537a5 100644 --- a/text/grep.rs +++ b/text/grep.rs @@ -359,7 +359,7 @@ impl Patterns { let mut any_pattern_matched = false; - 'next_pattern: for p in patterns { + for p in patterns { let mut current_string_index = 0_usize; loop { @@ -390,7 +390,7 @@ impl Patterns { if regexec_return_value != 0 { debug_assert!(regexec_return_value == REG_NOMATCH); - continue 'next_pattern; + break; } if !collect_matching_substrings { @@ -404,12 +404,17 @@ impl Patterns { let start = usize::try_from(regmatch_t.rm_so).unwrap(); let end = usize::try_from(regmatch_t.rm_eo).unwrap(); - debug_assert!(end > 0_usize); + // TODO + // Is this the right fix? + // The edge case is: + // + // grep -o '' + if end == 0_usize { + break; + } matching_substrings.push(current_string_slice[start..end].to_vec()); - debug_assert!(end > current_string_index); - current_string_index += end; } } diff --git a/text/tests/grep/mod.rs b/text/tests/grep/mod.rs index 9261c231b..1315dead3 100644 --- a/text/tests/grep/mod.rs +++ b/text/tests/grep/mod.rs @@ -1425,4 +1425,6 @@ KEYWORD "", 0_i32, ); + + grep_test(&["-o", ""], INPUT, "", "", 0_i32); } From 2f1d8534e3837fd4ac4a5dad1a48c02fe61cd72c Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 21 Oct 2024 22:41:41 -0500 Subject: [PATCH 3/7] Improve code style --- text/grep.rs | 115 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 70 insertions(+), 45 deletions(-) diff --git a/text/grep.rs b/text/grep.rs index 8095537a5..6f713db74 100644 --- a/text/grep.rs +++ b/text/grep.rs @@ -16,6 +16,7 @@ use std::{ fs::File, io::{self, BufRead, BufReader, StdoutLock, Write}, path::{Path, PathBuf}, + ptr, }; /// grep - search a file for a pattern. @@ -303,13 +304,13 @@ impl Patterns { /// # Returns /// /// Returns [bool](bool) - `true` if input matches present patterns, else `false`. - fn matches( - &self, - input: impl AsRef, - collect_matching_substrings: bool, - ) -> (bool, Vec>) { + fn matches(&self, input: impl AsRef, collect_matching_substrings: bool) -> MatchesResult { let input = input.as_ref(); + let mut matching_substrings = Vec::>::new(); + + let mut any_pattern_matched = false; + match self { Patterns::Fixed(patterns, ignore_case, line_regexp) => { let input = if *ignore_case { @@ -318,10 +319,6 @@ impl Patterns { input.to_string() }; - let mut matching_substrings = Vec::>::new(); - - let mut any_pattern_matched = false; - for pattern in patterns { if *line_regexp { if input != *pattern { @@ -329,7 +326,7 @@ impl Patterns { } if !collect_matching_substrings { - return (true, Vec::>::new()); + return MatchesResult::fast_path_match(); } any_pattern_matched = true; @@ -338,7 +335,7 @@ impl Patterns { } else { for st in input.matches(pattern) { if !collect_matching_substrings { - return (true, Vec::>::new()); + return MatchesResult::fast_path_match(); } any_pattern_matched = true; @@ -347,45 +344,45 @@ impl Patterns { } } } - - (any_pattern_matched, matching_substrings) } Patterns::Regex(patterns) => { - let nmatch_to_use = if collect_matching_substrings { 1 } else { 0 }; + const SINGLE_ELEMENT_ARRAY_SIZE: usize = 1_usize; let input_slice = input.as_bytes(); - let mut matching_substrings = Vec::>::new(); + let mut regmatch_t_array = [const { + regmatch_t { + rm_so: -1, + rm_eo: -1, + } + }; SINGLE_ELEMENT_ARRAY_SIZE]; - let mut any_pattern_matched = false; + let (nmatch, pmatch) = if collect_matching_substrings { + (SINGLE_ELEMENT_ARRAY_SIZE, regmatch_t_array.as_mut_ptr()) + } else { + (0_usize, ptr::null_mut()) + }; - for p in patterns { + for pattern in patterns { let mut current_string_index = 0_usize; loop { + // Clear values from the last iteration + if collect_matching_substrings { + let [ref mut regmatch_t] = regmatch_t_array; + + regmatch_t.rm_so = -1; + regmatch_t.rm_eo = -1; + } + let current_string_slice = &input_slice[current_string_index..]; let current_string_c_string = CString::new(current_string_slice).unwrap(); - let mut regmatch_t_vec = vec![ - regmatch_t { - rm_so: -1, - rm_eo: -1, - }; - nmatch_to_use - ]; - - let regmatch_vec_pointer = regmatch_t_vec.as_mut_ptr(); - - let regexec_return_value = unsafe { - regexec( - p, - current_string_c_string.as_ptr(), - nmatch_to_use, - regmatch_vec_pointer, - 0, - ) - }; + let current_string_pointer = current_string_c_string.as_ptr(); + + let regexec_return_value = + unsafe { regexec(pattern, current_string_pointer, nmatch, pmatch, 0) }; if regexec_return_value != 0 { debug_assert!(regexec_return_value == REG_NOMATCH); @@ -394,15 +391,20 @@ impl Patterns { } if !collect_matching_substrings { - return (true, Vec::>::new()); + return MatchesResult::fast_path_match(); } any_pattern_matched = true; - let regmatch_t = regmatch_t_vec.first().unwrap(); + let [regmatch_t] = regmatch_t_array; - let start = usize::try_from(regmatch_t.rm_so).unwrap(); - let end = usize::try_from(regmatch_t.rm_eo).unwrap(); + let regmatch_t { rm_so, rm_eo } = regmatch_t; + + debug_assert!(rm_so != -1); + debug_assert!(rm_eo != -1); + + let start = usize::try_from(rm_so).unwrap(); + let end = usize::try_from(rm_eo).unwrap(); // TODO // Is this the right fix? @@ -418,10 +420,13 @@ impl Patterns { current_string_index += end; } } - - (any_pattern_matched, matching_substrings) } } + + MatchesResult { + any_pattern_matched, + matching_substrings, + } } } @@ -462,6 +467,21 @@ struct GrepModel { only_matching: bool, } +struct MatchesResult { + any_pattern_matched: bool, + /// Will always be empty if the -o option is not being used + matching_substrings: Vec>, +} + +impl MatchesResult { + pub fn fast_path_match() -> MatchesResult { + MatchesResult { + any_pattern_matched: true, + matching_substrings: Vec::>::new(), + } + } +} + impl GrepModel { /// Processes input files or STDIN content. /// @@ -542,15 +562,20 @@ impl GrepModel { _ => line.as_str(), }; - let (line_matches_any_pattern, matching_substrings) = self.patterns.matches( + let matches_result = self.patterns.matches( line_without_newline, self.only_matching && matches!(self.output_mode, OutputMode::Default), ); + let MatchesResult { + any_pattern_matched, + matching_substrings, + } = matches_result; + let matches = if self.invert_match { - !line_matches_any_pattern + !any_pattern_matched } else { - line_matches_any_pattern + any_pattern_matched }; if matches { From 7c7acb9495638480ffabeb7656f32ed49a2dca50 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 21 Oct 2024 23:08:21 -0500 Subject: [PATCH 4/7] Address failing test --- text/tests/grep/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/text/tests/grep/mod.rs b/text/tests/grep/mod.rs index 1315dead3..2cdc3b10a 100644 --- a/text/tests/grep/mod.rs +++ b/text/tests/grep/mod.rs @@ -1425,6 +1425,11 @@ KEYWORD "", 0_i32, ); +} - grep_test(&["-o", ""], INPUT, "", "", 0_i32); +// Disabled due to implementation replacing empty patterns with ".*" on this platform +#[test] +#[cfg(not(target_os = "macos"))] +fn test_dash_o_empty_pattern() { + grep_test(&["-o", ""], "ABC", "", "", 0_i32); } From dd7ce2e1755344e37b06fdb80ebaf45c0de471a3 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Fri, 25 Oct 2024 18:04:32 -0500 Subject: [PATCH 5/7] Manually rebase --- text/grep.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/text/grep.rs b/text/grep.rs index 6f713db74..f6d252e24 100644 --- a/text/grep.rs +++ b/text/grep.rs @@ -8,9 +8,8 @@ // use clap::Parser; -use gettextrs::{bind_textdomain_codeset, textdomain}; +use gettextrs::{bind_textdomain_codeset, setlocale, textdomain, LocaleCategory}; use libc::{regcomp, regex_t, regexec, regfree, regmatch_t, REG_EXTENDED, REG_ICASE, REG_NOMATCH}; -use plib::PROJECT_NAME; use std::{ ffi::CString, fs::File, @@ -640,9 +639,10 @@ impl GrepModel { // 1 - No lines were selected. // >1 - An error occurred. fn main() -> Result<(), Box> { - textdomain(PROJECT_NAME)?; - bind_textdomain_codeset(PROJECT_NAME, "UTF-8")?; - // Parse command line arguments + setlocale(LocaleCategory::LcAll, ""); + textdomain(env!("PROJECT_NAME"))?; + bind_textdomain_codeset(env!("PROJECT_NAME"), "UTF-8")?; + let mut args = Args::parse(); let exit_code = args From a8c9a8c573d9aab55d30489a4e676f5586b4f17b Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 28 Oct 2024 08:36:33 -0500 Subject: [PATCH 6/7] grep: do not fail on non-UTF-8 input --- Cargo.lock | 1 + text/Cargo.toml | 1 + text/grep.rs | 385 ++++++++++++++++++++++++++--------------- text/tests/grep/mod.rs | 149 +++++++++------- 4 files changed, 333 insertions(+), 203 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ab6396b24..bf9c233fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1497,6 +1497,7 @@ dependencies = [ "dirs 5.0.1", "gettext-rs", "libc", + "memchr", "notify-debouncer-full", "plib", "proptest", diff --git a/text/Cargo.toml b/text/Cargo.toml index 10554d3ad..68610d4ef 100644 --- a/text/Cargo.toml +++ b/text/Cargo.toml @@ -19,6 +19,7 @@ notify-debouncer-full = "0.3" diff = "0.1" dirs = "5.0" walkdir = "2" +memchr = "2" [dev-dependencies] proptest = "1" diff --git a/text/grep.rs b/text/grep.rs index f6d252e24..3e372c2db 100644 --- a/text/grep.rs +++ b/text/grep.rs @@ -10,10 +10,13 @@ use clap::Parser; use gettextrs::{bind_textdomain_codeset, setlocale, textdomain, LocaleCategory}; use libc::{regcomp, regex_t, regexec, regfree, regmatch_t, REG_EXTENDED, REG_ICASE, REG_NOMATCH}; +use memchr::memmem; use std::{ + error::Error, ffi::CString, fs::File, io::{self, BufRead, BufReader, StdoutLock, Write}, + mem::MaybeUninit, path::{Path, PathBuf}, ptr, }; @@ -62,7 +65,7 @@ struct Args { #[arg(short, long)] quiet: bool, - /// Suppress the error messages ordinarily written for nonexistent or unreadable input_files. + /// Suppress the error messages ordinarily written for nonexistent or unreadable input files. #[arg(short = 's', long)] no_messages: bool, @@ -84,9 +87,6 @@ struct Args { /// standard input shall be used. #[arg(name = "FILE")] input_files: Vec, - - #[arg(skip)] - any_errors: bool, } impl Args { @@ -95,34 +95,42 @@ impl Args { /// # Errors /// /// Returns an error if conflicting options are found. - fn validate_args(&self) -> Result<(), String> { + fn validate_args(&self) -> Result<(), Box> { if self.extended_regexp && self.fixed_strings { - return Err("Options '-E' and '-F' cannot be used together".to_string()); + return Err(Box::from("options '-E' and '-F' cannot be used together")); } + if self.count && self.files_with_matches { - return Err("Options '-c' and '-l' cannot be used together".to_string()); + return Err(Box::from("options '-c' and '-l' cannot be used together")); } + if self.count && self.quiet { - return Err("Options '-c' and '-q' cannot be used together".to_string()); + return Err(Box::from("options '-c' and '-q' cannot be used together")); } + if self.files_with_matches && self.quiet { - return Err("Options '-l' and '-q' cannot be used together".to_string()); + return Err(Box::from("options '-l' and '-q' cannot be used together")); } + if self.regexp.is_empty() && self.file.is_empty() && self.single_pattern.is_none() { - return Err("A pattern list or at least one file is required".to_string()); + return Err(Box::from("a pattern list or at least one file is required")); } + Ok(()) } /// Resolves input patterns and input files. Reads patterns from pattern files and merges them with specified as argument. Handles input files if empty. - fn resolve(&mut self) { + fn resolve(&mut self, grep_model_state: &mut GrepModelState) { for path_buf in &self.file { match Self::get_file_patterns(path_buf) { Ok(patterns) => self.regexp.extend(patterns), Err(err) => { - self.any_errors = true; + grep_model_state.any_errors = true; + if !self.no_messages { - eprintln!("{}: {}", path_buf.display(), err); + let path_buf_display = path_buf.display(); + + eprintln!("grep: {path_buf_display}: {err}"); } } } @@ -132,9 +140,9 @@ impl Args { None => {} Some(pattern) => { if !self.regexp.is_empty() { - self.input_files.insert(0, pattern.clone()); + self.input_files.insert(0, pattern.to_owned()); } else { - self.regexp = vec![pattern.clone()]; + self.regexp = vec![pattern.to_owned()]; } } } @@ -142,7 +150,7 @@ impl Args { self.regexp = self .regexp .iter() - .flat_map(|pattern| pattern.split('\n').map(String::from)) + .flat_map(|pattern| pattern.split('\n').map(ToOwned::to_owned)) .collect(); self.regexp.sort_by_key(|r| r.len()); self.regexp.dedup(); @@ -172,17 +180,7 @@ impl Args { /// # Returns /// /// Returns [GrepModel](GrepModel) object. - fn into_grep_model(self) -> Result { - let output_mode = if self.count { - OutputMode::Count(0) - } else if self.files_with_matches { - OutputMode::FilesWithMatches - } else if self.quiet { - OutputMode::Quiet - } else { - OutputMode::Default - }; - + fn into_grep_model(self) -> Result> { let patterns = Patterns::new( self.regexp, self.extended_regexp, @@ -191,18 +189,17 @@ impl Args { self.line_regexp, )?; + let multiple_inputs = self.input_files.len() > 1_usize; + Ok(GrepModel { - any_matches: false, - any_errors: self.any_errors, + input_files: self.input_files, + invert_match: self.invert_match, line_number: self.line_number, + multiple_inputs, no_messages: self.no_messages, - invert_match: self.invert_match, - multiple_inputs: self.input_files.len() > 1, - output_mode, - patterns, - input_files: self.input_files, - stdout_lock: io::stdout().lock(), only_matching: self.only_matching, + patterns, + quiet: self.quiet, }) } } @@ -237,7 +234,7 @@ impl Patterns { fixed_string: bool, ignore_case: bool, line_regexp: bool, - ) -> Result { + ) -> Result> { if fixed_string { Ok(Self::Fixed( patterns @@ -248,15 +245,18 @@ impl Patterns { line_regexp, )) } else { - let mut ps = vec![]; - let mut cflags = 0; + if extended_regexp { cflags |= REG_EXTENDED; } + if ignore_case { cflags |= REG_ICASE; } + + let mut regex_vec = Vec::::with_capacity(patterns.len()); + for mut pattern in patterns { // macOS version of [regcomp](regcomp) from `libc` // provides additional check for empty regex. In this case, @@ -272,25 +272,39 @@ impl Patterns { pattern }; } + pattern = if line_regexp { format!("^{pattern}$") } else { pattern }; - let c_pattern = CString::new(pattern).map_err(|err| err.to_string())?; - let mut regex = unsafe { std::mem::zeroed::() }; + let pattern_c_string = CString::new(pattern)?; - let result = unsafe { regcomp(&mut regex, c_pattern.as_ptr(), cflags) }; - if result != 0 { - return Err(format!( - "Error compiling regex '{}'", - c_pattern.to_string_lossy() - )); + let pattern_c_string_pointer = pattern_c_string.as_ptr(); + + let mut regex_maybe_uninit = MaybeUninit::::uninit(); + + let regex_maybe_uninit_pointer = regex_maybe_uninit.as_mut_ptr(); + + let regcomp_result = unsafe { + regcomp(regex_maybe_uninit_pointer, pattern_c_string_pointer, cflags) + }; + + if regcomp_result == 0 { + // Safety: just checked regcomp return value + let regex = unsafe { regex_maybe_uninit.assume_init() }; + + regex_vec.push(regex); + } else { + return Err(Box::from(format!( + "error compiling regex '{}'", + pattern_c_string.to_string_lossy() + ))); } - ps.push(regex); } - Ok(Self::Regex(ps)) + + Ok(Self::Regex(regex_vec)) } } @@ -303,7 +317,11 @@ impl Patterns { /// # Returns /// /// Returns [bool](bool) - `true` if input matches present patterns, else `false`. - fn matches(&self, input: impl AsRef, collect_matching_substrings: bool) -> MatchesResult { + fn matches( + &self, + input: impl AsRef<[u8]>, + collect_matching_substrings: bool, + ) -> Result> { let input = input.as_ref(); let mut matching_substrings = Vec::>::new(); @@ -312,34 +330,42 @@ impl Patterns { match self { Patterns::Fixed(patterns, ignore_case, line_regexp) => { + let input_ascii_lowercase: Vec; + let input = if *ignore_case { - input.to_lowercase() + input_ascii_lowercase = input.to_ascii_lowercase(); + + input_ascii_lowercase.as_slice() } else { - input.to_string() + input }; for pattern in patterns { if *line_regexp { - if input != *pattern { + if input != pattern.as_bytes() { continue; } if !collect_matching_substrings { - return MatchesResult::fast_path_match(); + return Ok(MatchesData::fast_path_match()); } any_pattern_matched = true; matching_substrings.push(pattern.as_bytes().to_vec()); } else { - for st in input.matches(pattern) { + let pattern_len = pattern.len(); + + for index in memmem::find_iter(input, pattern) { if !collect_matching_substrings { - return MatchesResult::fast_path_match(); + return Ok(MatchesData::fast_path_match()); } any_pattern_matched = true; - matching_substrings.push(st.as_bytes().to_vec()); + let match_slice = &input[index..(index + pattern_len)]; + + matching_substrings.push(match_slice.to_vec()); } } } @@ -347,8 +373,6 @@ impl Patterns { Patterns::Regex(patterns) => { const SINGLE_ELEMENT_ARRAY_SIZE: usize = 1_usize; - let input_slice = input.as_bytes(); - let mut regmatch_t_array = [const { regmatch_t { rm_so: -1, @@ -363,7 +387,7 @@ impl Patterns { }; for pattern in patterns { - let mut current_string_index = 0_usize; + let mut current_position = 0_usize; loop { // Clear values from the last iteration @@ -374,9 +398,11 @@ impl Patterns { regmatch_t.rm_eo = -1; } - let current_string_slice = &input_slice[current_string_index..]; + let forward_slice = &input[current_position..]; - let current_string_c_string = CString::new(current_string_slice).unwrap(); + // TODO + // How should slices containing null bytes be handled? + let current_string_c_string = CString::new(forward_slice)?; let current_string_pointer = current_string_c_string.as_ptr(); @@ -390,7 +416,7 @@ impl Patterns { } if !collect_matching_substrings { - return MatchesResult::fast_path_match(); + return Ok(MatchesData::fast_path_match()); } any_pattern_matched = true; @@ -402,8 +428,8 @@ impl Patterns { debug_assert!(rm_so != -1); debug_assert!(rm_eo != -1); - let start = usize::try_from(rm_so).unwrap(); - let end = usize::try_from(rm_eo).unwrap(); + let start = usize::try_from(rm_so)?; + let end = usize::try_from(rm_eo)?; // TODO // Is this the right fix? @@ -414,18 +440,18 @@ impl Patterns { break; } - matching_substrings.push(current_string_slice[start..end].to_vec()); + matching_substrings.push(forward_slice[start..end].to_vec()); - current_string_index += end; + current_position += end; } } } } - MatchesResult { + Ok(MatchesData { any_pattern_matched, matching_substrings, - } + }) } } @@ -451,30 +477,34 @@ enum OutputMode { Default, } +struct GrepModelState { + any_errors: bool, + any_matches: bool, + output_mode: OutputMode, + stdout_lock: StdoutLock<'static>, +} + /// Structure that contains all necessary information for `grep` utility processing. struct GrepModel { - any_matches: bool, - any_errors: bool, - line_number: bool, - no_messages: bool, + input_files: Vec, invert_match: bool, + line_number: bool, multiple_inputs: bool, - output_mode: OutputMode, - patterns: Patterns, - input_files: Vec, - stdout_lock: StdoutLock<'static>, + no_messages: bool, only_matching: bool, + patterns: Patterns, + quiet: bool, } -struct MatchesResult { +struct MatchesData { any_pattern_matched: bool, /// Will always be empty if the -o option is not being used matching_substrings: Vec>, } -impl MatchesResult { - pub fn fast_path_match() -> MatchesResult { - MatchesResult { +impl MatchesData { + pub fn fast_path_match() -> MatchesData { + MatchesData { any_pattern_matched: true, matching_substrings: Vec::>::new(), } @@ -487,47 +517,60 @@ impl GrepModel { /// # Returns /// /// Returns [i32](i32) that represents *exit status code*. - fn grep(&mut self) -> i32 { - for input_name in self.input_files.drain(..).collect::>() { - if input_name == "-" { + fn grep(&self, grep_model_state: &mut GrepModelState) -> Result<(), Box> { + for input_name in &self.input_files { + let input_name_str = input_name.as_str(); + + if input_name_str == "-" { let reader = Box::new(BufReader::new(io::stdin())); - self.process_input("(standard input)", reader); + + if let Err(bo) = self.process_input(grep_model_state, "(standard input)", reader) { + grep_model_state.any_errors = true; + + if !self.no_messages { + eprintln!("grep: {input_name_str}: {bo}"); + } + } } else { - match File::open(&input_name) { + match File::open(input_name_str) { Ok(file) => { let reader = Box::new(BufReader::new(file)); - self.process_input(&input_name, reader) + + self.process_input(grep_model_state, input_name_str, reader)?; } - Err(err) => { - self.any_errors = true; + Err(er) => { + grep_model_state.any_errors = true; + if !self.no_messages { - eprintln!("{}: {}", input_name, err); + eprintln!("grep: {input_name_str}: {er}"); } } } } - if self.any_matches && self.output_mode == OutputMode::Quiet { - return 0; + + if grep_model_state.any_matches && grep_model_state.output_mode == OutputMode::Quiet { + return Ok(()); } } - if self.any_errors { - 2 - } else if !self.any_matches { - 1 - } else { - 0 - } + Ok(()) } - fn print_line_prefix(&mut self, input_name: &str, line_number: u64) { + fn print_line_prefix( + &self, + grep_model_state: &mut GrepModelState, + input_name: &str, + line_number: u64, + ) -> Result<(), Box> { if self.multiple_inputs { - write!(self.stdout_lock, "{input_name}:").unwrap(); + write!(grep_model_state.stdout_lock, "{input_name}:")?; } if self.line_number { - write!(self.stdout_lock, "{line_number}:").unwrap(); + write!(grep_model_state.stdout_lock, "{line_number}:")?; } + + Ok(()) } /// Reads lines from buffer and processes them. @@ -536,40 +579,55 @@ impl GrepModel { /// /// * `input_name` - [str](str) that represents content source name. /// * `reader` - [Box](Box) that contains object that implements [BufRead] and reads lines. - fn process_input(&mut self, input_name: &str, mut reader: Box) { + fn process_input( + &self, + grep_model_state: &mut GrepModelState, + input_name: &str, + mut reader: Box, + ) -> Result<(), Box> { let mut line_number = 0_u64; - let mut line = String::new(); + let mut line = Vec::::new(); loop { - line_number += 1; - line.clear(); - // TODO - // Probably should work on non-UTF-8 input - match reader.read_line(&mut line) { + match reader.read_until(b'\n', &mut line) { Ok(n_read) => { if n_read == 0 { break; } - let mut chars = line.chars(); + line_number += 1; - let line_without_newline = match chars.next_back() { - Some('\n') => chars.as_str(), - _ => line.as_str(), + let mut iter = line.iter(); + + let line_without_newline = match iter.next_back() { + Some(b'\n') => iter.as_slice(), + _ => line.as_slice(), }; let matches_result = self.patterns.matches( line_without_newline, - self.only_matching && matches!(self.output_mode, OutputMode::Default), + self.only_matching + && matches!(grep_model_state.output_mode, OutputMode::Default), ); - let MatchesResult { + let matches_data = match matches_result { + Ok(ma) => ma, + Err(bo) => { + if !self.no_messages { + eprintln!("grep: {input_name}: error matching patterns against line {line_number} ({bo})"); + } + + return Err(bo); + } + }; + + let MatchesData { any_pattern_matched, matching_substrings, - } = matches_result; + } = matches_data; let matches = if self.invert_match { !any_pattern_matched @@ -578,59 +636,73 @@ impl GrepModel { }; if matches { - self.any_matches = true; + grep_model_state.any_matches = true; - match &mut self.output_mode { + match &mut grep_model_state.output_mode { OutputMode::Count(count) => { *count += 1; } OutputMode::FilesWithMatches => { - writeln!(&mut self.stdout_lock, "{input_name}").unwrap(); + writeln!(&mut grep_model_state.stdout_lock, "{input_name}")?; break; } OutputMode::Quiet => { - return; + return Ok(()); } OutputMode::Default => { if self.only_matching { for matching_substring in matching_substrings { - self.print_line_prefix(input_name, line_number); + self.print_line_prefix( + grep_model_state, + input_name, + line_number, + )?; - self.stdout_lock - .write_all(matching_substring.as_slice()) - .unwrap(); + grep_model_state + .stdout_lock + .write_all(matching_substring.as_slice())?; - self.stdout_lock.write_all(b"\n").unwrap(); + grep_model_state.stdout_lock.write_all(b"\n")?; } } else { - self.print_line_prefix(input_name, line_number); + self.print_line_prefix( + grep_model_state, + input_name, + line_number, + )?; - writeln!(self.stdout_lock, "{line_without_newline}").unwrap(); + grep_model_state + .stdout_lock + .write_all(line_without_newline)?; + + grep_model_state.stdout_lock.write_all(b"\n")?; } } } } } - Err(err) => { - self.any_errors = true; - + Err(er) => { if !self.no_messages { - eprintln!("{input_name}: Error reading line {line_number} ({err})",); + eprintln!("grep: {input_name}: error reading line {line_number} ({er})"); } + + return Err(Box::new(er)); } } } - if let OutputMode::Count(count) = &mut self.output_mode { + if let OutputMode::Count(count) = &mut grep_model_state.output_mode { if self.multiple_inputs { - writeln!(&mut self.stdout_lock, "{input_name}:{count}").unwrap(); + writeln!(grep_model_state.stdout_lock, "{input_name}:{count}")?; } else { - writeln!(&mut self.stdout_lock, "{count}").unwrap(); + writeln!(grep_model_state.stdout_lock, "{count}")?; } *count = 0; } + + Ok(()) } } @@ -645,17 +717,52 @@ fn main() -> Result<(), Box> { let mut args = Args::parse(); - let exit_code = args + let output_mode = if args.count { + OutputMode::Count(0) + } else if args.files_with_matches { + OutputMode::FilesWithMatches + } else if args.quiet { + OutputMode::Quiet + } else { + OutputMode::Default + }; + + let mut grep_model_state = GrepModelState { + any_errors: false, + any_matches: false, + output_mode, + stdout_lock: io::stdout().lock(), + }; + + let result = args .validate_args() .and_then(|_| { - args.resolve(); + args.resolve(&mut grep_model_state); args.into_grep_model() }) - .map(|mut grep_model| grep_model.grep()) - .unwrap_or_else(|err| { - eprintln!("{err}"); - 2 + .and_then(|gr| match gr.grep(&mut grep_model_state) { + Ok(()) => Ok(gr), + Err(bo) => Err(bo), }); + let exit_code = match result { + Ok(grep_model) => { + if grep_model.quiet && grep_model_state.any_matches { + 0 + } else if grep_model_state.any_errors { + 2 + } else if !grep_model_state.any_matches { + 1 + } else { + 0 + } + } + Err(error_message) => { + eprintln!("grep: {error_message}"); + + 2 + } + }; + std::process::exit(exit_code); } diff --git a/text/tests/grep/mod.rs b/text/tests/grep/mod.rs index 2cdc3b10a..a63c5c361 100644 --- a/text/tests/grep/mod.rs +++ b/text/tests/grep/mod.rs @@ -18,7 +18,7 @@ const BAD_INPUT: &str = "(some text)\n"; const INPUT_FILE_1: &str = "tests/grep/f_1"; const INPUT_FILE_2: &str = "tests/grep/f_2"; const INPUT_FILE_3: &str = "tests/grep/f_3"; -const BAD_INPUT_FILE: &str = "tests/grep/inexisting_file"; +const NONEXISTENT_FILE: &str = "tests/grep/nonexistent_file"; const INVALID_LINE_INPUT_FILE: &str = "tests/grep/invalid_line"; const BRE: &str = r#"line_{[0-9]\{1,\}}"#; @@ -33,7 +33,7 @@ const EMPTY_PATTERN_FILE: &str = "tests/grep/empty_pattern"; fn grep_test( args: &[&str], - test_data: &str, + pipe_to_stdin: &str, expected_output: &str, expected_err: &str, expected_exit_code: i32, @@ -43,7 +43,7 @@ fn grep_test( run_test(TestPlan { cmd: String::from("grep"), args: str_args, - stdin_data: String::from(test_data), + stdin_data: String::from(pipe_to_stdin), expected_out: String::from(expected_output), expected_err: String::from(expected_err), expected_exit_code, @@ -56,21 +56,21 @@ fn test_incompatible_options() { &["-cl"], "", "", - "Options \'-c\' and \'-l\' cannot be used together\n", + "grep: options \'-c\' and \'-l\' cannot be used together\n", 2, ); grep_test( &["-cq"], "", "", - "Options \'-c\' and \'-q\' cannot be used together\n", + "grep: options \'-c\' and \'-q\' cannot be used together\n", 2, ); grep_test( &["-lq"], "", "", - "Options \'-l\' and \'-q\' cannot be used together\n", + "grep: options \'-l\' and \'-q\' cannot be used together\n", 2, ); } @@ -81,18 +81,18 @@ fn test_absent_pattern() { &[], "", "", - "A pattern list or at least one file is required\n", + "grep: a pattern list or at least one file is required\n", 2, ); } #[test] -fn test_inexisting_file_pattern() { +fn test_nonexistent_file_pattern() { grep_test( - &["-f", BAD_INPUT_FILE], + &["-f", NONEXISTENT_FILE], "", "", - "tests/grep/inexisting_file: No such file or directory (os error 2)\n", + "grep: tests/grep/nonexistent_file: No such file or directory (os error 2)\n", 2, ); } @@ -103,7 +103,7 @@ fn test_regexp_compiling_error() { &[INVALID_BRE], "", "", - "Error compiling regex '\\{1,3\\}'\n", + "grep: error compiling regex '\\{1,3\\}'\n", 2, ); } @@ -129,9 +129,12 @@ fn test_basic_regexp_03() { grep_test( &[BRE, INVALID_LINE_INPUT_FILE], "", - "line_{1}\np_line_{2}_s\n", - "tests/grep/invalid_line: Error reading line 2 (stream did not contain valid UTF-8)\n", - 2, + "\ +line_{1} +p_line_{2}_s +", + "", + 0, ); } @@ -167,16 +170,16 @@ fn test_basic_regexp_quiet_without_error_02() { #[test] fn test_basic_regexp_quiet_with_error_01() { - grep_test(&["-q", BRE, "-", BAD_INPUT_FILE], LINES_INPUT, "", "", 0); + grep_test(&["-q", BRE, "-", NONEXISTENT_FILE], LINES_INPUT, "", "", 0); } #[test] fn test_basic_regexp_quiet_with_error_02() { grep_test( - &["-q", BRE, BAD_INPUT_FILE, "-"], + &["-q", BRE, NONEXISTENT_FILE, "-"], LINES_INPUT, "", - "tests/grep/inexisting_file: No such file or directory (os error 2)\n", + "grep: tests/grep/nonexistent_file: No such file or directory (os error 2)\n", 0, ); } @@ -184,10 +187,10 @@ fn test_basic_regexp_quiet_with_error_02() { #[test] fn test_basic_regexp_quiet_with_error_03() { grep_test( - &["-q", BRE, "-", BAD_INPUT_FILE], + &["-q", BRE, "-", NONEXISTENT_FILE], BAD_INPUT, "", - "tests/grep/inexisting_file: No such file or directory (os error 2)\n", + "grep: tests/grep/nonexistent_file: No such file or directory (os error 2)\n", 2, ); } @@ -229,9 +232,12 @@ fn test_basic_regexp_line_number_03() { grep_test( &["-n", BRE, INVALID_LINE_INPUT_FILE], "", - "1:line_{1}\n3:p_line_{2}_s\n", - "tests/grep/invalid_line: Error reading line 2 (stream did not contain valid UTF-8)\n", - 2, + "\ +1:line_{1} +3:p_line_{2}_s +", + "", + 0, ); } @@ -253,21 +259,21 @@ fn test_basic_regexp_no_messages_without_error_02() { #[test] fn test_basic_regexp_no_messages_with_error_01() { - grep_test(&["-s", BRE, "-", BAD_INPUT_FILE], LINES_INPUT, "(standard input):line_{1}\n(standard input):p_line_{2}_s\n(standard input): line_{3} \n(standard input):line_{70}\n", "", 2); + grep_test(&["-s", BRE, "-", NONEXISTENT_FILE], LINES_INPUT, "(standard input):line_{1}\n(standard input):p_line_{2}_s\n(standard input): line_{3} \n(standard input):line_{70}\n", "", 2); } #[test] fn test_basic_regexp_no_messages_with_error_02() { - grep_test(&["-s", BRE, "-", BAD_INPUT_FILE], BAD_INPUT, "", "", 2); + grep_test(&["-s", BRE, "-", NONEXISTENT_FILE], BAD_INPUT, "", "", 2); } #[test] fn test_basic_regexp_no_messages_with_error_03() { grep_test( - &["-s", INVALID_BRE, "-", BAD_INPUT_FILE], + &["-s", INVALID_BRE, "-", NONEXISTENT_FILE], LINES_INPUT, "", - "Error compiling regex '\\{1,3\\}'\n", + "grep: error compiling regex '\\{1,3\\}'\n", 2, ); } @@ -275,7 +281,7 @@ fn test_basic_regexp_no_messages_with_error_03() { #[test] fn test_basic_regexp_no_messages_with_error_04() { grep_test( - &["-q", "-s", BRE, BAD_INPUT_FILE, "-"], + &["-q", "-s", BRE, NONEXISTENT_FILE, "-"], LINES_INPUT, "", "", @@ -290,7 +296,7 @@ fn test_basic_regexp_no_messages_with_error_05() { "", "line_{1}\np_line_{2}_s\n", "", - 2, + 0, ); } @@ -352,9 +358,12 @@ fn test_extended_regexp_03() { grep_test( &["-E", ERE, INVALID_LINE_INPUT_FILE], "", - "line_{1}\np_line_{2}_s\n", - "tests/grep/invalid_line: Error reading line 2 (stream did not contain valid UTF-8)\n", - 2, + "\ +line_{1} +p_line_{2}_s +", + "", + 0, ); } @@ -391,7 +400,7 @@ fn test_extended_regexp_quiet_without_error_02() { #[test] fn test_extended_regexp_quiet_with_error_01() { grep_test( - &["-E", "-q", ERE, "-", BAD_INPUT_FILE], + &["-E", "-q", ERE, "-", NONEXISTENT_FILE], LINES_INPUT, "", "", @@ -402,10 +411,10 @@ fn test_extended_regexp_quiet_with_error_01() { #[test] fn test_extended_regexp_quiet_with_error_02() { grep_test( - &["-E", "-q", ERE, BAD_INPUT_FILE, "-"], + &["-E", "-q", ERE, NONEXISTENT_FILE, "-"], LINES_INPUT, "", - "tests/grep/inexisting_file: No such file or directory (os error 2)\n", + "grep: tests/grep/nonexistent_file: No such file or directory (os error 2)\n", 0, ); } @@ -413,10 +422,10 @@ fn test_extended_regexp_quiet_with_error_02() { #[test] fn test_extended_regexp_quiet_with_error_03() { grep_test( - &["-E", "-q", ERE, "-", BAD_INPUT_FILE], + &["-E", "-q", ERE, "-", NONEXISTENT_FILE], BAD_INPUT, "", - "tests/grep/inexisting_file: No such file or directory (os error 2)\n", + "grep: tests/grep/nonexistent_file: No such file or directory (os error 2)\n", 2, ); } @@ -458,9 +467,12 @@ fn test_extended_regexp_line_number_03() { grep_test( &["-E", "-n", ERE, INVALID_LINE_INPUT_FILE], "", - "1:line_{1}\n3:p_line_{2}_s\n", - "tests/grep/invalid_line: Error reading line 2 (stream did not contain valid UTF-8)\n", - 2, + "\ +1:line_{1} +3:p_line_{2}_s +", + "", + 0, ); } @@ -482,13 +494,13 @@ fn test_extended_regexp_no_messages_without_error_02() { #[test] fn test_extended_regexp_no_messages_with_error_01() { - grep_test(&["-E", "-s", ERE, "-", BAD_INPUT_FILE], LINES_INPUT, "(standard input):line_{1}\n(standard input):p_line_{2}_s\n(standard input): line_{3} \n(standard input):line_{70}\n", "", 2); + grep_test(&["-E", "-s", ERE, "-", NONEXISTENT_FILE], LINES_INPUT, "(standard input):line_{1}\n(standard input):p_line_{2}_s\n(standard input): line_{3} \n(standard input):line_{70}\n", "", 2); } #[test] fn test_extended_regexp_no_messages_with_error_02() { grep_test( - &["-E", "-s", ERE, "-", BAD_INPUT_FILE], + &["-E", "-s", ERE, "-", NONEXISTENT_FILE], BAD_INPUT, "", "", @@ -499,10 +511,10 @@ fn test_extended_regexp_no_messages_with_error_02() { #[test] fn test_extended_regexp_no_messages_with_error_03() { grep_test( - &["-E", "-s", INVALID_ERE, "-", BAD_INPUT_FILE], + &["-E", "-s", INVALID_ERE, "-", NONEXISTENT_FILE], LINES_INPUT, "", - "Error compiling regex '{1,3}'\n", + "grep: error compiling regex '{1,3}'\n", 2, ); } @@ -510,7 +522,7 @@ fn test_extended_regexp_no_messages_with_error_03() { #[test] fn test_extended_regexp_no_messages_with_error_04() { grep_test( - &["-E", "-q", "-s", ERE, BAD_INPUT_FILE, "-"], + &["-E", "-q", "-s", ERE, NONEXISTENT_FILE, "-"], LINES_INPUT, "", "", @@ -523,9 +535,12 @@ fn test_extended_regexp_no_messages_with_error_05() { grep_test( &["-E", "-s", ERE, INVALID_LINE_INPUT_FILE], "", - "line_{1}\np_line_{2}_s\n", + "\ +line_{1} +p_line_{2}_s +", "", - 2, + 0, ); } @@ -593,9 +608,12 @@ fn test_fixed_strings_03() { grep_test( &["-F", FIXED, INVALID_LINE_INPUT_FILE], "", - "line_{1}\np_line_{2}_s\n", - "tests/grep/invalid_line: Error reading line 2 (stream did not contain valid UTF-8)\n", - 2, + "\ +line_{1} +p_line_{2}_s +", + "", + 0, ); } @@ -638,7 +656,7 @@ fn test_fixed_strings_quiet_without_error_02() { #[test] fn test_fixed_strings_quiet_with_error_01() { grep_test( - &["-F", "-q", FIXED, "-", BAD_INPUT_FILE], + &["-F", "-q", FIXED, "-", NONEXISTENT_FILE], LINES_INPUT, "", "", @@ -649,10 +667,10 @@ fn test_fixed_strings_quiet_with_error_01() { #[test] fn test_fixed_strings_quiet_with_error_02() { grep_test( - &["-F", "-q", FIXED, BAD_INPUT_FILE, "-"], + &["-F", "-q", FIXED, NONEXISTENT_FILE, "-"], LINES_INPUT, "", - "tests/grep/inexisting_file: No such file or directory (os error 2)\n", + "grep: tests/grep/nonexistent_file: No such file or directory (os error 2)\n", 0, ); } @@ -660,10 +678,10 @@ fn test_fixed_strings_quiet_with_error_02() { #[test] fn test_fixed_strings_quiet_with_error_03() { grep_test( - &["-F", "-q", FIXED, "-", BAD_INPUT_FILE], + &["-F", "-q", FIXED, "-", NONEXISTENT_FILE], BAD_INPUT, "", - "tests/grep/inexisting_file: No such file or directory (os error 2)\n", + "grep: tests/grep/nonexistent_file: No such file or directory (os error 2)\n", 2, ); } @@ -705,9 +723,12 @@ fn test_fixed_strings_line_number_03() { grep_test( &["-F", "-n", FIXED, INVALID_LINE_INPUT_FILE], "", - "1:line_{1}\n3:p_line_{2}_s\n", - "tests/grep/invalid_line: Error reading line 2 (stream did not contain valid UTF-8)\n", - 2, + "\ +1:line_{1} +3:p_line_{2}_s +", + "", + 0, ); } @@ -729,13 +750,13 @@ fn test_fixed_strings_no_messages_without_error_02() { #[test] fn test_fixed_strings_no_messages_with_error_01() { - grep_test(&["-F", "-s", FIXED, "-", BAD_INPUT_FILE], LINES_INPUT, "(standard input):line_{1}\n(standard input):p_line_{2}_s\n(standard input): line_{3} \n(standard input):line_{70}\n", "", 2); + grep_test(&["-F", "-s", FIXED, "-", NONEXISTENT_FILE], LINES_INPUT, "(standard input):line_{1}\n(standard input):p_line_{2}_s\n(standard input): line_{3} \n(standard input):line_{70}\n", "", 2); } #[test] fn test_fixed_strings_no_messages_with_error_02() { grep_test( - &["-F", "-s", FIXED, "-", BAD_INPUT_FILE], + &["-F", "-s", FIXED, "-", NONEXISTENT_FILE], BAD_INPUT, "", "", @@ -746,10 +767,10 @@ fn test_fixed_strings_no_messages_with_error_02() { #[test] fn test_fixed_strings_no_messages_with_error_03() { grep_test( - &["-F", "-cl", "-s", FIXED, "-", BAD_INPUT_FILE], + &["-F", "-cl", "-s", FIXED, "-", NONEXISTENT_FILE], LINES_INPUT, "", - "Options '-c' and '-l' cannot be used together\n", + "grep: options '-c' and '-l' cannot be used together\n", 2, ); } @@ -757,7 +778,7 @@ fn test_fixed_strings_no_messages_with_error_03() { #[test] fn test_fixed_strings_no_messages_with_error_04() { grep_test( - &["-F", "-q", "-s", FIXED, BAD_INPUT_FILE, "-"], + &["-F", "-q", "-s", FIXED, NONEXISTENT_FILE, "-"], LINES_INPUT, "", "", @@ -772,7 +793,7 @@ fn test_fixed_strings_no_messages_with_error_05() { "", "line_{1}\np_line_{2}_s\n", "", - 2, + 0, ); } @@ -1005,7 +1026,7 @@ fn test_stdin_and_input_files_quiet() { #[test] fn test_stdin_and_input_files_other_options() { - grep_test(&["-insvx", BRE, "-", INPUT_FILE_1, BAD_INPUT_FILE], LINES_INPUT, "(standard input):2:p_line_{2}_s\n(standard input):3: line_{3} \n(standard input):5:p_LINE_{5}_s\n(standard input):6:l_{6}\ntests/grep/f_1:2:p_line_{2}_s\ntests/grep/f_1:3: line_{3} \ntests/grep/f_1:5:p_LINE_{5}_s\ntests/grep/f_1:6:l_{6}\n", "", 2); + grep_test(&["-insvx", BRE, "-", INPUT_FILE_1, NONEXISTENT_FILE], LINES_INPUT, "(standard input):2:p_line_{2}_s\n(standard input):3: line_{3} \n(standard input):5:p_LINE_{5}_s\n(standard input):6:l_{6}\ntests/grep/f_1:2:p_line_{2}_s\ntests/grep/f_1:3: line_{3} \ntests/grep/f_1:5:p_LINE_{5}_s\ntests/grep/f_1:6:l_{6}\n", "", 2); } #[test] From 2fa58535226fc9cf00eed402051a0f17d250ccfb Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Mon, 4 Nov 2024 04:52:42 -0600 Subject: [PATCH 7/7] Fixed fixed string bug --- text/grep.rs | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/text/grep.rs b/text/grep.rs index 3e372c2db..a06002acd 100644 --- a/text/grep.rs +++ b/text/grep.rs @@ -309,21 +309,11 @@ impl Patterns { } /// Checks if input string matches the present patterns. - /// - /// # Arguments - /// - /// * `input` - object that implements [AsRef](AsRef) for [str](str) and describes line. - /// - /// # Returns - /// - /// Returns [bool](bool) - `true` if input matches present patterns, else `false`. fn matches( &self, - input: impl AsRef<[u8]>, + input: &[u8], collect_matching_substrings: bool, ) -> Result> { - let input = input.as_ref(); - let mut matching_substrings = Vec::>::new(); let mut any_pattern_matched = false; @@ -332,7 +322,7 @@ impl Patterns { Patterns::Fixed(patterns, ignore_case, line_regexp) => { let input_ascii_lowercase: Vec; - let input = if *ignore_case { + let input_after_ignore_case = if *ignore_case { input_ascii_lowercase = input.to_ascii_lowercase(); input_ascii_lowercase.as_slice() @@ -341,8 +331,10 @@ impl Patterns { }; for pattern in patterns { + let pattern_as_bytes = pattern.as_bytes(); + if *line_regexp { - if input != pattern.as_bytes() { + if input_after_ignore_case != pattern_as_bytes { continue; } @@ -352,20 +344,24 @@ impl Patterns { any_pattern_matched = true; - matching_substrings.push(pattern.as_bytes().to_vec()); + matching_substrings.push(input.to_vec()); } else { - let pattern_len = pattern.len(); - - for index in memmem::find_iter(input, pattern) { - if !collect_matching_substrings { - return Ok(MatchesData::fast_path_match()); - } + if collect_matching_substrings { + let pattern_as_bytes_len = pattern_as_bytes.len(); - any_pattern_matched = true; + for index in + memmem::find_iter(input_after_ignore_case, pattern_as_bytes) + { + any_pattern_matched = true; - let match_slice = &input[index..(index + pattern_len)]; + let match_slice = &input[index..(index + pattern_as_bytes_len)]; - matching_substrings.push(match_slice.to_vec()); + matching_substrings.push(match_slice.to_vec()); + } + } else { + if memmem::find(input_after_ignore_case, pattern_as_bytes).is_some() { + return Ok(MatchesData::fast_path_match()); + } } } } @@ -503,6 +499,7 @@ struct MatchesData { } impl MatchesData { + #[inline] pub fn fast_path_match() -> MatchesData { MatchesData { any_pattern_matched: true,