Skip to content

Commit f957698

Browse files
authored
Merge pull request #354 from andrewliebenow/tr-mixed-binary-utf-8-processing
tr: implement support for non-UTF-8 input
2 parents 3ba8f56 + f50e7f2 commit f957698

File tree

4 files changed

+1662
-943
lines changed

4 files changed

+1662
-943
lines changed

Cargo.lock

-7
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

text/Cargo.toml

-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ libc.workspace = true
1818
notify-debouncer-full = "0.3"
1919
diff = "0.1"
2020
dirs = "5.0"
21-
deunicode = "1.6"
2221
walkdir = "2"
2322

2423
[dev-dependencies]

text/tests/tr/mod.rs

+162-10
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,37 @@
88
// SPDX-License-Identifier: MIT
99
//
1010

11-
use plib::testing::{run_test, TestPlan};
11+
use plib::testing::{run_test, run_test_u8, TestPlan, TestPlanU8};
12+
13+
fn tr_test_binary(args: &[&str], test_data: &[u8], expected_output: &[u8]) {
14+
let str_args = args
15+
.iter()
16+
.map(|st| st.to_owned().to_owned())
17+
.collect::<Vec<_>>();
18+
19+
run_test_u8(TestPlanU8 {
20+
cmd: "tr".to_owned(),
21+
args: str_args,
22+
stdin_data: test_data.to_owned(),
23+
expected_out: expected_output.to_owned(),
24+
expected_err: Vec::<u8>::new(),
25+
expected_exit_code: 0_i32,
26+
});
27+
}
1228

1329
fn tr_test(args: &[&str], test_data: &str, expected_output: &str) {
1430
let str_args = args
1531
.iter()
1632
.map(|st| st.to_owned().to_owned())
17-
.collect::<Vec<String>>();
33+
.collect::<Vec<_>>();
1834

1935
run_test(TestPlan {
2036
cmd: "tr".to_owned(),
2137
args: str_args,
2238
stdin_data: test_data.to_owned(),
2339
expected_out: expected_output.to_owned(),
2440
expected_err: String::new(),
25-
expected_exit_code: 0,
41+
expected_exit_code: 0_i32,
2642
});
2743
}
2844

@@ -38,7 +54,7 @@ fn tr_bad_arguments_failure_test(args: &[&str], expected_stderr: &str) {
3854
stdin_data: String::new(),
3955
expected_out: String::new(),
4056
expected_err: expected_stderr.to_owned(),
41-
expected_exit_code: 1,
57+
expected_exit_code: 1_i32,
4258
});
4359
}
4460

@@ -482,23 +498,23 @@ fn tr_bad_octal_range() {
482498
fn tr_bad_x_n_construct_decimal() {
483499
tr_bad_arguments_failure_test(
484500
&["-d", "[a*100000000000000000000]"],
485-
"tr: invalid repeat count 100000000000000000000 in [c*n] construct\n",
501+
"tr: invalid repeat count '100000000000000000000' in [c*n] construct\n",
486502
);
487503
}
488504

489505
#[test]
490506
fn tr_bad_x_n_construct_octal() {
491507
tr_bad_arguments_failure_test(
492508
&["-d", "[a*010000000000000000000000]"],
493-
"tr: invalid repeat count 010000000000000000000000 in [c*n] construct\n",
509+
"tr: invalid repeat count '010000000000000000000000' in [c*n] construct\n",
494510
);
495511
}
496512

497513
#[test]
498514
fn tr_bad_x_n_construct_non_decimal_non_octal() {
499515
tr_bad_arguments_failure_test(
500516
&["-d", "[a*a]"],
501-
"tr: invalid repeat count ‘a’ in [c*n] construct\n",
517+
"tr: invalid repeat count 'a' in [c*n] construct\n",
502518
);
503519
}
504520

@@ -597,7 +613,7 @@ fn tr_equivalence_class_low_priority() {
597613
fn tr_arguments_validation_error_message_format() {
598614
tr_bad_arguments_failure_test(
599615
&["a"],
600-
"tr: missing operand after ‘a’. Two strings must be given when translating.\n",
616+
"tr: missing operand after 'a'. Two strings must be given when translating.\n",
601617
);
602618
}
603619

@@ -633,11 +649,147 @@ fn tr_minimal_d_s() {
633649
fn tr_missing_equiv() {
634650
tr_bad_arguments_failure_test(
635651
&["-d", "[==]"],
636-
"tr: missing equivalence class character '[==]'\n",
652+
"tr: input '[==]' is invalid: missing equivalence class character\n",
637653
);
638654
}
639655

640656
#[test]
641657
fn tr_missing_character_class() {
642-
tr_bad_arguments_failure_test(&["-d", "[::]"], "tr: missing character class name '[::]'\n");
658+
tr_bad_arguments_failure_test(
659+
&["-d", "[::]"],
660+
"tr: input '[::]' is invalid: missing character class name\n",
661+
);
662+
}
663+
664+
#[test]
665+
fn tr_8_bit() {
666+
tr_test_binary(&[r"\377", "A"], b"\xFF", b"A");
667+
}
668+
669+
#[test]
670+
fn tr_multi_byte_utf_8() {
671+
tr_test(&["-d", "ᛆᚠ"], "ᛆᚠᛏᚢᛆᛘᚢᚦᛌᛏᚭᚿᛏᛆᚱᚢᚿᛆᛧᚦᛆᛧ", "ᛏᚢᛘᚢᚦᛌᛏᚭᚿᛏᚱᚢᚿᛧᚦᛧ");
672+
}
673+
674+
#[test]
675+
fn tr_c_d_s_squeeze_not_complemented() {
676+
tr_test(&["-c", "-d", "-s", "D", "D"], "DDD AAABBBCCC DDD", "D");
677+
}
678+
679+
#[test]
680+
fn tr_squeeze_independent_of_translation() {
681+
tr_test(&["-s", "1", "23"], "111 222 333", "2 2 3");
682+
}
683+
684+
#[test]
685+
fn tr_complemented_squeeze_independent_of_translation() {
686+
tr_test(&["-c", "-s", "1", "23"], "111 222 333", "1113");
687+
}
688+
689+
#[test]
690+
fn tr_c_s_as_many_as_needed() {
691+
tr_test(&["-c", "-s", "B", "[d*]"], "AAA BBB CCC", "dBBBd");
692+
}
693+
694+
// Only BusyBox and uutils' coreutils handle this "correctly"
695+
// bsdutils runs forever (or a very long time)
696+
// GNU Core Utilities rejects this because of the "[d*] argument"
697+
#[test]
698+
fn tr_non_standard_d_s() {
699+
tr_test(&["-d", "-s", "B", "[C*]"], "AAA BBB CCC DDD", "AAA C DDD");
700+
}
701+
702+
// Different from bsdutils, but bsdutils doesn't handle 8-bit non-UTF-8 data
703+
#[test]
704+
fn tr_multi_byte_complement_translation() {
705+
tr_test(&["-c", "ᛏ", "A"], "ᛆᚠᛏ", "AAAAAAᛏ");
706+
}
707+
708+
#[test]
709+
fn tr_multi_byte_indexing_check() {
710+
tr_test(&["-c", "ᛏ", "B"], "ᛏA", "ᛏB");
711+
}
712+
713+
// BusyBox does not parse escape backslash/escape sequences inside [x*n] constructs
714+
// Other implementations do
715+
#[test]
716+
fn tr_slash_n_as_many_as_needed() {
717+
tr_test(
718+
&["b-c", r"[\n*]"],
719+
"The big black fox jumped over the fence",
720+
"\
721+
The
722+
ig
723+
la
724+
k fox jumped over the fen
725+
e",
726+
);
727+
}
728+
729+
#[test]
730+
fn tr_slash_n_broken_x_n_construct() {
731+
tr_test(
732+
&["b-e", r"[\nZ]"],
733+
"The big black fox jumped over the fence",
734+
"\
735+
Th] [ig [la
736+
k fox jump]Z ov]r th] f]n
737+
]",
738+
);
739+
}
740+
741+
#[test]
742+
fn tr_octal_in_as_many_as_needed() {
743+
tr_test(
744+
&["a-d", r"[\123*2]9"],
745+
"The big black fox jumped over the fence",
746+
"The Sig SlS9k fox jumpe9 over the fen9e",
747+
);
748+
}
749+
750+
#[test]
751+
fn tr_invalid_octal_in_as_many_as_needed() {
752+
tr_test(
753+
&["a-d", r"[\128*2]"],
754+
"The big black fox jumped over the fence",
755+
"\
756+
The
757+
ig
758+
l[8k fox jumpe* over the fen8e",
759+
);
760+
}
761+
762+
#[test]
763+
fn tr_invalid_multi_byte_range() {
764+
tr_bad_arguments_failure_test(
765+
&["-d", "ᛆ-ᚦ"],
766+
r"tr: range-endpoints of '\u{16c6}-\u{16a6}' are in reverse collating sequence order
767+
",
768+
);
769+
}
770+
771+
#[test]
772+
fn tr_multi_byte_range() {
773+
tr_test(
774+
&["-d", "ᚢ-ᛆ"],
775+
"A ᛆᚠᛏᚢᛆᛘᚢᚦᛌᛏᚭᚿᛏᛆᚱᚢᚿᛆᛧᚦᛆᛧ B",
776+
"\
777+
A ᚠᛏᛘᛌᛏᛏᛧᛧ B",
778+
);
779+
}
780+
781+
#[test]
782+
fn tr_multi_byte_squeeze_translate() {
783+
tr_test(&["-s", "ᚢ", "A"], "123 ᚢᚢᚢᚢᚢᚢ 456", "123 A 456");
784+
}
785+
786+
#[test]
787+
fn tr_dash_d_two_strings() {
788+
tr_bad_arguments_failure_test(
789+
&["-d", "A", "B"],
790+
"\
791+
tr: extra operand 'B'
792+
Only one string may be given when deleting without squeezing repeats.
793+
",
794+
);
643795
}

0 commit comments

Comments
 (0)