Skip to content

Commit 2cb7315

Browse files
Fix handling of non-UTF-8 data
1 parent bcdd602 commit 2cb7315

File tree

6 files changed

+160
-91
lines changed

6 files changed

+160
-91
lines changed

text/diff_util/file_diff.rs

+98-68
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,19 @@ use super::{
33
constants::COULD_NOT_UNWRAP_FILENAME,
44
diff_exit_status::DiffExitStatus,
55
file_data::{FileData, LineReader},
6-
functions::{check_existence, is_binary, system_time_to_rfc2822},
6+
functions::{check_existence, system_time_to_rfc2822},
77
hunks::Hunks,
88
};
99
use crate::diff_util::constants::NO_NEW_LINE_AT_END_OF_FILE;
1010
use std::{
1111
cmp::Reverse,
1212
collections::HashMap,
1313
fmt::Write,
14-
fs::{read_to_string, File},
14+
fs::{self, File},
1515
io::{self, BufReader, Read},
1616
os::unix::fs::MetadataExt,
17-
path::Path,
17+
path::{Display, Path},
18+
str,
1819
};
1920

2021
pub struct FileDiff<'a> {
@@ -46,72 +47,78 @@ impl<'a> FileDiff<'a> {
4647
format_options: &FormatOptions,
4748
show_if_different: Option<String>,
4849
) -> io::Result<DiffExitStatus> {
49-
if is_binary(path1)? || is_binary(path2)? {
50-
Self::binary_file_diff(path1, path2)
51-
} else {
52-
let content1 = read_to_string(path1)?.into_bytes();
53-
let linereader1 = LineReader::new(&content1);
54-
let ends_with_newline1 = linereader1.ends_with_newline();
55-
let mut lines1 = Vec::new();
56-
for line in linereader1 {
57-
if !format_options.ignore_trailing_white_spaces {
58-
lines1.push(line);
59-
} else {
60-
lines1.push(line.trim_end());
50+
let path1_vec = fs::read(path1)?;
51+
let path2_vec = fs::read(path2)?;
52+
53+
// Try normal diff first
54+
if let Ok(path1_str) = str::from_utf8(path1_vec.as_slice()) {
55+
if let Ok(path2_str) = str::from_utf8(path2_vec.as_slice()) {
56+
// Both files consist only of valid UTF-8 data
57+
let linereader1 = LineReader::new(path1_str.as_bytes());
58+
let ends_with_newline1 = linereader1.ends_with_newline();
59+
let mut lines1 = Vec::new();
60+
for line in linereader1 {
61+
if !format_options.ignore_trailing_white_spaces {
62+
lines1.push(line);
63+
} else {
64+
lines1.push(line.trim_end());
65+
}
6166
}
62-
}
6367

64-
let content2 = read_to_string(path2)?.into_bytes();
65-
let linereader2 = LineReader::new(&content2);
66-
let ends_with_newline2 = linereader2.ends_with_newline();
67-
let mut lines2 = Vec::new();
68-
for line in linereader2 {
69-
if !format_options.ignore_trailing_white_spaces {
70-
lines2.push(line);
71-
} else {
72-
lines2.push(line.trim_end());
68+
let linereader2 = LineReader::new(path2_str.as_bytes());
69+
let ends_with_newline2 = linereader2.ends_with_newline();
70+
let mut lines2 = Vec::new();
71+
for line in linereader2 {
72+
if !format_options.ignore_trailing_white_spaces {
73+
lines2.push(line);
74+
} else {
75+
lines2.push(line.trim_end());
76+
}
7377
}
74-
}
75-
let mut file1 = FileData::get_file(path1, lines1, ends_with_newline1)?;
76-
let mut file2 = FileData::get_file(path2, lines2, ends_with_newline2)?;
77-
78-
let mut diff = FileDiff::new(&mut file1, &mut file2, format_options);
79-
80-
// histogram diff
81-
let mut lcs_indices = vec![-1_i32; diff.file1.lines().len()];
82-
let num_lines1 = diff.file1.lines().len();
83-
let num_lines2 = diff.file2.lines().len();
84-
FileDiff::histogram_lcs(
85-
diff.file1,
86-
diff.file2,
87-
0,
88-
num_lines1,
89-
0,
90-
num_lines2,
91-
&mut lcs_indices,
92-
);
78+
let mut file1 = FileData::get_file(path1, lines1, ends_with_newline1)?;
79+
let mut file2 = FileData::get_file(path2, lines2, ends_with_newline2)?;
80+
81+
let mut diff = FileDiff::new(&mut file1, &mut file2, format_options);
82+
83+
// histogram diff
84+
let mut lcs_indices = vec![-1_i32; diff.file1.lines().len()];
85+
let num_lines1 = diff.file1.lines().len();
86+
let num_lines2 = diff.file2.lines().len();
87+
FileDiff::histogram_lcs(
88+
diff.file1,
89+
diff.file2,
90+
0,
91+
num_lines1,
92+
0,
93+
num_lines2,
94+
&mut lcs_indices,
95+
);
9396

94-
diff.hunks
95-
.create_hunks_from_lcs(&lcs_indices, num_lines1, num_lines2);
97+
diff.hunks
98+
.create_hunks_from_lcs(&lcs_indices, num_lines1, num_lines2);
9699

97-
if diff.hunks.hunk_count() > 0 {
98-
diff.are_different = true;
99-
}
100+
if diff.hunks.hunk_count() > 0 {
101+
diff.are_different = true;
102+
}
100103

101-
if diff.are_different {
102-
if let Some(show_if_different) = show_if_different {
103-
println!("{show_if_different}");
104+
if diff.are_different {
105+
if let Some(show_if_different) = show_if_different {
106+
println!("{show_if_different}");
107+
}
108+
} else {
109+
Self::print_identical_message_if_appropriate(
110+
format_options,
111+
path1.display(),
112+
path2.display(),
113+
);
104114
}
105-
} else if format_options.report_identical_files {
106-
println!(
107-
"Files {} and {} are identical",
108-
path1.display(),
109-
path2.display()
110-
);
111-
}
112115

113-
diff.print()
116+
return diff.print();
117+
}
114118
}
119+
120+
// Fall back to binary diff
121+
Self::binary_file_diff(format_options, path1, path2)
115122
}
116123

117124
pub fn file_dir_diff(
@@ -148,18 +155,33 @@ impl<'a> FileDiff<'a> {
148155
}
149156
}
150157

151-
fn binary_file_diff(file1_path: &Path, file2_path: &Path) -> io::Result<DiffExitStatus> {
152-
let differ_report = format!(
153-
"Binary files {} and {} differ",
154-
file1_path.display(),
155-
file2_path.display()
156-
);
158+
fn print_identical_message_if_appropriate(
159+
format_options: &FormatOptions,
160+
path_1_display: Display,
161+
path_2_display: Display,
162+
) {
163+
if format_options.report_identical_files {
164+
println!("Files {path_1_display} and {path_2_display} are identical",);
165+
}
166+
}
167+
168+
fn binary_file_diff(
169+
format_options: &FormatOptions,
170+
file1_path: &Path,
171+
file2_path: &Path,
172+
) -> io::Result<DiffExitStatus> {
173+
let file1_path_display = file1_path.display();
174+
let file2_path_display = file2_path.display();
175+
176+
let differ_report =
177+
format!("Binary files {file1_path_display} and {file2_path_display} differ",);
157178

158179
let file1 = File::open(file1_path)?;
159180
let file2 = File::open(file2_path)?;
160181

161182
if file1.metadata()?.size() != file2.metadata()?.size() {
162-
println!("{}", differ_report);
183+
println!("{differ_report}");
184+
163185
return Ok(DiffExitStatus::Different);
164186
}
165187

@@ -168,12 +190,20 @@ impl<'a> FileDiff<'a> {
168190

169191
for bytes_pair in file1.bytes().zip(file2.bytes()) {
170192
let (b1, b2) = (bytes_pair.0?, bytes_pair.1?);
193+
171194
if b1 != b2 {
172-
println!("{}", differ_report);
195+
println!("{differ_report}");
196+
173197
return Ok(DiffExitStatus::Different);
174198
}
175199
}
176200

201+
Self::print_identical_message_if_appropriate(
202+
format_options,
203+
file1_path_display,
204+
file2_path_display,
205+
);
206+
177207
Ok(DiffExitStatus::NotDifferent)
178208
}
179209

text/diff_util/functions.rs

+1-23
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,10 @@
11
use chrono::{DateTime, Local};
2-
use std::{
3-
fs::File,
4-
io::{self, Read},
5-
path::Path,
6-
time::SystemTime,
7-
};
8-
9-
use super::constants::UTF8_NOT_ALLOWED_BYTES;
2+
use std::{path::Path, time::SystemTime};
103

114
pub fn system_time_to_rfc2822(system_time: SystemTime) -> String {
125
Into::<DateTime<Local>>::into(system_time).to_rfc2822()
136
}
147

15-
pub fn is_binary(file_path: &Path) -> io::Result<bool> {
16-
let mut file = File::open(file_path)?;
17-
let mut buffer = [0; 1024];
18-
19-
if let Ok(count) = file.read(&mut buffer) {
20-
for buf_item in buffer.iter().take(count) {
21-
if UTF8_NOT_ALLOWED_BYTES.contains(buf_item) {
22-
return Ok(true);
23-
}
24-
}
25-
}
26-
27-
Ok(false)
28-
}
29-
308
pub fn check_existence(path: &Path) -> bool {
319
if !path.exists() {
3210
eprintln!("diff: {}: No such file or directory", path.display());

text/tests/diff-tests.rs

+58
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,30 @@ fn f1_txt_with_eol_spaces_path() -> String {
9393
.to_string()
9494
}
9595

96+
fn binary_a_path() -> String {
97+
diff_base_path()
98+
.join("binary-a")
99+
.to_str()
100+
.expect("Could not unwrap binary_a_path")
101+
.to_string()
102+
}
103+
104+
fn binary_b_path() -> String {
105+
diff_base_path()
106+
.join("binary-b")
107+
.to_str()
108+
.expect("Could not unwrap binary_b_path")
109+
.to_string()
110+
}
111+
112+
fn binary_c_path() -> String {
113+
diff_base_path()
114+
.join("binary-c")
115+
.to_str()
116+
.expect("Could not unwrap binary_c_path")
117+
.to_string()
118+
}
119+
96120
struct DiffTestHelper {
97121
content: String,
98122
file1_path: String,
@@ -485,3 +509,37 @@ diff: /a74c002739306869: No such file or directory
485509
2_i32,
486510
);
487511
}
512+
513+
#[test]
514+
fn test_diff_s_binary() {
515+
let binary_a = binary_a_path();
516+
let binary_b = binary_b_path();
517+
518+
let binary_a_str = binary_a.as_str();
519+
let binary_b_str = binary_b.as_str();
520+
521+
for short_or_long in ["-s", "--report-identical-files"] {
522+
diff_test(
523+
&[short_or_long, binary_a_str, binary_b_str],
524+
format!("Files {binary_a_str} and {binary_b_str} are identical\n").as_str(),
525+
EXIT_STATUS_NO_DIFFERENCE,
526+
);
527+
}
528+
}
529+
530+
#[test]
531+
fn test_diff_binary_differ() {
532+
let binary_a = binary_a_path();
533+
let binary_c = binary_c_path();
534+
535+
let binary_a_str = binary_a.as_str();
536+
let binary_c_str = binary_c.as_str();
537+
538+
for short_or_long in ["-s", "--report-identical-files"] {
539+
diff_test(
540+
&[short_or_long, binary_a_str, binary_c_str],
541+
format!("Binary files {binary_a_str} and {binary_c_str} differ\n").as_str(),
542+
EXIT_STATUS_DIFFERENCE,
543+
);
544+
}
545+
}

text/tests/diff/binary-a

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1 A � B � 2

text/tests/diff/binary-b

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1 A � B � 2

text/tests/diff/binary-c

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1 A � B � 2

0 commit comments

Comments
 (0)