Skip to content

Commit 1c756a0

Browse files
authored
Handle invalid UTF-8 encoding in strings (#295)
Log a warning and then retry with the replacement error handling approach.
1 parent 9b6db0f commit 1c756a0

File tree

2 files changed

+47
-2
lines changed

2 files changed

+47
-2
lines changed

nptdms/test/test_types.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from datetime import date, datetime
44
import io
55
import numpy as np
6+
import struct
67
import pytest
78

89
from nptdms import types
@@ -54,3 +55,33 @@ def test_timestamp_from_date():
5455
read_datetime = types.TimeStamp.read(data_file)
5556

5657
assert expected_datetime == read_datetime.as_datetime64()
58+
59+
60+
def test_invalid_utf8_string_read(caplog):
61+
""" Test reading a single invalid string value"""
62+
file = io.BytesIO(struct.pack("<L", 3) + b'0 \xb0')
63+
string_value = types.String.read(file)
64+
65+
assert string_value == "0 �"
66+
assert "WARNING" in caplog.text
67+
assert "0 \\xb0" in caplog.text
68+
69+
70+
def test_invalid_utf8_strings_read(caplog):
71+
"""Test reading multiple string values where one is invalid"""
72+
string_bytes = [
73+
b'hello',
74+
b'0 \xb0',
75+
b'world',
76+
]
77+
offset = 0
78+
offsets = []
79+
for val in string_bytes:
80+
offset += len(val)
81+
offsets.append(struct.pack("<L", offset))
82+
file = io.BytesIO(b''.join(offsets + string_bytes))
83+
string_values = types.String.read_values(file, len(string_bytes))
84+
85+
assert string_values == ["hello", "0 �", "world"]
86+
assert "WARNING" in caplog.text
87+
assert "0 \\xb0" in caplog.text

nptdms/types.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import numpy as np
44
import struct
55
from nptdms.timestamp import TdmsTimestamp, TimestampArray
6+
from nptdms.log import log_manager
7+
8+
9+
log = log_manager.get_logger(__name__)
610

711

812
__all__ = [
@@ -205,7 +209,7 @@ def __init__(self, value):
205209
def read(file, endianness="<"):
206210
size_bytes = file.read(4)
207211
size = _struct_unpack(endianness + 'L', size_bytes)[0]
208-
return file.read(size).decode('utf-8')
212+
return String._decode(file.read(size))
209213

210214
@classmethod
211215
def read_values(cls, file, number_values, endianness="<"):
@@ -220,9 +224,19 @@ def read_values(cls, file, number_values, endianness="<"):
220224
strings = []
221225
for i in range(number_values):
222226
s = file.read(offsets[i + 1] - offsets[i])
223-
strings.append(s.decode('utf-8'))
227+
strings.append(String._decode(s))
224228
return strings
225229

230+
@staticmethod
231+
def _decode(string_bytes):
232+
try:
233+
return string_bytes.decode('utf-8')
234+
except UnicodeDecodeError as exc:
235+
log.warning(
236+
"Error decoding string from bytes %s, retrying with replace handler: %s",
237+
string_bytes, exc)
238+
return string_bytes.decode('utf-8', errors='replace')
239+
226240

227241
@tds_data_type(0x21, np.bool_)
228242
class Boolean(StructType):

0 commit comments

Comments
 (0)