9
9
% case-insensitive BERTTokenizer using the file vocabFile as
10
10
% the vocabulary.
11
11
%
12
- % tokenizer = BERTTokenizer(vocabFile,'IgnoreCase',tf)
13
- % Constructs a BERTTokenizer which is case-sensitive or not
14
- % according to the scalar logical tf. The default is true.
12
+ % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
13
+ % specifies the optional parameter name/value pairs:
14
+ %
15
+ % 'IgnoreCase' - A logical value to control if the
16
+ % BERTTokenizer is case sensitive or not.
17
+ % The default value is true.
18
+ %
19
+ % 'FullTokenizer' - The underlying word-piece tokenizer.
20
+ % If not specified, a default
21
+ % FullTokenizer is constructed.
15
22
%
16
23
% BERTTokenizer properties:
17
24
% FullTokenizer - The underlying word-piece tokenizer.
34
41
% tokenizer = bert.tokenizer.BERTTokenizer();
35
42
% sequences = tokenizer.encode("Hello World!")
36
43
37
- % Copyright 2021 The MathWorks, Inc.
44
+ % Copyright 2021-2023 The MathWorks, Inc.
38
45
39
46
properties (Constant )
40
47
PaddingToken = " [PAD]"
63
70
% case-insensitive BERTTokenizer using the file vocabFile as
64
71
% the vocabulary.
65
72
%
66
- % tokenizer = BERTTokenizer(vocabFile,'IgnoreCase',tf)
67
- % Constructs a BERTTokenizer which is case-sensitive or not
68
- % according to the scalar logical tf. The default is true.
73
+ % tokenizer = BERTTokenizer(vocabFile,'PARAM1', VAL1, 'PARAM2', VAL2, ...)
74
+ % specifies the optional parameter name/value pairs:
75
+ %
76
+ % 'IgnoreCase' - A logical value to control if the
77
+ % BERTTokenizer is case sensitive or not.
78
+ % The default value is true.
79
+ %
80
+ % 'FullTokenizer' - The underlying word-piece tokenizer.
81
+ % If not specified, a default
82
+ % FullTokenizer is constructed.
69
83
%
70
84
% BERTTokenizer properties:
71
85
% FullTokenizer - The underlying word-piece tokenizer.
90
104
arguments
91
105
vocabFile (1 ,1 ) string {mustBeFile } = bert.internal.getSupportFilePath(" base" ," vocab.txt" )
92
106
nvp.IgnoreCase (1 ,1 ) logical = true
107
+ nvp.FullTokenizer = []
108
+ end
109
+ if isempty(nvp .FullTokenizer )
110
+ ignoreCase = nvp .IgnoreCase ;
111
+ this.FullTokenizer = bert .tokenizer .internal .FullTokenizer(vocabFile ,' IgnoreCase' ,ignoreCase );
112
+ else
113
+ mustBeA(nvp .FullTokenizer ,' bert.tokenizer.internal.FullTokenizer' );
114
+ this.FullTokenizer = nvp .FullTokenizer ;
93
115
end
94
- ignoreCase = nvp .IgnoreCase ;
95
- this.FullTokenizer = bert .tokenizer .internal .FullTokenizer(vocabFile ,' IgnoreCase' ,ignoreCase );
96
116
this.PaddingCode = this .FullTokenizer .encode(this .PaddingToken );
97
117
this.SeparatorCode = this .FullTokenizer .encode(this .SeparatorToken );
98
118
this.StartCode = this .FullTokenizer .encode(this .StartToken );
131
151
inputShape = size(text_a );
132
152
text_a = reshape(text_a ,[],1 );
133
153
text_b = reshape(text_b ,[],1 );
134
- tokenize = @(text ) this .FullTokenizer .tokenize(text );
135
- tokens = arrayfun(tokenize ,text_a ,' UniformOutput' ,false );
154
+ tokens = this .FullTokenizer .tokenize(text_a );
136
155
if ~isempty(text_b )
137
- tokens_b = arrayfun( tokenize , text_b , ' UniformOutput ' , false );
156
+ tokens_b = this . FullTokenizer . tokenize( text_b );
138
157
tokens = cellfun(@(tokens_a ,tokens_b ) [tokens_a ,this .SeparatorToken ,tokens_b ], tokens , tokens_b , ' UniformOutput' , false );
139
158
end
140
159
tokens = cellfun(@(tokens ) [this .StartToken , tokens , this .SeparatorToken ], tokens , ' UniformOutput' , false );
218
237
text = cellfun(@(x ) join(x ," " ), tokens );
219
238
end
220
239
end
221
- end
240
+ end
0 commit comments