|
2 | 2 | import re
|
3 | 3 | import pickle
|
4 | 4 | from dnlp.data_process.processor import Preprocessor
|
5 |
| -from dnlp.utils.constant import TAG_BEGIN, TAG_INSIDE, TAG_END, TAG_SINGLE,CWS_TAGS |
| 5 | +from dnlp.utils.constant import TAG_BEGIN, TAG_INSIDE, TAG_END, TAG_SINGLE,CWS_TAGS,UNK_VAL |
6 | 6 |
|
7 | 7 |
|
8 | 8 | class ProcessCWS(Preprocessor):
|
9 | 9 | def __init__(self, *, files: tuple = (), dict_path: str = '', base_folder: str = 'dnlp/data', name: str = '',
|
10 |
| - delimiter: tuple = ('。')): |
| 10 | + mode:str='train',delimiter: tuple = ('。')): |
| 11 | + self.mode = mode |
11 | 12 | self.SPLIT_CHAR = ' '
|
12 | 13 | if base_folder == '':
|
13 | 14 | raise Exception('base folder is empty')
|
@@ -53,17 +54,25 @@ def map_to_indices(self):
|
53 | 54 | lls = []
|
54 | 55 | for word in words:
|
55 | 56 | if len(word) == 1:
|
56 |
| - chs.append(self.dictionary[word]) |
| 57 | + if self.mode == 'train': |
| 58 | + chs.append(self.dictionary[word] if self.dictionary.get(word) is not None else UNK_VAL) |
| 59 | + else: |
| 60 | + chs.append(word) |
57 | 61 | lls.append(TAG_SINGLE)
|
58 | 62 | elif len(word) == 0:
|
59 | 63 | raise Exception('word length is zero')
|
60 | 64 | else:
|
61 |
| - chs.extend(map(lambda ch: self.dictionary[ch], word)) |
| 65 | + if self.mode == 'train': |
| 66 | + chs.extend(map(lambda ch: self.dictionary[ch] if self.dictionary.get(ch) is not None else UNK_VAL, word)) |
| 67 | + else: |
| 68 | + chs.append(word) |
62 | 69 | lls.append(TAG_BEGIN)
|
63 | 70 | lls.extend([TAG_INSIDE] * (len(word) - 2))
|
64 | 71 | lls.append(TAG_END)
|
65 | 72 | characters.append(chs)
|
66 | 73 | labels.append(lls)
|
| 74 | + if self.mode == 'test': |
| 75 | + characters = list(map(lambda words:''.join(words),characters)) |
67 | 76 | return characters, labels
|
68 | 77 |
|
69 | 78 | def save_data(self):
|
|
0 commit comments