modify init and preprocess logic

supercoderhawk · supercoderhawk · commit c957bc6cc22c · 2017-11-29T21:01:42.000+08:00
diff --git a/python/dnlp/data_process/process_cws.py b/python/dnlp/data_process/process_cws.py
@@ -2,12 +2,13 @@
 import re
 import pickle
 from dnlp.data_process.processor import Preprocessor
-from dnlp.utils.constant import TAG_BEGIN, TAG_INSIDE, TAG_END, TAG_SINGLE,CWS_TAGS
+from dnlp.utils.constant import TAG_BEGIN, TAG_INSIDE, TAG_END, TAG_SINGLE,CWS_TAGS,UNK_VAL
 
 
 class ProcessCWS(Preprocessor):
   def __init__(self, *, files: tuple = (), dict_path: str = '', base_folder: str = 'dnlp/data', name: str = '',
-               delimiter: tuple = ('。')):
+               mode:str='train',delimiter: tuple = ('。')):
+    self.mode = mode
     self.SPLIT_CHAR = '  '
     if base_folder == '':
       raise Exception('base folder is empty')
@@ -53,17 +54,25 @@ def map_to_indices(self):
       lls = []
       for word in words:
         if len(word) == 1:
-          chs.append(self.dictionary[word])
+          if self.mode == 'train':
+            chs.append(self.dictionary[word] if self.dictionary.get(word) is not None else UNK_VAL)
+          else:
+            chs.append(word)
           lls.append(TAG_SINGLE)
         elif len(word) == 0:
           raise Exception('word length is zero')
         else:
-          chs.extend(map(lambda ch: self.dictionary[ch], word))
+          if self.mode == 'train':
+            chs.extend(map(lambda ch: self.dictionary[ch] if self.dictionary.get(ch) is not None else UNK_VAL, word))
+          else:
+            chs.append(word)
           lls.append(TAG_BEGIN)
           lls.extend([TAG_INSIDE] * (len(word) - 2))
           lls.append(TAG_END)
       characters.append(chs)
       labels.append(lls)
+    if self.mode == 'test':
+      characters = list(map(lambda words:''.join(words),characters))
     return characters, labels
 
   def save_data(self):
diff --git a/python/scripts/init_datasets.py b/python/scripts/init_datasets.py
@@ -9,17 +9,18 @@ def copy():
   dst_base_folder = '../dnlp/data/cws/'
   if not os.path.exists(dst_base_folder):
     os.makedirs(dst_base_folder)
-  pku = 'pku_training.utf8'
-  copyfile(src_folder + pku, dst_base_folder + pku)
+  files = ['pku_training.utf8','pku_test.utf8']
+  for f in files:
+    copyfile(src_folder + f, dst_base_folder + f)
 
 
 def build_cws_datasets():
-  files = ('pku_training.utf8',)
   base_folder = '../dnlp/data/cws/'
   if not os.path.exists(base_folder):
     os.makedirs(base_folder)
-  ProcessCWS(files=files, base_folder=base_folder, name='pku_training')
-
+  ProcessCWS(files=('pku_training.utf8',), base_folder=base_folder, name='pku_training')
+  dict_path = base_folder + 'pku_training_dict.utf8'
+  ProcessCWS(files=('pku_test.utf8',), dict_path=dict_path,base_folder=base_folder, name='pku_test',mode='test')
 
 if __name__ == '__main__':
   copy()