fix dnn-crf prediction bug

supercoderhawk · supercoderhawk · commit 73001ee11354 · 2017-11-29T23:14:32.000+08:00
diff --git a/python/dnlp/core/dnn_crf_base.py b/python/dnlp/core/dnn_crf_base.py
@@ -2,11 +2,11 @@
 import numpy as np
 import pickle
 from dnlp.config.config import DnnCrfConfig
-from dnlp.utils.constant import BATCH_PAD, STRT_VAL, END_VAL, TAG_PAD, TAG_BEGIN, TAG_INSIDE, TAG_SINGLE
+from dnlp.utils.constant import BATCH_PAD, UNK, STRT_VAL, END_VAL, TAG_PAD, TAG_BEGIN, TAG_INSIDE, TAG_SINGLE
 
 
 class DnnCrfBase(object):
-  def __init__(self, config: DnnCrfConfig=None, data_path: str = '', mode: str = 'train', model_path: str = ''):
+  def __init__(self, config: DnnCrfConfig = None, data_path: str = '', mode: str = 'train', model_path: str = ''):
     # 加载数据
     self.data_path = data_path
     self.config_suffix = '.config.pickle'
@@ -18,7 +18,7 @@ def __init__(self, config: DnnCrfConfig=None, data_path: str = '', mode: str = '
       self.dictionary, self.tags = self.__load_config()
     self.tags_count = len(self.tags) - 1  # 忽略TAG_PAD
     self.tags_map = self.__generate_tag_map()
-    self.reversed_tags_map = dict(zip(self.tags_map.values(),self.tags_map.keys()))
+    self.reversed_tags_map = dict(zip(self.tags_map.values(), self.tags_map.keys()))
     self.dict_size = len(self.dictionary)
     # 初始化超参数
     self.skip_left = config.skip_left
@@ -82,7 +82,7 @@ def get_batch(self) -> (np.ndarray, np.ndarray, np.ndarray):
       else:
         ext_size = self.batch_length - len(chs)
         chs_batch[i] = chs + ext_size * [self.dictionary[BATCH_PAD]]
-        lls_batch[i] = list(map(lambda t: self.tags_map[t], lls)) + ext_size * [0]#[self.tags_map[TAG_PAD]]
+        lls_batch[i] = list(map(lambda t: self.tags_map[t], lls)) + ext_size * [0]  # [self.tags_map[TAG_PAD]]
 
     self.batch_start = new_start
     return self.indices2input(chs_batch), np.array(lls_batch, dtype=np.int32), np.array(len_batch, dtype=np.int32)
@@ -111,7 +111,8 @@ def viterbi(self, emission: np.ndarray, transition: np.ndarray, transition_init:
     return corr_path
 
   def sentence2indices(self, sentence: str) -> list:
-    return list(map(lambda ch: self.dictionary[ch], sentence))
+    expr = lambda ch: self.dictionary[ch] if ch in self.dictionary else self.dictionary[UNK]
+    return list(map(expr, sentence))
 
   def indices2input(self, indices: list) -> np.ndarray:
     res = []
@@ -173,10 +174,10 @@ def tags2entities(self, sentence: str, tags_seq: np.ndarray, return_start: bool
     else:
       return entities
 
-  def tag2sequences(self, tags_seq:np.ndarray):
+  def tag2sequences(self, tags_seq: np.ndarray):
     seq = []
 
     for tag in tags_seq:
       seq.append(self.reversed_tags_map[tag])
 
-    return seq
+    return seq