raphael-sch
diff --git a/‎README.md
Lines changed: 51 additions & 0 deletions b/‎README.md
Lines changed: 51 additions & 0 deletions
diff --git a/‎config.cfg
Lines changed: 39 additions & 0 deletions b/‎config.cfg
Lines changed: 39 additions & 0 deletions
diff --git a/‎data/.gitignore
Lines changed: 4 additions & 0 deletions b/‎data/.gitignore
Lines changed: 4 additions & 0 deletions
diff --git a/‎feature_set.py
Lines changed: 172 additions & 0 deletions b/‎feature_set.py
Lines changed: 172 additions & 0 deletions
@@ -0,0 +1,51 @@
+## Description
+This project is an implementation of Roth and Lapatas "Neural Semantic Role Labeling with Dependency Path Embeddings"[1], a Semantic Role Labeling Model with a LSTM at the core.
+
+## Requirements
+#### Software
+Python >= 3.4.3 until this (https://github.com/tensorflow/tensorflow/issues/4588) will be in tensorflow, then 3.X should be fine. <br />
+tensorflow >= 0.10.0 <br />
+Perl >= 5.8.1 for evaluation with the CoNNL 2009 Scorer<br />
+
+#### Data
+Download the CoNLL 2008 [2] or 2009 [3] Shared Task data:<br />
+2008: https://catalog.ldc.upenn.edu/LDC2009T12<br />
+2009: https://catalog.ldc.upenn.edu/LDC2012T04<br />
+
+#### Scorer
+If you want to evaluate a dataset with the official CoNLL Scorer you have to download it here:<br />
+2009: https://ufal.mff.cuni.cz/conll2009-st/scorer.html<br />
+Put the scorer named 'eval09.pl' into the 'score_scripts' folder. The python script will use it internally and can even evaluate 2008 format input files. The scorer is executed by a 'perl' command, so make sure to have it installed. You can switch of warnings in the perl script for better readability.
+
+#### Model
+I trained a model on the CoNLL2008 'train.closed' file. The parameters where set like in the paper, except there where only 2 iterations for each argument identification model and 5 iterations for argument classification. This model scores 73.89% Labeled F1 score on the in-domain test file and 67.61% on the out-of-domain test file.<br />
+Download model: https://www.dropbox.com/sh/au325tntluwe8us/AAC8tUeO5SH6txKUT5DC2Mu6a?dl=0 <br />
+After the download place all *.model and *.meta files into the 'model' folder. Only the main *.model file (the one without LSTM in it) needs to be passed to the run script, the others will be identified by their names.
+
+
+## Run
+#### Training
+    python3 run.py train ./data/train/train.closed conll2008 ./output/
+Trains a model with the 'train.closed' file from the CoNLL-2008 Shared Task. Writes the model files into 'output'. Be sure to set your desired parameters in 'config.cfg'. <br />
+The provided model took around 21 hours to train on a p2.xlarge EC2 instance with one NVIDIA K80 GPU. 
+
+#### Testing
+    python3 run.py test ./data/test.wsj/test.wsj.closed.GOLD conll2008 ./output/ ./model/train.closed_pi_ai_ac.model
+The input files needs to have gold labels to be evaluated on. Writes the predicted labels to 'output/[input_file].PRED' and the gold labels to 'output/[input_file].GOLD'. The full evaluation is written to 'output/[input_file].RESULTS'.
+
+#### Prediction
+    python3 run.py predict ./data/test.wsj/my_own_sentences.conll conll2009 ./output/ ./model/train.closed_pi_ai_ac.model
+The input file format has to be the CoNLL 2008 or 2009 format (without argument labels, but with sense disambiguated predicates!). Predicts the argument labels and writes them to 'output/[input_file].PRED'.
+
+##ToDos
+1. Implement Predicate Prediction and Disambiguation
+2. Implement Reranker
+
+<br />
+<br />
+
+[1] Roth and Lapata, 2016, https://arxiv.org/abs/1605.07515 <br />
+[2] Surdeanu et al., 2008, http://dl.acm.org/citation.cfm?id=1596411 <br />
+[3] Hajic et al., 2009, http://dl.acm.org/citation.cfm?id=1596324.1596352
+
+
@@ -0,0 +1,39 @@
+>default
+lstm_size = 28
+hidden_layer = 128
+learning_rate = 0.001
+batch_size = 1000
+drop_out_rate = 0.50
+class_weights = True
+iteration_factor = 5
+features = ArgForm ArgPOS ArgDeprel PredForm PredLemma PredSense PredPOS
+
+>ai_V
+lstm_size = 25
+hidden_layer = 90
+learning_rate = 0.0006
+drop_out_rate = 0.42
+features = ArgForm ArgPOS ArgDeprel PredDeprel PredSense ArgLeftPOS ArgRightPOS POSPath POSDepPath DeprelPath Position PredChildDepSet PredParentForm PredParentPOS ArgRightSiblingForm ArgRightSiblingPOS
+
+>ai_N
+lstm_size = 16
+hidden_layer = 125
+learning_rate = 0.0009
+drop_out_rate = 0.25
+features = ArgForm ArgPOS ArgDeprel PredForm PredLemma PredSense PredPOS ArgLeftPOS ArgRightForm ArgRightPOS POSPath POSDepPath DeprelPath Position PredParentForm PredChildFormSet
+
+>ac_V
+lstm_size = 5
+hidden_layer = 300
+learning_rate = 0.0155
+drop_out_rate = 0.50
+iteration_factor = 50
+features = PredSense PredLemma PredPOS ArgForm ArgPOS ArgDeprel ArgRightForm ArgRightPOS ArgLeftPOS ArgLeftSiblingPOS POSPath POSDepPath DeprelPath Position PredChildDepSet PredParentForm PredParentPOS
+
+>ac_N
+lstm_size = 88
+hidden_layer = 500
+learning_rate = 0.0055
+drop_out_rate = 0.46
+iteration_factor = 50
+features = PredForm PredSense PredLemma ArgForm ArgPOS ArgRightForm ArgRightPOS ArgLeftForm ArgLeftPOS ArgLeftSiblingForm ArgLeftSiblingPOS ArgRightSiblingPOS POSPath POSDepPath Position PredChildPOSSet
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
@@ -0,0 +1,172 @@
+from features import available_features, LSTMFeature
+from scipy.sparse import csr_matrix
+import numpy as np
+from random import randint
+
+
+class FeatureSet(object):
+
+    def __init__(self, iterator, step_name, pos_type, config, freezed, label_func=None, vocabs=None, class_names=None):
+        self.config = config
+        self.name = step_name + pos_type
+        self.iterator = iterator
+        self.step_name = step_name
+        self.pos_type = pos_type
+        self.freezed = freezed
+        self.vocabs = vocabs
+        self.label_func = label_func
+        self.class_names = class_names
+        feature_names = self.config.get_value('features', lambda s: s.split(' '))
+        self.binary_features = [available_features[f_name]() for f_name in feature_names]
+        self.lstm_feature = LSTMFeature()
+        if self.vocabs:
+            self._set_vocabs()
+
+        self.binary_feature_matrix = None
+        self.binary_feature_width = None
+        self.lstm_feature_vectors = None
+        self.lstm_feature_row_width = None
+        self.label_array = None
+        self.number_of_instances = None
+        self.num_classes = None if self.class_names is None else len(self.class_names)
+        self.class_indices = None
+
+        self.print('get binary feature matrix')
+        binary_features_vectors = list()
+        for feature in self.binary_features:
+            feature_vectors = feature.get_vector_batch(self.iterator, self.freezed)
+            binary_features_vectors.append(feature_vectors)
+        features_length = [len(f) for f in self.binary_features]
+        self.print('binary features length ' + str(features_length))
+        self.binary_feature_matrix = dicts_to_sparse_matrix(binary_features_vectors, features_length, add_bias=True)
+        self.binary_feature_width = self.binary_feature_matrix[0].shape[1]
+        self.print('finished binary feature matrix')
+        self.print('get lstm feature matrix')
+        self.lstm_feature_vectors = self.lstm_feature.get_vector_batch(self.iterator, self.freezed)
+        self.lstm_feature_row_width = len(self.lstm_feature)
+        self.print('finished lstm feature matrix')
+
+        # get the labels
+        if self.label_func is not None:
+            label_array_raw = self.label_func(self.iterator)
+            self.class_names = list(np.unique(label_array_raw).tolist())
+            self.num_classes = len(self.class_names)
+            label_array = list()
+            # calculate the class weights by frequency
+            class_weights = [1 - (list(label_array_raw).count(c) / float(len(list(label_array_raw))))
+                                  for c in self.class_names]
+            self.class_weights = [w / min(class_weights) for w in class_weights]
+            print('classes: ', self.class_names)
+
+            for i, label_raw in enumerate(label_array_raw):
+                label = [0 for _ in range(self.num_classes)]
+                label[self.class_names.index(label_raw)] = 1
+                label_array.append(label)
+
+            self.label_array = np.asarray(label_array)
+
+        # check if number of lstm feature instances, binary feature instance and labels are identical
+        if (self.binary_feature_matrix.shape[0] != len(self.lstm_feature_vectors)) or \
+                (self.label_func and (self.binary_feature_matrix.shape[0] != len(self.label_array))):
+            raise ValueError('No equal number of instances')
+        self.number_of_instances = self.binary_feature_matrix.shape[0]
+
+    def get_binary_feature_matrix(self):
+        return self.binary_feature_matrix
+
+    def get_lstm_features(self):
+        return self.lstm_feature, self.lstm_feature_row_width
+
+    def get_training_batch(self, batch_size, epoch):
+        random_int = randint(0, self.number_of_instances)
+        indices = [(i + random_int) % self.number_of_instances for i in range(batch_size)]
+        batch_lstm_instances = list()
+        batch_binary_instances = self.binary_feature_matrix[indices].toarray()
+        labels = list()
+        for i in indices:
+            batch_lstm_instances.append(self.lstm_feature_vectors[i])
+            labels.append(self.label_array[i])
+        lstm_instance_time_major, sequence_lengths = self._lstm_time_major(batch_lstm_instances)
+        labels = np.asarray(labels)
+        batch_binary_instances = np.asarray(batch_binary_instances)
+        return batch_binary_instances, lstm_instance_time_major, sequence_lengths, labels
+
+    def get_prediction_instances(self, start, stop):
+        lstm_instance_time_major, sequence_lengths = self._lstm_time_major(self.lstm_feature_vectors[start:stop])
+        binary_instances = self.binary_feature_matrix[start:stop].toarray()
+        return binary_instances, lstm_instance_time_major, sequence_lengths
+
+    def get_prediction_instance(self, i):
+        feature_vector = self.lstm_feature_vectors[i]
+        sequence_lengths = [len(feature_vector)]
+        lstm_instance_time_major = list()
+        for row_index in range(len(feature_vector)):
+            row = [1 if r in feature_vector[row_index] else 0 for r in range(self.lstm_feature_row_width)]
+            lstm_instance_time_major.append([row])
+
+        lstm_instance_time_major = np.asarray(lstm_instance_time_major, dtype=np.float32)
+        sequence_lengths = np.asarray(sequence_lengths, dtype=np.int32)
+        return lstm_instance_time_major, sequence_lengths
+
+    def _lstm_time_major(self, lstm_feature_instances):
+        # Tensorflow needs this format for sequences of different length
+        sequence_lengths = [len(sequence) for sequence in lstm_feature_instances]
+        max_sequence_length = max(sequence_lengths)
+
+        instance_time_major = np.zeros(shape=(max_sequence_length, len(lstm_feature_instances),
+                                              self.lstm_feature_row_width), dtype=np.float32)
+        for s_id, sequence in enumerate(lstm_feature_instances):
+            for r_id , row in enumerate(sequence):
+                for key in row.keys():
+                    instance_time_major[r_id][s_id][key] = 1
+
+        sequence_lengths = np.asarray(sequence_lengths, dtype=np.int32)
+        return instance_time_major, sequence_lengths
+
+    def get_vocabs(self):
+        # get the vocabularies from the features of this step to be saved in the model object
+        vocabs = {}
+        for feature in self.binary_features:
+            if hasattr(feature, 'get_vocab'):
+                vocabs.update(feature.get_vocab())
+        return vocabs
+
+    def _set_vocabs(self):
+        # load the vocabularies into the features to be able to reproduce one-hot-vectors of the trained model
+        for feature in self.binary_features:
+            if hasattr(feature, 'set_vocab'):
+                feature.set_vocab(self.vocabs)
+
+    def get_label_array(self):
+        if self.label_func is None:
+            raise ValueError('Need label function to generate labels')
+        return self.label_array
+
+    def print(self, s):
+        print(self.step_name + ' - ' + self.pos_type + ': ' + s)
+
+
+def dicts_to_sparse_matrix(features_vectors, features_length, add_bias=False):
+    if any([len(features_vectors[0]) != len(f) for f in features_vectors]):
+        raise ValueError('Every Feature needs values for every instance')
+
+    shape = (len(features_vectors[0]), sum(features_length))
+    row = list()
+    col = list()
+    data = list()
+
+    for feature_idx, feature_vectors in enumerate(features_vectors):
+        col_offset = sum(features_length[0:feature_idx])
+        for row_idx, feature_vector in enumerate(feature_vectors):
+            for col_idx, v in feature_vector.items():
+                col_idx += col_offset
+                row.append(row_idx)
+                col.append(col_idx)
+                data.append(v)
+    if add_bias:
+        for i in range(shape[0]):
+            row.append(i)
+            col.append(shape[1])
+            data.append(1)
+        shape = (shape[0], shape[1]+1)
+    return csr_matrix((data, (row, col)), shape=shape)