code

thomwolf · thomwolf · commit 8858f99438ad · 2018-06-13T16:07:58.000+02:00
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,9 @@
+# finetune-transformer-lm
+Code and model for the paper "Improving Language Understanding by Generative Pre-Training"
+
+Currently this code implements the ROCStories Cloze Test result reported in the paper by running:
+`python train.py --dataset rocstories --desc rocstories --submit --analysis --data_dir [path to data here]`
+
+Note: The code is currently non-deterministic due to various GPU ops. The median accuracy of 10 runs with this codebase (using default hyperparameters) is 85.8% - slightly lower than the reported single run of 86.5% from the paper. 
+
+The ROCStories dataset can be downloaded from the associated [website](http://cs.rochester.edu/nlp/rocstories/).
diff --git a/analysis.py b/analysis.py
@@ -0,0 +1,18 @@
+import os
+import json
+import numpy as np
+import pandas as pd
+
+from sklearn.metrics import accuracy_score
+
+from datasets import _rocstories
+
+def rocstories(data_dir, pred_path, log_path):
+    preds = pd.read_csv(pred_path, delimiter='\t')['prediction'].values.tolist()
+    _, _, _, labels = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
+    test_accuracy = accuracy_score(labels, preds)*100.
+    logs = [json.loads(line) for line in open(log_path)][1:]
+    best_validation_index = np.argmax([log['va_acc'] for log in logs])
+    valid_accuracy = logs[best_validation_index]['va_acc']
+    print('ROCStories Valid Accuracy: %.2f'%(valid_accuracy))
+    print('ROCStories Test Accuracy:  %.2f'%(test_accuracy))
diff --git a/datasets.py b/datasets.py
@@ -0,0 +1,51 @@
+import os
+import csv
+import numpy as np
+
+from tqdm import tqdm
+
+from sklearn.utils import shuffle
+from sklearn.model_selection import train_test_split
+
+seed = 3535999445
+
+def _rocstories(path):
+    with open(path) as f:
+        f = csv.reader(f)
+        st = []
+        ct1 = []
+        ct2 = []
+        y = []
+        for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
+            if i > 0:
+                s = ' '.join(line[1:5])
+                c1 = line[5]
+                c2 = line[6]
+                st.append(s)
+                ct1.append(c1)
+                ct2.append(c2)
+                y.append(int(line[-1])-1)
+        return st, ct1, ct2, y
+
+def rocstories(data_dir, n_train=1497, n_valid=374):
+    storys, comps1, comps2, ys = _rocstories(os.path.join(data_dir, 'cloze_test_val__spring2016 - cloze_test_ALL_val.csv'))
+    teX1, teX2, teX3, _ = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
+    tr_storys, va_storys, tr_comps1, va_comps1, tr_comps2, va_comps2, tr_ys, va_ys = train_test_split(storys, comps1, comps2, ys, test_size=n_valid, random_state=seed)
+    trX1, trX2, trX3 = [], [], []
+    trY = []
+    for s, c1, c2, y in zip(tr_storys, tr_comps1, tr_comps2, tr_ys):
+        trX1.append(s)
+        trX2.append(c1)
+        trX3.append(c2)
+        trY.append(y)
+
+    vaX1, vaX2, vaX3 = [], [], []
+    vaY = []
+    for s, c1, c2, y in zip(va_storys, va_comps1, va_comps2, va_ys):
+        vaX1.append(s)
+        vaX2.append(c1)
+        vaX3.append(c2)
+        vaY.append(y)
+    trY = np.asarray(trY, dtype=np.int32)
+    vaY = np.asarray(vaY, dtype=np.int32)
+    return (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)
diff --git a/model.py b/model.py
@@ -0,0 +1,163 @@
+import numpy as np
+import math
+import copy
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+vocab = n_vocab + n_special + n_ctx
+
+def gelu(x):
+    return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x, 3))))
+
+def swish(x):
+    return x*torch.sigmoid(x)
+
+ACT_FNS = {
+    'relu': nn.relu,
+    'swish': swish,
+    'gelu': gelu
+}
+
+def clones(module, N):
+    "Produce N identical layers."
+    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
+
+
+class LayerNorm(nn.Module):
+    "Construct a layernorm module (See citation for details)."
+    def __init__(self, n_state, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.g = nn.Parameter(torch.ones(n_state))
+        self.b = nn.Parameter(torch.zeros(n_state))
+        self.eps = eps
+
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        # One difference with the TF version here: we add epsilon outside of sqrt
+        return self.g * (x - mean) / (std + self.eps) + self.b
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, rf, nx):
+        super(Conv1D, self).__init__()
+        self.rf = rf
+        if rf == 1: #faster 1x1 conv
+            self.w = Parameter(torch.ones(nx, nf)) # TODO change to random normal
+            self.b = Parameter(torch.zeros(nf))
+        else: #was used to train LM
+            raise NotImplementedError
+
+    def forward(self, x):
+        if self.rf == 1:
+            size_out = x.size()[:-1] + [nf]
+            x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
+            x = x.view(*size_out)
+        else:
+            raise NotImplementedError
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_state, n_head, attn_pdrop, resid_pdrop, scale=False):
+        super(Attention, self).__init__()
+        self.c_attn = Conv1D(n_state*3, 1, nx)
+        self.c_proj = Conv1D(n_state, 1, nx)
+        self.scale = scale
+        self.n_head = n_head
+        self.attn_dropout = nn.Dropout(attn_pdrop)
+        self.resid_dropout = nn.Dropout(resid_pdrop)
+
+    @staticmethod
+    def mask_attn_weights(w):
+        n = w.size(-1)
+        b = torch.tril(np.ones(n, n)).view(1, 1, n, n)
+        return w * b + -1e9*(1-b)
+
+    def _attn(self, q, k, v):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        w = self.mask_attn_weights(w)
+        w = nn.Softmax()(w)
+        w = self.attn_dropout(w)
+        return torch.matmul(w, v)
+
+    def merge_heads(self, x):
+        new_x_shape = x.size()[:-2] + [np.prod(x.size()[-2:])]
+        x = x.view(*new_x_shape) # in Tensorflow version: merge_states
+        return x.permute(0, 2, 1, 3)
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + [self.n_head, x.size(-1)//self.n_head]
+        x = x.view(*new_x_shape) # in Tensorflow version: split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def forward(self, x):
+        x = self.c_attn(x)
+        query, key, value = x.split(3, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        a = self._attn(query, key, value)
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+        return a
+
+
+class MLP(nn.Module):
+    def __init__(self, nx, n_state, afn, resid_pdrop):
+        super(MLP, self).__init__()
+        self.c_fc = Conv1D(n_state, 1, nx)
+        self.c_proj = Conv1D(nx, 1, nx)
+        self.act = ACT_FNS[afn]
+        self.dropout = nn.Dropout(resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h = self.c_proj(h)
+        return self.dropout(h)
+
+
+class Block(nn.Module):
+    def __init__(self, nx, n_head, attn_pdrop, resid_pdrop, afn, scale=False):
+        super(Block, self).__init__()
+        self.attn = Attention(nx, nx, n_head, attn_pdrop, resid_pdrop, scale)
+        self.ln_1 = LayerNorm(nx)
+        self.mlp = MLP(nx, nx*4, afn, resid_pdrop)
+        self.ln_2 = LayerNorm(nx)
+
+    def forward(self, x):
+        h = self.attn(x)
+        h = self.ln_1(x)
+        h = self.mlp(x)
+        h = self.ln_2(x)
+        return h
+
+
+class Model(nn.Module):
+    """ Transformer model """
+    def __init__(self, vocab, n_embd, pdrop, n_layers,
+                nx, n_head, attn_pdrop, resid_pdrop, afn):
+        super(Model, self).__init__()
+        self.embed = nn.Embedding(vocab, n_embd)
+        self.drop = nn.Dropout(pdrop)
+        self.blocks = clones(Block(nx, n_head, attn_pdrop,
+                                   resid_pdrop, afn, scale=True), n_layers)
+        self.decoder = nn.Linear(nhid, vocab, bias=False)
+        self.decoder.weight = self.embed.weight
+
+    def forward(self, x, m):
+        x = x.view(-1, x.size(2), x.size(3))
+        m = m.view(-1, m.size(2))
+        e = self.embed(x)
+        h = e.sum(dim=2)
+        for block in self.blocks:
+            h = block(h)
+        return h
diff --git a/opt.py b/opt.py
@@ -0,0 +1,49 @@
+import math
+import numpy as np
+import tensorflow as tf
+
+def warmup_cosine(x, warmup=0.002):
+    s = tf.cast(x <= warmup, tf.float32)
+    return s*(x/warmup) + (1-s)*(0.5 * (1 + tf.cos(math.pi * x)))
+
+def warmup_constant(x, warmup=0.002):
+    s = tf.cast(x <= warmup, tf.float32)
+    return s*(x/warmup) + (1-s)*1
+
+def warmup_linear(x, warmup=0.002):
+    s = tf.cast(x <= warmup, tf.float32)
+    return (s*(x/warmup) + (1-s))*(1-x)
+
+schedules = {
+    'warmup_cosine':warmup_cosine,
+    'warmup_constant':warmup_constant,
+    'warmup_linear':warmup_linear,
+}
+
+def adam(params, grads, lr, schedule, t_total, b1=0.9, b2=0.999, e=1e-8, l2=0, vector_l2=False, max_grad_norm=-1, **kwargs):
+    """
+    adam with weight decay fix
+    """
+    t = tf.Variable(0, dtype=tf.float32, trainable=False)
+    tt = t+1
+    updates = [t.assign(tt)]
+    if max_grad_norm > 0:
+        grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
+    for p, g in zip(params, grads):
+        if p is None or g is None:
+            print("can't train", p.name, g)
+        else:
+            if isinstance(g, tf.IndexedSlices):
+                g = tf.convert_to_tensor(g)
+            m = tf.Variable(p*0, dtype=tf.float32, trainable=False)
+            v = tf.Variable(p*0, dtype=tf.float32, trainable=False)
+            lrt = lr*tf.sqrt(1-b2**tt)/(1-b1**tt)
+            lrt *= schedule(t/t_total)
+            mt = b1*m + (1-b1)*g
+            vt = b2*v + (1-b2)*g*g
+            if (len(p.get_shape()) > 1 or vector_l2) and l2 > 0:
+                pt = p - lrt * (mt / (tf.sqrt(vt) + e) + l2*p)
+            else:
+                pt = p - lrt * (mt / (tf.sqrt(vt) + e))
+            updates.extend([m.assign(mt), v.assign(vt), p.assign(pt)])
+    return tf.group(*updates)
diff --git a/text_utils.py b/text_utils.py
diff --git a/train.py b/train.py
diff --git a/utils.py b/utils.py