Skip to content

Commit 8858f99

Browse files
committed
code
1 parent 83202ae commit 8858f99

File tree

9 files changed

+954
-0
lines changed

9 files changed

+954
-0
lines changed

LICENSE

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2018 OpenAI
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

README.md

+9
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# finetune-transformer-lm
2+
Code and model for the paper "Improving Language Understanding by Generative Pre-Training"
3+
4+
Currently this code implements the ROCStories Cloze Test result reported in the paper by running:
5+
`python train.py --dataset rocstories --desc rocstories --submit --analysis --data_dir [path to data here]`
6+
7+
Note: The code is currently non-deterministic due to various GPU ops. The median accuracy of 10 runs with this codebase (using default hyperparameters) is 85.8% - slightly lower than the reported single run of 86.5% from the paper.
8+
9+
The ROCStories dataset can be downloaded from the associated [website](http://cs.rochester.edu/nlp/rocstories/).

analysis.py

+18
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import os
2+
import json
3+
import numpy as np
4+
import pandas as pd
5+
6+
from sklearn.metrics import accuracy_score
7+
8+
from datasets import _rocstories
9+
10+
def rocstories(data_dir, pred_path, log_path):
11+
preds = pd.read_csv(pred_path, delimiter='\t')['prediction'].values.tolist()
12+
_, _, _, labels = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
13+
test_accuracy = accuracy_score(labels, preds)*100.
14+
logs = [json.loads(line) for line in open(log_path)][1:]
15+
best_validation_index = np.argmax([log['va_acc'] for log in logs])
16+
valid_accuracy = logs[best_validation_index]['va_acc']
17+
print('ROCStories Valid Accuracy: %.2f'%(valid_accuracy))
18+
print('ROCStories Test Accuracy: %.2f'%(test_accuracy))

datasets.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import os
2+
import csv
3+
import numpy as np
4+
5+
from tqdm import tqdm
6+
7+
from sklearn.utils import shuffle
8+
from sklearn.model_selection import train_test_split
9+
10+
seed = 3535999445
11+
12+
def _rocstories(path):
13+
with open(path) as f:
14+
f = csv.reader(f)
15+
st = []
16+
ct1 = []
17+
ct2 = []
18+
y = []
19+
for i, line in enumerate(tqdm(list(f), ncols=80, leave=False)):
20+
if i > 0:
21+
s = ' '.join(line[1:5])
22+
c1 = line[5]
23+
c2 = line[6]
24+
st.append(s)
25+
ct1.append(c1)
26+
ct2.append(c2)
27+
y.append(int(line[-1])-1)
28+
return st, ct1, ct2, y
29+
30+
def rocstories(data_dir, n_train=1497, n_valid=374):
31+
storys, comps1, comps2, ys = _rocstories(os.path.join(data_dir, 'cloze_test_val__spring2016 - cloze_test_ALL_val.csv'))
32+
teX1, teX2, teX3, _ = _rocstories(os.path.join(data_dir, 'cloze_test_test__spring2016 - cloze_test_ALL_test.csv'))
33+
tr_storys, va_storys, tr_comps1, va_comps1, tr_comps2, va_comps2, tr_ys, va_ys = train_test_split(storys, comps1, comps2, ys, test_size=n_valid, random_state=seed)
34+
trX1, trX2, trX3 = [], [], []
35+
trY = []
36+
for s, c1, c2, y in zip(tr_storys, tr_comps1, tr_comps2, tr_ys):
37+
trX1.append(s)
38+
trX2.append(c1)
39+
trX3.append(c2)
40+
trY.append(y)
41+
42+
vaX1, vaX2, vaX3 = [], [], []
43+
vaY = []
44+
for s, c1, c2, y in zip(va_storys, va_comps1, va_comps2, va_ys):
45+
vaX1.append(s)
46+
vaX2.append(c1)
47+
vaX3.append(c2)
48+
vaY.append(y)
49+
trY = np.asarray(trY, dtype=np.int32)
50+
vaY = np.asarray(vaY, dtype=np.int32)
51+
return (trX1, trX2, trX3, trY), (vaX1, vaX2, vaX3, vaY), (teX1, teX2, teX3)

model.py

+163
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import numpy as np
2+
import math
3+
import copy
4+
import torch
5+
import torch.nn as nn
6+
import torch.nn.functional as F
7+
from torch.nn.parameter import Parameter
8+
9+
vocab = n_vocab + n_special + n_ctx
10+
11+
def gelu(x):
12+
return 0.5*x*(1+torch.tanh(math.sqrt(2/math.pi)*(x+0.044715*torch.pow(x, 3))))
13+
14+
def swish(x):
15+
return x*torch.sigmoid(x)
16+
17+
ACT_FNS = {
18+
'relu': nn.relu,
19+
'swish': swish,
20+
'gelu': gelu
21+
}
22+
23+
def clones(module, N):
24+
"Produce N identical layers."
25+
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
26+
27+
28+
class LayerNorm(nn.Module):
29+
"Construct a layernorm module (See citation for details)."
30+
def __init__(self, n_state, eps=1e-6):
31+
super(LayerNorm, self).__init__()
32+
self.g = nn.Parameter(torch.ones(n_state))
33+
self.b = nn.Parameter(torch.zeros(n_state))
34+
self.eps = eps
35+
36+
def forward(self, x):
37+
mean = x.mean(-1, keepdim=True)
38+
std = x.std(-1, keepdim=True)
39+
# One difference with the TF version here: we add epsilon outside of sqrt
40+
return self.g * (x - mean) / (std + self.eps) + self.b
41+
42+
43+
class Conv1D(nn.Module):
44+
def __init__(self, nf, rf, nx):
45+
super(Conv1D, self).__init__()
46+
self.rf = rf
47+
if rf == 1: #faster 1x1 conv
48+
self.w = Parameter(torch.ones(nx, nf)) # TODO change to random normal
49+
self.b = Parameter(torch.zeros(nf))
50+
else: #was used to train LM
51+
raise NotImplementedError
52+
53+
def forward(self, x):
54+
if self.rf == 1:
55+
size_out = x.size()[:-1] + [nf]
56+
x = torch.addmm(self.b, x.view(-1, x.size(-1)), self.w)
57+
x = x.view(*size_out)
58+
else:
59+
raise NotImplementedError
60+
return x
61+
62+
63+
class Attention(nn.Module):
64+
def __init__(self, nx, n_state, n_head, attn_pdrop, resid_pdrop, scale=False):
65+
super(Attention, self).__init__()
66+
self.c_attn = Conv1D(n_state*3, 1, nx)
67+
self.c_proj = Conv1D(n_state, 1, nx)
68+
self.scale = scale
69+
self.n_head = n_head
70+
self.attn_dropout = nn.Dropout(attn_pdrop)
71+
self.resid_dropout = nn.Dropout(resid_pdrop)
72+
73+
@staticmethod
74+
def mask_attn_weights(w):
75+
n = w.size(-1)
76+
b = torch.tril(np.ones(n, n)).view(1, 1, n, n)
77+
return w * b + -1e9*(1-b)
78+
79+
def _attn(self, q, k, v):
80+
w = torch.matmul(q, k)
81+
if self.scale:
82+
w = w / math.sqrt(v.size(-1))
83+
w = self.mask_attn_weights(w)
84+
w = nn.Softmax()(w)
85+
w = self.attn_dropout(w)
86+
return torch.matmul(w, v)
87+
88+
def merge_heads(self, x):
89+
new_x_shape = x.size()[:-2] + [np.prod(x.size()[-2:])]
90+
x = x.view(*new_x_shape) # in Tensorflow version: merge_states
91+
return x.permute(0, 2, 1, 3)
92+
93+
def split_heads(self, x, k=False):
94+
new_x_shape = x.size()[:-1] + [self.n_head, x.size(-1)//self.n_head]
95+
x = x.view(*new_x_shape) # in Tensorflow version: split_states
96+
if k:
97+
return x.permute(0, 2, 3, 1)
98+
else:
99+
return x.permute(0, 2, 1, 3)
100+
101+
def forward(self, x):
102+
x = self.c_attn(x)
103+
query, key, value = x.split(3, dim=2)
104+
query = self.split_heads(query)
105+
key = self.split_heads(key, k=True)
106+
value = self.split_heads(value)
107+
a = self._attn(query, key, value)
108+
a = self.merge_heads(a)
109+
a = self.c_proj(a)
110+
a = self.resid_dropout(a)
111+
return a
112+
113+
114+
class MLP(nn.Module):
115+
def __init__(self, nx, n_state, afn, resid_pdrop):
116+
super(MLP, self).__init__()
117+
self.c_fc = Conv1D(n_state, 1, nx)
118+
self.c_proj = Conv1D(nx, 1, nx)
119+
self.act = ACT_FNS[afn]
120+
self.dropout = nn.Dropout(resid_pdrop)
121+
122+
def forward(self, x):
123+
h = self.act(self.c_fc(x))
124+
h = self.c_proj(h)
125+
return self.dropout(h)
126+
127+
128+
class Block(nn.Module):
129+
def __init__(self, nx, n_head, attn_pdrop, resid_pdrop, afn, scale=False):
130+
super(Block, self).__init__()
131+
self.attn = Attention(nx, nx, n_head, attn_pdrop, resid_pdrop, scale)
132+
self.ln_1 = LayerNorm(nx)
133+
self.mlp = MLP(nx, nx*4, afn, resid_pdrop)
134+
self.ln_2 = LayerNorm(nx)
135+
136+
def forward(self, x):
137+
h = self.attn(x)
138+
h = self.ln_1(x)
139+
h = self.mlp(x)
140+
h = self.ln_2(x)
141+
return h
142+
143+
144+
class Model(nn.Module):
145+
""" Transformer model """
146+
def __init__(self, vocab, n_embd, pdrop, n_layers,
147+
nx, n_head, attn_pdrop, resid_pdrop, afn):
148+
super(Model, self).__init__()
149+
self.embed = nn.Embedding(vocab, n_embd)
150+
self.drop = nn.Dropout(pdrop)
151+
self.blocks = clones(Block(nx, n_head, attn_pdrop,
152+
resid_pdrop, afn, scale=True), n_layers)
153+
self.decoder = nn.Linear(nhid, vocab, bias=False)
154+
self.decoder.weight = self.embed.weight
155+
156+
def forward(self, x, m):
157+
x = x.view(-1, x.size(2), x.size(3))
158+
m = m.view(-1, m.size(2))
159+
e = self.embed(x)
160+
h = e.sum(dim=2)
161+
for block in self.blocks:
162+
h = block(h)
163+
return h

opt.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import math
2+
import numpy as np
3+
import tensorflow as tf
4+
5+
def warmup_cosine(x, warmup=0.002):
6+
s = tf.cast(x <= warmup, tf.float32)
7+
return s*(x/warmup) + (1-s)*(0.5 * (1 + tf.cos(math.pi * x)))
8+
9+
def warmup_constant(x, warmup=0.002):
10+
s = tf.cast(x <= warmup, tf.float32)
11+
return s*(x/warmup) + (1-s)*1
12+
13+
def warmup_linear(x, warmup=0.002):
14+
s = tf.cast(x <= warmup, tf.float32)
15+
return (s*(x/warmup) + (1-s))*(1-x)
16+
17+
schedules = {
18+
'warmup_cosine':warmup_cosine,
19+
'warmup_constant':warmup_constant,
20+
'warmup_linear':warmup_linear,
21+
}
22+
23+
def adam(params, grads, lr, schedule, t_total, b1=0.9, b2=0.999, e=1e-8, l2=0, vector_l2=False, max_grad_norm=-1, **kwargs):
24+
"""
25+
adam with weight decay fix
26+
"""
27+
t = tf.Variable(0, dtype=tf.float32, trainable=False)
28+
tt = t+1
29+
updates = [t.assign(tt)]
30+
if max_grad_norm > 0:
31+
grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
32+
for p, g in zip(params, grads):
33+
if p is None or g is None:
34+
print("can't train", p.name, g)
35+
else:
36+
if isinstance(g, tf.IndexedSlices):
37+
g = tf.convert_to_tensor(g)
38+
m = tf.Variable(p*0, dtype=tf.float32, trainable=False)
39+
v = tf.Variable(p*0, dtype=tf.float32, trainable=False)
40+
lrt = lr*tf.sqrt(1-b2**tt)/(1-b1**tt)
41+
lrt *= schedule(t/t_total)
42+
mt = b1*m + (1-b1)*g
43+
vt = b2*v + (1-b2)*g*g
44+
if (len(p.get_shape()) > 1 or vector_l2) and l2 > 0:
45+
pt = p - lrt * (mt / (tf.sqrt(vt) + e) + l2*p)
46+
else:
47+
pt = p - lrt * (mt / (tf.sqrt(vt) + e))
48+
updates.extend([m.assign(mt), v.assign(vt), p.assign(pt)])
49+
return tf.group(*updates)

0 commit comments

Comments
 (0)