-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtrain_rl.py
83 lines (74 loc) · 2.92 KB
/
train_rl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import argparse
import glob
import numpy as np
import random
import copy
from tqdm import tqdm
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import serializers, optimizers, Variable
from chainer.functions.loss.softmax_cross_entropy import softmax_cross_entropy
import network
import rl_self_play
def main():
# Set the number of sets
parser = argparse.ArgumentParser(description='IaGo:')
parser.add_argument('--set', '-s', type=int, default=10000, help='Number of game sets played to train')
args = parser.parse_args()
N = 32
# Model definition
model1 = network.SLPolicy()
serializers.load_npz("../models/RL/model0.npz", model1)
optimizer = optimizers.Adam(alpha=0.0005)
optimizer.setup(model1)
optimizer.add_hook(chainer.optimizer_hooks.WeightDecay(5e-4))
#serializers.load_npz("./backup/rl_optimizer.npz", optimizer)
# REINFORCE algorithm
for set in tqdm(range(0, args.set)):
# Randomly choose competitor model from reinforced models
model2 = network.SLPolicy()
model2_path = np.random.choice(glob.glob("../models/RL/model0.npz"))
print(model2_path)
serializers.load_npz(model2_path, model2)
result = 0
state_seq, action_seq, reward_seq = [], [], []
for i in tqdm(range(2*N)):
game = rl_self_play.Game(model1, model2)
if i%2==1:
# Switch head and tail
pos = random.choice([[2,4], [3,5], [4,2], [5,3]])
game.state[pos[0], pos[1]] = 2
states, actions, judge = game()
rewards = [judge]*len(states)
state_seq += states
action_seq += actions
reward_seq += rewards
if judge==1:
result += 1
# Update model
x = np.array(state_seq)
x = np.stack([x==1, x==2], axis=0).astype(np.float32)
x = chainer.Variable(x.transpose(1,0,2,3))
y = Variable(np.array(action_seq).astype(np.int32))
r = Variable(np.array(reward_seq).astype(np.float32))
pred = model1(x)
c = softmax_cross_entropy(pred, y, reduce="no")
model1.cleargrads()
loss = F.mean(c*r)
loss.backward()
optimizer.update()
print("Set:" + str(set) + ", Result:" + str(result/(2*N)) + ", Loss:" + str(loss.data))
with open("./log_test.txt", "a") as f:
f.write(str(result/(2*N)) + ", \n")
model = copy.deepcopy(model1)
#model.to_cpu()
#serializers.save_npz("./backup/model"+str(set)+".npz", model)
#serializers.save_npz("./backup/optimizer"+str(set)+".npz", optimizer)
if (set+1)%500==0:
model = copy.deepcopy(model1)
#model.to_cpu()
serializers.save_npz("../models/RL/model"+str((set+1)//500)+".npz", model)
serializers.save_npz("../models/rl_optimizer.npz", optimizer)
if __name__ == '__main__':
main()