Skip to content

Commit 05a7ae7

Browse files
committed
8.12 离线环境 注释中文化
1 parent 6eb8b0b commit 05a7ae7

File tree

8 files changed

+196
-129
lines changed

8 files changed

+196
-129
lines changed

agent.py

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,17 @@
66
class PPOAgent(object):
77

88
def __init__(self, args: dict):
9-
self.batch_size = args["batch_size"] # batch size
10-
self.lr_a = args["lr_a"] # Learning rate of actor
11-
self.lr_c = args["lr_c"] # Learning rate of critic
12-
self.gamma = args["gamma"] # Discount factor
13-
self.lamda = args["lambda"] # GAE parameter
14-
self.epsilon = args["epsilon"] # PPO clip parameter
15-
self.k_epochs = args["k_epochs"] # PPO parameter
9+
self.batch_size = args["batch_size"] # batch size
10+
self.lr_a = args["lr_a"] # 策略网络学习率
11+
self.lr_c = args["lr_c"] # 价值网络学习率
12+
self.gamma = args["gamma"] # 折扣因子
13+
self.lamda = args["lambda"] # GAE λ
14+
self.epsilon = args["epsilon"] # PPO ε
15+
self.k_epochs = args["k_epochs"] # PPO 训练轮数
1616
self.entropy_coef = args["entropy_coef"]
17-
self.device = args["device"] # device
17+
self.device = args["device"] # 运行设备
1818

19-
# networks
19+
# 神经网络
2020
self.pai_set = {
2121
20: get_model("actor", "./model/non_maze.pth", 20),
2222
19: get_model("actor", "./model/maze.pth", 19),
@@ -46,10 +46,10 @@ def learn(self, rep, step_t):
4646
"""
4747
s, a, a_log_prob, r, s_, dw, done = rep.to_tensor()
4848

49-
# calculate GAE advantage
49+
# 利用GAE计算优势函数
5050
adv = []
5151
gae = 0
52-
with torch.no_grad(): # adv and v_target have no gradient
52+
with torch.no_grad(): # 不需要梯度
5353
vs = self.v(s)
5454
vs_ = self.v(s_)
5555
deltas = r + self.gamma * (1.0 - dw) * vs_ - vs
@@ -59,44 +59,46 @@ def learn(self, rep, step_t):
5959
adv = torch.tensor(adv, dtype=torch.float).view(-1, 1)
6060
v_target = adv + vs
6161

62-
# advantage normalization
62+
# 优势归一化
6363
adv = ((adv - adv.mean()) / (adv.std() + 1e-5))
6464

65-
# Optimize policy for K epochs:
65+
# 参数更新k轮
6666
for _ in range(self.k_epochs):
6767
for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.batch_size, False):
6868
dist_now = Categorical(self.pai(s[index]))
69-
dist_entropy = dist_now.entropy().view(-1, 1) # shape(batch_size X 1)
70-
a_log_prob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1) # shape(batch_size X 1)
69+
dist_entropy = dist_now.entropy().view(-1, 1) # shape(batch_size x 1)
70+
a_log_prob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1) # shape(batch_size x 1)
7171

7272
# https://www.luogu.com.cn/paste/9vwi6ls0
73-
ratios = torch.exp(a_log_prob_now - a_log_prob[index]) # shape(batch_size X 1)
73+
# 计算策略梯度
74+
ratios = torch.exp(a_log_prob_now - a_log_prob[index]) # shape(batch_size x 1)
7475
surr1 = ratios * adv[index]
7576
surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index]
7677
actor_loss = -torch.min(surr1,
77-
surr2) - self.entropy_coef * dist_entropy # shape(batch_size X 1)
78-
# Update actor
78+
surr2) - self.entropy_coef * dist_entropy # shape(batch_size x 1)
79+
# 更新策略网络
7980
self.optimizer_actor.zero_grad()
8081
actor_loss.mean().backward()
81-
# Gradient clip
82+
# 梯度裁剪
8283
torch.nn.utils.clip_grad_norm_(self.pai.parameters(), 0.5)
8384
self.optimizer_actor.step()
8485

86+
# 价值网络梯度
8587
v_s = self.v(s[index])
8688
critic_loss = self.mse_loss_fn(v_target[index], v_s)
87-
# Update critic
89+
# 更新价值网络
8890
self.optimizer_critic.zero_grad()
8991
critic_loss.backward()
90-
# Gradient clip
92+
# 梯度裁剪
9193
torch.nn.utils.clip_grad_norm_(self.v.parameters(), 0.5)
9294
self.optimizer_critic.step()
9395

9496
self.lr_decay(step_t)
9597

9698
def lr_decay(self, total_steps):
9799
"""
98-
learning rate decay
99-
:param total_steps:
100+
学习率衰减
101+
:param total_steps: 已训练步数
100102
:return:
101103
"""
102104
decay_rate = 0.1
@@ -110,7 +112,7 @@ def lr_decay(self, total_steps):
110112

111113
def predict(self, observation):
112114
"""
113-
sample an action from policy network
115+
从策略网络采样动作
114116
:param observation: s_t
115117
:return: 2 tensors: action, ln(p(a_t|s_t))
116118
"""
@@ -122,7 +124,7 @@ def predict(self, observation):
122124

123125
def change_network(self, map_size):
124126
"""
125-
change policy and value network for a new game
127+
当模式更换时 更换神经网络
126128
:param map_size:
127129
:return:
128130
"""
@@ -131,21 +133,21 @@ def change_network(self, map_size):
131133

132134
def warm_up(self):
133135
"""
134-
warm up neural networks
136+
预热 因为神经网络第一次跑会比较慢
135137
:return:
136138
"""
137139
t = torch.zeros([1, 12, 20, 20]).to(self.device)
138140
self.pai(t)
139141
self.v(t)
140142

141143
def save(self):
142-
# save policy networks
144+
# 保存策略网络
143145
torch.save(self.pai_set[20], "./model/non_maze.pth")
144146
torch.save(self.pai_set[10], "./model/non_maze1v1.pth")
145147
torch.save(self.pai_set[19], "./model/maze.pth")
146148
torch.save(self.pai_set[9], "./model/maze1v1.pth")
147149

148-
# save value networks
150+
# 保存价值网络
149151
torch.save(self.v_set[20], "./model/non_maze_critic.pth")
150152
torch.save(self.v_set[10], "./model/non_maze1v1_critic.pth")
151153
torch.save(self.v_set[19], "./model/maze_critic.pth")

const.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22

33

44
class BlockType(object):
5-
road = 0 # null, unshown null
6-
obstacle = 1 # obstacle
7-
mountain = 2 # mountain
8-
crown = 3 # crown
9-
city = 4 # empty-city, city
5+
road = 0 # null, unshown null
6+
obstacle = 1 # obstacle
7+
mountain = 2 # mountain
8+
crown = 3 # crown
9+
city = 4 # empty-city, city
1010

1111

1212
class PlayerColor(object):
@@ -24,6 +24,9 @@ class PlayerColor(object):
2424
explore_reward = {BlockType.road: 0.01, BlockType.mountain: 0.01, BlockType.crown: 10, BlockType.city: 0.01}
2525

2626

27+
directions = {'W': (0, -1), 'S': (0, 1), 'A': (-1, 0), 'D': (1, 0)}
28+
29+
2730
class FrontColor(object):
2831
black = 30
2932
red = 31

generate_map.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import execjs
2+
from utils import *
23

34

45
with open("generate_map.js", "r", encoding="utf-8") as _js_file:
@@ -8,12 +9,12 @@
89

910

1011
def generate_random_map(player):
11-
_generator.call("generateRandomMap", player)
12+
return map_to_tensor(_generator.call("generateRandomMap", player))
1213

1314

1415
def generate_maze_map(player):
15-
_generator.call("generateMazeMap", player)
16+
return map_to_tensor(_generator.call("generateMazeMap", player))
1617

1718

1819
def generate_empty_map(player):
19-
_generator.call("generateEmptyMap", player)
20+
return map_to_tensor(_generator.call("generateEmptyMap", player))

main.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
def main(train=True):
1010
env = OffSiteEnv() if train else OnSiteEnv()
1111

12-
total_steps = 0 # Record the total steps during the training
12+
total_steps = 0 # 记录总步数
1313

1414
device = torch.device("cuda")
1515
args = {
@@ -30,16 +30,16 @@ def main(train=True):
3030
agent = PPOAgent(args)
3131
agent.warm_up()
3232

33-
# Build a tensorboard
33+
# 绘图器
3434
writer = SummaryWriter("offline_train_logs" if train else "online_train_logs")
3535

3636
while True:
3737
s = env.reset()
38-
# arguments update per game
38+
# 更新地图大小并更换神经网络
3939
args["state_dim"] = env.map_size
4040
agent.change_network(env.map_size)
4141

42-
# init normalizations
42+
# 初始化一些用于归一化的类
4343
state_norm = Normalization(shape=args["state_dim"]) # Trick 2:state normalization
4444
reward_scaling = RewardScaling(shape=1, gamma=args["gamma"])
4545

@@ -60,11 +60,12 @@ def main(train=True):
6060
s = s_
6161
total_steps += 1
6262

63-
# When the number of transitions in buffer reaches batch_size,then update
63+
# 缓存到达batch size的时候更新参数
6464
if len(replay_buffer) == args["batch_size"]:
6565
agent.learn(replay_buffer, total_steps)
6666
replay_buffer.clear()
6767

68+
# 自动保存模型 batch_size和autosave step的最小公倍数尽量大 因为同时保存和更新比较耗时间
6869
if total_steps % args["autosave_step"] == 0:
6970
agent.save()
7071

networks.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22

33

44
def orthogonal_init(layer, gain=1.0):
5+
"""
6+
正交初始化
7+
:param layer: 要初始化的层
8+
:param gain: 默认1.0 特别地 对于输出层 应为0.1
9+
:return:
10+
"""
511
nn.init.orthogonal_(layer.weight, gain=gain)
612
nn.init.constant_(layer.bias, 0)
713

@@ -10,7 +16,7 @@ class Actor(nn.Module):
1016

1117
def __init__(self, size):
1218
super(Actor, self).__init__()
13-
# convolution layers
19+
# 卷积层
1420
self.conv1 = nn.Conv2d(in_channels=12, out_channels=48, kernel_size=(5, 5), padding=2)
1521
self.batch_norm1 = nn.BatchNorm2d(48)
1622
self.conv2 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
@@ -20,17 +26,17 @@ def __init__(self, size):
2026
self.conv4 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
2127
self.batch_norm4 = nn.BatchNorm2d(48)
2228

23-
# dense & softmax
29+
# 全连接层和softmax
2430
self.dense_in = 48 * size ** 2
2531
self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2
2632
self.dense1 = nn.Linear(in_features=self.dense_in, out_features=self.dense_out)
2733
self.dense2 = nn.Linear(in_features=self.dense_out, out_features=self.dense_out)
2834
self.softmax = nn.Softmax(dim=0)
2935

30-
# activation function
36+
# 激活函数
3137
self.activ_func = nn.Tanh()
3238

33-
# init
39+
# 正交初始化
3440
orthogonal_init(self.conv1)
3541
orthogonal_init(self.conv2)
3642
orthogonal_init(self.conv3)
@@ -69,7 +75,7 @@ class Critic(nn.Module):
6975

7076
def __init__(self, size):
7177
super(Critic, self).__init__()
72-
# convolution layers
78+
# 卷积层
7379
self.conv1 = nn.Conv2d(in_channels=12, out_channels=48, kernel_size=(5, 5), padding=2)
7480
self.batch_norm1 = nn.BatchNorm2d(48)
7581
self.conv2 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
@@ -79,16 +85,16 @@ def __init__(self, size):
7985
self.conv4 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
8086
self.batch_norm4 = nn.BatchNorm2d(48)
8187

82-
# dense
88+
# 全连接层
8389
self.dense_in = 48 * size ** 2
8490
self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2
8591
self.dense1 = nn.Linear(in_features=self.dense_in, out_features=self.dense_out)
8692
self.dense2 = nn.Linear(in_features=self.dense_out, out_features=1)
8793

88-
# activation function
94+
# 激活函数
8995
self.activ_func = nn.Tanh()
9096

91-
# init
97+
# 正交初始化
9298
orthogonal_init(self.conv1)
9399
orthogonal_init(self.conv2)
94100
orthogonal_init(self.conv3)

0 commit comments

Comments
 (0)