c2d08y
diff --git a/‎agent.py
Lines changed: 30 additions & 28 deletions b/‎agent.py
Lines changed: 30 additions & 28 deletions
diff --git a/‎const.py
Lines changed: 8 additions & 5 deletions b/‎const.py
Lines changed: 8 additions & 5 deletions
diff --git a/‎generate_map.py
Lines changed: 4 additions & 3 deletions b/‎generate_map.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎main.py
Lines changed: 6 additions & 5 deletions b/‎main.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎networks.py
Lines changed: 14 additions & 8 deletions b/‎networks.py
Lines changed: 14 additions & 8 deletions
@@ -6,17 +6,17 @@
 class PPOAgent(object):
 
     def __init__(self, args: dict):
-        self.batch_size = args["batch_size"]  # batch size
-        self.lr_a = args["lr_a"]  # Learning rate of actor
-        self.lr_c = args["lr_c"]  # Learning rate of critic
-        self.gamma = args["gamma"]  # Discount factor
-        self.lamda = args["lambda"]  # GAE parameter
-        self.epsilon = args["epsilon"]  # PPO clip parameter
-        self.k_epochs = args["k_epochs"]  # PPO parameter
+        self.batch_size = args["batch_size"]            # batch size
+        self.lr_a = args["lr_a"]                        # 策略网络学习率
+        self.lr_c = args["lr_c"]                        # 价值网络学习率
+        self.gamma = args["gamma"]                      # 折扣因子
+        self.lamda = args["lambda"]                     # GAE λ
+        self.epsilon = args["epsilon"]                  # PPO ε
+        self.k_epochs = args["k_epochs"]                # PPO 训练轮数
         self.entropy_coef = args["entropy_coef"]
-        self.device = args["device"]  # device
+        self.device = args["device"]                    # 运行设备
 
-        # networks
+        # 神经网络
         self.pai_set = {
             20: get_model("actor", "./model/non_maze.pth", 20),
             19: get_model("actor", "./model/maze.pth", 19),
@@ -46,10 +46,10 @@ def learn(self, rep, step_t):
         """
         s, a, a_log_prob, r, s_, dw, done = rep.to_tensor()
 
-        # calculate GAE advantage
+        # 利用GAE计算优势函数
         adv = []
         gae = 0
-        with torch.no_grad():  # adv and v_target have no gradient
+        with torch.no_grad():  # 不需要梯度
             vs = self.v(s)
             vs_ = self.v(s_)
             deltas = r + self.gamma * (1.0 - dw) * vs_ - vs
@@ -59,44 +59,46 @@ def learn(self, rep, step_t):
             adv = torch.tensor(adv, dtype=torch.float).view(-1, 1)
             v_target = adv + vs
 
-        # advantage normalization
+        # 优势归一化
         adv = ((adv - adv.mean()) / (adv.std() + 1e-5))
 
-        # Optimize policy for K epochs:
+        # 参数更新k轮
         for _ in range(self.k_epochs):
             for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.batch_size, False):
                 dist_now = Categorical(self.pai(s[index]))
-                dist_entropy = dist_now.entropy().view(-1, 1)                       # shape(batch_size X 1)
-                a_log_prob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1)  # shape(batch_size X 1)
+                dist_entropy = dist_now.entropy().view(-1, 1)                       # shape(batch_size x 1)
+                a_log_prob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1)  # shape(batch_size x 1)
 
                 # https://www.luogu.com.cn/paste/9vwi6ls0
-                ratios = torch.exp(a_log_prob_now - a_log_prob[index])              # shape(batch_size X 1)
+                # 计算策略梯度
+                ratios = torch.exp(a_log_prob_now - a_log_prob[index])              # shape(batch_size x 1)
                 surr1 = ratios * adv[index]
                 surr2 = torch.clamp(ratios, 1 - self.epsilon, 1 + self.epsilon) * adv[index]
                 actor_loss = -torch.min(surr1,
-                                        surr2) - self.entropy_coef * dist_entropy  # shape(batch_size X 1)
-                # Update actor
+                                        surr2) - self.entropy_coef * dist_entropy  # shape(batch_size x 1)
+                # 更新策略网络
                 self.optimizer_actor.zero_grad()
                 actor_loss.mean().backward()
-                # Gradient clip
+                # 梯度裁剪
                 torch.nn.utils.clip_grad_norm_(self.pai.parameters(), 0.5)
                 self.optimizer_actor.step()
 
+                # 价值网络梯度
                 v_s = self.v(s[index])
                 critic_loss = self.mse_loss_fn(v_target[index], v_s)
-                # Update critic
+                # 更新价值网络
                 self.optimizer_critic.zero_grad()
                 critic_loss.backward()
-                # Gradient clip
+                # 梯度裁剪
                 torch.nn.utils.clip_grad_norm_(self.v.parameters(), 0.5)
                 self.optimizer_critic.step()
 
         self.lr_decay(step_t)
 
     def lr_decay(self, total_steps):
         """
-        learning rate decay
-        :param total_steps:
+        学习率衰减
+        :param total_steps: 已训练步数
         :return:
         """
         decay_rate = 0.1
@@ -110,7 +112,7 @@ def lr_decay(self, total_steps):
 
     def predict(self, observation):
         """
-        sample an action from policy network
+        从策略网络采样动作
         :param observation: s_t
         :return: 2 tensors: action, ln(p(a_t|s_t))
         """
@@ -122,7 +124,7 @@ def predict(self, observation):
 
     def change_network(self, map_size):
         """
-        change policy and value network for a new game
+        当模式更换时 更换神经网络
         :param map_size:
         :return:
         """
@@ -131,21 +133,21 @@ def change_network(self, map_size):
 
     def warm_up(self):
         """
-        warm up neural networks
+        预热 因为神经网络第一次跑会比较慢
         :return:
         """
         t = torch.zeros([1, 12, 20, 20]).to(self.device)
         self.pai(t)
         self.v(t)
 
     def save(self):
-        # save policy networks
+        # 保存策略网络
         torch.save(self.pai_set[20], "./model/non_maze.pth")
         torch.save(self.pai_set[10], "./model/non_maze1v1.pth")
         torch.save(self.pai_set[19], "./model/maze.pth")
         torch.save(self.pai_set[9], "./model/maze1v1.pth")
 
-        # save value networks
+        # 保存价值网络
         torch.save(self.v_set[20], "./model/non_maze_critic.pth")
         torch.save(self.v_set[10], "./model/non_maze1v1_critic.pth")
         torch.save(self.v_set[19], "./model/maze_critic.pth")
 
@@ -2,11 +2,11 @@
 
 
 class BlockType(object):
-    road = 0  # null, unshown null
-    obstacle = 1  # obstacle
-    mountain = 2  # mountain
-    crown = 3  # crown
-    city = 4  # empty-city, city
+    road = 0                # null, unshown null
+    obstacle = 1            # obstacle
+    mountain = 2            # mountain
+    crown = 3               # crown
+    city = 4                # empty-city, city
 
 
 class PlayerColor(object):
@@ -24,6 +24,9 @@ class PlayerColor(object):
 explore_reward = {BlockType.road: 0.01, BlockType.mountain: 0.01, BlockType.crown: 10, BlockType.city: 0.01}
 
 
+directions = {'W': (0, -1), 'S': (0, 1), 'A': (-1, 0), 'D': (1, 0)}
+
+
 class FrontColor(object):
     black = 30
     red = 31
 
@@ -1,4 +1,5 @@
 import execjs
+from utils import *
 
 
 with open("generate_map.js", "r", encoding="utf-8") as _js_file:
@@ -8,12 +9,12 @@
 
 
 def generate_random_map(player):
-    _generator.call("generateRandomMap", player)
+    return map_to_tensor(_generator.call("generateRandomMap", player))
 
 
 def generate_maze_map(player):
-    _generator.call("generateMazeMap", player)
+    return map_to_tensor(_generator.call("generateMazeMap", player))
 
 
 def generate_empty_map(player):
-    _generator.call("generateEmptyMap", player)
+    return map_to_tensor(_generator.call("generateEmptyMap", player))
@@ -9,7 +9,7 @@
 def main(train=True):
     env = OffSiteEnv() if train else OnSiteEnv()
 
-    total_steps = 0  # Record the total steps during the training
+    total_steps = 0  # 记录总步数
 
     device = torch.device("cuda")
     args = {
@@ -30,16 +30,16 @@ def main(train=True):
     agent = PPOAgent(args)
     agent.warm_up()
 
-    # Build a tensorboard
+    # 绘图器
     writer = SummaryWriter("offline_train_logs" if train else "online_train_logs")
 
     while True:
         s = env.reset()
-        # arguments update per game
+        # 更新地图大小并更换神经网络
         args["state_dim"] = env.map_size
         agent.change_network(env.map_size)
 
-        # init normalizations
+        # 初始化一些用于归一化的类
         state_norm = Normalization(shape=args["state_dim"])  # Trick 2:state normalization
         reward_scaling = RewardScaling(shape=1, gamma=args["gamma"])
 
@@ -60,11 +60,12 @@ def main(train=True):
             s = s_
             total_steps += 1
 
-            # When the number of transitions in buffer reaches batch_size,then update
+            # 缓存到达batch size的时候更新参数
             if len(replay_buffer) == args["batch_size"]:
                 agent.learn(replay_buffer, total_steps)
                 replay_buffer.clear()
 
+            # 自动保存模型 batch_size和autosave step的最小公倍数尽量大 因为同时保存和更新比较耗时间
             if total_steps % args["autosave_step"] == 0:
                 agent.save()
 
 
@@ -2,6 +2,12 @@
 
 
 def orthogonal_init(layer, gain=1.0):
+    """
+    正交初始化
+    :param layer: 要初始化的层
+    :param gain: 默认1.0 特别地 对于输出层 应为0.1
+    :return:
+    """
     nn.init.orthogonal_(layer.weight, gain=gain)
     nn.init.constant_(layer.bias, 0)
 
@@ -10,7 +16,7 @@ class Actor(nn.Module):
 
     def __init__(self, size):
         super(Actor, self).__init__()
-        # convolution layers
+        # 卷积层
         self.conv1 = nn.Conv2d(in_channels=12, out_channels=48, kernel_size=(5, 5), padding=2)
         self.batch_norm1 = nn.BatchNorm2d(48)
         self.conv2 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
@@ -20,17 +26,17 @@ def __init__(self, size):
         self.conv4 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
         self.batch_norm4 = nn.BatchNorm2d(48)
 
-        # dense & softmax
+        # 全连接层和softmax
         self.dense_in = 48 * size ** 2
         self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2
         self.dense1 = nn.Linear(in_features=self.dense_in, out_features=self.dense_out)
         self.dense2 = nn.Linear(in_features=self.dense_out, out_features=self.dense_out)
         self.softmax = nn.Softmax(dim=0)
 
-        # activation function
+        # 激活函数
         self.activ_func = nn.Tanh()
 
-        # init
+        # 正交初始化
         orthogonal_init(self.conv1)
         orthogonal_init(self.conv2)
         orthogonal_init(self.conv3)
@@ -69,7 +75,7 @@ class Critic(nn.Module):
 
     def __init__(self, size):
         super(Critic, self).__init__()
-        # convolution layers
+        # 卷积层
         self.conv1 = nn.Conv2d(in_channels=12, out_channels=48, kernel_size=(5, 5), padding=2)
         self.batch_norm1 = nn.BatchNorm2d(48)
         self.conv2 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
@@ -79,16 +85,16 @@ def __init__(self, size):
         self.conv4 = nn.Conv2d(in_channels=48, out_channels=48, kernel_size=(3, 3), padding=1)
         self.batch_norm4 = nn.BatchNorm2d(48)
 
-        # dense
+        # 全连接层
         self.dense_in = 48 * size ** 2
         self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2
         self.dense1 = nn.Linear(in_features=self.dense_in, out_features=self.dense_out)
         self.dense2 = nn.Linear(in_features=self.dense_out, out_features=1)
 
-        # activation function
+        # 激活函数
         self.activ_func = nn.Tanh()
 
-        # init
+        # 正交初始化
         orthogonal_init(self.conv1)
         orthogonal_init(self.conv2)
         orthogonal_init(self.conv3)