8.19 更改无效动作惩罚机制

c2d08y · c2d08y · commit e8a3d772aa20 · 2022-08-19T21:08:23.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,4 @@ __pycache__/
 .idea/
 offline_train_logs/
 online_train_logs/
+python.exe.lnk
diff --git a/agent.py b/agent.py
@@ -14,20 +14,19 @@ def __init__(self, args: dict):
         self.epsilon = args["epsilon"]                  # PPO ε
         self.k_epochs = args["k_epochs"]                # PPO 训练轮数
         self.entropy_coef = args["entropy_coef"]
-        self.device = args["device"]                    # 运行设备
 
         # 神经网络
         self.pai_set = {
-            20: get_model("actor", "./model/non_maze.pth", 20).to(self.device),
-            19: get_model("actor", "./model/maze.pth", 19).to(self.device),
-            10: get_model("actor", "./model/non_maze1v1.pth", 10).to(self.device),
-            9: get_model("actor", "./model/maze1v1.pth", 9).to(self.device)
+            20: get_model("actor", "./model/non_maze.pth", 20).to(device),
+            19: get_model("actor", "./model/maze.pth", 19).to(device),
+            10: get_model("actor", "./model/non_maze1v1.pth", 10).to(device),
+            9: get_model("actor", "./model/maze1v1.pth", 9).to(device)
         }
         self.v_set = {
-            20: get_model("critic", "./model/non_maze_critic.pth", 20).to(self.device),
-            19: get_model("critic", "./model/maze_critic.pth", 19).to(self.device),
-            10: get_model("critic", "./model/non_maze1v1_critic.pth", 10).to(self.device),
-            9: get_model("critic", "./model/maze1v1_critic.pth", 9).to(self.device)
+            20: get_model("critic", "./model/non_maze_critic.pth", 20).to(device),
+            19: get_model("critic", "./model/maze_critic.pth", 19).to(device),
+            10: get_model("critic", "./model/non_maze1v1_critic.pth", 10).to(device),
+            9: get_model("critic", "./model/maze1v1_critic.pth", 9).to(device)
         }
         self.pai = self.pai_set[20]
         self.v = self.v_set[20]
@@ -46,12 +45,12 @@ def learn(self, rep, step_t):
         """
         s, a, a_log_prob, r, s_, done = rep.get_data()
         # 全部送进N卡
-        s = s.to(self.device)
-        a = a.to(self.device)
-        a_log_prob = a_log_prob.to(self.device)
-        r = r.to(self.device)
-        s_ = s_.to(self.device)
-        done = done.to(self.device)
+        s = s.to(device)
+        a = a.to(device)
+        a_log_prob = a_log_prob.to(device)
+        r = r.to(device)
+        s_ = s_.to(device)
+        done = done.to(device)
 
         # 利用GAE计算优势函数
         adv = []
@@ -63,7 +62,7 @@ def learn(self, rep, step_t):
             for delta, d in zip(reversed(deltas.flatten()), reversed(done.flatten())):
                 gae = delta + self.gamma * self.lamda * gae * (1.0 - d)
                 adv.insert(0, gae)
-            adv = torch.tensor(adv, dtype=torch.float).view(-1, 1).to(self.device)
+            adv = torch.tensor(adv, dtype=torch.float).view(-1, 1).to(device)
             v_target = adv + vs
 
         # 优势归一化
@@ -143,7 +142,7 @@ def warm_up(self):
         预热 因为神经网络第一次跑会比较慢
         :return:
         """
-        t = torch.zeros([1, 12, 20, 20]).to(self.device)
+        t = torch.zeros([1, 12, 20, 20]).to(device)
         self.pai(t)
         self.v(t)
 
diff --git a/const.py b/const.py
@@ -5,7 +5,7 @@ class BlockType(object):
     road = 0                # null, unshown null
     obstacle = 1            # obstacle
     mountain = 2            # mountain
-    crown = 3               # crown
+    crown = 300             # crown
     city = 4                # empty-city, city
 
 
@@ -61,6 +61,7 @@ class Style(object):
 
 dx = [0, -1, 0, 1]
 dy = [-1, 0, 1, 0]
+inf = 999999999
 
 
 class ActionTranslator(object):
@@ -109,3 +110,21 @@ def a_to_i(self, size: int, action: torch.Tensor) -> int:
                 direction = i
         return self.__indexes[size][action[0][0]][action[0][1]][direction][action[0][4]]
 
+    def mask(self, obs, map_size):
+        o = obs[0]
+        mask_vec = torch.zeros([len(self.__actions[map_size])], dtype=torch.long)
+        for _a in range(len(self.__actions[map_size])):
+            act = self.__actions[map_size][_a][0]
+            act -= 1
+            act[4] += 1
+
+            if int(o[2][int(act[1]) - 1][int(act[0]) - 1]) != 0:
+                # 不是自己的
+                mask_vec[_a] = -inf
+            else:
+                mask_vec[_a] = 1
+        return mask_vec.to(device)
+
+
+at = ActionTranslator()
+device = torch.device("cuda")
diff --git a/main.py b/main.py
@@ -5,14 +5,14 @@
 from offsite_env import OffSiteEnv
 from normalization import *
 from replay_buffer import *
+from const import *
 
 
 def main(offline_train=True):
     env = OffSiteEnv() if offline_train else OnSiteEnv()
 
     total_steps = 0  # 记录总步数
 
-    device = torch.device("cuda")
     args = {
         "batch_size": 50,
         "state_dim": None,
@@ -25,7 +25,6 @@ def main(offline_train=True):
         "k_epochs": 10,
         "entropy_coef": 0.01,
         "autosave_step": 107,
-        "device": device
     }
 
     agent = PPOAgent(args)
@@ -56,9 +55,12 @@ def save_model():
         reward_scaling.reset()
 
         done = False
+        total_reward = 0
+        _step = total_steps
         while not done:
             a, a_log_prob = agent.predict(s)
             s_, r, done, _ = env.step(a)
+            total_reward += r
 
             env.render("human")
 
@@ -71,14 +73,21 @@ def save_model():
 
             # 缓存到达batch size的时候更新参数
             if len(replay_buffer) == args["batch_size"]:
-                _t = threading.Thread(target=update_model)
-                _t.start()
+                _t1 = threading.Thread(target=update_model)
+                _t1.start()
                 replay_buffer.clear()
 
             # 自动保存模型 batch_size和autosave step的最小公倍数尽量大 因为同时保存和更新比较耗时间
             if total_steps % args["autosave_step"] == 0:
-                _t = threading.Thread(target=save_model)
-                _t.start()
+                _t2 = threading.Thread(target=save_model)
+                _t2.start()
+
+        # 绘制reward曲线 代表学习效果
+        if env.episode % 10 == 0:
+            writer.add_scalar(f"offline_train_{env.mode}", total_reward, env.episode)
+
+        game_result = "won" if env.win_check() == 2 else "lost"
+        print(f"game {env.episode}: bot " + game_result + f", total_reward={total_reward}, step={total_steps - _step}")
 
         if env.quit_signal():
             break
@@ -89,12 +98,5 @@ def save_model():
 
 
 """
-Traceback (most recent call last):
-  File "D:/MyFiles/LearningBot/main.py", line 88, in <module>
-    main()
-  File "D:/MyFiles/LearningBot/main.py", line 83, in main
-    if env.quit_signal():
-AttributeError: 'OffSiteEnv' object has no attribute 'quit_signal'
-
-Process finished with exit code 1
+检查mask是否生效 现在似乎没有起作用
 """
diff --git a/networks.py b/networks.py
@@ -1,4 +1,5 @@
 from torch import nn
+from const import *
 
 
 def orthogonal_init(layer, gain=1.0):
@@ -28,23 +29,26 @@ def __init__(self, size):
 
         # 全连接层和softmax
         self.dense_in = 48 * size ** 2
-        self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2
+        self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2 + 1
         self.dense1 = nn.Linear(in_features=self.dense_in, out_features=self.dense_out)
         self.dense2 = nn.Linear(in_features=self.dense_out, out_features=self.dense_out)
-        self.softmax = nn.Softmax(dim=0)
+        self.softmax = nn.Softmax(dim=1)
 
         # 激活函数
         self.activ_func = nn.Tanh()
 
         # 正交初始化
-        orthogonal_init(self.conv1)
-        orthogonal_init(self.conv2)
-        orthogonal_init(self.conv3)
-        orthogonal_init(self.conv4)
-        orthogonal_init(self.dense1)
-        orthogonal_init(self.dense2, gain=0.01)
+        # orthogonal_init(self.conv1)
+        # orthogonal_init(self.conv2)
+        # orthogonal_init(self.conv3)
+        # orthogonal_init(self.conv4)
+        # orthogonal_init(self.dense1)
+        # orthogonal_init(self.dense2, gain=0.01)
 
     def forward(self, x):
+        # 先生成MASK表
+        mask = at.mask(x, x.shape[2])
+
         x = self.conv1(x)
         x = self.batch_norm1(x)
         x = self.activ_func(x)
@@ -67,6 +71,9 @@ def forward(self, x):
         x = self.activ_func(x)
 
         x = self.dense2(x)
+
+        # softmax之前mask一下
+        x *= mask
         x = self.softmax(x)
         return x
 
@@ -87,20 +94,20 @@ def __init__(self, size):
 
         # 全连接层
         self.dense_in = 48 * size ** 2
-        self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2
+        self.dense_out = 4 * ((size - 2) ** 2 + 3 * (size - 2) + 2) * 2 + 1
         self.dense1 = nn.Linear(in_features=self.dense_in, out_features=self.dense_out)
         self.dense2 = nn.Linear(in_features=self.dense_out, out_features=1)
 
         # 激活函数
         self.activ_func = nn.Tanh()
 
         # 正交初始化
-        orthogonal_init(self.conv1)
-        orthogonal_init(self.conv2)
-        orthogonal_init(self.conv3)
-        orthogonal_init(self.conv4)
-        orthogonal_init(self.dense1)
-        orthogonal_init(self.dense2, gain=0.01)
+        # orthogonal_init(self.conv1)
+        # orthogonal_init(self.conv2)
+        # orthogonal_init(self.conv3)
+        # orthogonal_init(self.conv4)
+        # orthogonal_init(self.dense1)
+        # orthogonal_init(self.dense2, gain=0.01)
 
     def forward(self, x):
         x = self.conv1(x)
diff --git a/offsite_env.py b/offsite_env.py
diff --git a/onsite_env.py b/onsite_env.py