。

c2d08y · c2d08y · commit 8167d241fbca · 2022-08-29T14:02:59.000+08:00
diff --git a/agent.py b/agent.py
@@ -1,3 +1,4 @@
+from torch.nn import functional as F
 from torch.distributions import Categorical
 from torch.utils.data import BatchSampler, SubsetRandomSampler
 from utils import *
@@ -71,7 +72,8 @@ def learn(self, rep, step_t):
         # 参数更新k轮
         for _ in range(self.k_epochs):
             for index in BatchSampler(SubsetRandomSampler(range(self.batch_size)), self.batch_size, False):
-                dist_now = Categorical(self.pai(s[index]))
+                mask = at.mask(s[index], s[index].shape[2])
+                dist_now = Categorical(mask * self.pai.softmax(self.pai(s[index])))
                 dist_entropy = dist_now.entropy().view(-1, 1)                       # shape(batch_size x 1)
                 a_log_prob_now = dist_now.log_prob(a[index].squeeze()).view(-1, 1)  # shape(batch_size x 1)
 
@@ -123,7 +125,9 @@ def predict(self, observation):
         :return: 2 tensors: action, ln(p(a_t|s_t))
         """
         with torch.no_grad():
-            action_p = Categorical(self.pai(observation))
+            mask = at.mask(observation, observation.shape[2])
+            act_ = self.pai(observation) * mask
+            action_p = Categorical(self.pai.softmax(act_))
             action = action_p.sample()
             a_log_prob = action_p.log_prob(action)
         return action, a_log_prob
diff --git a/const.py b/const.py
@@ -61,7 +61,7 @@ class Style(object):
 
 dx = [0, -1, 0, 1]
 dy = [-1, 0, 1, 0]
-inf = 999999999
+inf = 999999999.0
 
 
 class ActionTranslator(object):
@@ -95,6 +95,13 @@ def _generate(self, size):
                         continue
                     index[i][j][k][0] = len(action)
                     action.append(torch.Tensor([[i, j, tgx, tgy, 0]]))
+        for i in range(1, size + 1):
+            for j in range(1, size + 1):
+                for k in range(4):
+                    tgx = i + dx[k]
+                    tgy = j + dy[k]
+                    if tgx < 1 or tgx > size or tgy < 1 or tgy > size:
+                        continue
                     index[i][j][k][1] = len(action)
                     action.append(torch.Tensor([[i, j, tgx, tgy, 1]]))
 
@@ -113,18 +120,26 @@ def a_to_i(self, size: int, action: torch.Tensor) -> int:
     def mask(self, obs, map_size):
         o = obs[0]
         mask_vec = torch.zeros([len(self.__actions[map_size])], dtype=torch.long)
-        for _a in range(len(self.__actions[map_size])):
-            act = self.__actions[map_size][_a][0]
-            act -= 1
-            act[4] += 1
-
-            if int(o[2][int(act[1]) - 1][int(act[0]) - 1]) != 0:
+        mask_vec[0] = 1.0
+        for _a in range(1, len(self.__actions[map_size])):
+            act = self.__actions[map_size][_a][0].long().tolist()
+            if int(o[10][act[1] - 1][act[0] - 1]) != 0:
                 # 不是自己的
                 mask_vec[_a] = -inf
             else:
-                mask_vec[_a] = 1
+                mask_vec[_a] = 1.0
         return mask_vec.to(device)
 
 
 at = ActionTranslator()
-device = torch.device("cuda")
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+else:
+    device = torch.device("cpu")
+
+
+def debug_output_mask(_mask):
+    print("[", end='')
+    for _ in _mask:
+        print(f"{_},", end=' ')
+    print("]")
diff --git a/main.py b/main.py
@@ -46,25 +46,28 @@ def save_model():
         agent.change_network(env.map_size)
 
         # 初始化一些用于归一化的类
-        state_norm = Normalization(shape=args["state_dim"])  # Trick 2:state normalization
+        # state_norm = Normalization(shape=args["state_dim"])  # Trick 2:state normalization
         reward_scaling = RewardScaling(shape=1, gamma=args["gamma"])
 
         replay_buffer = ReplayBuffer(args)
 
-        s = state_norm(s).to(device)
+        # s = state_norm(s).to(device)
+        s = s.to(device)
         reward_scaling.reset()
 
         done = False
         total_reward = 0
         _step = total_steps
+        render_mode = "machine"
         while not done:
             a, a_log_prob = agent.predict(s)
             s_, r, done, _ = env.step(a)
             total_reward += r
 
-            env.render("human")
+            env.render(render_mode)
 
-            s_ = state_norm(s_).to(device)
+            # s_ = state_norm(s_).to(device)
+            s_ = s_.to(device)
             r = reward_scaling(r)
 
             replay_buffer.store(s, a, a_log_prob, r, s_, done)
@@ -82,6 +85,10 @@ def save_model():
                 _t2 = threading.Thread(target=save_model)
                 _t2.start()
 
+            # 手动特判
+            if r > 0:
+                render_mode = "human"
+
         # 绘制reward曲线 代表学习效果
         if env.episode % 10 == 0:
             writer.add_scalar(f"offline_train_{env.mode}", total_reward, env.episode)
@@ -95,8 +102,3 @@ def save_model():
 
 if __name__ == '__main__':
     main()
-
-
-"""
-检查mask是否生效 现在似乎没有起作用
-"""
diff --git a/networks.py b/networks.py
@@ -1,5 +1,6 @@
+import torch
 from torch import nn
-from const import *
+from torch.nn import functional as F
 
 
 def orthogonal_init(layer, gain=1.0):
@@ -46,9 +47,6 @@ def __init__(self, size):
         # orthogonal_init(self.dense2, gain=0.01)
 
     def forward(self, x):
-        # 先生成MASK表
-        mask = at.mask(x, x.shape[2])
-
         x = self.conv1(x)
         x = self.batch_norm1(x)
         x = self.activ_func(x)
@@ -71,11 +69,10 @@ def forward(self, x):
         x = self.activ_func(x)
 
         x = self.dense2(x)
+        return self.my_PReLU(x)
 
-        # softmax之前mask一下
-        x *= mask
-        x = self.softmax(x)
-        return x
+    def my_PReLU(self, x):
+        return torch.max(x, torch.FloatTensor([0.0]).cuda()) - 0.05 * torch.min(x, torch.FloatTensor([0.0]).cuda())
 
 
 class Critic(nn.Module):
diff --git a/offsite_env.py b/offsite_env.py
@@ -33,10 +33,6 @@ def __init__(self, mode="non_maze"):
         self.internal_bots = {}
         self.internal_bots_num = 0
         self.internal_bots_color = []
-        for i in range(1, 9):
-            if i != self.learningbot_color:
-                self.internal_bots_color.append(i)
-                self.internal_bots[i] = ibot.Game(i, self.get_view_of, self.bot_action_upd)
         self.actions_now = [[], [], [], [], [], [], [], []]
 
     def reset(self):
@@ -56,19 +52,26 @@ def reset(self):
                 self.player_num = random.randint(3, 8)
         self.episode += 1
         self.gen_map(self.player_num)
+        self.round = 0
 
         # 初始化shown
         self.shown = torch.zeros([9, self.map_size, self.map_size])
 
         # 初始化内部bot
         self.internal_bots_num = self.player_num - 1
+        for i in range(1, 9):
+            if i != self.learningbot_color:
+                self.internal_bots_color.append(i)
+                self.internal_bots[i] = ibot.Game(i, self.get_view_of, self.bot_action_upd)
 
         # 处理observation
+        self.obs_history.queue.clear()
         self.obs_history.put(torch.zeros([4, self.map_size, self.map_size]))
         self.obs_history.put(torch.zeros([4, self.map_size, self.map_size]))
         self.obs_history.put(copy.copy(self.get_view_of(self.learningbot_color)))
 
         # 先推一个空action进去 而且action_history还是只存list为妙 不然会有莫名其妙的错误
+        self.action_history.queue.clear()
         self.action_history.put([-1, -1, -1, -1, -1])
         return self.gen_observation()
 
@@ -111,12 +114,9 @@ def step(self, action: torch.Tensor):
         # 如果动作为空
         if last_move[0] < 0 or self.round == 1:
             return obs, reward, False, {}
-        # 无效移动扣大分
-        if int(last_obs[2][last_move[1] - 1][last_move[0] - 1]) != self._get_colormark(self.learningbot_color):
-            reward -= 100
         # 撞山扣一点
         if last_obs[1][last_move[3] - 1][last_move[2] - 1] == BlockType.mountain:
-            reward -= 2
+            reward -= 0.3
         # 撞塔扣分
         if self.map[1][last_move[3] - 1][last_move[2] - 1] == BlockType.city:
             if self.map[2][last_move[3] - 1][last_move[2] - 1] != self.learningbot_color:
@@ -190,19 +190,16 @@ def execute_actions(self, action: torch.Tensor):
                 self.combine((cur_action[1], cur_action[0]), (cur_action[3], cur_action[2]), mov_troop)
 
         # 处理LearningBot动作
-        act = at.i_to_a(self.map_size, int(action))[0].long()
-        act -= 1
-        act[4] += 1
-        # is_available = "available" if self.map[2][act[1]][act[0]] == self.learningbot_color else "unavailable"
-        # print(f"<{is_available}>: {act.tolist()}")
+        act = at.i_to_a(self.map_size, int(action))[0].long().tolist()
+        print(act)
         # 检查动作是否合法 act中可能会存在-1 代表空回合
-        if act[0] >= 0 and self.map[2][act[1]][act[0]] == self.learningbot_color:
-            f_amount = int(self.map[0][act[1]][act[0]])
+        if act[0] - 1 >= 0 and self.map[2][act[1] - 1][act[0] - 1] == self.learningbot_color:
+            f_amount = int(self.map[0][act[1] - 1][act[0] - 1])
             if act[4] == 1:
                 mov_troop = math.ceil((f_amount + 0.5) / 2) - 1
             else:
                 mov_troop = f_amount - 1
-            self.combine((act[1], act[0]), (act[3], act[2]), mov_troop)
+            self.combine((act[1] - 1, act[0] - 1), (act[3] - 1, act[2] - 1), mov_troop)
 
     def combine(self, b1: tuple, b2: tuple, cnt):
         """
diff --git a/onsite_env.py b/onsite_env.py
@@ -98,18 +98,15 @@ def step(self, action: torch.Tensor):
         # 计算上一步的奖励
         _dirx = [0, -1, 0, 1, 1, -1, 1, -1]
         _diry = [-1, 0, 1, 0, 1, -1, -1, 1]
-        last_move = self.action_history.queue[-1]
+        last_move = self.action_history.queue[-1].long().tolist()
         last_map = self.map_history.queue[-1]
         # 保存action
         if self.action_history.qsize() == 3:
             self.action_history.get()
-        self.action_history.put(copy.copy(action[0].long()))
+        self.action_history.put(copy.copy(at.i_to_a(self.map_size, int(action[0].long()))[0]))
         # 如果动作为空
         if last_move[0] < 0:
             return self.observation, reward, False, {}
-        # 无效移动扣大分
-        if last_map[2][last_move[1] - 1][last_move[0] - 1] != self._get_colormark(self.self_color):
-            reward -= 100
         # 撞山扣一点
         if self.map[1][last_move[3] - 1][last_move[2] - 1] == BlockType.mountain:
             reward -= 10
@@ -276,6 +273,7 @@ def win_check(self) -> int:
         """
         try:
             t = self.driver.find_element(By.ID, "swal2-content")
+            self.driver.find_element(By.CSS_SELECTOR, "div.swal2-actions > button.swal2-confirm.swal2-styled")
             if t.text.strip() == settings.bot_name + "赢了":
                 return 2
         except NoSuchElementException: