8.16 继续调

c2d08y · c2d08y · commit 723af0f80a2f · 2022-08-16T21:05:43.000+08:00
diff --git a/agent.py b/agent.py
@@ -45,20 +45,25 @@ def learn(self, rep, step_t):
         :return:
         """
         s, a, a_log_prob, r, s_, done = rep.get_data()
+        # 全部送进N卡
+        s = s.to(self.device)
+        a = a.to(self.device)
+        a_log_prob = a_log_prob.to(self.device)
+        r = r.to(self.device)
+        s_ = s_.to(self.device)
+        done = done.to(self.device)
 
         # 利用GAE计算优势函数
         adv = []
         gae = 0
-        s = s.to(self.device)
-        s_ = s_.to(self.device)
         with torch.no_grad():  # 不需要梯度
             vs = self.v(s)
             vs_ = self.v(s_)
             deltas = r + self.gamma * (1.0 - done) * vs_ - vs
-            for delta, d in zip(reversed(deltas.flatten().numpy()), reversed(done.flatten().numpy())):
+            for delta, d in zip(reversed(deltas.flatten()), reversed(done.flatten())):
                 gae = delta + self.gamma * self.lamda * gae * (1.0 - d)
                 adv.insert(0, gae)
-            adv = torch.tensor(adv, dtype=torch.float).view(-1, 1)
+            adv = torch.tensor(adv, dtype=torch.float).view(-1, 1).to(self.device)
             v_target = adv + vs
 
         # 优势归一化
diff --git a/bot_div/game.py b/bot_div/game.py
@@ -42,6 +42,7 @@ def get_map_from_env(self):
         game_map = self.get_tensor_map(self.bot_color)
         map_size = game_map.shape[1]
         game_map = game_map.long().tolist()
+        self.mp.resize(map_size)
         for i in range(1, map_size + 1):
             for j in range(1, map_size + 1):
                 if game_map[1][i - 1][j - 1] == BlockType.city:
@@ -242,7 +243,7 @@ def gather_army_to(self, x, y, method='rectangle'):  # 向(x, y)聚兵
             ans_top_right = (ans_top_left[0], ans_bottom_right[1])
             ans_bottom_left = (ans_bottom_right[0], ans_top_left[1])
             ans = [ans_top_left, ans_top_right, ans_bottom_left, ans_bottom_right]
-            print(ans, best_sum)
+            # print(ans, best_sum)
             tmp = []
             target_node = (x, y)
             for i in ans:
@@ -286,7 +287,7 @@ def gather_army_to(self, x, y, method='rectangle'):  # 向(x, y)聚兵
             min_y = min(ans_top_left[1], ans_top_right[1])
             max_y = max(ans_top_left[1], ans_top_right[1])
             while cx != end_node[0] or cy != end_node[1]:  # 蛇形遍历
-                print(cx, cy)
+                # print(cx, cy)
                 px = cx
                 py = cy
                 if cur_dir:
@@ -349,7 +350,7 @@ def flush_movements(self):  # 更新移动
             self.cur_y -= 1
         elif self.movements[0] == 'D':
             self.cur_y += 1
-        act = [x_old, y_old, self.cur_x, self.cur_y, is_half]
+        act = [y_old, x_old, self.cur_y, self.cur_x, is_half]
         self.send_action(self.bot_color, act)
         self.movements.pop(0)
         self.get_map_from_env()
diff --git a/bot_div/map.py b/bot_div/map.py
@@ -15,6 +15,16 @@ def __init__(self, amount=0, belong=0, type='land'):
         self.type = type  # land city general unknown mountain empty empty-city
         self.cost = 0
 
+    def __str__(self):
+        return '{' +\
+               f"""
+                'amount': {self.amount},
+                'belong': {self.belong},
+                'type': {self.type},
+                'cost': {self.cost}
+                """\
+                + '}'
+
 
 def dist_node(a, b):
     return dist(a[0], a[1], b[0], b[1])
diff --git a/main.py b/main.py
@@ -1,3 +1,4 @@
+import threading
 from torch.utils.tensorboard import SummaryWriter
 from agent import PPOAgent
 from onsite_env import OnSiteEnv
@@ -13,7 +14,7 @@ def main(offline_train=True):
 
     device = torch.device("cuda")
     args = {
-        "batch_size": 100,
+        "batch_size": 50,
         "state_dim": None,
         "action_dim": 5,
         "lr_a": 0.01,
@@ -30,6 +31,12 @@ def main(offline_train=True):
     agent = PPOAgent(args)
     agent.warm_up()
 
+    def update_model():
+        agent.learn(replay_buffer, total_steps)
+
+    def save_model():
+        agent.save()
+
     # 绘图器
     writer = SummaryWriter("offline_train_logs" if offline_train else "online_train_logs")
 
@@ -64,16 +71,30 @@ def main(offline_train=True):
 
             # 缓存到达batch size的时候更新参数
             if len(replay_buffer) == args["batch_size"]:
-                agent.learn(replay_buffer, total_steps)
+                _t = threading.Thread(target=update_model)
+                _t.start()
                 replay_buffer.clear()
 
             # 自动保存模型 batch_size和autosave step的最小公倍数尽量大 因为同时保存和更新比较耗时间
             if total_steps % args["autosave_step"] == 0:
-                agent.save()
+                _t = threading.Thread(target=save_model)
+                _t.start()
 
         if env.quit_signal():
             break
 
 
 if __name__ == '__main__':
     main()
+
+
+"""
+Traceback (most recent call last):
+  File "D:/MyFiles/LearningBot/main.py", line 88, in <module>
+    main()
+  File "D:/MyFiles/LearningBot/main.py", line 83, in main
+    if env.quit_signal():
+AttributeError: 'OffSiteEnv' object has no attribute 'quit_signal'
+
+Process finished with exit code 1
+"""
diff --git a/offsite_env.py b/offsite_env.py
@@ -72,7 +72,7 @@ def reset(self):
     def step(self, action: torch.Tensor):
         """
         执行一步
-        :param action: movement => tensor([[x1, y1, x2, y2, is_half]])
+        :param action: movement => tensor([[x1, y1, x2, y2, is_half]]) 注意x,y和i,j正好相反
         :return: observation (Tensor), reward (float), done (bool), info (dict)
         """
         # 运行
@@ -106,16 +106,19 @@ def step(self, action: torch.Tensor):
         if last_move[0] < 0:
             return obs, reward, False, {}
         # 无效移动扣大分
-        if last_obs[2][last_move[0] - 1][last_move[1] - 1] != self._get_colormark(self.learningbot_color):
+        if last_obs[2][last_move[1] - 1][last_move[0] - 1] != self._get_colormark(self.learningbot_color):
             reward -= 100
+        # 撞山扣一点
+        if last_obs[1][last_move[3] - 1][last_move[2] - 1] == BlockType.mountain:
+            reward -= 10
         # 撞塔扣分
-        if self.map[1][last_move[2] - 1][last_move[3] - 1] == BlockType.city:
-            if self.map[2][last_move[2] - 1][last_move[3] - 1] != self.learningbot_color:
+        if self.map[1][last_move[3] - 1][last_move[2] - 1] == BlockType.city:
+            if self.map[2][last_move[3] - 1][last_move[2] - 1] != self.learningbot_color:
                 reward -= 10
         # 探索新领地加分 注意 不是占领
         for i in range(8):
-            t_x = last_move[2] - 1 + _dirx[i]
-            t_y = last_move[3] - 1 + _diry[i]
+            t_x = last_move[3] - 1 + _dirx[i]
+            t_y = last_move[2] - 1 + _diry[i]
             if t_x < 0 or t_x >= self.map_size or t_y < 0 or t_y >= self.map_size:
                 continue
             if self.map[3][t_x][t_y] - last_obs[3][t_x][t_y] == 1:
@@ -150,13 +153,17 @@ def render(self, mode="human"):
     def execute_actions(self, action: torch.Tensor):
         """
         执行动作
-        :param action: LearningBot的动作 内置bot动作会存到类变量里边 不需要传参
+        :param action: LearningBot的动作 内置bot动作会存到类变量里边 不需要传参 注意x,y和i,j正好相反
         :return:
         """
         # 处理内置bot动作
         for i in range(self.internal_bots_num):
             cur_color = self.internal_bots_color[i]
-            self.internal_bots[cur_color].bot_move()
+            try:
+                # 有的时候会有莫名其妙的报错 懒得调了 反正这个Bot很弱 也不差这一个回合 主要训练还是得靠和人打
+                self.internal_bots[cur_color].bot_move()
+            except Exception:
+                continue
             if not self.actions_now[cur_color]:
                 print(f"bot {cur_color} empty move")
                 continue
@@ -173,26 +180,26 @@ def execute_actions(self, action: torch.Tensor):
 
             print(f"internal bot color {cur_color}: {cur_action}")
             # 检查动作是否合法
-            if cur_action[0] >= 0 and self.map[2][cur_action[0]][cur_action[1]] == cur_color:
-                f_amount = int(self.map[0][cur_action[0]][cur_action[1]])
+            if self.map[2][cur_action[1]][cur_action[0]] == cur_color:
+                f_amount = int(self.map[0][cur_action[1]][cur_action[0]])
                 if cur_action[4] == 1:
                     mov_troop = math.ceil((f_amount + 0.5) / 2) - 1
                 else:
                     mov_troop = f_amount - 1
-                self.combine((cur_action[0], cur_action[1]), (cur_action[2], cur_action[3]), mov_troop)
+                self.combine((cur_action[1], cur_action[0]), (cur_action[3], cur_action[2]), mov_troop)
 
         # 处理LearningBot动作
         act = self.at.i_to_a(self.map_size, int(action))[0].long()
         act -= 1
         act[4] += 1
-        # 检查动作是否合法
-        if act[0] >= 0 and self.map[2][act[0]][act[1]] == self.learningbot_color:
-            f_amount = int(self.map[0][act[0]][act[1]])
+        # 检查动作是否合法 act中可能会存在-1 代表空回合
+        if act[0] >= 0 and self.map[2][act[1]][act[0]] == self.learningbot_color:
+            f_amount = int(self.map[0][act[1]][act[0]])
             if act[4] == 1:
                 mov_troop = math.ceil((f_amount + 0.5) / 2) - 1
             else:
                 mov_troop = f_amount - 1
-            self.combine((act[0], act[1]), (act[2], act[3]), mov_troop)
+            self.combine((act[1], act[0]), (act[3], act[2]), mov_troop)
 
     def combine(self, b1: tuple, b2: tuple, cnt):
         """
@@ -228,10 +235,9 @@ def combine(self, b1: tuple, b2: tuple, cnt):
                     tcolor = t["color"]
                     for i in range(self.map_size):
                         for j in range(self.map_size):
-                            if self.map[2][i][j] == tcolor:
+                            if int(self.map[2][i][j]) == tcolor:
                                 self.map[2][i][j] = f["color"]
-                                if self.map[2][i][j] == BlockType.crown:
-                                    self.map[2][i][j] = BlockType.city
+                    t["type"] = BlockType.city
                 t["color"] = f["color"]
                 t["amount"] = -t["amount"]
 
@@ -284,7 +290,7 @@ def get_view_of(self, color):
                     # 如果这个玩家现在看不到这一格
                     if color != self.learningbot_color or int(self.shown[color][i][j]) == 0:
                         # 如果是LearningBot 那就帮它保留视野吧(●'◡'●)
-                        if self.map[1][i][j] == BlockType.city:
+                        if self.map[1][i][j] == BlockType.city or self.map[1][i][j] == BlockType.mountain:
                             map_filtered[1][i][j] = BlockType.obstacle
                 else:
                     map_filtered[0][i][j] = self.map[0][i][j]
@@ -341,10 +347,12 @@ def win_check(self):
         alive = []
         for i in range(self.map_size):
             for j in range(self.map_size):
+                if int(self.map[2][i][j]) == PlayerColor.grey:
+                    continue
                 if int(self.map[2][i][j]) not in alive:
                     alive.append(int(self.map[2][i][j]))
                 if len(alive) > 1:
                     return 0
         if alive[0] == self.learningbot_color:
             return 2
-        return 1
+        return 1
diff --git a/onsite_env.py b/onsite_env.py
@@ -71,7 +71,7 @@ def reset(self):
     def step(self, action: torch.Tensor):
         """
         执行一步
-        :param action: movement => tensor([[x1, y1, x2, y2, is_half]])
+        :param action: movement => tensor([[x1, y1, x2, y2, is_half]]) 注意x,y和i,j正好相反
         :return: observation (Tensor), reward (float), done (bool), info (dict)
         """
         reward = 0
@@ -101,16 +101,19 @@ def step(self, action: torch.Tensor):
         last_move = self.action_history.queue[-1]
         last_map = self.map_history.queue[-1]
         # 无效移动扣大分
-        if last_map[2][last_move[0] - 1][last_move[1] - 1] != self._get_colormark(self.self_color):
+        if last_map[2][last_move[1] - 1][last_move[0] - 1] != self._get_colormark(self.self_color):
             reward -= 100
+        # 撞山扣一点
+        if self.map[1][last_move[3] - 1][last_move[2] - 1] == BlockType.mountain:
+            reward -= 10
         # 撞塔扣分
-        if self.map[1][last_move[2] - 1][last_move[3] - 1] == BlockType.city:
-            if self.map[2][last_move[2] - 1][last_move[3] - 1] != self._get_colormark(self.self_color):
+        if self.map[1][last_move[3] - 1][last_move[2] - 1] == BlockType.city:
+            if self.map[2][last_move[3] - 1][last_move[2] - 1] != self._get_colormark(self.self_color):
                 reward -= 10
         # 探索新领地加分 注意 不是占领
         for i in range(8):
-            t_x = last_move[2] - 1 + _dirx[i]
-            t_y = last_move[3] - 1 + _diry[i]
+            t_x = last_move[3] - 1 + _dirx[i]
+            t_y = last_move[2] - 1 + _diry[i]
             if t_x < 0 or t_x >= self.map_size or t_y < 0 or t_y >= self.map_size:
                 continue
             if self.map[3][t_x][t_y] - last_map[3][t_x][t_y] == 1:
@@ -240,10 +243,14 @@ def update_map(self, _init_flag=False):
     def move(self, mov):
         """
         just as the name
-        :param mov: tensor([[x1, y1, x2, y2, is_half]])
+        :param mov: tensor([[x1, y1, x2, y2, is_half]]) 注意x,y和i,j正好相反
         :return:
         """
         move_info = mov[0].long()
+        # 先交换 将x,y坐标转换为i,j坐标
+        move_info[0], move_info[1] = move_info[1], move_info[0]
+        move_info[2], move_info[3] = move_info[3], move_info[2]
+
         if self.selected[0] != move_info[0] - 1 or self.selected[1] != move_info[1] - 1:
             # 如果没选中 先点一下
             self.driver.find_element_by_id(f"td-{int((move_info[0] - 1) * self.map_size + move_info[1])}").click()
diff --git a/utils.py b/utils.py
@@ -131,7 +131,8 @@ def print_tensor_map(game_map):
     size = game_map.shape[1]
     for i in range(size):
         for j in range(size):
-            if game_map[1][i][j] == BlockType.city or game_map[1][i][j] == BlockType.mountain:
+            if game_map[2][i][j] == PlayerColor.grey and \
+                    game_map[1][i][j] == BlockType.city or game_map[1][i][j] == BlockType.mountain:
                 bg = 40
             else:
                 bg = color_trans[int(game_map[2][i][j])]