TianhongDai
diff --git a/‎.gitignore
Lines changed: 118 additions & 0 deletions b/‎.gitignore
Lines changed: 118 additions & 0 deletions
diff --git a/‎01_dqn_algos/README.md
Lines changed: 12 additions & 0 deletions b/‎01_dqn_algos/README.md
Lines changed: 12 additions & 0 deletions
diff --git a/‎01_dqn_algos/arguments.py
Lines changed: 29 additions & 0 deletions b/‎01_dqn_algos/arguments.py
Lines changed: 29 additions & 0 deletions
diff --git a/‎01_dqn_algos/demo.py
Lines changed: 35 additions & 0 deletions b/‎01_dqn_algos/demo.py
Lines changed: 35 additions & 0 deletions
diff --git a/‎01_dqn_algos/dqn_agent.py
Lines changed: 125 additions & 0 deletions b/‎01_dqn_algos/dqn_agent.py
Lines changed: 125 additions & 0 deletions
diff --git a/‎01_dqn_algos/models.py
Lines changed: 66 additions & 0 deletions b/‎01_dqn_algos/models.py
Lines changed: 66 additions & 0 deletions
@@ -0,0 +1,118 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# DS Store
+.DS_Store
+
+#saved_model
+*.pth
+
+*.pt
+
+*.log
+
+*.txt
+*.csv
+logs/
@@ -0,0 +1,12 @@
+# Deep Q Networks (DQN)
+## Instructions
+1. Train the agents, can use flag `--use-dueling` and `--use-double-net` to try the Double DQN or Dueling Network Architecture:
+```bash
+python train.py --env-name='<env name>' --cuda (if you have a GPU) --<other flags>
+```
+2. Play the demo - Please use the same algorithm flag as training:
+```bash
+python demo.py --env-name='<env name>' --<algo flags>
+```
+## Results
+![](../figures/01_dqn.png)
@@ -0,0 +1,29 @@
+import argparse
+
+def get_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL')
+    parse.add_argument('--seed', type=int, default=123, help='the random seeds')
+    parse.add_argument('--env-name', type=str, default='PongNoFrameskip-v4', help='the environment name')
+    parse.add_argument('--batch-size', type=int, default=32, help='the batch size of updating')
+    parse.add_argument('--lr', type=float, default=1e-4, help='learning rate of the algorithm')
+    parse.add_argument('--buffer-size', type=int, default=10000, help='the size of the buffer')
+    parse.add_argument('--cuda', action='store_true', help='if use the gpu')
+    parse.add_argument('--init-ratio', type=float, default=1, help='the initial exploration ratio')
+    parse.add_argument('--exploration_fraction', type=float, default=0.1, help='decide how many steps to do the exploration')
+    parse.add_argument('--final-ratio', type=float, default=0.01, help='the final exploration ratio')
+    parse.add_argument('--grad-norm-clipping', type=float, default=10, help='the gradient clipping')
+    parse.add_argument('--total-timesteps', type=int, default=int(1e7), help='the total timesteps to train network')
+    parse.add_argument('--learning-starts', type=int, default=10000, help='the frames start to learn')
+    parse.add_argument('--train-freq', type=int, default=4, help='the frequency to update the network')
+    parse.add_argument('--target-network-update-freq', type=int, default=1000, help='the frequency to update the target network')
+    parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models')
+    parse.add_argument('--display-interval', type=int, default=10, help='the display interval')
+    parse.add_argument('--env-type', type=str, default='atari', help='the environment type')
+    parse.add_argument('--log-dir', type=str, default='logs/', help='dir to save log information')
+    parse.add_argument('--use-double-net', action='store_true', help='use double dqn to train the agent')
+    parse.add_argument('--use-dueling', action='store_true', help='use dueling to train the agent')
+
+    args = parse.parse_args()
+
+    return args
@@ -0,0 +1,35 @@
+import numpy as np
+from arguments import get_args
+from models import net
+import torch
+from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind
+
+def get_tensors(obs):
+    obs = np.transpose(obs, (2, 0, 1))
+    obs = np.expand_dims(obs, 0)
+    obs = torch.tensor(obs, dtype=torch.float32)
+    return obs
+
+if __name__ == '__main__':
+    args = get_args()
+    # create the environment
+    env = make_atari(args.env_name)
+    env = wrap_deepmind(env, frame_stack=True)
+    # create the network
+    net = net(env.action_space.n, args.use_dueling) 
+    # model path
+    model_path = args.save_dir + args.env_name + '/model.pt'
+    # load the models
+    net.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
+    # start to test the demo
+    obs = env.reset()
+    for _ in range(2000):
+        env.render()
+        with torch.no_grad():
+            obs_tensor = get_tensors(obs)
+            action_value = net(obs_tensor)
+        action = torch.argmax(action_value.squeeze()).item()
+        obs, reward, done, _ = env.step(action)
+        if done:
+            obs = env.reset()
+    env.close()
@@ -0,0 +1,125 @@
+import sys
+import numpy as np
+from models import net
+from utils import linear_schedule, select_actions, reward_recorder
+from rl_utils.experience_replay.experience_replay import replay_buffer
+import torch
+from datetime import datetime
+import os
+import copy
+
+# define the dqn agent
+class dqn_agent:
+    def __init__(self, env, args):
+        # define some important 
+        self.env = env
+        self.args = args 
+        # define the network
+        self.net = net(self.env.action_space.n, self.args.use_dueling)
+        # copy the self.net as the 
+        self.target_net = copy.deepcopy(self.net)
+        # make sure the target net has the same weights as the network
+        self.target_net.load_state_dict(self.net.state_dict())
+        if self.args.cuda:
+            self.net.cuda()
+            self.target_net.cuda()
+        # define the optimizer
+        self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.args.lr)
+        # define the replay memory
+        self.buffer = replay_buffer(self.args.buffer_size)
+        # define the linear schedule of the exploration
+        self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \
+                                                    self.args.final_ratio, self.args.init_ratio)
+        # create the folder to save the models
+        if not os.path.exists(self.args.save_dir):
+            os.mkdir(self.args.save_dir)
+        # set the environment folder
+        self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
+        if not os.path.exists(self.model_path):
+            os.mkdir(self.model_path)
+
+    # start to do the training
+    def learn(self):
+        # the episode reward
+        episode_reward = reward_recorder()
+        obs = np.array(self.env.reset())
+        td_loss = 0
+        for timestep in range(self.args.total_timesteps):
+            explore_eps = self.exploration_schedule.get_value(timestep)
+            with torch.no_grad():
+                obs_tensor = self._get_tensors(obs)
+                action_value = self.net(obs_tensor)
+            # select actions
+            action = select_actions(action_value, explore_eps)
+            # excute actions
+            obs_, reward, done, _ = self.env.step(action)
+            obs_ = np.array(obs_)
+            # tryint to append the samples
+            self.buffer.add(obs, action, reward, obs_, float(done))
+            obs = obs_
+            # add the rewards
+            episode_reward.add_rewards(reward)
+            if done:
+                obs = np.array(self.env.reset())
+                # start new episode to store rewards
+                episode_reward.start_new_episode()
+            if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0:
+                # start to sample the samples from the replay buffer
+                batch_samples = self.buffer.sample(self.args.batch_size)
+                td_loss = self._update_network(batch_samples)
+            if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0:
+                # update the target network
+                self.target_net.load_state_dict(self.net.state_dict())
+            if done and episode_reward.num_episodes % self.args.display_interval == 0:
+                print('[{}] Frames: {}, Episode: {}, Mean: {:.3f}, Loss: {:.3f}'.format(datetime.now(), timestep, episode_reward.num_episodes, \
+                        episode_reward.mean, td_loss))
+                torch.save(self.net.state_dict(), self.model_path + '/model.pt')
+
+    # update the network
+    def _update_network(self, samples):
+        obses, actions, rewards, obses_next, dones = samples
+        # convert the data to tensor
+        obses = self._get_tensors(obses)
+        actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(-1)
+        rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(-1)
+        obses_next = self._get_tensors(obses_next)
+        dones = torch.tensor(1 - dones, dtype=torch.float32).unsqueeze(-1)
+        # convert into gpu
+        if self.args.cuda:
+            actions = actions.cuda()
+            rewards = rewards.cuda()
+            dones = dones.cuda()
+        # calculate the target value
+        with torch.no_grad():
+            # if use the double network architecture
+            if self.args.use_double_net:
+                q_value_ = self.net(obses_next)
+                action_max_idx = torch.argmax(q_value_, dim=1, keepdim=True)
+                target_action_value = self.target_net(obses_next)
+                target_action_max_value = target_action_value.gather(1, action_max_idx)
+            else:
+                target_action_value = self.target_net(obses_next)
+                target_action_max_value, _ = torch.max(target_action_value, dim=1, keepdim=True)
+        # target
+        expected_value = rewards + self.args.gamma * target_action_max_value * dones
+        # get the real q value
+        action_value = self.net(obses)
+        real_value = action_value.gather(1, actions)
+        loss = (expected_value - real_value).pow(2).mean()
+        # start to update
+        self.optimizer.zero_grad()
+        loss.backward()
+        self.optimizer.step()
+        return loss.item()
+
+    # get tensors
+    def _get_tensors(self, obs):
+        if obs.ndim == 3:
+            obs = np.transpose(obs, (2, 0, 1))
+            obs = np.expand_dims(obs, 0)
+        elif obs.ndim == 4:
+            obs = np.transpose(obs, (0, 3, 1, 2))
+        obs = torch.tensor(obs, dtype=torch.float32)
+        if self.args.cuda:
+            obs = obs.cuda()
+        return obs
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+# the convolution layer of deepmind
+class deepmind(nn.Module):
+    def __init__(self):
+        super(deepmind, self).__init__()
+        self.conv1 = nn.Conv2d(4, 32, 8, stride=4)
+        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
+        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)
+        
+        # start to do the init...
+        nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu'))
+        nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu'))
+        nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu'))
+        # init the bias...
+        nn.init.constant_(self.conv1.bias.data, 0)
+        nn.init.constant_(self.conv2.bias.data, 0)
+        nn.init.constant_(self.conv3.bias.data, 0)
+        
+    def forward(self, x):
+        x = F.relu(self.conv1(x))
+        x = F.relu(self.conv2(x))
+        x = F.relu(self.conv3(x))
+        x = x.view(-1, 32 * 7 * 7)
+
+        return x
+
+# in the initial, just the nature CNN
+class net(nn.Module):
+    def __init__(self, num_actions, use_dueling=False):
+        super(net, self).__init__()
+        # if use the dueling network
+        self.use_dueling = use_dueling
+        # define the network
+        self.cnn_layer = deepmind()
+        # if not use dueling
+        if not self.use_dueling:
+            self.fc1 = nn.Linear(32 * 7 * 7, 256)
+            self.action_value = nn.Linear(256, num_actions)
+        else:
+            # the layer for dueling network architecture
+            self.action_fc = nn.Linear(32 * 7 * 7, 256)
+            self.state_value_fc = nn.Linear(32 * 7 * 7, 256)
+            self.action_value = nn.Linear(256, num_actions)
+            self.state_value = nn.Linear(256, 1)
+
+    def forward(self, inputs):
+        x = self.cnn_layer(inputs / 255.0)
+        if not self.use_dueling:
+            x = F.relu(self.fc1(x))
+            action_value_out = self.action_value(x)
+        else:
+            # get the action value
+            action_fc = F.relu(self.action_fc(x))
+            action_value = self.action_value(action_fc)
+            # get the state value
+            state_value_fc = F.relu(self.state_value_fc(x))
+            state_value = self.state_value(state_value_fc)
+            # action value mean
+            action_value_mean = torch.mean(action_value, dim=1, keepdim=True)
+            action_value_center = action_value - action_value_mean
+            # Q = V + A
+            action_value_out = state_value + action_value_center
+        return action_value_out