Skip to content

Commit 40de767

Browse files
committed
init upload for the new repo
0 parents  commit 40de767

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+3830
-0
lines changed

.gitignore

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
*.egg-info/
24+
.installed.cfg
25+
*.egg
26+
MANIFEST
27+
28+
# PyInstaller
29+
# Usually these files are written by a python script from a template
30+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
31+
*.manifest
32+
*.spec
33+
34+
# Installer logs
35+
pip-log.txt
36+
pip-delete-this-directory.txt
37+
38+
# Unit test / coverage reports
39+
htmlcov/
40+
.tox/
41+
.coverage
42+
.coverage.*
43+
.cache
44+
nosetests.xml
45+
coverage.xml
46+
*.cover
47+
.hypothesis/
48+
.pytest_cache/
49+
50+
# Translations
51+
*.mo
52+
*.pot
53+
54+
# Django stuff:
55+
*.log
56+
local_settings.py
57+
db.sqlite3
58+
59+
# Flask stuff:
60+
instance/
61+
.webassets-cache
62+
63+
# Scrapy stuff:
64+
.scrapy
65+
66+
# Sphinx documentation
67+
docs/_build/
68+
69+
# PyBuilder
70+
target/
71+
72+
# Jupyter Notebook
73+
.ipynb_checkpoints
74+
75+
# pyenv
76+
.python-version
77+
78+
# celery beat schedule file
79+
celerybeat-schedule
80+
81+
# SageMath parsed files
82+
*.sage.py
83+
84+
# Environments
85+
.env
86+
.venv
87+
env/
88+
venv/
89+
ENV/
90+
env.bak/
91+
venv.bak/
92+
93+
# Spyder project settings
94+
.spyderproject
95+
.spyproject
96+
97+
# Rope project settings
98+
.ropeproject
99+
100+
# mkdocs documentation
101+
/site
102+
103+
# mypy
104+
.mypy_cache/
105+
106+
# DS Store
107+
.DS_Store
108+
109+
#saved_model
110+
*.pth
111+
112+
*.pt
113+
114+
*.log
115+
116+
*.txt
117+
*.csv
118+
logs/

01_dqn_algos/README.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Deep Q Networks (DQN)
2+
## Instructions
3+
1. Train the agents, can use flag `--use-dueling` and `--use-double-net` to try the Double DQN or Dueling Network Architecture:
4+
```bash
5+
python train.py --env-name='<env name>' --cuda (if you have a GPU) --<other flags>
6+
```
7+
2. Play the demo - Please use the same algorithm flag as training:
8+
```bash
9+
python demo.py --env-name='<env name>' --<algo flags>
10+
```
11+
## Results
12+
![](../figures/01_dqn.png)

01_dqn_algos/arguments.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import argparse
2+
3+
def get_args():
4+
parse = argparse.ArgumentParser()
5+
parse.add_argument('--gamma', type=float, default=0.99, help='the discount factor of RL')
6+
parse.add_argument('--seed', type=int, default=123, help='the random seeds')
7+
parse.add_argument('--env-name', type=str, default='PongNoFrameskip-v4', help='the environment name')
8+
parse.add_argument('--batch-size', type=int, default=32, help='the batch size of updating')
9+
parse.add_argument('--lr', type=float, default=1e-4, help='learning rate of the algorithm')
10+
parse.add_argument('--buffer-size', type=int, default=10000, help='the size of the buffer')
11+
parse.add_argument('--cuda', action='store_true', help='if use the gpu')
12+
parse.add_argument('--init-ratio', type=float, default=1, help='the initial exploration ratio')
13+
parse.add_argument('--exploration_fraction', type=float, default=0.1, help='decide how many steps to do the exploration')
14+
parse.add_argument('--final-ratio', type=float, default=0.01, help='the final exploration ratio')
15+
parse.add_argument('--grad-norm-clipping', type=float, default=10, help='the gradient clipping')
16+
parse.add_argument('--total-timesteps', type=int, default=int(1e7), help='the total timesteps to train network')
17+
parse.add_argument('--learning-starts', type=int, default=10000, help='the frames start to learn')
18+
parse.add_argument('--train-freq', type=int, default=4, help='the frequency to update the network')
19+
parse.add_argument('--target-network-update-freq', type=int, default=1000, help='the frequency to update the target network')
20+
parse.add_argument('--save-dir', type=str, default='saved_models/', help='the folder to save models')
21+
parse.add_argument('--display-interval', type=int, default=10, help='the display interval')
22+
parse.add_argument('--env-type', type=str, default='atari', help='the environment type')
23+
parse.add_argument('--log-dir', type=str, default='logs/', help='dir to save log information')
24+
parse.add_argument('--use-double-net', action='store_true', help='use double dqn to train the agent')
25+
parse.add_argument('--use-dueling', action='store_true', help='use dueling to train the agent')
26+
27+
args = parse.parse_args()
28+
29+
return args

01_dqn_algos/demo.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
import numpy as np
2+
from arguments import get_args
3+
from models import net
4+
import torch
5+
from rl_utils.env_wrapper.atari_wrapper import make_atari, wrap_deepmind
6+
7+
def get_tensors(obs):
8+
obs = np.transpose(obs, (2, 0, 1))
9+
obs = np.expand_dims(obs, 0)
10+
obs = torch.tensor(obs, dtype=torch.float32)
11+
return obs
12+
13+
if __name__ == '__main__':
14+
args = get_args()
15+
# create the environment
16+
env = make_atari(args.env_name)
17+
env = wrap_deepmind(env, frame_stack=True)
18+
# create the network
19+
net = net(env.action_space.n, args.use_dueling)
20+
# model path
21+
model_path = args.save_dir + args.env_name + '/model.pt'
22+
# load the models
23+
net.load_state_dict(torch.load(model_path, map_location=lambda storage, loc: storage))
24+
# start to test the demo
25+
obs = env.reset()
26+
for _ in range(2000):
27+
env.render()
28+
with torch.no_grad():
29+
obs_tensor = get_tensors(obs)
30+
action_value = net(obs_tensor)
31+
action = torch.argmax(action_value.squeeze()).item()
32+
obs, reward, done, _ = env.step(action)
33+
if done:
34+
obs = env.reset()
35+
env.close()

01_dqn_algos/dqn_agent.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
import sys
2+
import numpy as np
3+
from models import net
4+
from utils import linear_schedule, select_actions, reward_recorder
5+
from rl_utils.experience_replay.experience_replay import replay_buffer
6+
import torch
7+
from datetime import datetime
8+
import os
9+
import copy
10+
11+
# define the dqn agent
12+
class dqn_agent:
13+
def __init__(self, env, args):
14+
# define some important
15+
self.env = env
16+
self.args = args
17+
# define the network
18+
self.net = net(self.env.action_space.n, self.args.use_dueling)
19+
# copy the self.net as the
20+
self.target_net = copy.deepcopy(self.net)
21+
# make sure the target net has the same weights as the network
22+
self.target_net.load_state_dict(self.net.state_dict())
23+
if self.args.cuda:
24+
self.net.cuda()
25+
self.target_net.cuda()
26+
# define the optimizer
27+
self.optimizer = torch.optim.Adam(self.net.parameters(), lr=self.args.lr)
28+
# define the replay memory
29+
self.buffer = replay_buffer(self.args.buffer_size)
30+
# define the linear schedule of the exploration
31+
self.exploration_schedule = linear_schedule(int(self.args.total_timesteps * self.args.exploration_fraction), \
32+
self.args.final_ratio, self.args.init_ratio)
33+
# create the folder to save the models
34+
if not os.path.exists(self.args.save_dir):
35+
os.mkdir(self.args.save_dir)
36+
# set the environment folder
37+
self.model_path = os.path.join(self.args.save_dir, self.args.env_name)
38+
if not os.path.exists(self.model_path):
39+
os.mkdir(self.model_path)
40+
41+
# start to do the training
42+
def learn(self):
43+
# the episode reward
44+
episode_reward = reward_recorder()
45+
obs = np.array(self.env.reset())
46+
td_loss = 0
47+
for timestep in range(self.args.total_timesteps):
48+
explore_eps = self.exploration_schedule.get_value(timestep)
49+
with torch.no_grad():
50+
obs_tensor = self._get_tensors(obs)
51+
action_value = self.net(obs_tensor)
52+
# select actions
53+
action = select_actions(action_value, explore_eps)
54+
# excute actions
55+
obs_, reward, done, _ = self.env.step(action)
56+
obs_ = np.array(obs_)
57+
# tryint to append the samples
58+
self.buffer.add(obs, action, reward, obs_, float(done))
59+
obs = obs_
60+
# add the rewards
61+
episode_reward.add_rewards(reward)
62+
if done:
63+
obs = np.array(self.env.reset())
64+
# start new episode to store rewards
65+
episode_reward.start_new_episode()
66+
if timestep > self.args.learning_starts and timestep % self.args.train_freq == 0:
67+
# start to sample the samples from the replay buffer
68+
batch_samples = self.buffer.sample(self.args.batch_size)
69+
td_loss = self._update_network(batch_samples)
70+
if timestep > self.args.learning_starts and timestep % self.args.target_network_update_freq == 0:
71+
# update the target network
72+
self.target_net.load_state_dict(self.net.state_dict())
73+
if done and episode_reward.num_episodes % self.args.display_interval == 0:
74+
print('[{}] Frames: {}, Episode: {}, Mean: {:.3f}, Loss: {:.3f}'.format(datetime.now(), timestep, episode_reward.num_episodes, \
75+
episode_reward.mean, td_loss))
76+
torch.save(self.net.state_dict(), self.model_path + '/model.pt')
77+
78+
# update the network
79+
def _update_network(self, samples):
80+
obses, actions, rewards, obses_next, dones = samples
81+
# convert the data to tensor
82+
obses = self._get_tensors(obses)
83+
actions = torch.tensor(actions, dtype=torch.int64).unsqueeze(-1)
84+
rewards = torch.tensor(rewards, dtype=torch.float32).unsqueeze(-1)
85+
obses_next = self._get_tensors(obses_next)
86+
dones = torch.tensor(1 - dones, dtype=torch.float32).unsqueeze(-1)
87+
# convert into gpu
88+
if self.args.cuda:
89+
actions = actions.cuda()
90+
rewards = rewards.cuda()
91+
dones = dones.cuda()
92+
# calculate the target value
93+
with torch.no_grad():
94+
# if use the double network architecture
95+
if self.args.use_double_net:
96+
q_value_ = self.net(obses_next)
97+
action_max_idx = torch.argmax(q_value_, dim=1, keepdim=True)
98+
target_action_value = self.target_net(obses_next)
99+
target_action_max_value = target_action_value.gather(1, action_max_idx)
100+
else:
101+
target_action_value = self.target_net(obses_next)
102+
target_action_max_value, _ = torch.max(target_action_value, dim=1, keepdim=True)
103+
# target
104+
expected_value = rewards + self.args.gamma * target_action_max_value * dones
105+
# get the real q value
106+
action_value = self.net(obses)
107+
real_value = action_value.gather(1, actions)
108+
loss = (expected_value - real_value).pow(2).mean()
109+
# start to update
110+
self.optimizer.zero_grad()
111+
loss.backward()
112+
self.optimizer.step()
113+
return loss.item()
114+
115+
# get tensors
116+
def _get_tensors(self, obs):
117+
if obs.ndim == 3:
118+
obs = np.transpose(obs, (2, 0, 1))
119+
obs = np.expand_dims(obs, 0)
120+
elif obs.ndim == 4:
121+
obs = np.transpose(obs, (0, 3, 1, 2))
122+
obs = torch.tensor(obs, dtype=torch.float32)
123+
if self.args.cuda:
124+
obs = obs.cuda()
125+
return obs

01_dqn_algos/models.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import torch
2+
import torch.nn as nn
3+
import torch.nn.functional as F
4+
5+
# the convolution layer of deepmind
6+
class deepmind(nn.Module):
7+
def __init__(self):
8+
super(deepmind, self).__init__()
9+
self.conv1 = nn.Conv2d(4, 32, 8, stride=4)
10+
self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
11+
self.conv3 = nn.Conv2d(64, 32, 3, stride=1)
12+
13+
# start to do the init...
14+
nn.init.orthogonal_(self.conv1.weight.data, gain=nn.init.calculate_gain('relu'))
15+
nn.init.orthogonal_(self.conv2.weight.data, gain=nn.init.calculate_gain('relu'))
16+
nn.init.orthogonal_(self.conv3.weight.data, gain=nn.init.calculate_gain('relu'))
17+
# init the bias...
18+
nn.init.constant_(self.conv1.bias.data, 0)
19+
nn.init.constant_(self.conv2.bias.data, 0)
20+
nn.init.constant_(self.conv3.bias.data, 0)
21+
22+
def forward(self, x):
23+
x = F.relu(self.conv1(x))
24+
x = F.relu(self.conv2(x))
25+
x = F.relu(self.conv3(x))
26+
x = x.view(-1, 32 * 7 * 7)
27+
28+
return x
29+
30+
# in the initial, just the nature CNN
31+
class net(nn.Module):
32+
def __init__(self, num_actions, use_dueling=False):
33+
super(net, self).__init__()
34+
# if use the dueling network
35+
self.use_dueling = use_dueling
36+
# define the network
37+
self.cnn_layer = deepmind()
38+
# if not use dueling
39+
if not self.use_dueling:
40+
self.fc1 = nn.Linear(32 * 7 * 7, 256)
41+
self.action_value = nn.Linear(256, num_actions)
42+
else:
43+
# the layer for dueling network architecture
44+
self.action_fc = nn.Linear(32 * 7 * 7, 256)
45+
self.state_value_fc = nn.Linear(32 * 7 * 7, 256)
46+
self.action_value = nn.Linear(256, num_actions)
47+
self.state_value = nn.Linear(256, 1)
48+
49+
def forward(self, inputs):
50+
x = self.cnn_layer(inputs / 255.0)
51+
if not self.use_dueling:
52+
x = F.relu(self.fc1(x))
53+
action_value_out = self.action_value(x)
54+
else:
55+
# get the action value
56+
action_fc = F.relu(self.action_fc(x))
57+
action_value = self.action_value(action_fc)
58+
# get the state value
59+
state_value_fc = F.relu(self.state_value_fc(x))
60+
state_value = self.state_value(state_value_fc)
61+
# action value mean
62+
action_value_mean = torch.mean(action_value, dim=1, keepdim=True)
63+
action_value_center = action_value - action_value_mean
64+
# Q = V + A
65+
action_value_out = state_value + action_value_center
66+
return action_value_out

0 commit comments

Comments
 (0)