-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhyperparams.py
executable file
·261 lines (224 loc) · 18.8 KB
/
hyperparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#!/usr/bin/env python
# -*- coding: utf-8 -*-
""" Hyperparameter search meta module - hyperparams.py - `DeepCV`__
.. moduleauthor:: Paul-Emmanuel Sotir
# To-Do List
- TODO: For hyperparameter embedding: read about graph embedding techniques like: https://github.com/google-research/google-research/tree/master/graph_embedding/ddgk and https://github.com/google-research/google-research/tree/master/graph_embedding/watch_your_step
"""
import logging
from typing import Sequence, Iterable, Callable, Dict, Tuple, Any, Union, Optional, List, Type
import mlflow
import networkx
import numpy as np
import scipy.optimize
import torch
import torch.nn
from torch.utils.data import DataLoader, Dataset
import deepcv.utils
from .nn import get_model_capacity
from .data import training_metadata
from .types_aliases import *
__all__ = ['Hyperparameters', 'HyperparameterSpace', 'HyperparamsEmbedding', 'GeneralizationAcrossScalesPredictor', 'to_hyperparameters', 'merge_hyperparameters']
__author__ = 'Paul-Emmanuel Sotir'
Hyperparameters = training_metadata.Hyperparameters
HyperparameterSpace = training_metadata.HyperparameterSpace
class HyperparamsEmbedding(torch.nn.Module):
""" Hyper-parameter dict embedding module
Given an hyper-parameter space (hp_space usually used during smpling of hyperopt's hyperparameter search), converts input hyperparameter dict into a vectorized representation.
Applied to a valid hp dict, returns a fixed-size vector embedding which can be interpreted as vectors in an euclidian space (final neural net layers are enforced by loss constraints to output embeddings in euclidian-like space)
"""
def __init__(self, embedding_size: int, intermediate_embedding_size: Optional[int] = 128, hp_space: HyperparameterSpace = None):
super().__init__()
self._embedding_size = embedding_size
self._intermediate_embedding_size = max(embedding_size + 32, intermediate_embedding_size)
self._hp_space = hp_space
# Define a simple shallow neural net architecture used to obtain final hyper-parameter embedding in appropriate space (euclidian space easier to interpret than space of `_from_hp_space(hp)` vector)
mean_embedding_size = sum(self._intermediate_embedding_size, self.embedding_size) // 2
linear1 = torch.nn.Linear(in_features=self._intermediate_embedding_size + 1, out_features=mean_embedding_size) # + 1 for hp dict hash input
linear2 = torch.nn.Linear(in_features=mean_embedding_size, out_features=min(mean_embedding_size, self.embedding_size * 2))
linear3 = torch.nn.Linear(in_features=min(mean_embedding_size, self.embedding_size * 2), out_features=self.embedding_size)
self._net = torch.nn.Sequential(linear1, torch.nn.ReLU(), linear2, torch.nn.ReLU(), linear3, torch.nn.Softmax())
def fit(self, hp_dicts):
# Unsupervised training to learn 3-layers fully connected NN for hp embedding in euclidian-like space
raise NotImplementedError
def _from_hp_space(self, hp: Dict[str, Any], _hp_repr: np.ndarray = None):
# TODO: parse hp space and append as much as possible (so that whole repr fits in 'embedding_size') binary representation of each relative positions in hp_space ranges/choices
# TODO: refactor this code (use NNI's hp_space generated YAML file instead of hyperopt)
# for node in hyperopt.vectorize.uniq(hyperopt.vectorize.toposort(self._hp_space)):
# if isinstance(node, hyperopt.tpe.pyll.rec_eval):
# pass
if not _hp_repr:
# First call of recursion over hp_space dict
_hp_repr = np.array([])
for n, v in hp.items():
if isinstance(v, Dict):
_hp_repr.append(self._from_hp_space(v, _hp_repr))
elif True or isinstance(v, ...):
raise NotImplementedError
self._hp_space[n]
_hp_repr.append(...)
return torch.from_numpy(_hp_repr)
def forward(self, hp: Hyperparameters) -> torch.Tensor:
# Parse hp_space to deduce hyperparameter dict's position in hp_space: returns an array of relative positions (or None value(s) if hp_space defines choice(s)/range(s) which are not present in input hp dict)
if self._hp_space is not None:
hp_repr = self._from_hp_space(hp)
# Concat binary representations of all relative positions of hp_repr into a vector of size 'embdding_size'
bits_per_position = 32 * self._intermediate_embedding_size // len(hp_repr) # TODO: replace 32 with sizeof(int)
intermediate_embedding = np.ndarray((self._intermediate_embedding_size,), dtype=np.float32)
for i, pos in enumerate(hp_repr):
# TODO: refactor this
# highest_bit_idx = np.floor(np.log(pos, 2))
# bit_pos_repr = pos & (sum(np.power(2, k) for k in range(bits_per_position)) << (
# highest_bit_idx - bits_per_position)) # TODO: fix it by making sure that pos is a positive integer
j = np.mod(i * bits_per_position, 32)
# TODO: handle bit-level offset and consequences
intermediate_embedding[j]
# Concat obtained embedding vector with hp dict hash and input it to FC NN
# TODO: append an embedding of hp graph topology to intermediate embedding using networkx spectral embedding of hp_space nodes
hp_repr = torch.cat(hp_repr, torch.Tensor(hash(hp)))
return self._net(hp_repr)
def _topologic_hp_embedding(self, hp: Dict[str, Any], topo_embedding_size=32):
# TODO: refactor this code (use NNI's hp_space generated YAML file instead of hyperopt)
G = networkx.DiGraph()
# nodes = hyperopt.vectorize.dfs(expr)
nodes_dict = networkx.spectral_layout(G, center=(0, 0), dim=2)
topo_embedding = np.array([], dtype=np.float32)
for n, v in nodes_dict.items():
raise NotImplementedError
# topo_embedding.append()
return topo_embedding
class GeneralizationAcrossScalesPredictor(torch.nn.Module):
""" GeneralizationAcrossScalesPredictor
Improved implementation of [a constructive prediction of the generalization error across scales](https://arxiv.org/pdf/1909.12673.pdf), which can combine proposed paper's model with a two layer fully connected neural net to better predict valid loss (optional).
By default, validation error is predicted by performing a least squares regression of `GeneralizationAcrossScalesPredictor.error_landscape_estimation` enveloppe function which depends on a few parameters (max. 6 parameters to fit).
This lightweight model allows to predict model's best valid loss landscape from very few model training example. This can be usefull, for example, to perform a faster hyperparameter search by training a model a few times on small trainset subsets and estimate how hyperparmeters would perform on a full trainset training.
This model also takes into account the influence of simple model capacity changes over validation error landscape if regression is done on varying (model capacity/dataset size/best validation error) triplets.
We modified validation error landscape modeling in order to reduce it's parameter count if model capacity or dataset size doesn't change across given training results, which permits to perform less training results to accuralty regress on validation error landscape, depending on which use you make of this model.
Moreover, optional fully connected neural net can improve these validation error prediction across more diverse setups if given enought training data. This additional model can eventually take various dataset statistics, hyperparameter embedding, training loss curves (not only best valid/train losses) as input.
But in order to make efficient use of the fully connected model, you will need to fit it on much more training results than basic/lightweight model.
# TODO: scale neural net model based on how much training data is available
# TODO: predict best losses with their respective uncertainty in FC NN
"""
def __init__(self, trainings_count: int, fit_using_hps: HyperparamsEmbedding = None, fit_using_dataset_stats: bool = False, fit_using_loss_curves: bool = False):
"""
Args:
- trainings_count: Number of training which will be performed to predict generalization capability of a model (trainings differs between each other in scale either by having different trainset sizes (across subsets) or different model capacities (across model capacities))
- fit_using_hps:
- fit_using_dataset_stats:
- fit_using_loss_curves:
"""
super().__init__()
self._fit_using_hps = fit_using_hps
self._fit_using_dataset_stats = fit_using_dataset_stats
self._fit_using_loss_curves = fit_using_loss_curves
# We initialize (α, eps0, c∞, η, β, b) parameters to regression results over CIDAR10 dataset with a ResNet architecture (values resulting from https://arxiv.org/pdf/1909.12673.pdf experiments)
self._leastsquares_params = np.array([0.66, 0.9, 7.14e-14, 19.77, 0.53, 5.87e-2])
self._trainings_count = trainings_count
self._use_additional_nn_model = any((fit_using_hps, fit_using_dataset_stats, fit_using_loss_curves))
if self._use_additional_nn_model:
# Define additional meta model which, combined with previous lightweight regression model, predicts generalization capability of a model over full dataset
# Default NN input data: Best train loss, best valid loss, trainset size and model size for each trainset subsets (4 * training_count) + leastsquares fitted model's parameter vector (self._leastsquares_params) and prediction (+1) + full dataset size, model parameter count (+2)
self._input_size = 4 * len(self._trainings_count) + len(self._leastsquares_params) + 3 # TODO: change NN input to be constant sized (not depending on subset count)?
if self._fit_using_hps is not None:
self._input_size += self._fit_using_hps.embedding_size # + Input hyperparameter dict embedding/distance-like-hash to NN
if self._fit_using_dataset_stats:
self._input_size += ... # + Input dataset stats like trainset/validset ratio, data shape, batch_size, mean;variance;quartiles;... of trainset targets, data-type #TODO: see https://arxiv.org/pdf/1810.06305.pdf for interesting dataset embedding/stats
if self._fit_using_loss_curves:
self._input_size += ... # + Validation and training losses evaluated during training iterations
linear1 = torch.nn.Linear(in_features=self._input_size, out_features=64)
linear2 = torch.nn.Linear(in_features=64, out_features=2) # Outputs valid and train losses
self._nn_metamodel = torch.nn.Sequential(linear1, torch.nn.Tanh(), linear2, torch.nn.Tanh())
torch.nn.init.xavier_uniform_(self._nn_metamodel.weight.data, gain=torch.nn.init.calculate_gain('tanh'))
self._nn_metamodel.bias.data.fill_(0.)
@staticmethod
def error_landscape_estimation(metaparms: np.array, m: int = None, n: int = None) -> float:
""" Enveloppe function modeling how best valid loss varies according to model size (m) and trainset size (n).
This simple model's parameters (metaparms) can be fitted on a few training results over trainset subsets (~5-6 subsets) in order to predict model's generealization capability over full trainset without having to train on whole dataset.
Thus, fitting this model during hyperparameter search can save time by estimating how promising is a hyperparmeter setup.
NOTE: if 'm' is None, model capacity is considered to be constant and 'b'/'beta' term will be zeroed/ignored; if 'n' is None, then trainset size is considered to be constant and 'a'/'alpha' term will be zeroed/ignored (allows to simplify model, as constant terms in 'emn' are redundant with 'cinf' parameter)
TODO: make bayesian estimate of least-squares regression uncertainty of this model by choosing appropriate à-prioris (+ combine this estimation with NN's uncertainty)
"""
eps0, cinf, eta = metaparms[0 if n is None else 1: 3]
emn = cinf
if n is not None:
a, alpha = 1., metaparms[0] # 'a=1' because it is a redundant parameter: equivalent to divide 'emn' by 'a' with 'eta' value replaced by 'a * eta'
emn += a * np.power(float(n), -alpha)
if m is not None:
b, beta = metaparms[-2:]
emn += b * np.power(float(m), -beta)
return eps0 * np.absolute(emn / (emn - eta * 1j)) # Complex absolute, i.e. 2D L2 norm
def fit_generalization(self, trainsets: Sequence[DataLoader], models: Sequence[torch.nn.Module], best_valid_losses: Sequence[FLOAT_OR_FLOAT_TENSOR_T], best_train_losses: Sequence[FLOAT_OR_FLOAT_TENSOR_T] = None):
model_capacities = [get_model_capacity(m) for m in models]
cst_modelsize = deepcv.utils.is_roughtly_constant(model_capacities)
if cst_modelsize:
# If model capacity doesn't change, we can simplify model regression by removing 'b' and 'beta' parameters (constant term which can be modeled by 'cinf' parameter)
params = self._leastsquares_params[: -2]
trainset_sizes = [len(dl.dataset) for dl in trainsets]
cst_datasize = deepcv.utils.is_roughtly_constant(trainset_sizes)
if cst_datasize:
# If traiset size doesn't change across results, we can simplify model regression by removing 'alpha' parameter (constant term which can be modeled by 'cinf' parameter)
params = params[1:]
# Fit basic `error_landscape_estimation` model over subsets training results using least squares regression of error estimates divergence
def _error_landscape_divergence(metaparms: np.array) -> float:
preds = [GeneralizationAcrossScalesPredictor.error_landscape_estimation(metaparms, None if cst_modelsize else m, None if cst_datasize else n)
for m, n in zip(model_capacities, trainset_sizes)]
return [(pred - real) / real for pred, real in zip(preds, best_valid_losses)]
rslt = scipy.optimize.least_squares(_error_landscape_divergence, x0=params, jac='3-point', bounds=(0., 200), method='dogbox', loss='soft_l1')
self._leastsquares_params[1 if cst_datasize else None: -2 if cst_modelsize else None] = rslt.x # TODO: smooth averaging window instead of pure update?
# Additional online training of fully connected model for better validation error landscape prediction
if self._use_additional_nn_model:
raise NotImplementedError
hhp = {'': ...}
x = (trainset_sizes, model_capacities, best_valid_losses, best_train_losses)
loss = torch.optim.RMSprop(self._nn_metamodel.params, lr=hhp['lr'], weight_decay=hhp['weight_decay'], momentum=hhp['momentum'])
# TODO: basic training procedure
# TODO: online training considerations
# TODO: create and train on 'meta' dataset from MLFlow?
def forward(self, model: Union[torch.nn.Module, int], trainset: Union[DataLoader, int]) -> float:
model_capacity = get_model_capacity(model) if isinstance(model, torch.nn.Module) else model
trainset_size = trainset if isinstance(trainset, int) else len(trainset)
estimation = GeneralizationAcrossScalesPredictor.error_landscape_estimation(self._leastsquares_params, model_capacity, trainset_size)
if self._use_additional_nn_model:
raise NotImplementedError
# By default our model takes best train loss, best valid loss, model capacity and dataset size for each dataset subsets along with full dataset size, model capacity and 'error_landscape_estimation' model parameter vector previously fitted with least-squares regression
x = ... # torch.Tensor([estimation, *self._leastsquares_params, model_capacity, trainset_size, subsets_results])
if self._fit_using_hps:
# TODO: Process hyper-parameters into a distance-like hash/embedding
x = torch.cat(x, ...)
if self._fit_using_dataset_stats:
# TODO: Process trainset stats and feed it to our model
x = torch.cat(x, ...)
if self._fit_using_loss_curves:
# TODO: Append loss curves resulting from trainset and validset evaluations to metamodel input
x = torch.cat(x, ...)
# We apply fully connected NN with a residual link from valid error estimate of fitted 'error_landscape_estimation' model to NN's output
return torch.cat(self._nn_metamodel(x), torch.from_numy(np.array([estimation, 0.])))
return estimation
def to_hyperparameters(hp: HYPERPARAMS_T, defaults: HYPERPARAMS_T = None, raise_if_missing: bool = True, drop_keys_not_in_defaults: bool = False) -> Union[Hyperparameters, Tuple[Hyperparameters, List[str]]]:
""" Converts given parameters Dict to a `deepcv.meta.hyperparams.Hyperparameters` object if needed. Alse allows you to check required and default hyperparameter through `defaults` argument (see `deepcv.meta.hyperparams.Hyperparameters.with_defaults` for more details)
Args:
- hp: Parameter dict or `deepcv.meta.hyperparams.Hyperparameters` object
- defaults: Optional argument specifying required and default (hyper)parameter(s) (see `deepcv.meta.hyperparams.Hyperparameters.with_defaults` for more details)
- raise_if_missing: Boolean indicating whether if this function should raise an exception if `defaults` specifies mandatory (hyper)parameters which are missing in `hp`
- drop_keys_not_in_defaults: Boolean indicating whether to drop/remove entries from `hp` which are not specified in `defaults` (removes unkown params)
Returns resulting `deepcv.meta.hyperparams.Hyperparameters` object with provided defaults, and eventually also returns missing hyperparameters which are missing according to `defaults` argument, if provided.
"""
if not isinstance(hp, Hyperparameters):
hp = Hyperparameters(**hp)
if defaults is not None:
hp, missing = hp.with_defaults(defaults, drop_keys_not_in_defaults=drop_keys_not_in_defaults)
if len(missing) > 0:
msg = f'Error: Missing mandatory (hyper)parameter(s) (missing="{missing}").'
logging.error(msg)
if raise_if_missing:
raise ValueError(msg)
return hp, missing
return hp
def merge_hyperparameters(*dicts: Iterable[Dict[str, Any]]) -> Hyperparameters:
""" Utils function used to merge given dictionnaries into a `hyperparams.Hyperparameters` class instance """
merged = deepcv.utils.merge_dicts(*dicts)
return Hyperparameters(*merged)
#_______________________________________________ HYPERPARAMS UNIT TESTS _______________________________________________#
if __name__ == '__main__':
cli = deepcv.utils.import_tests().test_module_cli(__file__)
cli()