-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtrain.py
395 lines (338 loc) · 15.3 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
# 'Regression of 3D Sky Map to Cosmological Parameters (CosmoFlow)'
# Copyright (c) 2018, The Regents of the University of California,
# through Lawrence Berkeley National Laboratory (subject to receipt of any
# required approvals from the U.S. Dept. of Energy). All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# If you have questions about your rights to use or distribute this software,
# please contact Berkeley Lab's Innovation & Partnerships Office at [email protected].
#
# NOTICE. This Software was developed under funding from the U.S. Department of
# Energy and the U.S. Government consequently retains certain rights. As such,
# the U.S. Government has been granted for itself and others acting on its
# behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software
# to reproduce, distribute copies to the public, prepare derivative works, and
# perform publicly and display publicly, and to permit other to do so.
"""
Main training script for the CosmoFlow Keras benchmark
"""
# System imports
import os
import argparse
import logging
import pickle
from types import SimpleNamespace
# External imports
import yaml
import pandas as pd
import tensorflow as tf
# Suppress TF warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(logging.ERROR)
import horovod.tensorflow.keras as hvd
import wandb
# MLPerf logging
try:
from mlperf_logging import mllog
have_mlperf_logging = True
except ImportError:
have_mlperf_logging = False
# Local imports
from data import get_datasets
from models import get_model
# Fix for loading Lambda layer checkpoints
from models.layers import *
from utils.optimizers import get_optimizer, get_lr_schedule
from utils.callbacks import (TimingCallback, MLPerfLoggingCallback,
StopAtTargetCallback)
from utils.device import configure_session
from utils.argparse import ReadYaml
from utils.checkpoints import reload_last_checkpoint
from utils.mlperf_logging import configure_mllogger, log_submission_info
def parse_args():
"""Parse command line arguments"""
parser = argparse.ArgumentParser('train.py')
add_arg = parser.add_argument
add_arg('config', nargs='?', default='configs/cosmo.yaml')
add_arg('--output-dir', help='Override output directory')
add_arg('--run-tag', help='Unique run tag for logging')
# Override data settings
add_arg('--data-dir', help='Override the path to input files')
add_arg('--n-train', type=int, help='Override number of training samples')
add_arg('--n-valid', type=int, help='Override number of validation samples')
add_arg('--batch-size', type=int, help='Override the batch size')
add_arg('--n-epochs', type=int, help='Override number of epochs')
add_arg('--apply-log', type=int, choices=[0, 1], help='Apply log transform to data')
add_arg('--stage-dir', help='Local directory to stage data to before training')
add_arg('--n-parallel-reads', type=int, help='Override num parallel read calls')
add_arg('--prefetch', type=int, help='Override data prefetch number')
# Hyperparameter settings
add_arg('--conv-size', type=int, help='CNN size parameter')
add_arg('--fc1-size', type=int, help='Fully-connected size parameter 1')
add_arg('--fc2-size', type=int, help='Fully-connected size parameter 2')
add_arg('--hidden-activation', help='Override hidden activation function')
add_arg('--dropout', type=float, help='Override dropout')
add_arg('--optimizer', help='Override optimizer type')
add_arg('--lr', type=float, help='Override learning rate')
# Runtime / device settings
add_arg('-d', '--distributed', action='store_true')
add_arg('--gpu', type=int, help='Specify a specific GPU number to use')
add_arg('--rank-gpu', action='store_true',
help='Use GPU based on local rank')
add_arg('--resume', action='store_true',
help='Resume from last checkpoint')
add_arg('--intra-threads', type=int, default=32,
help='TF intra-parallel threads')
add_arg('--inter-threads', type=int, default=2,
help='TF inter-parallel threads')
add_arg('--kmp-blocktime', help='Set KMP_BLOCKTIME')
add_arg('--kmp-affinity', help='Set KMP_AFFINITY')
add_arg('--omp-num-threads', help='Set OMP_NUM_THREADS')
add_arg('--amp', action='store_true', help='Enable automatic mixed precision')
# Other settings
add_arg('--seed', type=int, default=0, help='Specify the random seed')
add_arg('--deterministic-ops', action='store_true',
help='Enable TF deterministic ops (may not be 100% deterministic)')
add_arg('--mlperf', action='store_true', help='Enable MLPerf logging')
add_arg('--wandb', action='store_true', help='Enable W&B logging')
add_arg('--tensorboard', action='store_true', help='Enable TB logger')
add_arg('--print-fom', action='store_true', help='Print parsable figure of merit')
add_arg('-v', '--verbose', action='store_true')
return parser.parse_args()
def init_workers(distributed=False):
if distributed:
hvd.init()
return SimpleNamespace(rank=hvd.rank(), size=hvd.size(),
local_rank=hvd.local_rank(),
local_size=hvd.local_size())
else:
return SimpleNamespace(rank=0, size=1, local_rank=0, local_size=1)
def config_logging(verbose):
log_format = '%(asctime)s %(levelname)s %(message)s'
log_level = logging.DEBUG if verbose else logging.INFO
logging.basicConfig(level=log_level, format=log_format)
def load_config(args):
"""Reads the YAML config file and returns a config dictionary"""
with open(args.config) as f:
config = yaml.load(f, Loader=yaml.FullLoader)
# Expand paths
output_dir = config['output_dir'] if args.output_dir is None else args.output_dir
config['output_dir'] = os.path.expandvars(output_dir)
# Override data config from command line
if args.data_dir is not None:
config['data']['data_dir'] = args.data_dir
if args.n_train is not None:
config['data']['n_train'] = args.n_train
if args.n_valid is not None:
config['data']['n_valid'] = args.n_valid
if args.batch_size is not None:
config['data']['batch_size'] = args.batch_size
if args.n_epochs is not None:
config['data']['n_epochs'] = args.n_epochs
if args.apply_log is not None:
config['data']['apply_log'] = bool(args.apply_log)
if args.stage_dir is not None:
config['data']['stage_dir'] = args.stage_dir
if args.n_parallel_reads is not None:
config['data']['n_parallel_reads'] = args.n_parallel_reads
if args.prefetch is not None:
config['data']['prefetch'] = args.prefetch
# Hyperparameters
if args.conv_size is not None:
config['model']['conv_size'] = args.conv_size
if args.fc1_size is not None:
config['model']['fc1_size'] = args.fc1_size
if args.fc2_size is not None:
config['model']['fc2_size'] = args.fc2_size
if args.hidden_activation is not None:
config['model']['hidden_activation'] = args.hidden_activation
if args.dropout is not None:
config['model']['dropout'] = args.dropout
if args.optimizer is not None:
config['optimizer']['name'] = args.optimizer
if args.lr is not None:
config['optimizer']['lr'] = args.lr
return config
def save_config(config):
output_dir = config['output_dir']
config_file = os.path.join(output_dir, 'config.pkl')
logging.info('Writing config via pickle to %s', config_file)
with open(config_file, 'wb') as f:
pickle.dump(config, f)
def load_history(output_dir):
return pd.read_csv(os.path.join(output_dir, 'history.csv'))
def print_training_summary(output_dir, print_fom):
history = load_history(output_dir)
if 'val_loss' in history.keys():
best = history.val_loss.idxmin()
logging.info('Best result:')
for key in history.keys():
logging.info(' %s: %g', key, history[key].loc[best])
# Figure of merit printing for HPO parsing
if print_fom:
print('FoM:', history['val_loss'].loc[best])
logging.info('Total epoch time: %.3f', history.time.sum())
logging.info('Mean epoch time: %.3f', history.time.mean())
def main():
"""Main function"""
# Initialization
args = parse_args()
dist = init_workers(args.distributed)
config = load_config(args)
os.makedirs(config['output_dir'], exist_ok=True)
config_logging(verbose=args.verbose)
logging.info('Initialized rank %i size %i local_rank %i local_size %i',
dist.rank, dist.size, dist.local_rank, dist.local_size)
if dist.rank == 0:
logging.info('Configuration: %s', config)
# Random seeding
tf.keras.utils.set_random_seed(args.seed)
# Enable deterministic ops - should ensure single-gpu determinism but
# doesn't seem to guarantee determinism with Horovod distributed training
if args.deterministic_ops:
tf.config.experimental.enable_op_determinism()
# Setup MLPerf logging
if args.mlperf:
mllogger = configure_mllogger(config['output_dir'])
if dist.rank == 0 and args.mlperf:
mllogger.event(key=mllog.constants.CACHE_CLEAR)
mllogger.start(key=mllog.constants.INIT_START)
mllogger.start(key=mllog.constants.SEED, value=args.seed)
# Scale logging for mlperf hpc metrics
mllogger.event(key='number_of_ranks', value=dist.size)
mllogger.event(key='number_of_nodes', value=(dist.size//dist.local_size))
mllogger.event(key='accelerators_per_node', value=dist.local_size)
# Initialize Weights & Biases logging
if args.wandb and dist.rank == 0:
import wandb
wandb.init(project='cosmoflow', name=args.run_tag, id=args.run_tag,
config=config, resume=args.run_tag)
# Device and session configuration
gpu = dist.local_rank if args.rank_gpu else args.gpu
if gpu is not None:
logging.info('Taking gpu %i', gpu)
configure_session(gpu=gpu,
intra_threads=args.intra_threads,
inter_threads=args.inter_threads,
kmp_blocktime=args.kmp_blocktime,
kmp_affinity=args.kmp_affinity,
omp_num_threads=args.omp_num_threads)
# Mixed precision
if args.amp:
logging.info('Enabling mixed float16 precision')
tf.keras.mixed_precision.set_global_policy('mixed_float16')
# Start MLPerf logging
if dist.rank == 0 and args.mlperf:
log_submission_info(**config.get('mlperf', {}))
mllogger.end(key=mllog.constants.INIT_STOP)
mllogger.start(key=mllog.constants.RUN_START)
# Load the data
data_config = config['data']
if dist.rank == 0:
logging.info('Loading data')
datasets = get_datasets(dist=dist, **data_config)
logging.debug('Datasets: %s', datasets)
# Construct or reload the model
if dist.rank == 0:
logging.info('Building the model')
train_config = config['train']
initial_epoch = 0
checkpoint_format = os.path.join(config['output_dir'], 'checkpoint-{epoch:03d}.h5')
if args.resume and os.path.exists(checkpoint_format.format(epoch=1)):
# Reload model from last checkpoint
initial_epoch, model = reload_last_checkpoint(
checkpoint_format, data_config['n_epochs'],
distributed=args.distributed)
else:
# Build a new model
model = get_model(**config['model'])
# Configure the optimizer
opt = get_optimizer(distributed=args.distributed,
**config['optimizer'])
# Compile the model
model.compile(optimizer=opt, loss=train_config['loss'],
metrics=train_config['metrics'])
if dist.rank == 0:
model.summary()
# Save configuration to output directory
if dist.rank == 0:
config['n_ranks'] = dist.size
save_config(config)
# Prepare the callbacks
if dist.rank == 0:
logging.info('Preparing callbacks')
callbacks = []
if args.distributed:
# Broadcast initial variable states from rank 0 to all processes.
callbacks.append(hvd.callbacks.BroadcastGlobalVariablesCallback(0))
# Average metrics across workers
callbacks.append(hvd.callbacks.MetricAverageCallback())
# Learning rate decay schedule
if 'lr_schedule' in config:
global_batch_size = data_config['batch_size'] * dist.size
callbacks.append(tf.keras.callbacks.LearningRateScheduler(
get_lr_schedule(global_batch_size=global_batch_size,
**config['lr_schedule'])))
# Timing
timing_callback = TimingCallback()
callbacks.append(timing_callback)
# Checkpointing and logging from rank 0 only
if dist.rank == 0:
callbacks.append(tf.keras.callbacks.ModelCheckpoint(checkpoint_format))
callbacks.append(tf.keras.callbacks.CSVLogger(
os.path.join(config['output_dir'], 'history.csv'), append=args.resume))
if args.tensorboard:
callbacks.append(tf.keras.callbacks.TensorBoard(
os.path.join(config['output_dir'], 'tensorboard')))
if args.mlperf:
callbacks.append(MLPerfLoggingCallback())
if args.wandb:
callbacks.append(wandb.keras.WandbCallback())
# Early stopping
patience = train_config.get('early_stopping_patience', None)
if patience is not None:
callbacks.append(tf.keras.callbacks.EarlyStopping(
monitor='val_loss', min_delta=1e-5, patience=patience, verbose=1))
# Stopping at specified target
target_mae = train_config.get('target_mae', None)
callbacks.append(StopAtTargetCallback(target_max=target_mae))
if dist.rank == 0:
logging.debug('Callbacks: %s', callbacks)
# Train the model
if dist.rank == 0:
logging.info('Beginning training')
fit_verbose = 1 if (args.verbose and dist.rank==0) else 2
model.fit(datasets['train_dataset'],
steps_per_epoch=datasets['n_train_steps'],
epochs=data_config['n_epochs'],
validation_data=datasets['valid_dataset'],
validation_steps=datasets['n_valid_steps'],
callbacks=callbacks,
initial_epoch=initial_epoch,
verbose=fit_verbose)
# Stop MLPerf timer
if dist.rank == 0 and args.mlperf:
mllogger.end(key=mllog.constants.RUN_STOP, metadata={'status': 'success'})
# Print training summary
if dist.rank == 0:
print_training_summary(config['output_dir'], args.print_fom)
# Print GPU memory
#if gpu is not None:
# gpu_mem_info = tf.config.experimental.get_memory_info(f'GPU:{gpu}')
# logging.info('Peak GPU memory: %.2f GB', gpu_mem_info['peak'] / 1024 / 1024 / 1024)
# Finalize
if dist.rank == 0:
logging.info('All done!')
if __name__ == '__main__':
main()