百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例(TensorFlow 2.3.0)
文档简介:
TensorFlow
基于tensorflow框架的MNIST图像分类任务示例代码,训练数据集点击这里下载
单机训练(计算节点数为1),示例代码如下:
"""
tf train demo
"""
import tensorflow as tf
import os
mnist = tf.keras.datasets.mnist
work_path = os.getcwd()
(x_train, y_train), (x_test, y_test) = mnist.load_data('%s/train_data/mnist.npz' % work_path)
TensorFlow
基于tensorflow框架的MNIST图像分类任务示例代码,训练数据集点击这里下载
单机训练(计算节点数为1),示例代码如下:
""" tf train demo """ import tensorflow as tf import os mnist = tf.keras.datasets.mnist work_path = os.getcwd() (x_train, y_train), (x_test, y_test) = mnist.load_data
('%s/train_data/mnist.npz' % work_path) x_train, x_test = x_train / 255.0,
x_test / 255.0 model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(
28, 28)), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.2),
tf.keras.layers.Dense(10, activation='softmax') ]) model.compile(optimizer='adam',
loss='sparse_categorical_crossentropy', metrics=['accuracy']) model.fit(x_train,
y_train, epochs=5) model.evaluate(x_test, y_test, verbose=2) model.save('./output/')
分布式训练(计算节点数大于1),示例代码如下:
说明:demo分布式程序没有做数据的分片操作,仅供参考
""" tf horovod train demo """ import tensorflow as tf import horovod.tensorflow as hvd import os
# Horovod: initialize Horovod. hvd.init() # Horovod: pin GPU to be used
to process local rank (one GPU per process) gpus = tf.config.experimental.
list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu,
True) if gpus: tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
work_path = os.getcwd() (mnist_images, mnist_labels), _ = \ tf.keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path) dataset
= tf.data.Dataset.from_tensor_slices( (tf.cast(mnist_images[..., tf.newaxis] / 255.0,
tf.float32), tf.cast(mnist_labels, tf.int64)) ) dataset = dataset.repeat().shuffle
(10000).batch(128) mnist_model = tf.keras.Sequential([ tf.keras.layers.Conv2D(32,
[3, 3], activation='relu'), tf.keras.layers.Conv2D(64, [3, 3], activation='relu'),
tf.keras.layers.MaxPooling2D(pool_size=(2, 2)), tf.keras.layers.Dropout(0.25),
tf.keras.layers.Flatten(), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(10, activation='softmax') ]) loss = tf.losses.SparseCategoricalCrossentropy()
# Horovod: adjust learning rate based on number of GPUs. opt = tf.optimizers.Adam
(0.001 * hvd.size()) @tf.function def training_step(images, labels, first_batch): """ :param images: :param labels: :param first_batch: :return: """ with tf.GradientTape() as tape: probs = mnist_model(images, training=True)
loss_value = loss(labels, probs) # Horovod: add Horovod Distributed GradientTape.
tape = hvd.DistributedGradientTape(tape) grads = tape.gradient(loss_value, mnist_model.
trainable_variables) opt.apply_gradients(zip(grads, mnist_model.trainable_variables))
# Horovod: broadcast initial variable states from rank 0 to all other processes.
# This is necessary to ensure consistent initialization of all workers when
# training is started with random weights or restored from a checkpoint. #
# Note: broadcast should be done after the first gradient step to ensure optimizer #
initialization. if first_batch: hvd.broadcast_variables(mnist_model.variables, root_rank=0)
hvd.broadcast_variables(opt.variables(), root_rank=0) return loss_value # Horovod: adjust number
of steps based on number of GPUs. for batch, (images, labels) in enumerate(dataset.take
(10000 // hvd.size())): loss_value = training_step(images, labels, batch == 0) if batch %
10 == 0 and hvd.local_rank() == 0: print('Step #%d\tLoss: %.6f' % (batch, loss_value))
# Horovod: save model only on worker 0 to prevent other workers from # corrupting it.
if hvd.rank() == 0: mnist_model.save('./output/')