百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例（TensorFlow 1.13.2）

飞桨BML 全功能AI开发平台

简介/价格/文档

百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例（TensorFlow 1.13.2）

文档简介：

TensorFlow 基于tensorflow框架的MNIST图像分类任务示例代码，训练数据集点击这里下载单机训练（计算节点数为1），示例代码如下： import os import tensorflow as tf import numpy as np from tensorflow import keras layers = tf.layers tf.logging.set_verbosity(tf.logging.INFO) def conv_model(feature, target, mode):

*此产品及展示信息均由百度智能云官方提供。免费试用咨询热线：400-826-7010，为您提供专业的售前咨询，让您快速了解云产品，助您轻松上云！微信咨询

免费试用、价格特惠

文档详情

TensorFlow

基于tensorflow框架的MNIST图像分类任务示例代码，训练数据集点击这里下载

单机训练（计算节点数为1），示例代码如下：

import os import tensorflow as tf import numpy as np from tensorflow import keras
layers = tf.layers
tf.logging.set_verbosity(tf.logging.INFO) def conv_model(feature, target, mode):

"""2-layer convolution model.""" # Convert the target to a one-hot tensor of

shape (batch_size, 10) and # with a on-value of 1 for each one-hot vector of

length 10. target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0) # Reshape

feature to 4d tensor with 2nd and 3rd dimensions being # image width and

 height final dimension being the number of color channels. feature

= tf.reshape(feature, [-1, 28, 28, 1]) # First conv layer will compute

32 features for each 5x5 patch with tf.variable_scope('conv_layer1'):

 h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5], activation=tf.nn.relu,

 padding="SAME") h_pool1 = tf.nn.max_pool( h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

 # Second conv layer will compute 64 features for each 5x5 patch. with tf.variable_scope('conv_layer2')

: h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5], activation=tf.nn.relu, padding="SAME") h_pool2

 = tf.nn.max_pool( h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

# reshape tensor into a batch of vectors h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

# Densely connected layer with 1024 neurons. h_fc1 = layers.dropout( layers.dense(h_pool2_flat,

 1024, activation=tf.nn.relu), rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN)

# Compute logits (1 per class) and compute loss. logits = layers.dense(h_fc1, 10,

 activation=None) loss = tf.losses.softmax_cross_entropy(target, logits) return tf.argmax(logits, 1),

 loss def train_input_generator(x_train, y_train, batch_size=64): assert len(x_train)

== len(y_train) while True: p = np.random.permutation(len(x_train)) x_train,

 y_train = x_train[p], y_train[p] index = 0 while index <= len(x_train) -

batch_size: yield x_train[index:index + batch_size], \
               y_train[index:index + batch_size], index += batch_size def main(_):

 work_path = os.getcwd() # Download and load MNIST dataset. (x_train, y_train),

 (x_test, y_test) = \
     keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path)

# The shape of downloaded data is (-1, 28, 28), hence we need to reshape it

 # into (-1, 784) to feed into our network. Also, need to normalize the

# features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) / 255.0

x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model... with tf.name_scope('input'):

 image = tf.placeholder(tf.float32, [None, 784], name='image') label = tf.placeholder(tf.float32,

 [None], name='label') predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) opt

 = tf.train.RMSPropOptimizer(0.001) global_step = tf.train.get_or_create_global_step() train_op

 = opt.minimize(loss, global_step=global_step) hooks = [ tf.train.StopAtStepHook(last_step=20000),

 tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss}, every_n_iter=10), ]

 # Horovod: pin GPU to be used to process local rank (one GPU per process) config

= tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list

 = '0' # Horovod: save checkpoints only on worker 0 to prevent other workers from #

 corrupting them. checkpoint_dir = './checkpoints' training_batch_generator

 = train_input_generator(x_train, y_train, batch_size=100) # The MonitoredTrainingSession

takes care of session initialization, # restoring from a checkpoint, saving to a

 checkpoint, and closing when done # or an error occurs. with tf.train.MonitoredT

rainingSession(checkpoint_dir=checkpoint_dir, hooks=hooks, config=config) as mon_sess:

while not mon_sess.should_stop(): # Run a training step synchronously. image_, label_

 = next(training_batch_generator) mon_sess.run(train_op, feed_dict={image: image_,

label: label_}) checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir) saver

 = tf.train.Saver() inputs_classes = tf.saved_model.utils.build_tensor_info(image)

outputs_classes = tf.saved_model.utils.build_tensor_info(predict) signature =

(tf.saved_model.signature_def_utils.build_signature_def( inputs={tf.saved_model.

signature_constants.CLASSIFY_INPUTS: inputs_classes}, outputs={tf.saved_model.signature_constants.

CLASSIFY_OUTPUT_CLASSES: outputs_classes}, method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME))

os.system("rm -rf ./output") with tf.Session() as sess: sess.run([tf.local_variables_initializer(),

tf.tables_initializer()]) saver.restore(sess, checkpoint_file) builder = tf.saved_model.builder.

SavedModelBuilder('./output') legacy_init_op = tf.group(tf.tables_initializer(), name=

'legacy_init_op') builder.add_meta_graph_and_variables(sess, [tf.saved_model.

tag_constants.SERVING], signature_def_map={'predict_images': signature}, legacy_init_op=legacy_init_op)

builder.save() if __name__ == "__main__": tf.app.run()

分布式训练（计算节点数大于1），示例代码如下：

说明：demo分布式程序没有做数据的分片操作，仅供参考

import os import tensorflow as tf import horovod.tensorflow as hvd import numpy as np from tensorflow import keras
layers = tf.layers
tf.logging.set_verbosity(tf.logging.INFO) def conv_model(feature, target, mode): """2-layer convolution model.

""" # Convert the target to a one-hot tensor of shape (batch_size, 10) and # with a on-value of

1 for each one-hot vector of length 10. target = tf.one_hot(tf.cast(target, tf.int32), 10, 1, 0)

# Reshape feature to 4d tensor with 2nd and 3rd dimensions being # image width and height final

 dimension being the number of color channels. feature = tf.reshape(feature, [-1, 28, 28, 1])

# First conv layer will compute 32 features for each 5x5 patch with tf.variable_scope('conv_layer1'):

 h_conv1 = layers.conv2d(feature, 32, kernel_size=[5, 5], activation=tf.nn.relu, padding="SAME") h_pool1

= tf.nn.max_pool( h_conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

# Second conv layer will compute 64 features for each 5x5 patch. with tf.variable_scope('conv_layer2'):

 h_conv2 = layers.conv2d(h_pool1, 64, kernel_size=[5, 5], activation=tf.nn.relu, padding="SAME")

 h_pool2 = tf.nn.max_pool( h_conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

# reshape tensor into a batch of vectors h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64])

# Densely connected layer with 1024 neurons. h_fc1 = layers.dropout( layers.dense(h_pool2_flat,

 1024, activation=tf.nn.relu), rate=0.5, training=mode == tf.estimator.ModeKeys.TRAIN) # Compute

logits (1 per class) and compute loss. logits = layers.dense(h_fc1, 10, activation=None) loss =

 tf.losses.softmax_cross_entropy(target, logits) return tf.argmax(logits, 1), loss def train_input

_generator(x_train, y_train, batch_size=64): assert len(x_train) == len(y_train) while True: p

= np.random.permutation(len(x_train)) x_train, y_train = x_train[p], y_train[p] index = 0 while

index <= len(x_train) - batch_size: yield x_train[index:index + batch_size], \
                  y_train[index:index + batch_size], index += batch_size def main(_):

 # Horovod: initialize Horovod. hvd.init() work_path = os.getcwd() # Download and

load MNIST dataset. (x_train, y_train), (x_test, y_test) = \
        keras.datasets.mnist.load_data('%s/train_data/mnist.npz' % work_path) #

The shape of downloaded data is (-1, 28, 28), hence we need to reshape it #

into (-1, 784) to feed into our network. Also, need to normalize the

# features between 0 and 1. x_train = np.reshape(x_train, (-1, 784)) /

255.0 x_test = np.reshape(x_test, (-1, 784)) / 255.0 # Build model...

 with tf.name_scope('input'): image = tf.placeholder(tf.float32, [None,

784], name='image') label = tf.placeholder(tf.float32, [None], name='label')

 predict, loss = conv_model(image, label, tf.estimator.ModeKeys.TRAIN) serve_graph_file =

 "./serve_graph.meta" tf.train.export_meta_graph(serve_graph_file, as_text=True)

# Horovod: adjust learning rate based on number of GPUs. opt =

tf.train.RMSPropOptimizer(0.001 * hvd.size()) # Horovod: add Horovod Distributed Optimizer.

opt = hvd.DistributedOptimizer(opt) global_step = tf.train.get_or_create_global_step() train_op

 = opt.minimize(loss, global_step=global_step) hooks = [ # Horovod: BroadcastGlobalVariablesHook

broadcasts initial variable states # from rank 0 to all other processes. This is necessary

 to ensure consistent # initialization of all workers when training is started with

 random weights # or restored from a checkpoint. hvd.BroadcastGlobalVariablesHook(0),

 # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook

(last_step=10000 // hvd.size()), tf.train.LoggingTensorHook(tensors={'step': global_step,

 'loss': loss}, every_n_iter=10), ] # Horovod: pin GPU to be used to process local rank

(one GPU per process) config = tf.ConfigProto() config.gpu_options.allow_growth = True config.

gpu_options.visible_device_list = str(hvd.local_rank()) # Horovod: save checkpoints only on

worker 0 to prevent other workers from # corrupting them. checkpoint_dir = './checkpoints

' if hvd.rank() == 0 else None training_batch_generator = train_input_generator(x_train,

y_train, batch_size=100) # The MonitoredTrainingSession takes care of session initialization,

 # restoring from a checkpoint, saving to a checkpoint, and closing when done

# or an error occurs. with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,

hooks=hooks, config=config) as mon_sess: while not mon_sess.should_stop():

 # Run a training step synchronously. image_, label_ = next(training_batch_generator)

mon_sess.run(train_op, feed_dict={image: image_, label: label_}) if hvd.rank()

!= 0: return checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)

tf.reset_default_graph() saver = tf.train.import_meta_graph(serve_graph_file) inputs_classes

= tf.saved_model.utils.build_tensor_info(image) outputs_classes = tf.saved_model.utils.build_tensor_info(predict)

 signature = (tf.saved_model.signature_def_utils.build_signature_def

( inputs={tf.saved_model.signature_constants.CLASSIFY_INPUTS: inputs_classes},

 outputs={tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES: outputs_classes},

 method_name=tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME)) os.system

("rm -rf ./output") with tf.Session() as sess: sess.run([tf.local_variables_initializer(),

 tf.tables_initializer()]) saver.restore(sess, checkpoint_file) builder = tf.saved_model.builder.

SavedModelBuilder('./output') legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')

builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], signature_def_map=

{'predict_images': signature}, legacy_init_op=legacy_init_op) builder.save() if __name__ == "__main__": tf.app.run()

相似文档

百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例（TensorFlow 2.3.0）
TensorFlow 基于tensorflow框架的MNIST图像分类任务示例代码，训练数据集点击这里下载单机训练（计算节点数为1），示例代码如下： """ tf train demo """ import tensorflow as tf import os mnist = tf.keras.datasets.mnist work_path = os.getcwd() (x_train, y_train), (x_test, y_test) = mnist.load_data('%s/train_data/mnist.npz' % work_path)
百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例（Blackhole 1.0.0）
Blackhole是百度自研的高性能数据科学引擎，CodeLab中内嵌了该引擎。通过异构加速计算、超大数据处理、高效数据存储等技术，单机Blackhole在数据分析和机器学习等场景相比开源Pandas/Sklearn性能可提升7倍以上、拥有TB级的单机超大数据处理能力，同时提供和Pandas、Sklearn基本一致的易用接口。
百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例（Pytorch 1.7.1）
训练代码基于Pytorch框架的MNIST图像分类示例代码，数据集请点击这里下载。单机训练时（计算节点等于1），示例代码如下： import argparse import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import torch.utils.data as data from torchvision import transforms import codecs import errno import gzip import numpy as np import os from PIL import Image
百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例（Sklearn 0.23.2）
sklearn框架下，自定义作业支持发布保存模型为pickle和joblib格式，并且在发布至模型仓库时需要选择相应的模型文件。使用下面代码进行模型训练时，训练程序可以自行加载数据，训练数据选择空文件夹即可。
百度智能云全功能AI开发平台BML自定义作业建模 - 训练作业代码示例（XGBoost 1.3.1）
XGBoost框架下，自定义作业支持发布保存模型为pickle和joblib格式，并且在发布至模型仓库时需要选择相应的模型文件。使用下面代码进行模型训练时，训练程序可以自行加载数据，训练数据选择空文件夹即可。

文档中心

全民上云·上云补贴申领

免费试用（限企业）

TensorFlow