给出一个tensorflow多gpu的demo。用了minst 数据集,可直接运行。(tf1.8)
注:tensorflow gpu之间的操作好像有问题,比如下面的c != a + b。期待解答!
import tensorflow as tf import numpy as np import os import argparse def arg_config(): parser = argparse.ArgumentParser() parser.add_argument('-gpu', type=str, required=False, default='1, 2') args = parser.parse_args() # config log_device_placement = True # 是否打印设备分配日志 allow_soft_placement = True # 如果你指定的设备不存在,允许TF自动分配设备 gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.90, allow_growth=True) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # 使用 GPU id config = tf.ConfigProto(log_device_placement=log_device_placement, allow_soft_placement=allow_soft_placement, gpu_options=gpu_options) return args, config def data_loader(): mnist = tf.contrib.learn.datasets.load_dataset("mnist") train_data = mnist.train.images # Returns np.array train_data = np.reshape(train_data, newshape=(-1, 28, 28, 1)) train_labels = np.asarray(mnist.train.labels, dtype=np.int32) # print(train_data.shape) # print(train_labels.shape) # print(train_labels.dtype) ds = tf.data.Dataset.from_tensor_slices((train_data.astype(np.float32), train_labels)) ds = ds.repeat().batch(2000) it = ds.make_one_shot_iterator() dt, lb = it.get_next() return dt, lb # multi-gpu try class Model(object): def __init__(self): pass @staticmethod def var_on_cpu(name, shape, initializer=tf.truncated_normal_initializer(stddev=0.1)): with tf.device('/cpu:0'): var = tf.get_variable(name, shape, tf.float32, initializer) return var def conv2d(self, inputs, filters, kernel_size, strides=(1, 1), name=None): kernel = self.var_on_cpu(name+'/kernel', shape=kernel_size+(inputs.shape[3].value, filters,)) bias = self.var_on_cpu(name+'/bias', shape=(filters,)) conv = tf.nn.conv2d(input=inputs, filter=kernel, strides=(1,)+strides+(1,), padding='SAME', name=name) bias_add = tf.nn.bias_add(conv, bias) return tf.nn.relu(bias_add) def dense(self, inputs, units, name, activation=None): var = self.var_on_cpu(name+'/kernel', shape=(inputs.shape[1].value, units)) bias = self.var_on_cpu(name+'/bias', shape=(units, )) ds = tf.matmul(inputs, var) + bias if activation is not None: return activation(ds) else: return ds def model(self, data, training=True, scope_name='haha'): with tf.variable_scope(name_or_scope=scope_name, reuse=tf.AUTO_REUSE): conv1 = self.conv2d(inputs=data, filters=32, kernel_size=(5, 5), name='conv1') pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2) conv2 = self.conv2d(inputs=pool1, filters=64, kernel_size=(5, 5), name='conv2') pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2) pool2_flat = tf.reshape(pool2, [-1, 7 * 7 * 64]) dense = self.dense(inputs=pool2_flat, units=1024, name='dense1', activation=tf.nn.relu) dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=training) # Logits Layer logits = self.dense(inputs=dropout, units=10, name='dense2') return logits def get_loss(self, data, labels, training=True): outs = self.model(data, training=training) ls = tf.losses.sparse_softmax_cross_entropy(labels, outs) return tf.reduce_mean(ls) def average_grads(tower): averaged_grads = [] for grads_and_vars in zip(*tower): # print(grads_and_vars) grads = [] for g, _ in grads_and_vars: expanded_grad = tf.expand_dims(g, 0, 'expand_grads') grads.append(expanded_grad) grad = tf.concat(values=grads, axis=0) grad = tf.reduce_mean(input_tensor=grad, axis=0, keepdims=False) g_and_v = (grad, grads_and_vars[0][1]) averaged_grads.append(g_and_v) return averaged_grads def train_multi_gpu(): with tf.device('/cpu:0'): args, config = arg_config() gpu_num = len(args.gpu.split(',')) global_step = tf.Variable(0, dtype=tf.int64, trainable=False) model = Model() data_all, label_all = data_loader() data = tf.split(data_all, gpu_num) label = tf.split(label_all, gpu_num) optimizer = tf.train.MomentumOptimizer(0.01, 0.9) tower = [] with tf.variable_scope('gpu_vars'): for i in range(gpu_num): with tf.device("/gpu:{}".format(i)), tf.name_scope('tower_{}'.format(i)): loss_op = model.get_loss(data[i], label[i]) # print(loss_op) tf.add_to_collection(name='total_loss', value=loss_op) grads_and_vars = optimizer.compute_gradients(loss_op, tf.trainable_variables()) tower.append(grads_and_vars) mean_grads_and_vars = average_grads(tower) total_loss_op = tf.get_collection('total_loss', 'gpu_vars') with tf.control_dependencies([g for g, _ in mean_grads_and_vars]): train_op = optimizer.apply_gradients(mean_grads_and_vars, global_step=global_step, name='optimizer') # print(tf.trainable_variables()) # exit() print('running...') with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) step = 0 while step < 1000: _, loss = sess.run([train_op, total_loss_op]) print(step, loss) step += 1 if __name__ == "__main__": train_multi_gpu() # c != a+b with tf.device('/gpu:1'), tf.variable_scope('haha', reuse=tf.AUTO_REUSE): a = tf.get_variable(name='a', shape=[3], dtype=tf.float32, initializer=tf.truncated_normal_initializer()) # a = tf.Variable([2, 3, 4], name='a', dtype=tf.float32, ) with tf.device('/gpu:2'): b = tf.constant(value=[1, 2, 3], dtype=tf.float32, shape=[3], name='b') with tf.device('/gpu:1'): c = tf.add(a, b) with tf.Session(config=tf.ConfigProto(log_device_placement=True)) as sess: sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) print('c: ', sess.run([c])) print('b: ', sess.run([b])) print('a: ', sess.run([a]))