问题
I'm training caffenet with multilabel data. However the lost is not decreasing during training phase. I'm now trying to check if the backward()
is not working properly. I have this code to check if there is a gradient.
import numpy as np
import os.path as osp
import matplotlib.pyplot as plt
from pprint import pprint
from copy import copy
% matplotlib inline
plt.rcParams['figure.figsize'] = (6, 6)
caffe_root = '../' # this file is expected to be in {caffe_root}/examples
sys.path.append(caffe_root + 'python')
import caffe # If you get "No module named _caffe", either you have not built pycaffe or you have the wrong path.
from caffe import layers as L, params as P # Shortcuts to define the net prototxt.
sys.path.append("pycaffe/layers") # the datalayers we will use are in this directory.
sys.path.append("pycaffe") # the tools file is in this folder
import tools #this contains some tools that we need
# set data root directory, e.g:
peta_root = osp.join('/root/data/PETA/')
# these are the PASCAL classes, we'll need them later.
#classes = np.asarray(['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'])
# make sure we have the caffenet weight downloaded.
if not os.path.isfile(caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel'):
print("Downloading pre-trained CaffeNet model...")
!../scripts/download_model_binary.py ../models/bvlc_reference_caffenet
# initialize caffe for gpu mode
caffe.set_mode_gpu()
caffe.set_device(1)
# helper function for common structures
def conv_relu(bottom, ks, nout, stride=1, pad=0, group=1):
conv = L.Convolution(bottom, kernel_size=ks, stride=stride,
num_output=nout, pad=pad, group=group)#,weight_filler=dict(type='xavier'))
return conv, L.ReLU(conv, in_place=True)
# another helper function
def fc_relu(bottom, nout):
fc = L.InnerProduct(bottom, num_output=nout)
return fc, L.ReLU(fc, in_place=True)
# yet another helper function
def max_pool(bottom, ks, stride=1):
return L.Pooling(bottom, pool=P.Pooling.MAX, kernel_size=ks, stride=stride)
# main netspec wrapper
def caffenet_multilabel(data_layer_params, datalayer):
# setup the python data layer
n = caffe.NetSpec()
n.data, n.label = L.Python(module = 'peta_multilabel_datalayers', layer = datalayer,
ntop = 2, param_str=str(data_layer_params))
# the net itself
n.conv1, n.relu1 = conv_relu(n.data, 11, 96, stride=4)
n.pool1 = max_pool(n.relu1, 3, stride=2)
n.norm1 = L.LRN(n.pool1, local_size=5, alpha=1e-4, beta=0.75)
n.conv2, n.relu2 = conv_relu(n.norm1, 5, 256, pad=2, group=2)
n.pool2 = max_pool(n.relu2, 3, stride=2)
n.norm2 = L.LRN(n.pool2, local_size=5, alpha=1e-4, beta=0.75)
n.conv3, n.relu3 = conv_relu(n.norm2, 3, 384, pad=1)
n.conv4, n.relu4 = conv_relu(n.relu3, 3, 384, pad=1, group=2)
n.conv5, n.relu5 = conv_relu(n.relu4, 3, 256, pad=1, group=2)
n.pool5 = max_pool(n.relu5, 3, stride=2)
n.fc6, n.relu6 = fc_relu(n.pool5, 4096)
n.drop6 = L.Dropout(n.relu6, in_place=True)
n.fc7, n.relu7 = fc_relu(n.drop6, 4096)
n.drop7 = L.Dropout(n.relu7, in_place=True)
n.score = L.InnerProduct(n.drop7, num_output=2)
n.loss = L.SigmoidCrossEntropyLoss(n.score, n.label)
return str(n.to_proto())
workdir = './peta_multilabel_with_datalayer'
if not os.path.isdir(workdir):
os.makedirs(workdir)
solverprototxt = tools.CaffeSolver(trainnet_prototxt_path = osp.join(workdir, "trainnet.prototxt"), testnet_prototxt_path = osp.join(workdir, "valnet.prototxt"))
solverprototxt.sp['display'] = "1"
solverprototxt.sp['base_lr'] = "0.0001"
solverprototxt.write(osp.join(workdir, 'solver.prototxt'))
# write train net.
with open(osp.join(workdir, 'trainnet.prototxt'), 'w') as f:
# provide parpeta_multilabel_with_datalayerameters to the data layer as a python dictionary. Easy as pie!
data_layer_params = dict(batch_size = 128, im_shape = [227, 227], split = 'train', peta_root = peta_root)
f.write(caffenet_multilabel(data_layer_params, 'PetaMultilabelDataLayerSync'))
# write validation net.
with open(osp.join(workdir, 'valnet.prototxt'), 'w') as f:
data_layer_params = dict(batch_size = 128, im_shape = [227, 227], split = 'val', peta_root = peta_root)
f.write(caffenet_multilabel(data_layer_params, 'PetaMultilabelDataLayerSync'))
solver = caffe.SGDSolver(osp.join(workdir, 'solver.prototxt'))
#solver.net.copy_from(caffe_root + 'models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel')
solver.test_nets[0].share_with(solver.net)
#solver.step(1)
solver.net.top_names
solver.net.backward()
solver.step(1)
print solver.net.params['fc6'][0].data[...]
print solver.net.blobs['fc6'].data[...]
print solver.net.blobs['fc6'].diff[...]
However the output of gradient seems to be zero and the weight is not update at all.
[[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
...,
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]]
[[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
...,
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]]
[[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
...,
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]
[ 0. 0. 0. ..., 0. 0. 0.]]
Anyone knows what happened?
回答1:
Looking at the values of your params
: they are all zeros. You did not define filler
s for your layers, thus all you get is zeros.
Define random initializers for the weights and run again.
Running only backward()
is meaningless - the loss is computed during forward()
pass, there is no loss information propagated through the net for the backward pass to use.
Call forward()
before backward()
to make one complete forward-backward pass.
来源:https://stackoverflow.com/questions/47278518/caffe-network-produce-zero-gradient-and-not-learning