问题
my neural network is failing at the training set and every iteration at gradient descent is decreasing the error, but the hypothesis vector values are decreasing each time. When I run the test set on the learned parameters, the network just outputs 1 as its guess for every image. So far, i've tried removing the sigmoid activation from the output, checked the gradients with gradient checking, and i've tried to put the dataset between the values 0-1.
Heres an example of the hypothesis during gradient descent:
Iter 1: [0.5123, 0.5492, 0.5329, 0.5281, ...]
Iter 2: [0.3294, 0.3239, 0.3985, 0.3283, ...]
Iter 3: [0.1313, 0.1123, 0.1233, 0.1231, ...]
Iter 4: [0.0112, 0.0123, 0.0123, 0.0123, ...]
I would be so grateful if someone could help me find what's wrong with my network and help it classify the images.
Here's the code:
Image processing:
def processImg(self):
with open("binaryMNIST.pkl", "br") as fh:
data = pc.load(fh)
img_dim = 28;
features = 784;
m = 60000
test_m = 10000;
fullX = (np.asfarray(data[0]))
bias = np.ones((60000, 1))
fullX = np.hstack((bias, fullX))
fullY = np.asfarray(data[1])
testX = (np.asfarray(data[2]))
bias2 = np.ones((10000, 1))
testX = np.hstack((bias2, testX))
testY = np.asfarray(data[3])
fullY = fullY.astype(int)
testY = testY.astype(int)
iden = np.identity(10, dtype = np.int)
oneHot = np.zeros((m, 10), dtype = np.int)
oneHot_T = np.zeros((test_m, 10), dtype = np.int)
#creates m number of one, zeros vector indicating the class
for i in range(test_m):
oneHot_T[i] = iden[testY[i], :]
for i in range(m):
oneHot[i] = iden[fullY[i], :]
trainX = fullX[:40000, :]
trainY = oneHot[:40000, :]
valX = np.asfarray(fullX[40000:, :])
valY = np.asfarray(oneHot[40000:, :])
self.trainX = trainX
self.trainY = trainY
self.valX = valX
self.valY = valY
self.testX = testX
self.oneHot_T = oneHot_T
Setting theta:
def setThetas(self):
#784 features
#5 nodes per layer (not including bias)
#(nodes in previous layer, nodes in next layer)
#theta1(785, 5) theta2(6, 5) theta3(6, 10)
#after finishing, do big 3d matrix of theta and vectorize backprop
params = np.random.rand(4015)
self.params = params
forward and backprop:
def fbProp(self, theta1, theta2, theta3):
#after calculating a w/sig(), add bias
m = np.shape(self.trainY)[0]
z1 = np.array(np.dot(self.trainX, theta1), dtype = np.float64)
a1 = self.sig(z1)
bias = np.ones((40000, 1))
a1 = np.hstack((bias, a1))
z2 = np.dot(a1, theta2)
a2 = self.sig(z2)
a2 = np.hstack((bias, a2))
z3 = np.dot(a2, theta3)
hyp = self.sig(z3)
g3 = 0
g2 = 0
g1 = 0
for i in range(m):
dOut = hyp[i, :] - self.trainY[i, :]
#switch
d2 = np.dot(np.transpose(dOut), np.transpose(theta3))
d2 = d2[1:] * self.sigG(z2[i, :])
d1 = np.dot(d2, np.transpose(theta2))
d1 = d1[1:] * self.sigG(z1[i, :])
g3 = g3 + np.dot(np.transpose(np.array(a2[i, :], ndmin = 2)), np.array(dOut, ndmin = 2))
g2 = g2 + np.dot(np.transpose(np.array(a1[i, :], ndmin = 2)), np.array(d1, ndmin = 2))
g1 = g1 + np.dot(np.transpose(np.array(self.trainX[i, :], ndmin = 2)), np.array(d1, ndmin = 2))
self.theta1G = (1/m) * g1
self.theta2G = (1/m) * g2
self.theta3G = (1/m) * g3
Gradient descent:
def gradDescent(self):
params = np.array(self.params)
theta1 = params[0:3925]
theta1 = np.resize(theta1, (785, 5))
theta2 = params[3925:3955]
theta2 = np.resize(theta2, (6, 5))
theta3 = params[3955:4015]
theta3 = np.resize(theta3, (6, 10))
for i in range(self.steps):
J = self.error(theta1, theta2, theta3, self.trainX, self.trainY)
print("Iteration: ", i+1, " | error: ", J)
self.fbProp(theta1, theta2, theta3)
theta1 = theta1 - (self.alpha * self.theta1G)
theta2 = theta2 - (self.alpha * self.theta2G)
theta3 = theta3 - (self.alpha * self.theta3G)
error function:
def error(self, theta1, theta2, theta3, X, y):
bias = np.ones((np.shape(y)[0], 1))
a1 = self.sig(np.dot(X, theta1))
a1 = np.hstack((bias, a1))
a2 = self.sig(np.dot(a1, theta2))
a2 = np.hstack((bias, a2))
hyp = self.sig(np.dot(a2, theta3))
print(hyp[0, :])
#10 classes
pt1 = ((np.log(hyp) * y) + (np.log(1-hyp) * (1-y))).sum()
J = - (1/(40000)) * pt1.sum()
return J
Sigmoid gradient and sigmoid:
def sigG(self, z):
return (self.sig(z) * (1-self.sig(z)))
def sig(self, z):
return 1/(1+(np.exp(-z)))
来源:https://stackoverflow.com/questions/59830193/mnist-data-set-neural-network-why-is-the-network-failing-to-classify-images