MNIST data set neural network, why is the network failing to classify images?

问题

my neural network is failing at the training set and every iteration at gradient descent is decreasing the error, but the hypothesis vector values are decreasing each time. When I run the test set on the learned parameters, the network just outputs 1 as its guess for every image. So far, i've tried removing the sigmoid activation from the output, checked the gradients with gradient checking, and i've tried to put the dataset between the values 0-1.

Heres an example of the hypothesis during gradient descent:

Iter 1: [0.5123, 0.5492, 0.5329, 0.5281, ...]

Iter 2: [0.3294, 0.3239, 0.3985, 0.3283, ...]

Iter 3: [0.1313, 0.1123, 0.1233, 0.1231, ...]

Iter 4: [0.0112, 0.0123, 0.0123, 0.0123, ...]

I would be so grateful if someone could help me find what's wrong with my network and help it classify the images.

Here's the code:

Image processing:

    def processImg(self):

        with open("binaryMNIST.pkl", "br") as fh:
            data = pc.load(fh)


        img_dim = 28;
        features = 784;
        m = 60000
        test_m = 10000;

        fullX = (np.asfarray(data[0]))
        bias = np.ones((60000, 1))
        fullX = np.hstack((bias, fullX))

        fullY = np.asfarray(data[1])

        testX = (np.asfarray(data[2]))
        bias2 = np.ones((10000, 1))
        testX = np.hstack((bias2, testX))

        testY = np.asfarray(data[3])

        fullY = fullY.astype(int)
        testY = testY.astype(int)

        iden = np.identity(10, dtype = np.int)
        oneHot = np.zeros((m, 10), dtype = np.int)
        oneHot_T = np.zeros((test_m, 10), dtype = np.int)

        #creates m number of one, zeros vector indicating the class
        for i in range(test_m):
            oneHot_T[i] = iden[testY[i], :]

        for i in range(m):
            oneHot[i] = iden[fullY[i], :]

        trainX = fullX[:40000, :]
        trainY = oneHot[:40000, :]

        valX = np.asfarray(fullX[40000:, :])
        valY = np.asfarray(oneHot[40000:, :])


        self.trainX = trainX
        self.trainY = trainY
        self.valX = valX
        self.valY = valY
        self.testX = testX
        self.oneHot_T = oneHot_T

Setting theta:

    def setThetas(self):
        #784 features
        #5 nodes per layer (not including bias)
        #(nodes in previous layer, nodes in next layer)
        #theta1(785, 5) theta2(6, 5) theta3(6, 10)

        #after finishing, do big 3d matrix of theta and vectorize backprop

        params = np.random.rand(4015)
        self.params = params

forward and backprop:

    def fbProp(self, theta1, theta2, theta3):

        #after calculating a w/sig(), add bias
        m = np.shape(self.trainY)[0]
        z1 = np.array(np.dot(self.trainX, theta1), dtype = np.float64)

        a1 = self.sig(z1)
        bias = np.ones((40000, 1))
        a1 = np.hstack((bias, a1))
        z2 = np.dot(a1, theta2)
        a2 = self.sig(z2)
        a2 = np.hstack((bias, a2))
        z3 = np.dot(a2, theta3)
        hyp = self.sig(z3)

        g3 = 0
        g2 = 0
        g1 = 0

        for i in range(m):
            dOut = hyp[i, :] - self.trainY[i, :]
            #switch
            d2 = np.dot(np.transpose(dOut), np.transpose(theta3))
            d2 = d2[1:] * self.sigG(z2[i, :])
            d1 = np.dot(d2, np.transpose(theta2))
            d1 = d1[1:] * self.sigG(z1[i, :])

            g3 = g3 + np.dot(np.transpose(np.array(a2[i, :], ndmin = 2)), np.array(dOut, ndmin = 2))
            g2 = g2 + np.dot(np.transpose(np.array(a1[i, :], ndmin = 2)), np.array(d1, ndmin = 2))
            g1 = g1 + np.dot(np.transpose(np.array(self.trainX[i, :], ndmin = 2)), np.array(d1, ndmin = 2))

        self.theta1G = (1/m) * g1
        self.theta2G = (1/m) * g2
        self.theta3G = (1/m) * g3

Gradient descent:

    def gradDescent(self):

        params = np.array(self.params)
        theta1 = params[0:3925]
        theta1 = np.resize(theta1, (785, 5))
        theta2 = params[3925:3955]
        theta2 = np.resize(theta2, (6, 5))
        theta3 = params[3955:4015]
        theta3 = np.resize(theta3, (6, 10))

        for i in range(self.steps):
            J = self.error(theta1, theta2, theta3, self.trainX, self.trainY)
            print("Iteration: ", i+1, " | error: ", J)
            self.fbProp(theta1, theta2, theta3)
            theta1 = theta1 - (self.alpha * self.theta1G)
            theta2 = theta2 - (self.alpha * self.theta2G)
            theta3 = theta3 - (self.alpha * self.theta3G)

error function:

    def error(self, theta1, theta2, theta3, X, y):
        bias = np.ones((np.shape(y)[0], 1))
        a1 = self.sig(np.dot(X, theta1))
        a1 = np.hstack((bias, a1))
        a2 = self.sig(np.dot(a1, theta2))
        a2 = np.hstack((bias, a2))
        hyp = self.sig(np.dot(a2, theta3))
        print(hyp[0, :])

        #10 classes
        pt1 = ((np.log(hyp) * y) + (np.log(1-hyp) * (1-y))).sum()
        J = - (1/(40000)) * pt1.sum()

        return J

Sigmoid gradient and sigmoid:

    def sigG(self, z):
        return (self.sig(z) * (1-self.sig(z)))


    def sig(self, z):
        return 1/(1+(np.exp(-z)))

来源：https://stackoverflow.com/questions/59830193/mnist-data-set-neural-network-why-is-the-network-failing-to-classify-images

标签

python

machine-learning

neural-network

mnist