For a neural networks library I implemented some activation functions and loss functions and their derivatives. They can be combined arbitrarily and the derivative at the output
Just in case you are processing in batches, here is an implementation in NumPy (tested vs TensorFlow). However, I will suggest avoiding the associated tensor operations, by mixing the jacobian with the cross-entropy, which leads to a very simple and efficient expression.
def softmax(z):
exps = np.exp(z - np.max(z))
return exps / np.sum(exps, axis=1, keepdims=True)
def softmax_jacob(s):
return np.einsum('ij,jk->ijk', s, np.eye(s.shape[-1])) \
- np.einsum('ij,ik->ijk', s, s)
def np_softmax_test(z):
return softmax_jacob(softmax(z))
def tf_softmax_test(z):
z = tf.constant(z, dtype=tf.float32)
with tf.GradientTape() as g:
g.watch(z)
a = tf.nn.softmax(z)
jacob = g.batch_jacobian(a, z)
return jacob.numpy()
z = np.random.randn(3, 5)
np.all(np.isclose(np_softmax_test(z), tf_softmax_test(z)))