We'll use numpy to code a simple neural network and test it on MNIST dataset. The architecture is the same as the TensorFlow's "Get started" tutorial except that we use mini-batch gradient descent as our optimizer instead of adam.
import _pickle as cPickle
import gzip
def load_data():
f = gzip.open('./data/mnist.pkl.gz', 'rb')
training_data, validation_data, test_data = cPickle.load(f, encoding='bytes')
f.close()
return training_data, validation_data, test_data
You may download the dataset from here
class Model:
"""
Architecture:
Flatten -> Dense -> ReLU -> Dropout -> Dense -> SoftMax
"""
def __init__(self, input_size, hidden_size, output_size, dropout_p):
self.params = {
'W1': np.random.randn(input_size, hidden_size) / np.sqrt(input_size),
'b1': np.zeros((1, hidden_size)),
'W2': np.random.randn(hidden_size, output_size) / np.sqrt(hidden_size),
'b2': np.zeros((1, output_size))
}
self.dropout_p = dropout_p
dropout_p
defines the fraction of the input units to drop
def train(self, X, y, X_val, y_val, nb_epoch, batch_size, eta):
n = len(X)
for i in range(nb_epoch):
epoch_loss = 0
X, y = shuffle(X, y)
for j in range(0, n, batch_size):
X_batch = X[j:j + batch_size]
y_batch = y[j:j + batch_size]
loss, grads = self.loss(X_batch, y_batch)
epoch_loss += loss
# update parameters
for param_name in ('W1', 'b1', 'W2', 'b2'):
self.params[param_name] -= eta * grads[param_name]
train_acc = self.evaluate(X, y)
val_acc = self.evaluate(X_val, y_val)
print("epoch %d / %d: loss %f, train_acc: %f, val_acc: %f" %
(i + 1, nb_epoch, epoch_loss / n, train_acc, val_acc))
def ReLU(x):
return np.maximum(0, x)
def dropout(x, dropout_p):
return x * np.random.binomial([np.ones(x.shape)], 1 - dropout_p)[0] / (1 - dropout_p)
def softmax(x):
exps = np.exp(x - np.max(x, axis=1, keepdims=True))
return exps / np.sum(exps, axis=1, keepdims=True)
def loss(self, X, y):
W1, b1 = self.params['W1'], self.params['b1']
W2, b2 = self.params['W2'], self.params['b2']
n = X.shape[0]
# feed forward pass
h1 = ReLU(np.dot(X, W1) + b1)
h1 = dropout(h1, dropout_p=self.dropout_p)
out = np.dot(h1, W2) + b2
probs = softmax(out)
# loss
log_probs = -np.log(probs[range(n), y])
loss = np.sum(log_probs) / n
# backward pass
dout = probs
dout[range(n), y] -= 1
dh1 = np.dot(dout, W2.T)
dh1[h1 <= 0] = 0
dW2 = np.dot(h1.T, dout)
db2 = np.sum(dout, axis=0, keepdims=True)
dW1 = np.dot(X.T, dh1)
db1 = np.sum(dh1, axis=0, keepdims=True)
grads = {
'W1': dW1,
'b1': db1,
'W2': dW2,
'b2': db2
}
return loss, grads
def evaluate(self, X, y):
h1 = ReLU(np.dot(X, self.params['W1']) + self.params['b1'])
out = np.dot(h1, self.params['W2']) + self.params['b2']
probs = softmax(out)
pred = np.argmax(probs, axis=1)
return sum(pred == y) / X.shape[0]
epoch 1 / 5: loss 0.024931, train_acc: 0.960260, val_acc: 0.957000
epoch 2 / 5: loss 0.011182, train_acc: 0.979260, val_acc: 0.973300
epoch 3 / 5: loss 0.007939, train_acc: 0.985060, val_acc: 0.976700
epoch 4 / 5: loss 0.006107, train_acc: 0.989840, val_acc: 0.978800
epoch 5 / 5: loss 0.004749, train_acc: 0.991940, val_acc: 0.978300
Test set accuracy 0.9794
See here for complete code gist.