ML Batching from Scratch
May 9, 2023
For this article, we are going to batch from scratch. Previously, we created a 2-layer CNN, added activation functions, the adam optimizer, and initialized random values by addressing variance. Now, we can put it all together to create a model that is usable. In the prior articles, we updated the weights after every iteration over the training data. Batching allows us to update the weights after averaging the derivative of the loss with respect to a given weight over the entire batch. This can be beneficial because it doesn't allow bad training data to have a sizeable impact on the weights. Averaging allows the weights to be steered in the direction of an entire batch, not just one sample.
An additional benefit is that it allows the model to train over the same data. You can iterate different combinations of batches over the same dataset. Also, this is great for parallel computing as the derivatives for an entire batch can be done independently of other elements in the batch. See commented code below:
import numpy as np
mnistlocation = “INSERT MNIST FILE LOCATION” #you can download the file here
# For example: “/Users/enrichmentcap/Downloads/mndata.npz”
trains, tests = 60000, 10000 #depends on computer processing speed. If computer isn't fast, reduce size of training and test dataset
trainimages, trainlabels, testimages, testlabels = np.load(mnistlocation)['trainimages'][:trains] / 255, np.load(mnistlocation)['trainlabels'][:trains], np.load(mnistlocation)['testimages'][:tests] / 255, np.load(mnistlocation)['testlabels'][:tests]
##from tensorflow.keras.datasets import mnist #uncomment if using tensorflow library to retrieve dataset
##(trainimages, trainlabels), (testimages, testlabels) = mnist.load_data()
##trainimages, testimages = trainimages[:train]/255, testimages[:test]/255
np.random.seed(0)
classes = len(np.unique(trainlabels))
imw = trainimages.shape[2]
imh = trainimages.shape[1]
lr = .001
fsw = 8
fsh = 8
fsw2 = 4
fsh2 = 4
filts = 16
filts2 = 8
step = 1
step2 = 2
rsw = (imw - fsw) // step + 1
rsh = (imh - fsh) // step + 1
rsw2 = (rsw - fsw2) // step2 + 1
rsh2 = (rsh - fsh2) // step2 + 1
kern = np.random.randn(filts, fsh, fsw) * np.sqrt(2 / (fsh*fsw))# + filts*fsh*fsw)) # (2 / (X*Y))**.5 = he normal initializer
kb = np.zeros(filts)
kern2 = np.random.randn(filts2, filts, fsh2, fsw2) * np.sqrt(2 / (filts*fsh2*fsw2))# + filts2*fsh2*fsw2))
kb2 = np.zeros(filts2)
w = np.random.randn(classes, filts2, rsh2, rsw2) * np.sqrt(6 / (filts2*rsh2*rsw2 + classes)) # (6 / (X*Y + Z))**.5 = glorot normal initializer
b = np.zeros(classes)
#adam optimizer params
b1 = .9
b2 = .999
eps = 1e-7
mw = 0
vw = 0
mb = 0
vb = 0
mk = 0
vk = 0
mk2 = 0
vk2 = 0
mkb = 0
vkb = 0
mkb2 = 0
vkb2 = 0
epochs = 10
bs = 100 #batch size
nbatch = trains // bs #number of batches per epoch
for e in range(epochs): #epochs go through the same dataset, but in different batches; this allows one to train over the same dataset while continuing to learn
print('epoch: %d'%e)
np.random.seed(e)
ibatch = np.random.permutation(trainimages[:trains]) #training set of images to be batched
np.random.seed(e) #reverting back to the random seed ensures that train images and labels are shuffled exactly the same, despite being different sizes
lbatch = np.random.permutation(trainlabels[:trains]) #training set of labels to be batched
for nb in range(nbatch):
#these derivatives are backpropagated to trainable weights; need to be reset after every batch
#trainable weights are only updated once per batch; each batch takes the average of all these values, then backpropagates to the trainable weights
bdLdw = np.zeros((bs, classes, filts2, rsh2, rsw2))
bdLdb = np.zeros((bs, classes))
bdLdkern2 = np.zeros((bs, filts2, filts, fsh2, fsw2))
bdLdkb2 = np.zeros((bs, filts2))
bdLdkern = np.zeros((bs, filts, fsh, fsw))
bdLdkb = np.zeros((bs, filts))
for i in range(bs):
xx, label, label[lbatch[nb*bs + i]] = ibatch[nb*bs + i], np.zeros(classes), 1
k = np.zeros((filts, rsh, rsw))
for j in range(rsh):
for jj in range(rsw):
k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw].reshape(1,fsh,fsw)).sum(axis=(1,2)) + kb
kr = np.maximum(0, k) #relu activation
kk = np.zeros((filts2, rsh2, rsw2))
for j0 in range(rsh2):
for jj0 in range(rsw2):
kk[:,j0,jj0] = (kern2 * kr[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)
kkr = np.maximum(0, kk) #relu activation
x = (w * kkr.reshape(1,filts2,rsh2,rsw2)).sum(axis=(1,2,3)) + b
y = np.exp(x) / np.sum(np.exp(x))
dydx = -np.exp(x)[lbatch[nb*bs + i]] * np.exp(x) / np.sum(np.exp(x))**2
dydx[lbatch[nb*bs + i]] = np.exp(x)[lbatch[nb*bs + i]] * (np.sum(np.exp(x)) - np.exp(x)[lbatch[nb*bs + i]]) / np.sum(np.exp(x))**2
dLdy = 1 / y[lbatch[nb*bs + i]]
dLdx = dLdy * dydx
dxdw, dxdb = kkr, 1
dLdw = dLdx.reshape(classes,1,1,1) * dxdw.reshape(1, filts2, rsh2, rsw2)
dLdb = dLdx * dxdb
bdLdw[i] = dLdw
bdLdb[i] = dLdb
dxdkkr = w #(classes, filts2*rsize2**2)
dLdkkr = (dLdx.reshape(classes, 1, 1, 1) * dxdkkr).sum(axis=0)#dLdkk represents all class loss integrated into appropriate positions in kk output; meaning which positions in kk led to more loss or error
dkkrdkk = np.array(kk > 0, dtype = float) #relu activation derivative
dLdkk = dLdkkr * dkkrdkk
dkkdkern2 = kr
dLdkern2 = np.zeros((filts2,filts,fsh2,fsw2))
for f in range(filts2):
for j000 in range(fsh2):
for jj000 in range(fsw2):
dLdkern2[f, :, j000, jj000] = (dLdkk[f].reshape(1,rsh2,rsw2) * dkkdkern2[:,j000:j000+rsh2,jj000:jj000+rsw2]).sum(axis=(1,2))
bdLdkern2[i] = dLdkern2
dkkdkb2 = 1
dLdkb2 = dLdkk.sum(axis=(1,2)) * dkkdkb2
bdLdkb2[i] = dLdkb2
dkkdkr = kern2 # shape [filts2, filts * fsize2**2]
dLdkr = np.zeros((filts, rsh, rsw))
for ooo in range(filts):
for o in range(rsh2):
for oo in range(rsw2):
dLdkr[ooo,o*step2:o*step2+fsh2,oo*step2:oo*step2+fsw2] += (dLdkk[:,o,oo].reshape(filts2,1,1) * dkkdkr[:,ooo,:,:]).sum(axis=0) #[filts2,] @ [filts2,fsize2,fsize2]
dkrdk = np.array(k > 0, dtype = float) #relu activation derivative
dLdk = dLdkr * dkrdk
dkdkern = xx
dLdkern = np.zeros((filts,fsh,fsw))
for j00 in range(fsh):
for jj00 in range(fsw):
dLdkern[:,j00,jj00] = (dkdkern[j00:j00+rsw, jj00:jj00+rsh].reshape(1,rsw,rsh) * dLdk).sum(axis=(1,2))
bdLdkern[i] = dLdkern
dkdkb = 1
dLdkb = dLdk.sum(axis=(1,2)) * dkdkb
bdLdkb[i] = dLdkb
#adam optimizer; only updates weights after entire batch has been iterated through
lrt = lr * (1 - b2**(e*nbatch + nb + 1))**.5 / (1 - b1**(e*nbatch + nb + 1))
mk = b1 * mk + (1 - b1) * bdLdkern.mean(axis=0)
vk = b2 * vk + (1 - b2) * bdLdkern.mean(axis=0)**2
kern = kern + lrt * mk / (vk**.5 + eps)
mkb = b1 * mkb + (1 - b1) * bdLdkb.mean(axis=0)
vkb = b2 * vkb + (1 - b2) * bdLdkb.mean(axis=0)**2
kb = kb + lrt * mkb / (vkb**.5 + eps)
mk2 = b1 * mk2 + (1 - b1) * bdLdkern2.mean(axis=0)
vk2 = b2 * vk2 + (1 - b2) * bdLdkern2.mean(axis=0)**2
kern2 = kern2 + lrt * mk2 / (vk2**.5 + eps)
mkb2 = b1 * mkb2 + (1 - b1) * bdLdkb2.mean(axis=0)
vkb2 = b2 * vkb2 + (1 - b2) * bdLdkb2.mean(axis=0)**2
kb2 = kb2 + lrt * mkb2 / (vkb2**.5 + eps)
mw = b1 * mw + (1 - b1) * bdLdw.mean(axis=0)
vw = b2 * vw + (1 - b2) * bdLdw.mean(axis=0)**2
w = w + lrt * mw / (vw**.5 + eps)
mb = b1 * mb + (1 - b1) * bdLdb.mean(axis=0)
vb = b2 * vb + (1 - b2) * bdLdb.mean(axis=0)**2
b = b + lrt * mb / (vb**.5 + eps)
checke = np.zeros(tests)
for i in range(tests):
xx = testimages[i]
k = np.zeros((filts, rsh, rsw))
for j in range(rsh):
for jj in range(rsw):
k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw].reshape(1,fsh,fsw)).sum(axis=(1,2)) + kb
kr = np.maximum(0, k)
kk = np.zeros((filts2, rsh2, rsw2))
for j0 in range(rsh2):
for jj0 in range(rsw2):
kk[:,j0,jj0] = (kern2 * kr[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)
kkr = np.maximum(0, kk)
x = (w * kkr.reshape(1,filts2,rsh2,rsw2)).sum(axis=(1,2,3)) + b
if testlabels[i] == np.argmax(x):
checke[i] = 1
print(len(np.flatnonzero(checke==1))/tests)