ML Batching from Scratch

May 9, 2023


For this article, we are going to batch from scratch. Previously, we created a 2-layer CNN, added activation functions, the adam optimizer, and initialized random values by addressing variance. Now, we can put it all together to create a model that is usable. In the prior articles, we updated the weights after every iteration over the training data. Batching allows us to update the weights after averaging the derivative of the loss with respect to a given weight over the entire batch. This can be beneficial because it doesn't allow bad training data to have a sizeable impact on the weights. Averaging allows the weights to be steered in the direction of an entire batch, not just one sample. 

An additional benefit is that it allows the model to train over the same data. You can iterate different combinations of batches over the same dataset. Also, this is great for parallel computing as the derivatives for an entire batch can be done independently of other elements in the batch. See commented code below:

import numpy as np

mnistlocation = “INSERT MNIST FILE LOCATION” #you can download the file here

# For example: “/Users/enrichmentcap/Downloads/mndata.npz”

trains, tests = 60000, 10000 #depends on computer processing speed. If computer isn't fast, reduce size of training and test dataset

trainimages, trainlabels, testimages, testlabels = np.load(mnistlocation)['trainimages'][:trains] / 255, np.load(mnistlocation)['trainlabels'][:trains], np.load(mnistlocation)['testimages'][:tests] / 255, np.load(mnistlocation)['testlabels'][:tests]


##from tensorflow.keras.datasets import mnist #uncomment if using tensorflow library to retrieve dataset

##(trainimages, trainlabels), (testimages, testlabels) = mnist.load_data()

##trainimages, testimages = trainimages[:train]/255, testimages[:test]/255


np.random.seed(0)

classes = len(np.unique(trainlabels))

imw = trainimages.shape[2]

imh = trainimages.shape[1]

lr = .001


fsw = 8

fsh = 8

fsw2 = 4

fsh2 = 4

filts =  16

filts2 = 8

step = 1

step2 = 2

rsw = (imw - fsw) // step + 1

rsh = (imh - fsh) // step + 1

rsw2 = (rsw - fsw2) // step2 + 1

rsh2 = (rsh - fsh2) // step2 + 1

kern = np.random.randn(filts, fsh, fsw) * np.sqrt(2 / (fsh*fsw))# + filts*fsh*fsw)) # (2 / (X*Y))**.5 = he normal initializer 

kb = np.zeros(filts)

kern2 = np.random.randn(filts2, filts, fsh2, fsw2) * np.sqrt(2 / (filts*fsh2*fsw2))# + filts2*fsh2*fsw2))

kb2 = np.zeros(filts2)

w = np.random.randn(classes, filts2, rsh2, rsw2) * np.sqrt(6 / (filts2*rsh2*rsw2 + classes)) # (6 / (X*Y + Z))**.5 = glorot normal initializer

b = np.zeros(classes)


#adam optimizer params

b1 = .9

b2 = .999

eps = 1e-7

mw = 0

vw = 0

mb = 0

vb = 0

mk = 0

vk = 0

mk2 = 0

vk2 = 0

mkb = 0

vkb = 0

mkb2 = 0

vkb2 = 0


epochs = 10

bs = 100 #batch size

nbatch = trains // bs #number of batches per epoch


for e in range(epochs): #epochs go through the same dataset, but in different batches; this allows one to train over the same dataset while continuing to learn

    print('epoch: %d'%e)

    np.random.seed(e) 

    ibatch = np.random.permutation(trainimages[:trains]) #training set of images to be batched

    np.random.seed(e) #reverting back to the random seed ensures that train images and labels are shuffled exactly the same, despite being different sizes

    lbatch = np.random.permutation(trainlabels[:trains]) #training set of labels to be batched

    for nb in range(nbatch):

        #these derivatives are backpropagated to trainable weights; need to be reset after every batch

        #trainable weights are only updated once per batch; each batch takes the average of all these values, then backpropagates to the trainable weights

        bdLdw = np.zeros((bs, classes, filts2, rsh2, rsw2)) 

        bdLdb = np.zeros((bs, classes)) 

        bdLdkern2 = np.zeros((bs, filts2, filts, fsh2, fsw2))

        bdLdkb2 = np.zeros((bs, filts2))

        bdLdkern = np.zeros((bs, filts, fsh, fsw))

        bdLdkb = np.zeros((bs, filts))

        for i in range(bs):

            xx, label, label[lbatch[nb*bs + i]] = ibatch[nb*bs + i], np.zeros(classes), 1

            k = np.zeros((filts, rsh, rsw))

            for j in range(rsh):

                for jj in range(rsw):

                    k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw].reshape(1,fsh,fsw)).sum(axis=(1,2)) + kb

            kr = np.maximum(0, k) #relu activation


            kk = np.zeros((filts2, rsh2, rsw2))

            for j0 in range(rsh2):

                for jj0 in range(rsw2):

                    kk[:,j0,jj0] = (kern2 * kr[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)

            kkr = np.maximum(0, kk) #relu activation

            

            x = (w * kkr.reshape(1,filts2,rsh2,rsw2)).sum(axis=(1,2,3)) + b

            y = np.exp(x) / np.sum(np.exp(x))

            dydx = -np.exp(x)[lbatch[nb*bs + i]] * np.exp(x) / np.sum(np.exp(x))**2

            dydx[lbatch[nb*bs + i]] = np.exp(x)[lbatch[nb*bs + i]] * (np.sum(np.exp(x)) - np.exp(x)[lbatch[nb*bs + i]]) / np.sum(np.exp(x))**2

            dLdy = 1 / y[lbatch[nb*bs + i]]

            

            dLdx = dLdy * dydx

            dxdw, dxdb = kkr, 1


            dLdw = dLdx.reshape(classes,1,1,1) * dxdw.reshape(1, filts2, rsh2, rsw2)

            dLdb = dLdx * dxdb

            bdLdw[i] = dLdw

            bdLdb[i] = dLdb


            dxdkkr = w #(classes, filts2*rsize2**2)

            dLdkkr = (dLdx.reshape(classes, 1, 1, 1) * dxdkkr).sum(axis=0)#dLdkk represents all class loss integrated into appropriate positions in kk output; meaning which positions in kk led to more loss or error

            dkkrdkk = np.array(kk > 0, dtype = float) #relu activation derivative

            dLdkk = dLdkkr * dkkrdkk


            dkkdkern2 = kr

            dLdkern2 =  np.zeros((filts2,filts,fsh2,fsw2))

            for f in range(filts2):

                for j000 in range(fsh2):

                    for jj000 in range(fsw2):

                        dLdkern2[f, :, j000, jj000] = (dLdkk[f].reshape(1,rsh2,rsw2) * dkkdkern2[:,j000:j000+rsh2,jj000:jj000+rsw2]).sum(axis=(1,2))

            bdLdkern2[i] = dLdkern2

            

            dkkdkb2 = 1

            dLdkb2 = dLdkk.sum(axis=(1,2)) * dkkdkb2

            bdLdkb2[i] = dLdkb2


            dkkdkr = kern2 # shape [filts2, filts * fsize2**2]

            dLdkr = np.zeros((filts, rsh, rsw))

            for ooo in range(filts):

                for o in range(rsh2):

                    for oo in range(rsw2):

                        dLdkr[ooo,o*step2:o*step2+fsh2,oo*step2:oo*step2+fsw2] += (dLdkk[:,o,oo].reshape(filts2,1,1) * dkkdkr[:,ooo,:,:]).sum(axis=0) #[filts2,] @ [filts2,fsize2,fsize2]

            dkrdk = np.array(k > 0, dtype = float) #relu activation derivative

            dLdk = dLdkr * dkrdk

            

            dkdkern = xx

            dLdkern = np.zeros((filts,fsh,fsw))

            for j00 in range(fsh):

                for jj00 in range(fsw):

                    dLdkern[:,j00,jj00] = (dkdkern[j00:j00+rsw, jj00:jj00+rsh].reshape(1,rsw,rsh) * dLdk).sum(axis=(1,2))

            bdLdkern[i] = dLdkern

            

            dkdkb = 1

            dLdkb = dLdk.sum(axis=(1,2)) * dkdkb

            bdLdkb[i] = dLdkb

            

        #adam optimizer; only updates weights after entire batch has been iterated through

        lrt = lr * (1 - b2**(e*nbatch + nb + 1))**.5 / (1 - b1**(e*nbatch + nb + 1))        

        

        mk = b1 * mk + (1 - b1) * bdLdkern.mean(axis=0)

        vk = b2 * vk + (1 - b2) * bdLdkern.mean(axis=0)**2

        kern = kern + lrt * mk / (vk**.5 + eps)


        mkb = b1 * mkb + (1 - b1) * bdLdkb.mean(axis=0)

        vkb = b2 * vkb + (1 - b2) * bdLdkb.mean(axis=0)**2

        kb = kb + lrt * mkb / (vkb**.5 + eps)


        mk2 = b1 * mk2 + (1 - b1) * bdLdkern2.mean(axis=0)

        vk2 = b2 * vk2 + (1 - b2) * bdLdkern2.mean(axis=0)**2

        kern2 = kern2 + lrt * mk2 / (vk2**.5 + eps)


        mkb2 = b1 * mkb2 + (1 - b1) * bdLdkb2.mean(axis=0)

        vkb2 = b2 * vkb2 + (1 - b2) * bdLdkb2.mean(axis=0)**2

        kb2 = kb2 + lrt * mkb2 / (vkb2**.5 + eps)


        mw = b1 * mw + (1 - b1) * bdLdw.mean(axis=0)                                    

        vw = b2 * vw + (1 - b2) * bdLdw.mean(axis=0)**2

        w = w + lrt * mw / (vw**.5 + eps)    


        mb = b1 * mb + (1 - b1) * bdLdb.mean(axis=0)

        vb = b2 * vb + (1 - b2) * bdLdb.mean(axis=0)**2

        b = b + lrt * mb / (vb**.5 + eps)


checke = np.zeros(tests)               

for i in range(tests):

    xx = testimages[i]

    k = np.zeros((filts, rsh, rsw))

    for j in range(rsh):

        for jj in range(rsw):

            k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw].reshape(1,fsh,fsw)).sum(axis=(1,2)) + kb

    kr = np.maximum(0, k)

    kk = np.zeros((filts2, rsh2, rsw2))

    for j0 in range(rsh2):

        for jj0 in range(rsw2):

            kk[:,j0,jj0] = (kern2 * kr[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)

    kkr = np.maximum(0, kk)

    x = (w * kkr.reshape(1,filts2,rsh2,rsw2)).sum(axis=(1,2,3)) + b

    if testlabels[i] == np.argmax(x):

        checke[i] = 1

print(len(np.flatnonzero(checke==1))/tests)