Adam Optimizer with CNN's

May 8, 2023


Each ML model optimizer approaches the point of optimization uniquely. Therefore, different models may require different optimizers. Instead of using the intuitive stochastic gradient descent (SGD), this article uses the popular adaptive momentum estimation (ADAM) optimizer. This implementation of the adam optimizer is inspired by Keras. It incorporates the concept of exponentially weighted averages. Refer to the commented code below to understand the mechanics. 

# this code is a walk through of 2-layer CNN forward and backward propagation from scratch, using the adam optimizer

import numpy as np

mnistlocation = “INSERT MNIST FILE LOCATION” #you can download the file here

# For example: “/Users/enrichmentcap/Downloads/mndata.npz”

trains, tests = 1000, 1000 #depends on computer processing speed. If computer isn't fast, reduce size of training and test dataset

trainimages, trainlabels, testimages, testlabels = np.load(mnistlocation)['trainimages'][:trains] / 255, np.load(mnistlocation)['trainlabels'][:trains], np.load(mnistlocation)['testimages'][:tests] / 255, np.load(mnistlocation)['testlabels'][:tests]


##from tensorflow.keras.datasets import mnist #uncomment if using tensorflow library to retrieve dataset

##(trainimages, trainlabels), (testimages, testlabels) = mnist.load_data()

##trainimages, testimages = trainimages[:train]/255, testimages[:test]/255


np.random.seed(0)

classes = len(np.unique(trainlabels))

imw = trainimages.shape[2] 

imh = trainimages.shape[1]


lr = .001 #adam optimizer requires different learn rate


fsw = 8 #kernel1 width

fsh = 8 #kernel1 height

fsw2 = 4 #kernel2 width

fsh2 = 4 #kernel2 height

filts =  16 #kernel1 number of filters

filts2 = 8 #kernel2 number of filters

kern = np.random.rand(filts, fsh, fsw) - .5

kb = np.zeros(filts)

kern2 = np.random.rand(filts2, filts, fsh2, fsw2) - .5

kb2 = np.zeros(filts2)

step = 1 #step sizes of kernel convolutions

step2 = 2

rsw = (imw - fsw) // step + 1 #resulting shape width of kernel1 convolution

rsh = (imh - fsh) // step + 1 #resulting shape height of kernel1 convolution

rsw2 = (rsw - fsw2) // step2 + 1 #resulting shape width of kernel2 convolution

rsh2 = (rsh - fsh2) // step2 + 1 #resulting shape height of kernel2 convolution

w = np.random.rand(classes, filts2, rsh2, rsw2) - .5

b = np.random.rand(classes) - .5


b1 = .9 #adam optimizer parameters

b2 = .99

eps = 1e-7

mw = 0

vw = 0

mb = 0

vb = 0

mk = 0

vk = 0

mk2 = 0

vk2 = 0


for i in range(trains):

##    print(i)

    xx, label, label[trainlabels[i]] = trainimages[i], np.zeros(classes), 1

    k = np.zeros((filts, rsh, rsw))

    kk = np.zeros((filts2, rsh2, rsw2))

    for j in range(rsh):

        for jj in range(rsw):

            k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw].reshape(1,fsh,fsw)).sum(axis=(1,2)) + kb #k.shape = [filts,rsize,rsize] = [16,21,21]

    for j0 in range(rsh2):

        for jj0 in range(rsw2):

            kk[:,j0,jj0] = (kern2 * k[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)

    x = (w * kk.reshape(1,filts2,rsh2,rsw2)).sum(axis=(1,2,3)) + b


    y = np.exp(x) / np.sum(np.exp(x))

    dydx = -np.exp(x)[trainlabels[i]] * np.exp(x) / np.sum(np.exp(x))**2

    dydx[trainlabels[i]] = np.exp(x)[trainlabels[i]] * (np.sum(np.exp(x)) - np.exp(x)[trainlabels[i]]) / np.sum(np.exp(x))**2

    dLdy = 1 / y[trainlabels[i]]

    

    dLdx = dLdy * dydx

    dxdw, dxdb = kk, 1

    dLdw = dLdx.reshape(classes,1,1,1) * dxdw.reshape(1, filts2, rsh2, rsw2)

    dLdb = dLdx * dxdb


## adam optimizer code block

    lrt = lr * (1 - b2**(i+1))**.5 / (1 - b1**(i+1)) #as training increases lrt starts high goes low real fast then gradually heads towards 1

    mw = b1 * mw + (1 - b1) * dLdw #takes large ratio of a small ratio of the previous slope,  and adds to a small ratio of current slope

    vw = b2 * vw + (1 - b2) * dLdw**2 #takes really large ratio of a really small ratio of the previous slope squared, and adds really small ratio of current slope squared

    w = w + lrt * mw / (vw**.5 + eps) #better prelim results with abs(dLdw) in vw and removing square root of vw in w; independent from prior mw and vw, this results is a tanh shape more intuitive for what one would like as slope increases or decreases

    mb = b1 * mb + (1 - b1) * dLdb

    vb = b2 * vb + (1 - b2) * dLdb**2

    b = b + lrt * mb / (vb**.5 + eps)   


    dxdkk = w #(classes, filts2*rsize2**2)

    dLdkk = (dLdx.reshape(classes, 1, 1, 1) * dxdkk).sum(axis=0)#dLdkk represents all class loss integrated into appropriate positions in kk output; meaning which positions in kk led to more loss or error

    dLdkern2 =  np.zeros((filts2,filts,fsh2,fsw2))

    dkkdkern2 = k

    dkkdkb2 = 1


    for f in range(filts2):

        for j000 in range(fsh2):

            for jj000 in range(fsw2):

                dLdkern2[f, :, j000, jj000] = (dLdkk[f].reshape(1,rsh2,rsw2) * dkkdkern2[:,j000:j000+rsh2,jj000:jj000+rsw2]).sum(axis=(1,2))

    dLdkb2 = dLdkk.sum(axis=(1,2)) * dkkdkb2

    

    kb2 = kb2 + dLdkb2 * lr


## adam optimizer code block

    mk2 = b1 * mk2 + (1 - b1) * dLdkern2

    vk2 = b2 * vk2 + (1 - b2) * dLdkern2**2

    kern2 = kern2 + lrt * mk2 / (vk2**.5 + eps)


    dkkdk = kern2 # shape [filts2, filts * fsize2**2]

    dLdk = np.zeros((filts, rsh, rsw))


    for ooo in range(filts):

        for o in range(rsh2):

            for oo in range(rsw2):

                dLdk[ooo,o*step2:o*step2+fsh2,oo*step2:oo*step2+fsw2] += (dLdkk[:,o,oo].reshape(filts2,1,1) * dkkdk[:,ooo,:,:]).sum(axis=0) #[filts2,] @ [filts2,fsize2,fsize2]

    dkdkern = xx

    dkdkb = 1

    dLdkern = np.zeros((filts,fsh,fsw))

    for j00 in range(fsh):

        for jj00 in range(fsw):

            dLdkern[:,j00,jj00] = (dkdkern[j00:j00+rsw, jj00:jj00+rsh].reshape(1,rsw,rsh) * dLdk).sum(axis=(1,2))

    dLdkb = dLdk.sum(axis=(1,2)) * dkdkb

    

    kb = kb + dLdkb * lr


## adam optimizer code block

    mk = b1 * mk + (1 - b1) * dLdkern

    vk = b2 * vk + (1 - b2) * dLdkern**2

    kern = kern + lrt * mk / (vk**.5 + eps)

        

##loss = np.array([loss[i:i+100].mean() for i in range(trains - 100)])  


checke = np.zeros(tests)               

for i in range(tests):

    xx = testimages[i]

    k = np.zeros((filts, rsh, rsw))

    kk = np.zeros((filts2, rsh2, rsw2))

    for j in range(rsh):

        for jj in range(rsw):

            k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw]).sum(axis=(1,2)) + kb #k.shape = [filts,rsize,rsize] = [16,21,21]

    for j0 in range(rsh2):

        for jj0 in range(rsw2):

            kk[:,j0,jj0] = (kern2 * k[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)   

    x = (w * kk).sum(axis=(1,2,3)) + b 

    if testlabels[i] == np.argmax(x):

        checke[i] = 1

print(len(np.flatnonzero(checke==1))/tests)