Adam Optimizer with CNN's
May 8, 2023
Each ML model optimizer approaches the point of optimization uniquely. Therefore, different models may require different optimizers. Instead of using the intuitive stochastic gradient descent (SGD), this article uses the popular adaptive momentum estimation (ADAM) optimizer. This implementation of the adam optimizer is inspired by Keras. It incorporates the concept of exponentially weighted averages. Refer to the commented code below to understand the mechanics.
# this code is a walk through of 2-layer CNN forward and backward propagation from scratch, using the adam optimizer
import numpy as np
mnistlocation = “INSERT MNIST FILE LOCATION” #you can download the file here
# For example: “/Users/enrichmentcap/Downloads/mndata.npz”
trains, tests = 1000, 1000 #depends on computer processing speed. If computer isn't fast, reduce size of training and test dataset
trainimages, trainlabels, testimages, testlabels = np.load(mnistlocation)['trainimages'][:trains] / 255, np.load(mnistlocation)['trainlabels'][:trains], np.load(mnistlocation)['testimages'][:tests] / 255, np.load(mnistlocation)['testlabels'][:tests]
##from tensorflow.keras.datasets import mnist #uncomment if using tensorflow library to retrieve dataset
##(trainimages, trainlabels), (testimages, testlabels) = mnist.load_data()
##trainimages, testimages = trainimages[:train]/255, testimages[:test]/255
np.random.seed(0)
classes = len(np.unique(trainlabels))
imw = trainimages.shape[2]
imh = trainimages.shape[1]
lr = .001 #adam optimizer requires different learn rate
fsw = 8 #kernel1 width
fsh = 8 #kernel1 height
fsw2 = 4 #kernel2 width
fsh2 = 4 #kernel2 height
filts = 16 #kernel1 number of filters
filts2 = 8 #kernel2 number of filters
kern = np.random.rand(filts, fsh, fsw) - .5
kb = np.zeros(filts)
kern2 = np.random.rand(filts2, filts, fsh2, fsw2) - .5
kb2 = np.zeros(filts2)
step = 1 #step sizes of kernel convolutions
step2 = 2
rsw = (imw - fsw) // step + 1 #resulting shape width of kernel1 convolution
rsh = (imh - fsh) // step + 1 #resulting shape height of kernel1 convolution
rsw2 = (rsw - fsw2) // step2 + 1 #resulting shape width of kernel2 convolution
rsh2 = (rsh - fsh2) // step2 + 1 #resulting shape height of kernel2 convolution
w = np.random.rand(classes, filts2, rsh2, rsw2) - .5
b = np.random.rand(classes) - .5
b1 = .9 #adam optimizer parameters
b2 = .99
eps = 1e-7
mw = 0
vw = 0
mb = 0
vb = 0
mk = 0
vk = 0
mk2 = 0
vk2 = 0
for i in range(trains):
## print(i)
xx, label, label[trainlabels[i]] = trainimages[i], np.zeros(classes), 1
k = np.zeros((filts, rsh, rsw))
kk = np.zeros((filts2, rsh2, rsw2))
for j in range(rsh):
for jj in range(rsw):
k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw].reshape(1,fsh,fsw)).sum(axis=(1,2)) + kb #k.shape = [filts,rsize,rsize] = [16,21,21]
for j0 in range(rsh2):
for jj0 in range(rsw2):
kk[:,j0,jj0] = (kern2 * k[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)
x = (w * kk.reshape(1,filts2,rsh2,rsw2)).sum(axis=(1,2,3)) + b
y = np.exp(x) / np.sum(np.exp(x))
dydx = -np.exp(x)[trainlabels[i]] * np.exp(x) / np.sum(np.exp(x))**2
dydx[trainlabels[i]] = np.exp(x)[trainlabels[i]] * (np.sum(np.exp(x)) - np.exp(x)[trainlabels[i]]) / np.sum(np.exp(x))**2
dLdy = 1 / y[trainlabels[i]]
dLdx = dLdy * dydx
dxdw, dxdb = kk, 1
dLdw = dLdx.reshape(classes,1,1,1) * dxdw.reshape(1, filts2, rsh2, rsw2)
dLdb = dLdx * dxdb
## adam optimizer code block
lrt = lr * (1 - b2**(i+1))**.5 / (1 - b1**(i+1)) #as training increases lrt starts high goes low real fast then gradually heads towards 1
mw = b1 * mw + (1 - b1) * dLdw #takes large ratio of a small ratio of the previous slope, and adds to a small ratio of current slope
vw = b2 * vw + (1 - b2) * dLdw**2 #takes really large ratio of a really small ratio of the previous slope squared, and adds really small ratio of current slope squared
w = w + lrt * mw / (vw**.5 + eps) #better prelim results with abs(dLdw) in vw and removing square root of vw in w; independent from prior mw and vw, this results is a tanh shape more intuitive for what one would like as slope increases or decreases
mb = b1 * mb + (1 - b1) * dLdb
vb = b2 * vb + (1 - b2) * dLdb**2
b = b + lrt * mb / (vb**.5 + eps)
dxdkk = w #(classes, filts2*rsize2**2)
dLdkk = (dLdx.reshape(classes, 1, 1, 1) * dxdkk).sum(axis=0)#dLdkk represents all class loss integrated into appropriate positions in kk output; meaning which positions in kk led to more loss or error
dLdkern2 = np.zeros((filts2,filts,fsh2,fsw2))
dkkdkern2 = k
dkkdkb2 = 1
for f in range(filts2):
for j000 in range(fsh2):
for jj000 in range(fsw2):
dLdkern2[f, :, j000, jj000] = (dLdkk[f].reshape(1,rsh2,rsw2) * dkkdkern2[:,j000:j000+rsh2,jj000:jj000+rsw2]).sum(axis=(1,2))
dLdkb2 = dLdkk.sum(axis=(1,2)) * dkkdkb2
kb2 = kb2 + dLdkb2 * lr
## adam optimizer code block
mk2 = b1 * mk2 + (1 - b1) * dLdkern2
vk2 = b2 * vk2 + (1 - b2) * dLdkern2**2
kern2 = kern2 + lrt * mk2 / (vk2**.5 + eps)
dkkdk = kern2 # shape [filts2, filts * fsize2**2]
dLdk = np.zeros((filts, rsh, rsw))
for ooo in range(filts):
for o in range(rsh2):
for oo in range(rsw2):
dLdk[ooo,o*step2:o*step2+fsh2,oo*step2:oo*step2+fsw2] += (dLdkk[:,o,oo].reshape(filts2,1,1) * dkkdk[:,ooo,:,:]).sum(axis=0) #[filts2,] @ [filts2,fsize2,fsize2]
dkdkern = xx
dkdkb = 1
dLdkern = np.zeros((filts,fsh,fsw))
for j00 in range(fsh):
for jj00 in range(fsw):
dLdkern[:,j00,jj00] = (dkdkern[j00:j00+rsw, jj00:jj00+rsh].reshape(1,rsw,rsh) * dLdk).sum(axis=(1,2))
dLdkb = dLdk.sum(axis=(1,2)) * dkdkb
kb = kb + dLdkb * lr
## adam optimizer code block
mk = b1 * mk + (1 - b1) * dLdkern
vk = b2 * vk + (1 - b2) * dLdkern**2
kern = kern + lrt * mk / (vk**.5 + eps)
##loss = np.array([loss[i:i+100].mean() for i in range(trains - 100)])
checke = np.zeros(tests)
for i in range(tests):
xx = testimages[i]
k = np.zeros((filts, rsh, rsw))
kk = np.zeros((filts2, rsh2, rsw2))
for j in range(rsh):
for jj in range(rsw):
k[:,j,jj] = (kern * xx[step*j:step*j+fsh, step*jj:step*jj+fsw]).sum(axis=(1,2)) + kb #k.shape = [filts,rsize,rsize] = [16,21,21]
for j0 in range(rsh2):
for jj0 in range(rsw2):
kk[:,j0,jj0] = (kern2 * k[:,step2*j0:step2*j0+fsh2, step2*jj0:step2*jj0+fsw2].reshape(1,filts,fsh2,fsw2)).sum(axis=(1,2,3)) + kb2 #kk.shape = (filts2, rsize2, rsize2)
x = (w * kk).sum(axis=(1,2,3)) + b
if testlabels[i] == np.argmax(x):
checke[i] = 1
print(len(np.flatnonzero(checke==1))/tests)