In [7]:
def backward_pass(parameters, cache, X, Y):
    
    # unpack paramaeters and cache to get values for backpropagation calculations
    W1 = parameters['W1']
    W2 = parameters['W2']
    
    Z1 = cache['Z1']
    A1 = cache['A1']
    Z2 = cache['Z2']
    A2 = cache['A2']
    
    m = X.shape[1] # number of examples in a training set
    
    dZ2= A2 - Y
    dW2 = (1 / m) * np.dot(dZ2, A1.T)
    db2 = (1 / m) * np.sum(dZ2, axis=1, keepdims=True)   # keepdims - prevents python to output rank 1 array (n,)
    
    dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2)) # we use tanh activation function
    dW1 = (1 / m) * np.dot(dZ1, X.T)  
    db1 = (1 / m) * np.sum(dZ1, axis=1, keepdims=True)
    
    grads = {"dW1": dW1,
             "db1": db1,
             "dW2": dW2,
             "db2": db2}
    
    return grads