testing COMPs-fastPcum
In this notebook, we will study how homeostasis (cooperation) may be an essential ingredient to this algorithm working on a winner-take-all basis (competition). This extension has been published as Perrinet, Neural Computation (2010) (see https://laurentperrinet.github.io/publication/perrinet-10-shl ). Compared to the previous post, we optimize the code to be faster.
See also the other posts on unsupervised learning,
This is joint work with Victor Boutin.
Summary: using fast Pcum functions works with approx 80 times speed-up, and one needs to learn the non-linear functions
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import numpy as np
np.set_printoptions(formatter = dict( float = lambda x: "%.3g" % x ), precision=3, suppress=True, threshold=np.inf)
from shl_scripts.shl_experiments import SHL
import time
%load_ext autoreload
%autoreload 2
DEBUG = True
DEBUG = False
if not DEBUG:
matname = '2017-05-31_Testing_COMPs'
DEBUG_DOWNSCALE = 1
else:
matname = '2017-05-31_Testing_COMPs-DEBUG'
DEBUG_DOWNSCALE = 10
seed = 42
nb_quant = 512
C = 5.
do_sym = False
from shl_scripts.shl_experiments import SHL
shl = SHL(DEBUG_DOWNSCALE=DEBUG_DOWNSCALE,
datapath='/tmp/database', seed=seed,
eta=0.05, verbose=2, record_each=50, n_iter=1000, eta_homeo=0., alpha_homeo=1.,
do_sym=do_sym, nb_quant=nb_quant, C=C)
data = shl.get_data(matname=matname)
test_size = data.shape[0]//2
data_training = data[:test_size, :]
data_test = data[test_size:,:]
#DEBUG
test_size = data.shape[0]//20
data_training = data[:(data.shape[0]-test_size),:].copy()
data_test = data[:test_size, :].copy()
dico_partial_learning = shl.learn_dico(data=data_training, matname=matname)
We start off by using a short learning with no homeostasis such that we end up with a unbalanced dictionary:
fig, ax = shl.show_dico(dico_partial_learning, data=data, title=matname)
fig.show()
fig, ax = shl.time_plot(dico_partial_learning, variable='prob_active');
fig.show()
MP classique¶
n_samples, n_pixels = data_test.shape
n_dictionary, n_pixels = dico_partial_learning.dictionary.shape
norm_each_filter = np.sqrt(np.sum(dico_partial_learning.dictionary**2, axis=1))
dico_partial_learning.dictionary /= norm_each_filter[:,np.newaxis]
sparse_code_mp = shl.code(data_test, dico_partial_learning, matname=matname)
def plot_proba_histogram(coding, verbose=False):
n_dictionary=coding.shape[1]
p = np.count_nonzero(coding, axis=0)/coding.shape[1]
p /= p.sum()
rel_ent = np.sum( -p * np.log(p)) / np.log(n_dictionary)
if verbose: print('Entropy / Entropy_max=', rel_ent )
fig = plt.figure(figsize=(16, 4))
ax = fig.add_subplot(111)
ax.bar(np.arange(n_dictionary), p*n_dictionary)
ax.set_title('distribution of the selection probability - entropy= ' + str(rel_ent) )
ax.set_ylabel('pdf')
ax.axis('tight')
ax.set_xlim(0, n_dictionary)
return fig, ax
fig, ax = plot_proba_histogram(sparse_code_mp)
COMP : learning modulations¶
computing histograms is rather fast:
np.linspace(0., 1, nb_quant, endpoint=True)
def get_P_cum(code, C, nb_quant=100, do_sym=True, verbose=False):
from shl_scripts.shl_encode import rescaling
p_c = rescaling(code=code, C=C, do_sym=do_sym, verbose=verbose)
n_samples, nb_filter = code.shape
code_bins = np.linspace(0., 1., nb_quant, endpoint=True)
P_cum = np.zeros((nb_filter, nb_quant))
for i in range(nb_filter):
p, bins = np.histogram(p_c[:, i], bins=code_bins, density=True)
p /= p.sum()
P_cum[i, :] = np.hstack((0, np.cumsum(p)))
return P_cum
%%time
P_cum = get_P_cum(sparse_code_mp, C=C, nb_quant=nb_quant)
P_cum = get_P_cum(sparse_code_mp, nb_quant=shl.nb_quant, C=C, do_sym=do_sym)
from shl_scripts.shl_tools import plot_P_cum
fig, ax = plot_P_cum(P_cum, verbose=False, alpha=.05)
ax.set_ylim(0.85, 1.01);
from shl_scripts.shl_encode import rescaling
corr = (data_test @ dico_partial_learning.dictionary.T)[0, :]
print('correlation=', corr)
print('transformed correlation=', rescaling(corr, C=C, do_sym=do_sym))
%%time
code_bins = np.linspace(0., 1, shl.nb_quant, endpoint=True)
def quantile(P_cum, c):
q_res = np.zeros_like(c)
for i in range(P_cum.shape[0]):
q_res[i] = np.interp(c[i], code_bins, P_cum[i, :])
return q_res
q_vanilla = quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym))
print('quantiles=', q_vanilla)
%%timeit
quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym))
optimizing the quantile function: fast interpolation¶
Let's try a quick interpolation in the linearly spaced bins either with or without linear interpolation:
def quantile(P_cum, p_c, do_linear=True):
q_res = np.zeros_like(p_c)
for i in range(P_cum.shape[0]):
if not do_linear:
q_res[i] = P_cum[i, int(p_c[i]*shl.nb_quant)]
else:
if p_c[i]>0:
p = p_c[i]*shl.nb_quant - int(p_c[i]*shl.nb_quant)
floor = P_cum[i, int(p_c[i]*shl.nb_quant)]
ceil = P_cum[i, int(p_c[i]*shl.nb_quant)+1 -(p_c[i]==1) ]
q_res[i] = (1-p) * floor + p * ceil
return q_res
%%time
q_ind = quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym), do_linear=False)
print('relative error=', np.sum((q_ind-q_vanilla)**2) / np.sum(q_vanilla**2))
%%time
q_ind = quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym), do_linear=True)
print('relative error=', np.sum((q_ind-q_vanilla)**2)/np.sum((q_vanilla)**2))
%%timeit
quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym))
print('quantiles=', q_ind)
print('quantiles=', (q_ind-q_vanilla).max())
This small difference comes from the fact that we switched from a linear to a "nearest neighbor" interpolation. This code is approximately 8 times faster...
optimizing the quantile function: vectorization¶
Let's proceed to vectorize this function further:
(rescaling(corr, C=C, do_sym=do_sym)*shl.nb_quant).astype(np.int)
print(P_cum[0, :], P_cum.ravel()[:shl.nb_quant])
adding the index to scan the raveled P_cum
matrix, denoting it as a "stick" marking the index of the index of each line of P_cum
stick = np.arange(P_cum.shape[0])*shl.nb_quant
stick
such that the indices in the raveled matrix is:
(rescaling(corr, C=C, do_sym=do_sym)*shl.nb_quant).astype(np.int) + stick
We should be careful to the border cases such as $p_c=1$ :
p_c = np.ones_like(corr)
print('Index before correction', (p_c*shl.nb_quant ).astype(np.int) + stick)
print('Index after correction', (p_c*shl.nb_quant - (p_c==1)).astype(np.int) + stick)
p_c = rescaling(corr, C=C, do_sym=do_sym)
%%time
def quantile(P_cum, p_c):
return P_cum.ravel()[(p_c*shl.nb_quant - (p_c==1)).astype(np.int) + np.arange(P_cum.shape[0])*shl.nb_quant]
q_vec = quantile(P_cum, p_c)
%%timeit
quantile(P_cum, p_c)
getting even a bit further using a precomputed "stick" for the different indices yields:
stick = np.arange(P_cum.shape[0])*shl.nb_quant
print("shape of stick is ", stick.shape)
print("shape of vector ", (p_c*P_cum.shape[1]).astype(np.int).shape)
%%time
def quantile(P_cum, p_c, stick):
return P_cum.ravel()[(p_c*shl.nb_quant - (p_c==1)).astype(np.int) + stick]
q_vec = quantile(P_cum, p_c, stick)
%%timeit
quantile(P_cum, p_c, stick)
print('relative difference=', (q_vec-q_vanilla).std()/(q_vanilla).std())
print('relative difference=', (q_vec-q_ind).std()/(q_ind).std())
Finally, vectorization allows to go $10\times 8$ faster.
Note: We check that values are still ok on the borders :
#from shl_scripts.shl_encode import quantile, rescaling
stick = np.arange(shl.n_dictionary)*shl.nb_quant
print('Value for ones = ', quantile(P_cum, rescaling(np.inf*np.ones(shl.n_dictionary), C=C), stick))
print('Value for zeros = ', quantile(P_cum, rescaling(np.zeros(shl.n_dictionary), C=C), stick))
optimizing the quantile function: vectorization with linear interpolation¶
To improve the precision (and thus decrease the number of quantification steps) we can do a linear interpolation:
%%time
def quantile(P_cum, p_c, stick):
indices = (p_c*shl.nb_quant).astype(np.int) # (floor) index of each p_c in the respective line of P_cum
p = p_c*shl.nb_quant - indices # ratio between floor and ceil
floor = P_cum.ravel()[indices - (p_c==1) + stick] # floor, accounting for extremes, and moved on the raveled P_cum matrix
ceil = P_cum.ravel()[indices + 1 - (p_c==0) - (p_c==1) - (indices>=P_cum.shape[1]-1) + stick] # ceiling, accounting for both extremes, and moved similarly
return (1-p) * floor + p * ceil
q_vec = quantile(P_cum, p_c, stick)
%%timeit
quantile(P_cum, p_c, stick)
print('relative error=', np.sum((q_vec-q_vanilla)**2)/np.sum((q_vanilla)**2))
print(np.prod(P_cum.shape))
Note that when interpolating, we should be careful to the border cases such as $p_c=1$ or more generally the cases where the index is the last from the line :
p_c = np.ones_like(corr)
indices = (p_c*shl.nb_quant ).astype(np.int)
print('Index before correction', indices + stick)
print('Index after correction', indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)
corr_test = np.zeros_like(corr)
p_c = rescaling(corr_test, C=C, do_sym=do_sym)
indices = (p_c*shl.nb_quant ).astype(np.int)
print('Index before correction', indices + stick)
print('Index after correction', indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)
p_c = 0.9999*np.ones_like(corr)
indices = (p_c*shl.nb_quant ).astype(np.int)
print('Index before correction', indices + stick)
print('Index after correction', indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)
Another check:
print('Value for ones = ', quantile(P_cum, rescaling(np.inf*np.ones(shl.n_dictionary), C=C), stick))
print('Value for 0.999 = ', quantile(P_cum, rescaling(0.999*np.ones(shl.n_dictionary), C=C), stick))
print('Value for zeros = ', quantile(P_cum, rescaling(np.zeros(shl.n_dictionary), C=C), stick))
Do we get faster by accessing all indices at one?
%%time
def quantile(P_cum, p_c, stick):
indices = (p_c*shl.nb_quant).astype(np.int)
p = p_c*shl.nb_quant - indices
q = P_cum.ravel()[list(indices - (p_c==1) + stick) + list(indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)]
return (1-p) * q[:len(indices)] + p * q[len(indices):]
q_vec = quantile(P_cum, p_c, stick)
%%time
def quantile(P_cum, p_c, stick):
indices = (p_c*shl.nb_quant).astype(np.int)
p = p_c*shl.nb_quant - indices
q = P_cum.ravel()[np.hstack(((indices - (p_c==1) + stick), (indices + 1 - (p_c==0) - 2*(p_c==1) + stick)))]
return (1-p) * q[:len(indices)] + p * q[len(indices):]
q_vec = quantile(P_cum, p_c, stick)
%%timeit
quantile(P_cum, p_c, stick)
print('relative error=', np.sum((q_vec-q_vanilla)**2)/np.sum((q_vanilla)**2))
Not really, let's keep the above solution...
COMP : using modulations¶
let's use this new quantile
function
from shl_scripts.shl_encode import quantile, rescaling
l0_sparseness = shl.l0_sparseness
def comp(data, dico, P_cum, C=C, do_sym=do_sym, l0_sparseness=l0_sparseness, verbose=0):
if verbose!=0: t0 = time.time()
n_samples, n_dictionary = data.shape[0], dico.shape[0]
sparse_code = np.zeros((n_samples, n_dictionary))
if not P_cum is None:
nb_quant = P_cum.shape[1]
stick = np.arange(n_dictionary)*nb_quant
corr = (data @ dico.T)
Xcorr = (dico @ dico.T)
for i_sample in range(n_samples):
c = corr[i_sample, :].copy()
if verbose!=0: ind_list=list()
for i_l0 in range(int(l0_sparseness)):
if P_cum is None:
q_i = rescaling(c, C=C, do_sym=do_sym)
else:
q_i = quantile(P_cum, rescaling(c, C=C, do_sym=do_sym), stick, do_fast=True)
ind = np.argmax(q_i)
if verbose!=0: ind_list.append(ind)
c_ind = c[ind] / Xcorr[ind, ind]
sparse_code[i_sample, ind] += c_ind
c -= c_ind * Xcorr[ind, :]
if verbose!=0 and i_sample in range(2):
print(ind_list, [q_i[i] for i in ind_list], np.median(q_i), q_i.max(), [c[i] for i in ind_list], c.min(), c.max())
if verbose!=0:
duration = time.time()-t0
print('coding duration : {0}s'.format(duration))
return sparse_code
#sparse_code = comp(data_test, dico_partial_learning.dictionary, code_bins, P_cum, verbose=1)
%%timeit
sparse_code = comp(data_test, dico_partial_learning.dictionary, P_cum, C=C, do_sym=do_sym, verbose=0)
def plot_scatter_MpVsComp(sparse_vector, my_sparse_code):
fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111)
a_min = np.min((sparse_vector.min(), my_sparse_code.min()))
a_max = np.max((sparse_vector.max(), my_sparse_code.max()))
ax.plot(np.array([a_min, a_max]), np.array([a_min, a_max]), 'k--', lw=2)
print(sparse_vector.shape, my_sparse_code.shape)
ax.scatter(sparse_vector.ravel(), my_sparse_code.ravel(), alpha=0.01)
ax.set_title('MP')
ax.set_ylabel('COMP')
#ax.set_xlim(0)
#ax.set_ylim(0)
ax.axis('equal')
return fig, ax
#fig, ax = plot_scatter_MpVsComp(sparse_code_mp, sparse_code)
#fig.show()
testing that COMP with fixed Pcum is equivalent to MP¶
When homeostasis is not enabled, we set the P_cum
matricx to None
:
print(dico_partial_learning.P_cum)
l0_sparseness = 10 # shl.l0_sparseness
#sparse_code_mp = shl.code(data_test, dico_partial_learning, matname=matname, l0_sparseness=l0_sparseness)
sparse_code_mp = comp(data_test, dico_partial_learning.dictionary, P_cum=None, C=C, do_sym=do_sym, verbose=1, l0_sparseness=l0_sparseness)
n_samples, nb_filter = sparse_code_mp.shape
P_cum = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((nb_filter, 1))
sparse_code_comp = comp(data_test, dico_partial_learning.dictionary, P_cum, C=C, do_sym=do_sym, verbose=1, l0_sparseness=l0_sparseness)
print('Relative difference = ', np.sum((sparse_code_mp - sparse_code_comp)**2)/np.sum((sparse_code_mp)**2))
fig, ax = plot_proba_histogram(sparse_code_mp)
fig.show()
fig, ax = plot_proba_histogram(sparse_code_comp)
fig, ax = plot_scatter_MpVsComp(sparse_code_mp, sparse_code_comp)
gradient descent¶
P_cum = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((nb_filter, 1))
print('Shape of modulation function', P_cum.shape)
eta_homeo = .01
for i in range(1000//DEBUG_DOWNSCALE):
sparse_code = comp(data_test, dico_partial_learning.dictionary, P_cum, C=C, do_sym=do_sym, verbose=0)
P_cum_ = get_P_cum(sparse_code, C=C, nb_quant=nb_quant)
P_cum = (1-eta_homeo) * P_cum + eta_homeo * P_cum_
if i % (100//DEBUG_DOWNSCALE) == 0:
print('Learning step', i)
fig, ax = plot_proba_histogram(sparse_code)
plt.show()