testing COMPs-fastPcum
In this notebook, we will study how homeostasis (cooperation) may be an essential ingredient to this algorithm working on a winner-take-all basis (competition). This extension has been published as Perrinet, Neural Computation (2010) (see https://laurentperrinet.github.io/publication/perrinet-10-shl ). Compared to the previous post, we optimize the code to be faster.
See also the other posts on unsupervised learning,
This is joint work with Victor Boutin.
Summary: using fast Pcum functions works with approx 80 times speed-up, and one needs to learn the non-linear functions
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import numpy as np
np.set_printoptions(formatter = dict( float = lambda x: "%.3g" % x ), precision=3, suppress=True, threshold=np.inf)
from shl_scripts.shl_experiments import SHL
import time
%load_ext autoreload
%autoreload 2
DEBUG = True
DEBUG = False
if not DEBUG:
matname = '2017-05-31_Testing_COMPs'
DEBUG_DOWNSCALE = 1
else:
matname = '2017-05-31_Testing_COMPs-DEBUG'
DEBUG_DOWNSCALE = 10
seed = 42
nb_quant = 512
C = 5.
do_sym = False
from shl_scripts.shl_experiments import SHL
shl = SHL(DEBUG_DOWNSCALE=DEBUG_DOWNSCALE,
datapath='/tmp/database', seed=seed,
eta=0.05, verbose=2, record_each=50, n_iter=1000, eta_homeo=0., alpha_homeo=1.,
do_sym=do_sym, nb_quant=nb_quant, C=C)
data = shl.get_data(matname=matname)
test_size = data.shape[0]//2
data_training = data[:test_size, :]
data_test = data[test_size:,:]
#DEBUG
test_size = data.shape[0]//20
data_training = data[:(data.shape[0]-test_size),:].copy()
data_test = data[:test_size, :].copy()
dico_partial_learning = shl.learn_dico(data=data_training, matname=matname)
We start off by using a short learning with no homeostasis such that we end up with a unbalanced dictionary:
fig, ax = shl.show_dico(dico_partial_learning, data=data, title=matname)
fig.show()
fig, ax = shl.time_plot(dico_partial_learning, variable='prob_active');
fig.show()
MP classique¶
n_samples, n_pixels = data_test.shape
n_dictionary, n_pixels = dico_partial_learning.dictionary.shape
norm_each_filter = np.sqrt(np.sum(dico_partial_learning.dictionary**2, axis=1))
dico_partial_learning.dictionary /= norm_each_filter[:,np.newaxis]
sparse_code_mp = shl.code(data_test, dico_partial_learning, matname=matname)
def plot_proba_histogram(coding, verbose=False):
n_dictionary=coding.shape[1]
p = np.count_nonzero(coding, axis=0)/coding.shape[1]
p /= p.sum()
rel_ent = np.sum( -p * np.log(p)) / np.log(n_dictionary)
if verbose: print('Entropy / Entropy_max=', rel_ent )
fig = plt.figure(figsize=(16, 4))
ax = fig.add_subplot(111)
ax.bar(np.arange(n_dictionary), p*n_dictionary)
ax.set_title('distribution of the selection probability - entropy= ' + str(rel_ent) )
ax.set_ylabel('pdf')
ax.axis('tight')
ax.set_xlim(0, n_dictionary)
return fig, ax
fig, ax = plot_proba_histogram(sparse_code_mp)
COMP : learning modulations¶
computing histograms is rather fast:
np.linspace(0., 1, nb_quant, endpoint=True)
def get_P_cum(code, C, nb_quant=100, do_sym=True, verbose=False):
from shl_scripts.shl_encode import rescaling
p_c = rescaling(code=code, C=C, do_sym=do_sym, verbose=verbose)
n_samples, nb_filter = code.shape
code_bins = np.linspace(0., 1., nb_quant, endpoint=True)
P_cum = np.zeros((nb_filter, nb_quant))
for i in range(nb_filter):
p, bins = np.histogram(p_c[:, i], bins=code_bins, density=True)
p /= p.sum()
P_cum[i, :] = np.hstack((0, np.cumsum(p)))
return P_cum
%%time
P_cum = get_P_cum(sparse_code_mp, C=C, nb_quant=nb_quant)
P_cum = get_P_cum(sparse_code_mp, nb_quant=shl.nb_quant, C=C, do_sym=do_sym)
from shl_scripts.shl_tools import plot_P_cum
fig, ax = plot_P_cum(P_cum, verbose=False, alpha=.05)
ax.set_ylim(0.85, 1.01);
from shl_scripts.shl_encode import rescaling
corr = (data_test @ dico_partial_learning.dictionary.T)[0, :]
print('correlation=', corr)
print('transformed correlation=', rescaling(corr, C=C, do_sym=do_sym))
%%time
code_bins = np.linspace(0., 1, shl.nb_quant, endpoint=True)
def quantile(P_cum, c):
q_res = np.zeros_like(c)
for i in range(P_cum.shape[0]):
q_res[i] = np.interp(c[i], code_bins, P_cum[i, :])
return q_res
q_vanilla = quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym))
print('quantiles=', q_vanilla)
%%timeit
quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym))
optimizing the quantile function: fast interpolation¶
Let's try a quick interpolation in the linearly spaced bins either with or without linear interpolation:
def quantile(P_cum, p_c, do_linear=True):
q_res = np.zeros_like(p_c)
for i in range(P_cum.shape[0]):
if not do_linear:
q_res[i] = P_cum[i, int(p_c[i]*shl.nb_quant)]
else:
if p_c[i]>0:
p = p_c[i]*shl.nb_quant - int(p_c[i]*shl.nb_quant)
floor = P_cum[i, int(p_c[i]*shl.nb_quant)]
ceil = P_cum[i, int(p_c[i]*shl.nb_quant)+1 -(p_c[i]==1) ]
q_res[i] = (1-p) * floor + p * ceil
return q_res
%%time
q_ind = quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym), do_linear=False)
print('relative error=', np.sum((q_ind-q_vanilla)**2) / np.sum(q_vanilla**2))
%%time
q_ind = quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym), do_linear=True)
print('relative error=', np.sum((q_ind-q_vanilla)**2)/np.sum((q_vanilla)**2))
%%timeit
quantile(P_cum, rescaling(corr, C=C, do_sym=do_sym))
print('quantiles=', q_ind)
print('quantiles=', (q_ind-q_vanilla).max())
This small difference comes from the fact that we switched from a linear to a "nearest neighbor" interpolation. This code is approximately 8 times faster...
optimizing the quantile function: vectorization¶
Let's proceed to vectorize this function further:
(rescaling(corr, C=C, do_sym=do_sym)*shl.nb_quant).astype(np.int)
print(P_cum[0, :], P_cum.ravel()[:shl.nb_quant])
adding the index to scan the raveled P_cum
matrix, denoting it as a "stick" marking the index of the index of each line of P_cum
stick = np.arange(P_cum.shape[0])*shl.nb_quant
stick
such that the indices in the raveled matrix is:
(rescaling(corr, C=C, do_sym=do_sym)*shl.nb_quant).astype(np.int) + stick
We should be careful to the border cases such as $p_c=1$ :
p_c = np.ones_like(corr)
print('Index before correction', (p_c*shl.nb_quant ).astype(np.int) + stick)
print('Index after correction', (p_c*shl.nb_quant - (p_c==1)).astype(np.int) + stick)
p_c = rescaling(corr, C=C, do_sym=do_sym)
%%time
def quantile(P_cum, p_c):
return P_cum.ravel()[(p_c*shl.nb_quant - (p_c==1)).astype(np.int) + np.arange(P_cum.shape[0])*shl.nb_quant]
q_vec = quantile(P_cum, p_c)
%%timeit
quantile(P_cum, p_c)
getting even a bit further using a precomputed "stick" for the different indices yields:
stick = np.arange(P_cum.shape[0])*shl.nb_quant
print("shape of stick is ", stick.shape)
print("shape of vector ", (p_c*P_cum.shape[1]).astype(np.int).shape)
%%time
def quantile(P_cum, p_c, stick):
return P_cum.ravel()[(p_c*shl.nb_quant - (p_c==1)).astype(np.int) + stick]
q_vec = quantile(P_cum, p_c, stick)
%%timeit
quantile(P_cum, p_c, stick)
print('relative difference=', (q_vec-q_vanilla).std()/(q_vanilla).std())
print('relative difference=', (q_vec-q_ind).std()/(q_ind).std())
Finally, vectorization allows to go $10\times 8$ faster.
Note: We check that values are still ok on the borders :
#from shl_scripts.shl_encode import quantile, rescaling
stick = np.arange(shl.n_dictionary)*shl.nb_quant
print('Value for ones = ', quantile(P_cum, rescaling(np.inf*np.ones(shl.n_dictionary), C=C), stick))
print('Value for zeros = ', quantile(P_cum, rescaling(np.zeros(shl.n_dictionary), C=C), stick))
optimizing the quantile function: vectorization with linear interpolation¶
To improve the precision (and thus decrease the number of quantification steps) we can do a linear interpolation:
%%time
def quantile(P_cum, p_c, stick):
indices = (p_c*shl.nb_quant).astype(np.int) # (floor) index of each p_c in the respective line of P_cum
p = p_c*shl.nb_quant - indices # ratio between floor and ceil
floor = P_cum.ravel()[indices - (p_c==1) + stick] # floor, accounting for extremes, and moved on the raveled P_cum matrix
ceil = P_cum.ravel()[indices + 1 - (p_c==0) - (p_c==1) - (indices>=P_cum.shape[1]-1) + stick] # ceiling, accounting for both extremes, and moved similarly
return (1-p) * floor + p * ceil
q_vec = quantile(P_cum, p_c, stick)
%%timeit
quantile(P_cum, p_c, stick)
print('relative error=', np.sum((q_vec-q_vanilla)**2)/np.sum((q_vanilla)**2))
print(np.prod(P_cum.shape))
Note that when interpolating, we should be careful to the border cases such as $p_c=1$ or more generally the cases where the index is the last from the line :
p_c = np.ones_like(corr)
indices = (p_c*shl.nb_quant ).astype(np.int)
print('Index before correction', indices + stick)
print('Index after correction', indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)
corr_test = np.zeros_like(corr)
p_c = rescaling(corr_test, C=C, do_sym=do_sym)
indices = (p_c*shl.nb_quant ).astype(np.int)
print('Index before correction', indices + stick)
print('Index after correction', indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)
p_c = 0.9999*np.ones_like(corr)
indices = (p_c*shl.nb_quant ).astype(np.int)
print('Index before correction', indices + stick)
print('Index after correction', indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)
Another check:
print('Value for ones = ', quantile(P_cum, rescaling(np.inf*np.ones(shl.n_dictionary), C=C), stick))
print('Value for 0.999 = ', quantile(P_cum, rescaling(0.999*np.ones(shl.n_dictionary), C=C), stick))
print('Value for zeros = ', quantile(P_cum, rescaling(np.zeros(shl.n_dictionary), C=C), stick))
Do we get faster by accessing all indices at one?
%%time
def quantile(P_cum, p_c, stick):
indices = (p_c*shl.nb_quant).astype(np.int)
p = p_c*shl.nb_quant - indices
q = P_cum.ravel()[list(indices - (p_c==1) + stick) + list(indices + 1 - (p_c==0) - (p_c==1) - (indices>=stick+P_cum.shape[1]-1) + stick)]
return (1-p) * q[:len(indices)] + p * q[len(indices):]
q_vec = quantile(P_cum, p_c, stick)
%%time
def quantile(P_cum, p_c, stick):
indices = (p_c*shl.nb_quant).astype(np.int)
p = p_c*shl.nb_quant - indices
q = P_cum.ravel()[np.hstack(((indices - (p_c==1) + stick), (indices + 1 - (p_c==0) - 2*(p_c==1) + stick)))]
return (1-p) * q[:len(indices)] + p * q[len(indices):]
q_vec = quantile(P_cum, p_c, stick)
%%timeit
quantile(P_cum, p_c, stick)
print('relative error=', np.sum((q_vec-q_vanilla)**2)/np.sum((q_vanilla)**2))
Not really, let's keep the above solution...
COMP : using modulations¶
let's use this new quantile
function
from shl_scripts.shl_encode import quantile, rescaling
l0_sparseness = shl.l0_sparseness
def comp(data, dico, P_cum, C=C, do_sym=do_sym, l0_sparseness=l0_sparseness, verbose=0):
if verbose!=0: t0 = time.time()
n_samples, n_dictionary = data.shape[0], dico.shape[0]
sparse_code = np.zeros((n_samples, n_dictionary))
if not P_cum is None:
nb_quant = P_cum.shape[1]
stick = np.arange(n_dictionary)*nb_quant
corr = (data @ dico.T)
Xcorr = (dico @ dico.T)
for i_sample in range(n_samples):
c = corr[i_sample, :].copy()
if verbose!=0: ind_list=list()
for i_l0 in range(int(l0_sparseness)):
if P_cum is None:
q_i = rescaling(c, C=C, do_sym=do_sym)
else:
q_i = quantile(P_cum, rescaling(c, C=C, do_sym=do_sym), stick, do_fast=True)
ind = np.argmax(q_i)
if verbose!=0: ind_list.append(ind)
c_ind = c[ind] / Xcorr[ind, ind]
sparse_code[i_sample, ind] += c_ind
c -= c_ind * Xcorr[ind, :]
if verbose!=0 and i_sample in range(2):
print(ind_list, [q_i[i] for i in ind_list], np.median(q_i), q_i.max(), [c[i] for i in ind_list], c.min(), c.max())
if verbose!=0:
duration = time.time()-t0
print('coding duration : {0}s'.format(duration))
return sparse_code
#sparse_code = comp(data_test, dico_partial_learning.dictionary, code_bins, P_cum, verbose=1)
%%timeit
sparse_code = comp(data_test, dico_partial_learning.dictionary, P_cum, C=C, do_sym=do_sym, verbose=0)
def plot_scatter_MpVsComp(sparse_vector, my_sparse_code):
fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111)
a_min = np.min((sparse_vector.min(), my_sparse_code.min()))
a_max = np.max((sparse_vector.max(), my_sparse_code.max()))
ax.plot(np.array([a_min, a_max]), np.array([a_min, a_max]), 'k--', lw=2)
print(sparse_vector.shape, my_sparse_code.shape)
ax.scatter(sparse_vector.ravel(), my_sparse_code.ravel(), alpha=0.01)
ax.set_title('MP')
ax.set_ylabel('COMP')
#ax.set_xlim(0)
#ax.set_ylim(0)
ax.axis('equal')
return fig, ax
#fig, ax = plot_scatter_MpVsComp(sparse_code_mp, sparse_code)
#fig.show()
testing that COMP with fixed Pcum is equivalent to MP¶
When homeostasis is not enabled, we set the P_cum
matricx to None
:
print(dico_partial_learning.P_cum)
l0_sparseness = 10 # shl.l0_sparseness
#sparse_code_mp = shl.code(data_test, dico_partial_learning, matname=matname, l0_sparseness=l0_sparseness)
sparse_code_mp = comp(data_test, dico_partial_learning.dictionary, P_cum=None, C=C, do_sym=do_sym, verbose=1, l0_sparseness=l0_sparseness)
n_samples, nb_filter = sparse_code_mp.shape
P_cum = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((nb_filter, 1))
sparse_code_comp = comp(data_test, dico_partial_learning.dictionary, P_cum, C=C, do_sym=do_sym, verbose=1, l0_sparseness=l0_sparseness)
print('Relative difference = ', np.sum((sparse_code_mp - sparse_code_comp)**2)/np.sum((sparse_code_mp)**2))
fig, ax = plot_proba_histogram(sparse_code_mp)
fig.show()
fig, ax = plot_proba_histogram(sparse_code_comp)
fig, ax = plot_scatter_MpVsComp(sparse_code_mp, sparse_code_comp)
gradient descent¶
P_cum = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((nb_filter, 1))
print('Shape of modulation function', P_cum.shape)
eta_homeo = .01
for i in range(1000//DEBUG_DOWNSCALE):
sparse_code = comp(data_test, dico_partial_learning.dictionary, P_cum, C=C, do_sym=do_sym, verbose=0)
P_cum_ = get_P_cum(sparse_code, C=C, nb_quant=nb_quant)
P_cum = (1-eta_homeo) * P_cum + eta_homeo * P_cum_
if i % (100//DEBUG_DOWNSCALE) == 0:
print('Learning step', i)
fig, ax = plot_proba_histogram(sparse_code)
plt.show()
from shl_scripts.shl_tools import plot_P_cum
fig, ax = plot_P_cum(P_cum, verbose=False);
fig, ax = plot_P_cum(P_cum, verbose=False)
ax.set_ylim(0.92, 1.01);
Compared to the previous post, one observes that the distribution gets progressively more uniform which was our goal. This implementation makes the Matching Pursuit algorithm running faster such that we can integrate it to the SHL package. This will be done in the next post.