testing COMPs-Pcum
In this notebook, we will study how homeostasis (cooperation) may be an essential ingredient to this algorithm working on a winner-take-all basis (competition). This extension has been published as Perrinet, Neural Computation (2010) (see https://laurentperrinet.github.io/publication/perrinet-10-shl ). In particular, we will show how one can build the non-linear functions based on the activity of each filter and which implement homeostasis.
See also the other posts on unsupervised learning,
This is joint work with Victor Boutin.
Conclusion: using Pcum functions seems to work, but one needs to learn the non-linear functions in a smooth manner. This will be done in a next post.
%load_ext autoreload
%autoreload 2
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
#%config InlineBackend.figure_format='svg'
import numpy as np
np.set_printoptions(formatter = dict( float = lambda x: "%.3g" % x ), precision=3, suppress=True, threshold=np.inf)
import time
starting with a short learning without homeostasis¶
DEBUG = True
DEBUG = False
if not DEBUG:
matname = '2017-05-31_Testing_COMPs'
DEBUG_DOWNSCALE = 1
else:
matname = '2017-05-31_Testing_COMPs-DEBUG'
DEBUG_DOWNSCALE = 10
seed = 42
nb_quant = 512
C = 5.
do_sym = False
from shl_scripts.shl_experiments import SHL
shl = SHL(DEBUG_DOWNSCALE=DEBUG_DOWNSCALE, seed=seed,
eta=0.05, verbose=2, record_each=50, n_iter=1000, eta_homeo=0., alpha_homeo=1.,
do_sym=do_sym, nb_quant=nb_quant, C=C)
data = shl.get_data(matname=matname)
test_size = data.shape[0]//2
data_training = data[:test_size, :]
data_test = data[test_size:,:]
#DEBUG
test_size = data.shape[0]//20
data_training = data[:(data.shape[0]-test_size),:].copy()
data_test = data[:test_size, :].copy()
We start off by using a short learning with no homeostasis such that we end up with a unbalanced dictionary:
dico_partial_learning = shl.learn_dico(data=data_training, matname=matname)
dico_partial_learning.P_cum
fig, ax = shl.show_dico(dico_partial_learning, data=data, title=matname)
fig.show()
fig, ax = shl.time_plot(dico_partial_learning, variable='prob_active');
fig.show()
classical Matching Pursuit¶
#n_samples, n_pixels = data_test.shape
#n_dictionary, n_pixels = dico_partial_learning.dictionary.shape
norm_each_filter = np.sqrt(np.sum(dico_partial_learning.dictionary**2, axis=1))
print('L2 norm of each filter=', norm_each_filter)
dico_partial_learning.dictionary /= norm_each_filter[:,np.newaxis]
sparse_code_mp = shl.code(data_test, dico_partial_learning, matname=matname)
def plot_proba_histogram(coding):
n_dictionary=coding.shape[1]
p = np.count_nonzero(coding, axis=0)/coding.shape[1]
p /= p.sum()
rel_ent = np.sum( -p * np.log(p)) / np.log(n_dictionary)
#print('Entropy / Entropy_max=', rel_ent )
fig = plt.figure(figsize=(16, 4))
ax = fig.add_subplot(111)
ax.bar(np.arange(n_dictionary), p*n_dictionary)
ax.set_title('distribution of the selection probability - entropy= ' + str(rel_ent) )
ax.set_ylabel('pdf')
ax.axis('tight')
ax.set_xlim(0, n_dictionary)
return fig, ax
fig, ax = plot_proba_histogram(sparse_code_mp)
p = np.count_nonzero(sparse_code_mp, axis=0)
print('relative proba of being selected', p/p.mean())
print('most selected filter : {}'.format(np.argmax(p)))
print('less selected filter : {}'.format(np.argmin(p)))
COMP : learning modulations¶
defining a rescaling function for coefficients to avoid overflows:
def rescaling(code, C=5., do_sym=True):
if do_sym:
return 1.-np.exp(-np.abs(code)/C)
else:
return (1.-np.exp(-code/C))*(code>0)
fig=plt.figure(figsize=(13, 8))
ax = plt.subplot(111)
x=np.linspace(0, 30, 100)
ax.plot(x, np.ones_like(x), '--')
ax.plot(x, rescaling(x, C=C, do_sym=do_sym))
ax.set_ylabel('raw coefficient')
ax.set_xlabel('rescaled coefficient')
ax.set_xscale('log');
def get_P_cum(code, nb_quant=100):
n_samples, nb_filter = code.shape
code_bins = np.linspace(0., 1, nb_quant, endpoint=True)
P_cum = np.zeros((nb_filter, nb_quant))
for i in range(nb_filter):
p, bins = np.histogram(rescaling(code[:, i], C=C, do_sym=do_sym), bins=code_bins, density=True)
p /= p.sum()
P_cum[i, :] = np.hstack((0, np.cumsum(p)))
return P_cum
P_cum_mp = get_P_cum(sparse_code_mp, nb_quant=nb_quant)
from shl_scripts.shl_tools import plot_P_cum
fig, ax = plot_P_cum(P_cum_mp, verbose=False, alpha=.15);
ax.set_ylim(0.85, 1.01)
#ax.set_xmargin(0.);
COMP : using modulations¶
Computing a quantile using np.interp
:
code_bins = np.linspace(0., 1, nb_quant+1, endpoint=True)
def quantile(code_bins, Pcum, c):
q_i = np.zeros_like(c)
for i in range(Pcum.shape[0]):
q_i[i] = np.interp(c[i], code_bins[:-1], Pcum[i, :])
return q_i
corr = (data_test @ dico_partial_learning.dictionary.T)[0, :]
P_cum = P_cum_mp.copy()
print('correlation=', corr)
print('rescaled correlation=', rescaling(corr, C=C, do_sym=do_sym))
print('quantile=', quantile(code_bins, P_cum, rescaling(corr, C=C, do_sym=do_sym)))
Such that we can redefine Matching Pursuit by including this non-linear function:
l0_sparseness = shl.l0_sparseness
def comp(data, dico, code_bins, P_cum, l0_sparseness=l0_sparseness, verbose=0):
if verbose!=0:
t0 = time.time()
n_samples, n_dictionary = data.shape[0], dico.shape[0]
sparse_code = np.zeros((n_samples, n_dictionary))
corr = (data @ dico.T)
Xcorr = (dico @ dico.T)
for i_sample in range(n_samples):
c = corr[i_sample, :].copy()
if verbose!=0: ind_list=list()
for i_l0 in range(int(l0_sparseness)):
if P_cum is None:
q_i = rescaling(c, C=C, do_sym=do_sym)
else:
q_i = quantile(code_bins, P_cum, rescaling(c, C=C, do_sym=do_sym))
ind = np.argmax(q_i)
if verbose!=0:
ind_list.append(ind)
c_ind = c[ind] / Xcorr[ind, ind]
sparse_code[i_sample, ind] += c_ind
c -= c_ind * Xcorr[ind, :]
if verbose!=0:
if i_sample in range(2):
q_i = quantile(code_bins, P_cum, rescaling(c, C=C, do_sym=do_sym))
print(ind_list, [q_i[i] for i in ind_list], np.median(q_i), q_i.max(), [c[i] for i in ind_list], c.min(), c.max())
if verbose!=0:
duration = time.time()-t0
print('coding duration : {0}s'.format(duration))
return sparse_code
sparse_code = comp(data_test, dico_partial_learning.dictionary, code_bins, P_cum, verbose=1)
sparse_code.shape
np.mean(np.count_nonzero(sparse_code, axis=0)/sparse_code.shape[1])
np.count_nonzero(sparse_code, axis=0)
def plot_scatter_MpVsComp(sparse_vector, my_sparse_code):
fig = plt.figure(figsize=(16, 16))
ax = fig.add_subplot(111)
a_min = np.min((sparse_vector.min(), my_sparse_code.min()))
a_max = np.max((sparse_vector.max(), my_sparse_code.max()))
ax.plot(np.array([a_min, a_max]), np.array([a_min, a_max]), 'k--', lw=2)
ax.scatter(sparse_vector.ravel(), my_sparse_code.ravel(), alpha=0.01)
ax.set_title('MP')
ax.set_ylabel('COMP')
#ax.set_xlim(0)
#ax.set_ylim(0)
ax.axis('equal')
return fig, ax
#fig, ax = plot_scatter_MpVsComp(sparse_code_mp, sparse_code)
#fig.show()
testing that COMP with flat Pcum is equivalent to MP¶
n_samples, nb_filter = sparse_code.shape
P_cum = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((nb_filter, 1))
sparse_code_comp = comp(data_test, dico_partial_learning.dictionary, code_bins, P_cum, verbose=1)
print('Relative difference = ', np.sum((sparse_code_mp - sparse_code_comp)**2)/np.sum((sparse_code_mp)**2))
fig, ax = plot_proba_histogram(sparse_code_mp)
fig.show()
fig, ax = plot_proba_histogram(sparse_code_comp)
fig, ax = plot_scatter_MpVsComp(sparse_code_mp, sparse_code_comp)
gradient descent¶
P_cum_new = np.linspace(0, 1, nb_quant, endpoint=True)[np.newaxis, :] * np.ones((nb_filter, 1))
print('Shape of modulation function', P_cum.shape)
eta_homeo = .005
for i in range(1000//DEBUG_DOWNSCALE):
sparse_code_comp = comp(data_test, dico_partial_learning.dictionary, code_bins, P_cum_new, verbose=0)
P_cum_ = get_P_cum(sparse_code_comp, nb_quant=nb_quant)
P_cum_new = (1-eta_homeo) * P_cum_new + eta_homeo * P_cum_
if i % (100//DEBUG_DOWNSCALE) == 0:
print('Learning step', i)
fig, ax = plot_proba_histogram(sparse_code_comp)
plt.show()
One observes that the distribution gets progressively more uniform which was our goal. However, this implementation makes the Matching Pursuit algorithm very slow such that we need to speed things up. This will be done in the next post.
Note the shape of the initial non-linear functions:
from shl_scripts.shl_tools import plot_P_cum
fig, ax = plot_P_cum(P_cum_mp, verbose=False);
ax.set_ylim(0.85, 1.01);
With that at convergence:
from shl_scripts.shl_tools import plot_P_cum
fig, ax = plot_P_cum(P_cum_new, verbose=False);
ax.set_ylim(0.85, 1.01);
Indeed, as the unsupervised learning is forced to be homeostatic, the filters that are learned ultimately converge to features of equal "saillance", such that the non-linear functions are approximately equal at convergence: In a equalitarian network at convergence, there is no need for these rules anymore (unless you want to learn again).
Version used¶
%load_ext version_information
%version_information numpy, shl_scripts