Time series clustering
import pprint
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
import scipy.spatial.distance as ssd
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import v_measure_score
from reservoir_computing.modules import RC_model
from reservoir_computing.datasets import ClfLoader
np.random.seed(0) # Fix the seed for reproducibility
Configure the RC model
config = {}
# Reservoir
config['n_internal_units'] = 450 # size of the reservoir
config['spectral_radius'] = 0.9 # largest eigenvalue of the reservoir
config['leak'] = None # amount of leakage in the reservoir state update (None or 1.0 --> no leakage)
config['connectivity'] = 0.25 # percentage of nonzero connections in the reservoir
config['input_scaling'] = 0.1 # scaling of the input weights
config['noise_level'] = 0.0 # noise in the reservoir state update
config['n_drop'] = 5 # transient states to be dropped
config['bidir'] = True # if True, use bidirectional reservoir
config['circle'] = False # use reservoir with circle topology
# Dimensionality reduction
config['dimred_method'] ='tenpca' # options: {None (no dimensionality reduction), 'pca', 'tenpca'}
config['n_dim'] = 75 # number of resulting dimensions after the dimensionality reduction procedure
# MTS representation
config['mts_rep'] = 'reservoir' # MTS representation: {'last', 'mean', 'output', 'reservoir'}
config['w_ridge_embedding'] = 5.0 # regularization parameter of the ridge regression
# Readout
config['readout_type'] = None # by setting None, the input representations will be stored
pprint.pprint(config)
{'bidir': True,
'circle': False,
'connectivity': 0.25,
'dimred_method': 'tenpca',
'input_scaling': 0.1,
'leak': None,
'mts_rep': 'reservoir',
'n_dim': 75,
'n_drop': 5,
'n_internal_units': 450,
'noise_level': 0.0,
'readout_type': None,
'spectral_radius': 0.9,
'w_ridge_embedding': 5.0}
Prepare the data
Xtr, Ytr, Xte, Yte = ClfLoader().get_data('Japanese_Vowels')
Loaded Japanese_Vowels dataset.
Number of classes: 9
Data shapes:
Xtr: (270, 29, 12)
Ytr: (270, 1)
Xte: (370, 29, 12)
Yte: (370, 1)
# Since we are doing clustering, we do not need the train/test split
X = np.concatenate((Xtr, Xte), axis=0)
Y = np.concatenate((Ytr, Yte), axis=0)
Initialize and fit the RC model
rcm = RC_model(**config)
# Generate representations of the input MTS
rcm.fit(X)
mts_representations = rcm.input_repr
Training completed in 0.02 min
Compute the clustering partition
# Compute Dissimilarity matrix
Dist = cosine_distances(mts_representations)
distArray = ssd.squareform(Dist)
# Hierarchical clustering
distArray = ssd.squareform(Dist)
Z = linkage(distArray, 'ward')
clust = fcluster(Z, t=4.0, criterion="distance")
print(f"Found {len(np.unique(clust))} clusters")
Found 9 clusters
# Evaluate the agreement between class and cluster labels
nmi = v_measure_score(Y[:,0], clust)
print(f"Normalized Mutual Information (v-score): {nmi:.3f}")
Normalized Mutual Information (v-score): 0.899