Time series clustering

import pprint
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster
import scipy.spatial.distance as ssd
from sklearn.metrics.pairwise import cosine_distances
from sklearn.metrics import v_measure_score

from reservoir_computing.modules import RC_model
from reservoir_computing.datasets import ClfLoader

np.random.seed(0) # Fix the seed for reproducibility

Configure the RC model

config = {}

# Reservoir
config['n_internal_units'] = 450        # size of the reservoir
config['spectral_radius'] = 0.9         # largest eigenvalue of the reservoir
config['leak'] = None                   # amount of leakage in the reservoir state update (None or 1.0 --> no leakage)
config['connectivity'] = 0.25           # percentage of nonzero connections in the reservoir
config['input_scaling'] = 0.1           # scaling of the input weights
config['noise_level'] = 0.0             # noise in the reservoir state update
config['n_drop'] = 5                    # transient states to be dropped
config['bidir'] = True                  # if True, use bidirectional reservoir
config['circle'] = False                # use reservoir with circle topology

# Dimensionality reduction
config['dimred_method'] ='tenpca'       # options: {None (no dimensionality reduction), 'pca', 'tenpca'}
config['n_dim'] = 75                    # number of resulting dimensions after the dimensionality reduction procedure

# MTS representation
config['mts_rep'] = 'reservoir'         # MTS representation:  {'last', 'mean', 'output', 'reservoir'}
config['w_ridge_embedding'] = 5.0       # regularization parameter of the ridge regression

# Readout
config['readout_type'] = None           # by setting None, the input representations will be stored

pprint.pprint(config)

{'bidir': True,
 'circle': False,
 'connectivity': 0.25,
 'dimred_method': 'tenpca',
 'input_scaling': 0.1,
 'leak': None,
 'mts_rep': 'reservoir',
 'n_dim': 75,
 'n_drop': 5,
 'n_internal_units': 450,
 'noise_level': 0.0,
 'readout_type': None,
 'spectral_radius': 0.9,
 'w_ridge_embedding': 5.0}

Prepare the data

Xtr, Ytr, Xte, Yte = ClfLoader().get_data('Japanese_Vowels')

Loaded Japanese_Vowels dataset.
Number of classes: 9
Data shapes:
  Xtr: (270, 29, 12)
  Ytr: (270, 1)
  Xte: (370, 29, 12)
  Yte: (370, 1)

# Since we are doing clustering, we do not need the train/test split
X = np.concatenate((Xtr, Xte), axis=0)
Y = np.concatenate((Ytr, Yte), axis=0)

Initialize and fit the RC model

rcm =  RC_model(**config)

# Generate representations of the input MTS
rcm.fit(X)
mts_representations = rcm.input_repr

Training completed in 0.02 min

Compute the clustering partition

# Compute Dissimilarity matrix
Dist = cosine_distances(mts_representations)
distArray = ssd.squareform(Dist)

# Hierarchical clustering
distArray = ssd.squareform(Dist)
Z = linkage(distArray, 'ward')
clust = fcluster(Z, t=4.0, criterion="distance")
print(f"Found {len(np.unique(clust))} clusters")

Found 9 clusters

# Evaluate the agreement between class and cluster labels
nmi = v_measure_score(Y[:,0], clust)
print(f"Normalized Mutual Information (v-score): {nmi:.3f}")

Normalized Mutual Information (v-score): 0.899