To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit b6d6ebef authored by Ami's avatar Ami
Browse files

slet-ce-2

parent 4220cf46
import sklearn as skl
from sklearn.utils.validation import check_is_fitted
import pandas as pd
import numpy as np
from treelib import Tree
import matplotlib.pyplot as plt
def read_data_csv(sheet, y_names=None):
"""Parse a column data store into X, y arrays
Args:
sheet (str): Path to csv data sheet.
y_names (list of str): List of column names used as labels.
Returns:
X (np.ndarray): Array with feature values from columns that are not
contained in y_names (n_samples, n_features)
y (dict of np.ndarray): Dictionary with keys y_names, each key
contains an array (n_samples, 1) with the label data from the
corresponding column in sheet.
"""
data = pd.read_csv(sheet)
feature_columns = [c for c in data.columns if c not in y_names]
X = data[feature_columns].values
y = dict([(y_name, data[[y_name]].values) for y_name in y_names])
return X, y
class DeterministicAnnealingClustering(skl.base.BaseEstimator,
skl.base.TransformerMixin):
"""Template class for DAC
Attributes:
cluster_centers (np.ndarray): Cluster centroids y_i
(n_clusters, n_features)
cluster_probs (np.ndarray): Assignment probability vectors
p(y_i | x) for each sample (n_samples, n_clusters)
bifurcation_tree (treelib.Tree): Tree object that contains information
about cluster evolution during annealing.
Parameters:
n_clusters (int): Maximum number of clusters returned by DAC.
random_state (int): Random seed.
"""
def __init__(self, n_clusters=8, random_state=42, metric="euclidian"):
self.n_clusters = n_clusters
self.random_state = random_state
self.metric = metric
self.T = None
self.T_min = None
self.cluster_centers = None
self.cluster_probs = None
self.n_eff_clusters = list()
self.temperatures = list()
self.distortions = list()
self.bifurcation_tree = Tree()
# Not necessary, depends on your implementation
self.bifurcation_tree_cut_idx = None
# Add more parameters, if necessary. You can also modify any other
# attributes defined above
def fit(self, samples):
"""Compute DAC for input vectors X
Preferred implementation of DAC as described in reference [1].
Args:
samples (np.ndarray): Input array with shape (samples, n_features)
"""
# TODO:
if self.metric == "euclidian":
pass
elif self.metric == "ratioscale":
pass
def _calculate_cluster_probs(self, dist_mat, temperature):
"""Predict assignment probability vectors for each sample in X given
the pairwise distances
Args:
dist_mat (np.ndarray): Distances (n_samples, n_centroids)
temperature (float): Temperature at which probabilities are
calculated
Returns:
probs (np.ndarray): Assignment probability vectors
(new_samples, n_clusters)
"""
# TODO:
return [None]
def get_distance(self, samples, clusters):
"""Calculate the distance matrix between samples and codevectors
based on the given metric
Args:
samples (np.ndarray): Samples array (n_samples, n_features)
clusters (np.ndarray): Codebook (n_centroids, n_features)
Returns:
D (np.ndarray): Distances (n_samples, n_centroids)
"""
# TODO:
if self.metric == "euclidian":
pass
elif self.metric == "ratioscale":
pass
return []
def predict(self, samples):
"""Predict assignment probability vectors for each sample in X.
Args:
samples (np.ndarray): Input array with shape (new_samples, n_features)
Returns:
probs (np.ndarray): Assignment probability vectors
(new_samples, n_clusters)
"""
distance_mat = self.get_distance(samples, self.cluster_centers)
probs = self._calculate_cluster_probs(distance_mat, self.T_min)
return probs
def transform(self, samples):
"""Transform X to a cluster-distance space.
In the new space, each dimension is the distance to the cluster centers
Args:
samples (np.ndarray): Input array with shape
(new_samples, n_features)
Returns:
Y (np.ndarray): Cluster-distance vectors (new_samples, n_clusters)
"""
check_is_fitted(self, ["cluster_centers"])
# Your code goes here
distance_mat = self.get_distance(samples, self.cluster_centers)
return distance_mat
def plot_bifurcation(self):
"""Show the evolution of cluster splitting
This is a pseudo-code showing how you may be using the tree
information to make a bifurcation plot. Your implementation may be
entire different or based on this code.
"""
check_is_fitted(self, ["bifurcation_tree"])
clusters = [[] for _ in range(len(np.unique(self.n_eff_clusters)))]
for node in self.bifurcation_tree.all_nodes_itr():
c_id = node.data['cluster_id']
my_dist = node.data['distance']
if c_id > 0 and len(clusters[c_id]) == 0:
clusters[c_id] = list(np.copy(clusters[c_id-1]))
clusters[c_id].append(my_dist)
# Cut the last iterations, usually it takes too long
cut_idx = self.bifurcation_tree_cut_idx + 20
beta = [1 / t for t in self.temperatures]
plt.figure(figsize=(10, 5))
for c_id, s in enumerate(clusters):
plt.plot(s[:cut_idx], beta[:cut_idx], '-k',
alpha=1, c='C%d' % int(c_id),
label='Cluster %d' % int(c_id))
plt.legend()
plt.xlabel("distance to parent")
plt.ylabel(r'$1 / T$')
plt.title('Bifurcation Plot')
plt.show()
def plot_phase_diagram(self):
"""Plot the phase diagram
This is an example of how to make phase diagram plot. The exact
implementation may vary entirely based on your self.fit()
implementation. Feel free to make any modifications.
"""
t_max = np.log(max(self.temperatures))
d_min = np.log(min(self.distortions))
y_axis = [np.log(i) - d_min for i in self.distortions]
x_axis = [t_max - np.log(i) for i in self.temperatures]
plt.figure(figsize=(12, 9))
plt.plot(x_axis, y_axis)
region = {}
for i, c in list(enumerate(self.n_eff_clusters)):
if c not in region:
region[c] = {}
region[c]['min'] = x_axis[i]
region[c]['max'] = x_axis[i]
for c in region:
if c == 0:
continue
plt.text((region[c]['min'] + region[c]['max']) / 2, 0.2,
f'K={c}', rotation=90)
plt.axvspan(region[c]['min'], region[c]['max'], color='C' + str(c),
alpha=0.2)
plt.title('Phases diagram (log)')
plt.xlabel('Temperature')
plt.ylabel('Distortion')
plt.show()
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SLT-CE-2: Deterministic Annealing"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### References"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<ol>\n",
"<li> Sections II.A.1 (principled derivation of deterministic annealing) and II.A.3 (Mass-constrained clustering) of 'Deterministic annealing for clustering, compression, classification, regression, and related optimization problems', Kenneth Rose, 1998, http://ieeexplore.ieee.org/document/726788/ \n",
"</li>\n",
"\n",
"<li>\n",
"The wine data set, http://www3.dsi.uminho.pt/pcortez/wine5.pdf\n",
"</li>\n",
" \n",
"</ol>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Setup "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sklearn as skl\n",
"from sklearn.utils.validation import check_is_fitted\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.datasets import make_blobs\n",
"import sklearn.svm as svm\n",
"from sklearn import cluster\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"from treelib import Tree\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib import cm"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2 style=\"background-color:#f0b375;\">\n",
"Section 4.0\n",
"<span style=font-size:50%> Complete all problems in this and previous sections to get a grade of 4.0 </span>\n",
"</h2>\n",
"\n",
"<p style=\"background-color:#adebad;\">\n",
" For this exercise, it is of utmost importance to read reference [1] about deterministic annealing clustering (DAC). Our implementation will be based on this reference. Please shortly summarize what they refer to as the <i>preferred implementation</i> of the DAC algorithm.\n",
"</p>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Put your markdown text here"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"background-color:#adebad;\">\n",
" In order to avoid headaches with numerical instabilities, we first try our algorithm on a simple artificially generated data as below. Run the bloc below to have a look at the data. Later when we have everything implemented, we will examine some real world data. \n",
"</p>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"n_clusters = 4\n",
"ran_s = 42\n",
"\n",
"# Generate artificial dataset\n",
"X, y_true = make_blobs(n_samples=7000, centers=4,\n",
" cluster_std=0.3, random_state=ran_s,\n",
" center_box=(-8.0, 8.0),\n",
" shuffle=False)\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y_true, train_size=6000, random_state=42)\n",
"\n",
"plt.figure()\n",
"plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=40, cmap='viridis')\n",
"plt.title(\"Training data\")\n",
"\n",
"plt.figure()\n",
"plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=40, cmap='viridis')\n",
"plt.title(\"Test data\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"background-color:#adebad;\">\n",
" Implement the <b>fit method</b> for the template class DeterministicAnnealing, according to the contract outlined in its docstring. (The template class DeterministicAnnealing is in file <b>DA.py</b> which you can open in your favourite IDE) For the implementation, it may help to take a look at both <b>get_distance method</b> and <b>fit _calculate_cluster_probs method</b> and implement them as well. Of course you are free to change all these methods or/and write additional methods for your purpose.\n",
" You can add more class methods as necessary.\n",
" See http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html for complementary information.\n",
"</p>\n",
"<p style=\"background-color:#adebad;\">\n",
" While implementing, you can run the bloc below to test your implementation.\n",
"</p>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from DA import DeterministicAnnealingClustering\n",
"\n",
"DAC = DeterministicAnnealingClustering(\n",
" n_clusters=n_clusters, random_state=ran_s)\n",
"DAC.fit(X_train)\n",
"y_DAC = DAC.predict(X_test)\n",
"y_DAC_hard = np.argmax(y_DAC, axis=1)\n",
"plt.figure()\n",
"plt.scatter(X_test[:, 0], X_test[:, 1], c=y_DAC_hard, s=40, cmap='viridis')\n",
"plt.title(\"DA clustering\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2 style=\"background-color:#f0b375;\">\n",
"Section 4.5\n",
"<span style=font-size:50%> Complete all problems in this section to get an additional (+0.5) point to the previous points. Note that you can have a maximum of 6 points at the end.</span>\n",
"</h2>\n",
"\n",
"<p style=\"background-color:#adebad;\">\n",
" In this section we implement a plot which will help us better understand the DA method, and could also be a help for better debugging of your implementation.\n",
" \n",
" <ul style=\"background-color:#adebad;\">\n",
" <li> \n",
" Modify your implementation of <b>fit</b> function such that <b>plot_phase_diagram</b> method will produce a plot similar to the phase diagram plot shown in Figure 2 of the reference paper.\n",
" </li>\n",
"</ul> \n",
"</p>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<ul style=\"background-color:#adebad;\">\n",
" Produce a phase diagram plot of the expected distortion D, as shown in figure 2 of reference [1]. For this, extend DAC.fit to save the expected distortion during annealing as an additional attribute self.distortion.\n",
" You might also want to save the number of effective clusters and the temperature along the way.\n",
" </ul>\n",
"</p>\n",
"\n",
"#### extend DAC.fit(self, X):\n",
" # ...\n",
" # Save information for each (n-th) annealing step:\n",
" # self.distortion = [d0, d1, d2, ...]\n",
" # self.n_eff_clusters = [e0, e1, e2, ...]\n",
" # self.temp = [t0, t1, t2, ...]\n",
" # ..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DAC.plot_phase_diagram()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2 style=\"background-color:#f0b375;\">\n",
"Section 5.0\n",
"<span style=font-size:50%> Complete all problems in this section to get an additional (+0.5) point to the previous points.</span>\n",
"</h2>\n",
"<ul style=\"background-color:#adebad;\">\n",
"Here we implement another plot which helps better undetrstad the dynamics of the algorithm.\n",
" <li>\n",
" Implement DAC.plot_bifurcation, which should create a bifurcation plot.<br>\n",
" Modify DAC.fit to keep track of the distances, using the tree object DAC.bifurcation_tree. When a cluster splits, it creates two child nodes. Each node should store its centroid vector, and the distance to the parent centroid vector. After splitting, the parent node is not updated anymore.<br>\n",
" In the bifurcation plot, the horizontal distance of a child node to its parent node should be exactly the distance to the parent centroid vector. The two child nodes should move in opposite directions, i.e. one to the left of the parent and one to the right.\n",
" </li>\n",
"</ul>\n",
"\n",
"This section could bit a bit annoying, you can also jump to the next sections and come back here later. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"DAC.plot_bifurcation()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2 style=\"background-color:#f0b375;\">\n",
"Section 5.5\n",
"<span style=font-size:50%> Complete all problems in this section to get an additional (+0.5) point to the previous points.</span>\n",
"</h2>\n",
"\n",
"<p style=\"background-color:#adebad;\">\n",
"Now we are ready to use some real world data. This might need some tweaking and handling of numberical instabilities. Please make sure your understand the data.\n",
"</p>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"background-color:#adebad;\">\n",
"Read the wine data [3], which contains 11 physiochemical attributes, and two labels (quality and color).\n",
"</p>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<p style=\"background-color:#adebad;\">\n",
" Create an instance of your DAC class with n_clusters = 2 and <b>fit the first 6000 samples</b> of the wine data set. Record the execution time. Furthermore, create an instance of the sklearn k-means class, and fit it with the same parameters. Again record the execution time. Make sure that the hyper parameters (initial temperature, min temperature, convergence criteria, noise, etc.) make sense and lead to a reasonable clustering\n",
"</p>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from DA import read_data_csv\n",
"X, y = read_data_csv(\"wine-data.csv\", y_names=[\"quality\", \"color\"])\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y[\"color\"], train_size=6000, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"DAC = DeterministicAnnealingClustering(n_clusters=2, random_state=42)\n",
"DAC.fit(X_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"kmeans = cluster.KMeans(n_clusters=2,random_state=42)\n",
"kmeans.fit(X_train)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%time\n",
"y_kmeans = kmeans.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [],
"source": [
"%%time\n",
"y_DAC = DAC.predict(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2 style=\"background-color:#f0b375;\">\n",
"Section 6.0\n",
"<span style=font-size:50%> Complete all problems in this section to get an additional (+0.5) point to the previous points.</span>\n",
"</h2>\n",
"<ul style=\"background-color:#adebad;\">\n",
"<li> Before we can compute the confusion matrix, we need to perform some post-processing on the DAC cluster assignments.\n",
" Explain what the function postprocess (defined below) does, and why we need it. To do so, complete the docstring of the function postprocess.\n",
" </li>\n",
"</ul>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def postprocess(y_DAC, y_kmeans):\n",
" \"\"\"TODO: Add explanation\"\"\"\n",
" \n",
" y_DAC_hard = np.argmax(y_DAC, axis=1)\n",
" \n",
" n_clusters = len(np.unique(y_DAC_hard))\n",
" dac2kmeans = []\n",
" for cluster in range(n_clusters):\n",
" argmax = np.argmax(y_DAC[:, cluster])\n",
" dac2kmeans.append(y_kmeans[argmax])\n",
" \n",
" y_DAC_new = []\n",
" for dac_label in y_DAC_hard:\n",
" y_DAC_new.append(dac2kmeans[dac_label])\n",
" \n",
" return np.array(y_DAC_new)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"skl.metrics.confusion_matrix(y_kmeans, postprocess(y_DAC, y_kmeans))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"</h2>\n",
"\n",
"<ul style=\"background-color:#adebad;\">\n",
" <li> Read the docstring of <b>transform method</b> and understand what it does.\n",
" </li>\n",
" <li>\n",
" Use DAC.transform and kmeans.transform to transform both, X_train and X_test. \n",
" </li>\n",
" \n",
"</ul>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [],
"source": [
"X_train_DAC = DAC.transform(X_train)\n",
"X_test_DAC = DAC.transform(X_test)\n",
"\n",
"X_train_kmeans = kmeans.transform(X_train)\n",
"X_test_kmeans = kmeans.transform(X_test)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<ul style=\"background-color:#adebad;\">\n",
" <li>\n",
" Fit an SVM classifier with default parameters to the untransformed data, and to the transformed data.\n",
" Compare the performance of predicting whether the color of a wine is red or white.\n",
" </li>\n",
" </ul>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"svm = svm.SVC(random_state=42)\n",
"svm.fit(X_train, y_train)\n",
"svm.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"svm_DAC = svm.SVC(random_state=42)\n",
"svm_DAC.fit(X_train_DAC, y_train)\n",
"svm_DAC.score(X_test_DAC, y_test)"