# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs, load_wine, load_iris # For synthetic and real datasets
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, homogeneity_score, completeness_score, adjusted_rand_score
from sklearn.decomposition import PCA # For visualization of high-dimensional data

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

n_samples = 300
n_features = 2
n_clusters_true = 3 # We know the true number of clusters for this synthetic data
random_state = 42

X_synth, y_true_synth = make_blobs(n_samples=n_samples, n_features=n_features,
                                   centers=n_clusters_true, cluster_std=0.8,
                                   random_state=random_state)

print(f"Synthetic dataset shape: {X_synth.shape}")

Synthetic dataset shape: (300, 2)

# n_init='auto' is the default in newer versions, specifying how many times to run with different centroid seeds.
kmeans_synth = KMeans(n_clusters=n_clusters_true, random_state=random_state, n_init='auto')
kmeans_synth.fit(X_synth)

# Get cluster labels and centroids
labels_synth = kmeans_synth.labels_
centroids_synth = kmeans_synth.cluster_centers_
inertia_synth = kmeans_synth.inertia_

print(f"\nK-means (K={n_clusters_true}) completed.")
print(f"Inertia (Sum of Squared Distances): {inertia_synth:.2f}")
print(f"Final Centroids:\n{centroids_synth}")

# 3. Visualize the clustered data and centroids
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_synth[:, 0], y=X_synth[:, 1], hue=labels_synth, palette='viridis', legend='full', s=80, alpha=0.7)
plt.scatter(centroids_synth[:, 0], centroids_synth[:, 1], marker='X', s=200, color='red', label='Centroids', edgecolor='black')
plt.title(f'K-means Clustering with K={n_clusters_true} on Synthetic Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

K-means (K=3) completed.
Inertia (Sum of Squared Distances): 362.79
Final Centroids:
[[-2.60842567  9.03771305]
 [-6.88302287 -6.96320924]
 [ 4.72565847  2.00310936]]

# Define a range of K values to test
k_range = range(2, 11)
inertias = []
silhouette_scores = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=random_state, n_init='auto')
    kmeans.fit(X_synth)
    inertias.append(kmeans.inertia_)
    score = silhouette_score(X_synth, kmeans.labels_)
    silhouette_scores.append(score)
    print(f"  K={k}: Inertia={kmeans.inertia_:.2f}, Silhouette Score={score:.4f}")

# Plotting the Elbow Method
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(k_range, inertias, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal K (Inertia)')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.xticks(k_range)
plt.grid(True, linestyle='--', alpha=0.6)

# Plotting the Silhouette Score
plt.subplot(1, 2, 2)
plt.plot(k_range, silhouette_scores, marker='o', linestyle='-', color='orange')
plt.title('Silhouette Score for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.xticks(k_range)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

  K=2: Inertia=5526.51, Silhouette Score=0.7206
  K=3: Inertia=362.79, Silhouette Score=0.8781
  K=4: Inertia=320.34, Silhouette Score=0.6929
  K=5: Inertia=276.63, Silhouette Score=0.4982
  K=6: Inertia=233.60, Silhouette Score=0.3268
  K=7: Inertia=206.92, Silhouette Score=0.3327
  K=8: Inertia=181.80, Silhouette Score=0.3403
  K=9: Inertia=164.57, Silhouette Score=0.3266
  K=10: Inertia=153.53, Silhouette Score=0.3268

# 1. Load the Wine dataset
wine = load_wine()
X_wine = pd.DataFrame(wine.data, columns=wine.feature_names)
y_true_wine = wine.target # True labels (for evaluation)

print(f"Wine dataset shape: {X_wine.shape}")
print(f"True number of classes in Wine dataset: {len(np.unique(y_true_wine))}")
print("\nOriginal Wine Data (first 5 rows):")
print(X_wine.head())

# 2. Feature Scaling
# K-means is a distance-based algorithm. Features with larger scales will
# dominate the distance calculations. Scaling ensures all features contribute equally.
scaler_wine = StandardScaler()
X_wine_scaled = scaler_wine.fit_transform(X_wine)
X_wine_scaled_df = pd.DataFrame(X_wine_scaled, columns=wine.feature_names)

print("\nScaled Wine Data (first 5 rows):")
print(X_wine_scaled_df.head())

# 3. Determine Optimal K for Wine dataset (using Elbow and Silhouette)
# True classes are 3, so let's aim around that.
k_range_wine = range(2, 7) # Test K from 2 to 6
inertias_wine = []
silhouette_scores_wine = []

for k in k_range_wine:
    kmeans_wine_opt = KMeans(n_clusters=k, random_state=random_state, n_init='auto')
    kmeans_wine_opt.fit(X_wine_scaled)
    inertias_wine.append(kmeans_wine_opt.inertia_)
    score_wine = silhouette_score(X_wine_scaled, kmeans_wine_opt.labels_)
    silhouette_scores_wine.append(score_wine)
    print(f"  Wine Data K={k}: Inertia={kmeans_wine_opt.inertia_:.2f}, Silhouette Score={score_wine:.4f}")

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range_wine, inertias_wine, marker='o', linestyle='-')
plt.title('Elbow Method for Optimal K (Wine Data)')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.xticks(k_range_wine)
plt.grid(True, linestyle='--', alpha=0.6)

plt.subplot(1, 2, 2)
plt.plot(k_range_wine, silhouette_scores_wine, marker='o', linestyle='-', color='orange')
plt.title('Silhouette Score for Optimal K (Wine Data)')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')
plt.xticks(k_range_wine)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

# Based on elbow/silhouette (and knowing true K is 3), let's proceed with K=3
optimal_k_wine = 3
kmeans_wine = KMeans(n_clusters=optimal_k_wine, random_state=random_state, n_init='auto')
kmeans_wine.fit(X_wine_scaled)
labels_wine = kmeans_wine.labels_

# 4. Evaluate the clustering performance
print(f"\n--- K-means Evaluation for Wine Dataset (K={optimal_k_wine}) ---")
print(f"Silhouette Score (unsupervised): {silhouette_score(X_wine_scaled, labels_wine):.4f}")
print(f"Homogeneity Score (supervised): {homogeneity_score(y_true_wine, labels_wine):.4f}")
print(f"Completeness Score (supervised): {completeness_score(y_true_wine, labels_wine):.4f}")
print(f"Adjusted Rand Score (supervised): {adjusted_rand_score(y_true_wine, labels_wine):.4f}")

# 5. Visualize clusters using PCA
# PCA reduces the dimensionality for visualization while retaining as much variance as possible.
pca = PCA(n_components=2)
X_wine_pca = pca.fit_transform(X_wine_scaled)

plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_wine_pca[:, 0], y=X_wine_pca[:, 1], hue=labels_wine, palette='viridis', legend='full', s=80, alpha=0.7)
plt.scatter(kmeans_wine.cluster_centers_[:, 0], kmeans_wine.cluster_centers_[:, 1], marker='X', s=200, color='red', label='Centroids', edgecolor='black')
plt.title(f'K-means Clusters (K={optimal_k_wine}) on Wine Data (PCA-reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Optional: Visualize true labels for comparison
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_wine_pca[:, 0], y=X_wine_pca[:, 1], hue=y_true_wine, palette='viridis', legend='full', s=80, alpha=0.7)
plt.title('True Classes of Wine Data (PCA-reduced)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

Wine dataset shape: (178, 13)
True number of classes in Wine dataset: 3

Original Wine Data (first 5 rows):
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                          3.92   1065.0  
1                          3.40   1050.0  
2                          3.17   1185.0  
3                          3.45   1480.0  
4                          2.93    735.0  

Scaled Wine Data (first 5 rows):
    alcohol  malic_acid       ash  alcalinity_of_ash  magnesium  \
0  1.518613   -0.562250  0.232053          -1.169593   1.913905   
1  0.246290   -0.499413 -0.827996          -2.490847   0.018145   
2  0.196879    0.021231  1.109334          -0.268738   0.088358   
3  1.691550   -0.346811  0.487926          -0.809251   0.930918   
4  0.295700    0.227694  1.840403           0.451946   1.281985   

   total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
0       0.808997    1.034819             -0.659563         1.224884   
1       0.568648    0.733629             -0.820719        -0.544721   
2       0.808997    1.215533             -0.498407         2.135968   
3       2.491446    1.466525             -0.981875         1.032155   
4       0.808997    0.663351              0.226796         0.401404   

   color_intensity       hue  od280/od315_of_diluted_wines   proline  
0         0.251717  0.362177                      1.847920  1.013009  
1        -0.293321  0.406051                      1.113449  0.965242  
2         0.269020  0.318304                      0.788587  1.395148  
3         1.186068 -0.427544                      1.184071  2.334574  
4        -0.319276  0.362177                      0.449601 -0.037874  
  Wine Data K=2: Inertia=1661.68, Silhouette Score=0.2650
  Wine Data K=3: Inertia=1277.93, Silhouette Score=0.2849
  Wine Data K=4: Inertia=1211.75, Silhouette Score=0.2542
  Wine Data K=5: Inertia=1123.16, Silhouette Score=0.1836
  Wine Data K=6: Inertia=1079.54, Silhouette Score=0.1690

--- K-means Evaluation for Wine Dataset (K=3) ---
Silhouette Score (unsupervised): 0.2849
Homogeneity Score (supervised): 0.8788
Completeness Score (supervised): 0.8730
Adjusted Rand Score (supervised): 0.8975

# 1. Generate a synthetic dataset to highlight limitations (e.g., non-spherical)
# Make two crescent shapes for non-spherical clusters (using make_moons)
from sklearn.datasets import make_moons
X_moons, y_true_moons = make_moons(n_samples=200, noise=0.05, random_state=random_state)
# Add some "outliers" manually
outliers = np.array([[2.0, 1.0], [-1.5, 1.5], [0.5, -1.0]])
X_moons = np.vstack([X_moons, outliers])
y_true_moons = np.hstack([y_true_moons, [2, 2, 2]]) # Assign a dummy label for outliers

print(f"Moons dataset shape (with outliers): {X_moons.shape}")

# 2. Compare 'k-means++' (default) vs. 'random' initialization
n_clusters_moons = 2 # We aim for 2 main clusters ignoring outliers for a moment

print("\n--- Comparing Initialization Methods for Moons Data (K=2) ---")

# k-means++ initialization
print("\nK-means with 'k-means++' initialization:")
kmeans_plusplus = KMeans(n_clusters=n_clusters_moons, init='k-means++', n_init=10, random_state=random_state)
kmeans_plusplus.fit(X_moons)
labels_plusplus = kmeans_plusplus.labels_
print(f"  Inertia (k-means++): {kmeans_plusplus.inertia_:.2f}")
print(f"  Silhouette Score (k-means++): {silhouette_score(X_moons, labels_plusplus):.4f}")

# Random initialization (multiple runs to show variability if random_state was not fixed)
# For demonstration, we'll run it a few times to show potential differences without fixing random_state
print("\nK-means with 'random' initialization (multiple runs):")
random_inits_labels = []
random_inits_inertia = []
random_inits_silhouette = []

# Perform multiple runs with random initialization (setting n_init=1 explicitly overrides auto for demonstration)
for i in range(3): # Run 3 times
    kmeans_random = KMeans(n_clusters=n_clusters_moons, init='random', n_init=1, random_state=random_state + i) # Varying seed
    kmeans_random.fit(X_moons)
    random_inits_labels.append(kmeans_random.labels_)
    random_inits_inertia.append(kmeans_random.inertia_)
    random_inits_silhouette.append(silhouette_score(X_moons, kmeans_random.labels_))
    print(f"  Run {i+1} (random): Inertia={random_inits_inertia[-1]:.2f}, Silhouette={random_inits_silhouette[-1]:.4f}")

# Visualize results of k-means++ (primary result)
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_moons[:, 0], y=X_moons[:, 1], hue=labels_plusplus, palette='viridis', legend='full', s=80, alpha=0.7)
plt.scatter(kmeans_plusplus.cluster_centers_[:, 0], kmeans_plusplus.cluster_centers_[:, 1], marker='X', s=200, color='red', label='Centroids', edgecolor='black')
plt.title(f'K-means with k-means++ Init on Moons Data (K={n_clusters_moons})')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Visualize one of the random initialization runs (e.g., the first one)
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_moons[:, 0], y=X_moons[:, 1], hue=random_inits_labels[0], palette='viridis', legend='full', s=80, alpha=0.7)
plt.title(f'K-means with Random Init (Run 1) on Moons Data (K={n_clusters_moons})')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

Moons dataset shape (with outliers): (203, 2)

--- Comparing Initialization Methods for Moons Data (K=2) ---

K-means with 'k-means++' initialization:
  Inertia (k-means++): 86.98
  Silhouette Score (k-means++): 0.4813

K-means with 'random' initialization (multiple runs):
  Run 1 (random): Inertia=86.98, Silhouette=0.4813
  Run 2 (random): Inertia=86.98, Silhouette=0.4813
  Run 3 (random): Inertia=86.98, Silhouette=0.4813

Part 1: Understanding K-Means on a Simple Synthetic Dataset¶

1. Generate a synthetic dataset with clear clusters¶

2. Apply KMeans algorithm¶

Discussion Point:¶

Part 2: Determining Optimal K - Elbow Method & Silhouette Score¶

Discussion Points:¶

Part 3: K-Means on a Real-World Dataset with Preprocessing¶

Discussion Points:¶

Part 4: Initialization Strategies and Limitations¶

Discussion Points:¶

Part 5: Advanced Topics & Discussion¶