# No special installs are needed as these are standard in Colab
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Scikit-learn for clustering and preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering

# Scipy for dendrograms
import scipy.cluster.hierarchy as sch

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.2)


# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas Version: {pd.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

Pandas Version: 2.2.2
Scikit-learn Version: 1.6.1

print("--- Part 1: Dataset Loading and Preparation ---")

# 1. Load the dataset from a public URL
url = 'https://raw.githubusercontent.com/SteffiPeTaffy/machineLearningAZ/master/Machine%20Learning%20A-Z%20Template%20Folder/Part%204%20-%20Clustering/Section%2025%20-%20Hierarchical%20Clustering/Mall_Customers.csv'
df = pd.read_csv(url)

# 2. Select the relevant features for clustering
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]

# 3. Visualize the raw data to see the inherent clusters
plt.figure(figsize=(10, 7))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=X)
plt.title('Raw Mall Customer Data')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

# 4. Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("\nData has been standardized.")

--- Part 1: Dataset Loading and Preparation ---

Data has been standardized.

print("\n--- Part 2: Generating and Interpreting the Dendrogram ---")

# 1. Generate the linkage matrix using Ward's method
# Ward's method minimizes the variance of the clusters being merged. It's often a good default.
linked = sch.linkage(X_scaled, method='ward')

# 2. Plot the dendrogram
plt.figure(figsize=(14, 8))
dendrogram = sch.dendrogram(linked)

plt.title('Hierarchical Clustering Dendrogram (Ward\'s Linkage)')
plt.xlabel('Customers (Data Points)')
plt.ylabel('Euclidean Distance')

# Add a horizontal line to suggest a cut
plt.axhline(y=12.5, color='r', linestyle='--', label='Optimal Cut')
plt.legend()
plt.show()

# INSIGHT: By drawing a horizontal line at a distance of ~12.5, we intersect 5 vertical lines.
# This suggests that the optimal number of clusters for this dataset is 5.

--- Part 2: Generating and Interpreting the Dendrogram ---

print("\n--- Part 3: Applying Agglomerative Clustering ---")

# 1. Initialize and fit the model
# We use n_clusters=5 as determined from the dendrogram, and the same 'ward' linkage.
agg_cluster = AgglomerativeClustering(n_clusters=5, linkage='ward')
y_pred = agg_cluster.fit_predict(X_scaled)

# 2. Add the cluster labels to the original DataFrame
df['Cluster'] = y_pred

# 3. Visualize the resulting clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', data=df,
                hue='Cluster', palette='viridis', s=100)
plt.title('Customer Segments by Hierarchical Clustering (k=5)')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend(title='Cluster')
plt.show()

# INSIGHT: The algorithm successfully identified the 5 distinct customer segments that we could visually see in the raw data plot.

--- Part 3: Applying Agglomerative Clustering ---

print("\n--- Part 4: Comparing Linkage Methods ---")

# Define the linkage methods to compare
linkage_methods = ['ward', 'complete', 'average', 'single']

fig, axes = plt.subplots(2, 2, figsize=(18, 14))
axes = axes.flatten()

for i, method in enumerate(linkage_methods):
    model = AgglomerativeClustering(n_clusters=5, linkage=method)
    y_pred_method = model.fit_predict(X_scaled)

    sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=y_pred_method,
                    palette='viridis', ax=axes[i], legend=False)
    axes[i].set_title(f'Linkage Method: "{method}"')
    axes[i].set_xlabel('Annual Income (Scaled)')
    axes[i].set_ylabel('Spending Score (Scaled)')

plt.tight_layout()
plt.show()

# INSIGHT: For this dataset, 'ward', 'complete', and 'average' produce very similar, logical clusters.
# 'single' linkage, however, performs poorly. It creates one large cluster and chains off the outliers,
# failing to capture the true underlying structure. This highlights the importance of choosing an appropriate linkage.

--- Part 4: Comparing Linkage Methods ---

# --- TASK 1: Dendrogram Interpretation ---
# Look at the dendrogram you generated in Part 2.
# 1. If you were to draw a horizontal "cut" line at a distance of 20 on the Y-axis,
#    how many clusters would you get?
# 2. To get exactly two clusters, at roughly what distance would you need to make the cut?

# print("--- Task 1: Dendrogram Interpretation ---")
# print("Answer 1: A cut at a distance of 20 would intersect 3 vertical lines, resulting in 3 clusters.")
# print("Answer 2: To get two clusters, you would need to make the cut above the green merge, at a distance of approximately 25.")


# --- TASK 2: Applying to a New Dataset ---
# The `make_blobs` dataset is designed to have clear, spherical clusters.
# 1. Generate a `make_blobs` dataset with 300 samples and 4 centers.
# 2. Scale the data and generate a dendrogram using 'ward' linkage.
# 3. Based on the dendrogram, is the optimal number of clusters clear? Why?

# YOUR CODE HERE
# from sklearn.datasets import make_blobs
# X_blobs, y_blobs = make_blobs(n_samples=300, centers=4, cluster_std=0.8, random_state=42)
# X_blobs_scaled = StandardScaler().fit_transform(X_blobs)
#
# plt.figure(figsize=(14, 8))
# linked_blobs = sch.linkage(X_blobs_scaled, method='ward')
# sch.dendrogram(linked_blobs)
# plt.title('Dendrogram for Blobs Dataset')
# plt.axhline(y=7, color='r', linestyle='--')
# plt.show()
# print("\n--- Task 2: Blobs Dataset ---")
# print("Yes, the optimal number of clusters is very clear. The longest vertical lines are in the final 4 merges,")
# print("and a cut at a distance of ~7 clearly intersects 4 lines, matching the 4 centers we specified.")


# --- TASK 3: Profiling the Mall Customer Clusters ---
# Using the `df` DataFrame from Part 3 which contains the cluster labels (from 'ward' linkage),
# calculate the mean 'Annual Income (k$)' and 'Spending Score (1-100)' for each of the 5 clusters.
# Based on these means, create a descriptive persona for each cluster.

# YOUR CODE HERE
# cluster_profiles = df.groupby('Cluster')[['Annual Income (k$)', 'Spending Score (1-100)']].mean()
# print("\n--- Task 3: Customer Cluster Profiles ---")
# print(cluster_profiles)
# print("\n--- Personas ---")
# print("Cluster 0: High Income, Low Spending (Careful/Rich)")
# print("Cluster 1: Average Income, Average Spending (Standard/Target)")
# print("Cluster 2: Low Income, Low Spending (Sensible/Students)")
# print("Cluster 3: Low Income, High Spending (Careless/Young)")
# print("Cluster 4: High Income, High Spending (Prime Target/VIP)")

Understanding Data Structures with Hierarchical Clustering¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: Dataset & The Goal of Clustering¶

Part 2: The Dendrogram - Visualizing the Hierarchy¶

Part 3: Implementing Agglomerative Clustering in Scikit-Learn¶

Part 4: Comparing Different Linkage Methods¶

Lab Tasks & Exercises¶

Part 6: Advanced Topics & Discussion¶