# No special installs are needed
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Scikit-learn for clustering, datasets, and preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.datasets import make_moons, make_circles
from sklearn.neighbors import NearestNeighbors

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.2)


# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas Version: {pd.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

Pandas Version: 2.2.2
Scikit-learn Version: 1.6.1

print("--- Part 1: Demonstrating the Failure of K-Means ---")

# 1. Generate a non-spherical dataset (moons) with some noise
X, y = make_moons(n_samples=500, noise=0.1, random_state=42)
X_scaled = StandardScaler().fit_transform(X)

# 2. Apply K-Means with k=2
kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X_scaled)

# 3. Visualize the K-Means result
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=kmeans_labels, palette='viridis')
plt.title('K-Means Fails on Non-Spherical Data')
plt.xlabel('Feature 1 (Scaled)')
plt.ylabel('Feature 2 (Scaled)')
plt.legend(title='K-Means Cluster')
plt.show()

# INSIGHT: K-Means fails completely. It incorrectly splits each "moon" in half, unable to capture the true crescent-shaped clusters.
# This motivates the need for a different approach that doesn't assume a cluster shape.

--- Part 1: Demonstrating the Failure of K-Means ---

print("\n--- Part 3: Implementing DBSCAN ---")

# 1. Initialize and fit the DBSCAN model
# We'll choose eps=0.3 and min_samples=5 as a starting point.
dbscan = DBSCAN(eps=0.3, min_samples=5)
dbscan_labels = dbscan.fit_predict(X_scaled)

# The labels are integers for each cluster. Noise points are labeled -1.
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
n_noise = list(dbscan_labels).count(-1)

print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")

# 2. Visualize the DBSCAN result
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_scaled[:, 0], y=X_scaled[:, 1], hue=dbscan_labels, palette='viridis')
plt.title('DBSCAN Successfully Identifies Non-Spherical Clusters')
plt.xlabel('Feature 1 (Scaled)')
plt.ylabel('Feature 2 (Scaled)')
plt.legend(title='DBSCAN Cluster')
plt.show()

# INSIGHT: Success! DBSCAN perfectly identified the two moon-shaped clusters (labeled 0 and 1)
# and correctly ignored the uniform noise, which is not part of any dense region.

--- Part 3: Implementing DBSCAN ---
Estimated number of clusters: 2
Estimated number of noise points: 1

print("\n--- Part 4: Choosing an Optimal `eps` ---")

# We will use k = min_samples = 5
k = 5
# 1. Calculate the distance to the k-th nearest neighbor for each point
neighbors = NearestNeighbors(n_neighbors=k)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)

# 2. Sort the distances
k_distances = np.sort(distances[:, k-1], axis=0)

# 3. Plot the k-distance graph
plt.figure(figsize=(10, 6))
plt.plot(k_distances)
plt.title('K-Distance Graph')
plt.xlabel('Points (sorted by distance)')
plt.ylabel(f'{k}-th Nearest Neighbor Distance')
plt.axhline(y=0.3, color='r', linestyle='--', label='Elbow at eps=0.3')
plt.legend()
plt.show()

# INSIGHT: The plot shows a distinct "elbow" around a distance of 0.3. This confirms our initial
# choice of eps=0.3 was a good one and provides a data-driven way to select this hyperparameter.

--- Part 4: Choosing an Optimal `eps` ---

# --- TASK 1: The Effect of Hyperparameters ---
# Re-run DBSCAN on the moons dataset twice and visualize the results:
# 1. With a very small `eps` (e.g., 0.1). What happens to the data?
# 2. With a very large `eps` (e.g., 1.5). What happens to the clusters?

# YOUR CODE HERE for small eps
# dbscan_small_eps = DBSCAN(eps=0.1, min_samples=5).fit(X_scaled)
# plt.figure(figsize=(8,6))
# sns.scatterplot(x=X_scaled[:,0], y=X_scaled[:,1], hue=dbscan_small_eps.labels_, palette='viridis').set_title('Small eps=0.1')
# plt.show()
# print("With a small eps, most points are classified as noise (-1) because their neighborhoods are too small.")


# YOUR CODE HERE for large eps
# dbscan_large_eps = DBSCAN(eps=1.5, min_samples=5).fit(X_scaled)
# plt.figure(figsize=(8,6))
# sns.scatterplot(x=X_scaled[:,0], y=X_scaled[:,1], hue=dbscan_large_eps.labels_, palette='viridis').set_title('Large eps=1.5')
# plt.show()
# print("With a large eps, all points are considered part of the same dense region, so everything is merged into a single cluster (0).")


# --- TASK 2: Applying DBSCAN to Concentric Circles ---
# The `make_circles` dataset is another classic challenge for K-Means.
# 1. Generate a `make_circles` dataset with noise.
# 2. Use the k-distance graph method to estimate a good `eps` value for it.
# 3. Run DBSCAN with your estimated `eps` and `min_samples=5`. Does it work?

# YOUR CODE HERE
# # Generate data
# X_circles, y_circles = make_circles(n_samples=500, noise=0.05, factor=0.5, random_state=42)
# X_circles_scaled = StandardScaler().fit_transform(X_circles)
#
# # Find eps
# neighbors_c = NearestNeighbors(n_neighbors=5).fit(X_circles_scaled)
# distances_c, _ = neighbors_c.kneighbors(X_circles_scaled)
# k_distances_c = np.sort(distances_c[:, 4], axis=0)
# plt.plot(k_distances_c)
# plt.title('K-Distance Graph for Circles')
# plt.show() # Elbow looks to be around 0.2
#
# # Run DBSCAN and visualize
# dbscan_circles = DBSCAN(eps=0.2, min_samples=5).fit(X_circles_scaled)
# plt.figure(figsize=(8,6))
# sns.scatterplot(x=X_circles_scaled[:,0], y=X_circles_scaled[:,1], hue=dbscan_circles.labels_, palette='viridis').set_title('DBSCAN on Circles')
# plt.show()


# --- TASK 3: Applying DBSCAN to Real-World Data ---
# Apply DBSCAN to the Mall Customers data from the previous lab.
# Use the k-distance plot to find a good `eps`. Compare the clusters found by DBSCAN to the 5 clusters
# found by Hierarchical clustering. What is different about the result?

# YOUR CODE HERE
# url_mall = 'https://raw.githubusercontent.com/SteffiPeTaffy/machineLearningAZ/master/Machine%20Learning%20A-Z%20Template%20Folder/Part%204%20-%20Clustering/Section%2025%20-%20Hierarchical%20Clustering/Mall_Customers.csv'
# df_mall = pd.read_csv(url_mall)
# X_mall = df_mall[['Annual Income (k$)', 'Spending Score (1-100)']]
# X_mall_scaled = StandardScaler().fit_transform(X_mall)
#
# # Find eps (let's use min_samples=5)
# neighbors_m = NearestNeighbors(n_neighbors=5).fit(X_mall_scaled)
# distances_m, _ = neighbors_m.kneighbors(X_mall_scaled)
# k_distances_m = np.sort(distances_m[:, 4], axis=0)
# plt.plot(k_distances_m)
# plt.title('K-Distance Graph for Mall Customers') # Elbow at ~0.5
# plt.show()
#
# dbscan_mall = DBSCAN(eps=0.5, min_samples=5).fit(X_mall_scaled)
# plt.figure(figsize=(10,7))
# sns.scatterplot(x=X_mall_scaled[:,0], y=X_mall_scaled[:,1], hue=dbscan_mall.labels_, palette='viridis').set_title('DBSCAN on Mall Customer Data')
# plt.show()
# print("DBSCAN finds the same 5 clusters but also identifies several points as noise (-1), which Hierarchical clustering did not.")

Feature	K-Means	Hierarchical	DBSCAN
Cluster Shape	Assumes spherical	No assumption	Finds arbitrary shapes
Outlier Handling	No (assigns to nearest cluster)	No (assigns to some cluster)	Yes (explicitly labels noise)
# of Clusters (`k`)	Must be pre-specified	Not required (chosen from dendrogram)	Not required (determined by density)
Scalability	High (near-linear)	Low (quadratic/cubic)	Medium (N log N), can be slow
Sensitivity	Sensitive to initial centroids	Sensitive to linkage method	Sensitive to `eps` and `min_samples`

Density-Based Clustering with DBSCAN¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: The Weakness of K-Means & The Need for DBSCAN¶

Part 2: Core Concepts of DBSCAN¶

Part 3: Implementing DBSCAN¶

Part 4: A Practical Method for Choosing `eps`¶

Lab Tasks & Exercises¶

Part 5: Advanced Topics & Discussion¶

Density-Based Clustering with DBSCAN¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: The Weakness of K-Means & The Need for DBSCAN¶

Part 2: Core Concepts of DBSCAN¶

Part 3: Implementing DBSCAN¶

Part 4: A Practical Method for Choosing eps¶

Lab Tasks & Exercises¶

Part 5: Advanced Topics & Discussion¶

Part 4: A Practical Method for Choosing `eps`¶