# Install necessary libraries if not already present in Colab environment
!pip install pandas numpy scikit-learn matplotlib seaborn -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import time # To compare computation times

# Scikit-learn for PCA, t-SNE, datasets, and models
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.2)


# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas Version: {pd.__version__}")
print(f"Seaborn Version: {sns.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

Pandas Version: 2.2.2
Seaborn Version: 0.13.2
Scikit-learn Version: 1.6.1

print("--- Part 1: Dataset Loading and Inspection ---")

# 1. Load the Digits dataset
digits = load_digits()
X = digits.data
y = digits.target

print(f"Data shape (X): {X.shape}")
print(f"Target shape (y): {y.shape}")
print(f"Number of features (dimensions): {X.shape[1]}")
print(f"Number of unique classes (digits): {len(np.unique(y))}")

# 2. Visualize a few sample digits to understand the data
fig, axes = plt.subplots(2, 5, figsize=(12, 5), subplot_kw={'xticks':[], 'yticks':[]})
for i, ax in enumerate(axes.flat):
    ax.imshow(X[i].reshape(8, 8), cmap='binary')
    ax.set_title(f"Label: {y[i]}")
plt.suptitle("Sample Handwritten Digits")
plt.show()

--- Part 1: Dataset Loading and Inspection ---
Data shape (X): (1797, 64)
Target shape (y): (1797,)
Number of features (dimensions): 64
Number of unique classes (digits): 10

print("\n--- Part 2: Principal Component Analysis (PCA) ---")

# 1. Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print("Data has been standardized.")

# 2. Fit PCA to find the explained variance of all components
pca_full = PCA()
pca_full.fit(X_scaled)

# 3. Plot the cumulative explained variance
plt.figure(figsize=(10, 6))
plt.plot(np.cumsum(pca_full.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Explained Variance by Number of Principal Components')
plt.grid(True)
plt.axhline(y=0.95, color='r', linestyle='--', label='95% Explained Variance')
plt.legend()
plt.show()

# INSIGHT: We can see that with ~30 components, we can capture 95% of the total variance in the original 64-feature dataset.

# 4. Perform PCA again, but this time to reduce dimensionality
# We'll choose n_components to capture 95% of the variance
pca_95 = PCA(n_components=0.95)
X_pca = pca_95.fit_transform(X_scaled)

print(f"\nOriginal number of features: {X_scaled.shape[1]}")
print(f"Reduced number of features (for 95% variance): {X_pca.shape[1]}")

--- Part 2: Principal Component Analysis (PCA) ---
Data has been standardized.

Original number of features: 64
Reduced number of features (for 95% variance): 40

print("\n--- Part 3: t-SNE for Visualization ---")

# 1. Initialize and run t-SNE
# Perplexity is typically between 5 and 50. n_iter can be increased if the plot looks crowded.
tsne = TSNE(n_components=2, perplexity=30, n_iter=1000, random_state=42)
start_time = time.time()
X_tsne = tsne.fit_transform(X_scaled) # Apply t-SNE to the original scaled data
end_time = time.time()

print(f"t-SNE completed in {end_time - start_time:.2f} seconds.")
print(f"Shape of t-SNE embedding: {X_tsne.shape}")

# 2. Visualize the t-SNE embedding
plt.figure(figsize=(12, 10))
sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=y, palette=sns.color_palette("hsv", 10), legend="full")
plt.title('t-SNE Visualization of Digits Dataset')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend(title='Digit Label')
plt.show()

# INSIGHT: The visualization is stunning. t-SNE has created very clear, well-separated clusters for each of the 10 digits,
# revealing the underlying structure of the data in a way that would be impossible to see in 64 dimensions.

--- Part 3: t-SNE for Visualization ---
t-SNE completed in 21.33 seconds.
Shape of t-SNE embedding: (1797, 2)

print("\n--- Part 4: Comparing PCA and t-SNE Visualizations ---")

# 1. Get the 2D projection from PCA
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled)

# 2. Create side-by-side plots for comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 9))

# PCA Plot
sns.scatterplot(x=X_pca_2d[:, 0], y=X_pca_2d[:, 1], hue=y, palette=sns.color_palette("hsv", 10), legend="full", ax=ax1)
ax1.set_title('PCA 2D Visualization')
ax1.set_xlabel('Principal Component 1')
ax1.set_ylabel('Principal Component 2')

# t-SNE Plot
sns.scatterplot(x=X_tsne[:, 0], y=X_tsne[:, 1], hue=y, palette=sns.color_palette("hsv", 10), legend="full", ax=ax2)
ax2.set_title('t-SNE 2D Visualization')
ax2.set_xlabel('t-SNE Component 1')
ax2.set_ylabel('t-SNE Component 2')

plt.show()

# INSIGHT: The difference is clear. PCA shows some separation, but the clusters are heavily overlapped. This is because
# it's trying to preserve the global variance, not the local separation. t-SNE provides a much clearer view of the
# distinct groupings, confirming it is superior for visualization.

--- Part 4: Comparing PCA and t-SNE Visualizations ---

# # --- TASK 1: PCA Variance Threshold ---
# # Re-run the PCA analysis to find the number of components needed to capture 80% of the variance.
# # How many components are needed? How does this compare to the 95% threshold?

# # YOUR CODE HERE
# pca_80 = PCA(n_components=0.80)
# X_pca_80 = pca_80.fit_transform(X_scaled)
# print(f"Components for 80% variance: {X_pca_80.shape[1]}")


# # --- TASK 2: t-SNE Perplexity ---
# # Re-run the t-SNE visualization twice, first with a very low perplexity (e.g., 5) and then with a
# # very high one (e.g., 50 or 100). Plot the results.
# # How does the visualization change? What does this tell you about the role of perplexity?

# # YOUR CODE HERE for perplexity=5
# tsne_5 = TSNE(n_components=2, perplexity=5, random_state=42)
# X_tsne_5 = tsne_5.fit_transform(X_scaled)
# plt.figure(figsize=(8,6))
# sns.scatterplot(x=X_tsne_5[:,0], y=X_tsne_5[:,1], hue=y, palette=sns.color_palette("hsv", 10)).set_title('Perplexity = 5')
# plt.show()

# # YOUR CODE HERE for perplexity=50
# tsne_50 = TSNE(n_components=2, perplexity=50, random_state=42)
# X_tsne_50 = tsne_50.fit_transform(X_scaled)
# plt.figure(figsize=(8,6))
# sns.scatterplot(x=X_tsne_50[:,0], y=X_tsne_50[:,1], hue=y, palette=sns.color_palette("hsv", 10)).set_title('Perplexity = 50')
# plt.show()


# # --- TASK 3: PCA for Model Performance ---
# # PCA is not just for visualization; it's a powerful preprocessing step.
# # 1. Train a Logistic Regression classifier on the ORIGINAL scaled data (X_scaled, y).
# # 2. Train another Logistic Regression classifier on the PCA-REDUCED data (X_pca, y).
# # 3. Compare their accuracy and training time. What do you observe?

# # Split the data
# X_train_orig, X_test_orig, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# X_train_pca, X_test_pca, _, _ = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# # YOUR CODE HERE for the original data
# log_reg_orig = LogisticRegression(max_iter=1000, random_state=42)
# start = time.time()
# log_reg_orig.fit(X_train_orig, y_train)
# end = time.time()
# y_pred_orig = log_reg_orig.predict(X_test_orig)
# acc_orig = accuracy_score(y_test, y_pred_orig)
# print(f"Original Data -> Accuracy: {acc_orig:.4f}, Time: {end-start:.4f}s")

# # YOUR CODE HERE for the PCA-reduced data
# log_reg_pca = LogisticRegression(max_iter=1000, random_state=42)
# start = time.time()
# log_reg_pca.fit(X_train_pca, y_train)
# end = time.time()
# y_pred_pca = log_reg_pca.predict(X_test_pca)
# acc_pca = accuracy_score(y_test, y_pred_pca)
# print(f"PCA-Reduced Data -> Accuracy: {acc_pca:.4f}, Time: {end-start:.4f}s")

Components for 80% variance: 21

Original Data -> Accuracy: 0.9722, Time: 0.7514s
PCA-Reduced Data -> Accuracy: 0.9611, Time: 0.5276s

Dimensionality Reduction with PCA and Visualization with t-SNE¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: The Curse of Dimensionality & Our Dataset¶

Part 2: Principal Component Analysis (PCA) for Dimensionality Reduction¶

1. Data Preprocessing (Standardization)¶

2. Covariance Matrix¶

3. Eigen Decomposition¶

4. Principal Components¶

5. Explained Variance Ratio¶

6. PCA Algorithm Steps¶

Part 3: t-SNE for High-Dimensional Visualization¶

Part 4: Comparing PCA and t-SNE Visualizations¶

Lab Tasks & Exercises¶

Part 7: Advanced Topics & Discussion¶