# Install mlxtend for plotting decision boundaries
!pip install mlxtend -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import mlxtend

# Scikit-learn for models, datasets, and preprocessing
from sklearn.datasets import make_moons, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# MLxtend for visualizing decision regions
from mlxtend.plotting import plot_decision_regions

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.2)

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"MLxtend Version: {mlxtend.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

MLxtend Version: 0.23.4
Scikit-learn Version: 1.6.1

print("--- Part 1: Dataset Generation and Setup ---")

# 1. Generate the make_moons dataset
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)

# 2. Visualize the dataset
plt.figure(figsize=(10, 7))
sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, palette='viridis')
plt.title('The Non-Linearly Separable Moons Dataset')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# 3. Split data and scale features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training data shape: {X_train_scaled.shape}")
print(f"Test data shape: {X_test_scaled.shape}")

--- Part 1: Dataset Generation and Setup ---

Training data shape: (350, 2)
Test data shape: (150, 2)

print("\n--- Part 2: Decision Trees - Overfitting vs. Pruning ---")

# 1. Train a fully grown (overfit) Decision Tree
# By not setting max_depth, the tree can grow as deep as it needs to.
dt_overfit = DecisionTreeClassifier(random_state=42)
dt_overfit.fit(X_train_scaled, y_train)

# 2. Train a pruned (regularized) Decision Tree
dt_pruned = DecisionTreeClassifier(max_depth=3, random_state=42)
dt_pruned.fit(X_train_scaled, y_train)

# 3. Evaluate both models
print(f"Overfit Tree - Train Accuracy: {accuracy_score(y_train, dt_overfit.predict(X_train_scaled)):.4f}")
print(f"Overfit Tree - Test Accuracy:  {accuracy_score(y_test, dt_overfit.predict(X_test_scaled)):.4f}")
print(f"Pruned Tree (max_depth=3) - Train Accuracy: {accuracy_score(y_train, dt_pruned.predict(X_train_scaled)):.4f}")
print(f"Pruned Tree (max_depth=3) - Test Accuracy:  {accuracy_score(y_test, dt_pruned.predict(X_test_scaled)):.4f}")

# 4. Visualize the decision boundaries
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 9))

plot_decision_regions(X_train_scaled, y_train, clf=dt_overfit, ax=ax1)
ax1.set_title('Overfit Decision Tree (Complex Boundary)')
ax1.set_xlabel('Feature 1 (Scaled)')
ax1.set_ylabel('Feature 2 (Scaled)')


plot_decision_regions(X_train_scaled, y_train, clf=dt_pruned, ax=ax2)
ax2.set_title('Pruned Decision Tree (max_depth=3)')
ax2.set_xlabel('Feature 1 (Scaled)')
ax2.set_ylabel('Feature 2 (Scaled)')

plt.show()

# INSIGHT: The overfit tree has a near-perfect training accuracy but lower test accuracy. Its boundary is highly complex.
# The pruned tree has a slightly lower training accuracy but generalizes better to the test set. Its boundary is much simpler.

--- Part 2: Decision Trees - Overfitting vs. Pruning ---
Overfit Tree - Train Accuracy: 1.0000
Overfit Tree - Test Accuracy:  0.8667
Pruned Tree (max_depth=3) - Train Accuracy: 0.9057
Pruned Tree (max_depth=3) - Test Accuracy:  0.8800

# 5. Visualize the pruned Decision Tree structure
plt.figure(figsize=(12, 8))
plot_tree(dt_pruned, filled=True, feature_names=['Feature 1 (Scaled)', 'Feature 2 (Scaled)'], class_names=['0', '1'], rounded=True)
plt.title('Structure of the Pruned Decision Tree (max_depth=3)')
plt.show()

# INSIGHT: This plot shows the sequence of splits the pruned tree makes based on the scaled features.

print("\n--- Part 3: Support Vector Machines with RBF Kernel ---")

# 1. Train an SVM with default parameters (C=1.0, gamma='scale')
svm_rbf = SVC(kernel='rbf', random_state=42)
svm_rbf.fit(X_train_scaled, y_train)

print(f"RBF SVM - Test Accuracy: {accuracy_score(y_test, svm_rbf.predict(X_test_scaled)):.4f}")

# 2. Visualize the decision boundary of the SVM
plt.figure(figsize=(10, 8))
plot_decision_regions(X_train_scaled, y_train, clf=svm_rbf)
plt.title('SVM with RBF Kernel Decision Boundary')
plt.xlabel('Feature 1 (Scaled)')
plt.ylabel('Feature 2 (Scaled)')
plt.show()

# INSIGHT: The SVM with the RBF kernel creates a smooth, non-linear boundary that effectively separates the two moons.

# 3. Visualize the effect of C and gamma
fig, axes = plt.subplots(2, 2, figsize=(18, 14))

# Low C, Low gamma -> Smooth, general boundary
svm_1 = SVC(kernel='rbf', C=0.1, gamma=0.1, random_state=42).fit(X_train_scaled, y_train)
plot_decision_regions(X_train_scaled, y_train, clf=svm_1, ax=axes[0, 0])
axes[0, 0].set_title('Low C (0.1), Low gamma (0.1)')

# High C, Low gamma -> Tries harder to fit, but still smooth
svm_2 = SVC(kernel='rbf', C=100, gamma=0.1, random_state=42).fit(X_train_scaled, y_train)
plot_decision_regions(X_train_scaled, y_train, clf=svm_2, ax=axes[0, 1])
axes[0, 1].set_title('High C (100), Low gamma (0.1)')

# Low C, High gamma -> Localized influence, wiggly boundary
svm_3 = SVC(kernel='rbf', C=0.1, gamma=10, random_state=42).fit(X_train_scaled, y_train)
plot_decision_regions(X_train_scaled, y_train, clf=svm_3, ax=axes[1, 0])
axes[1, 0].set_title('Low C (0.1), High gamma (10)')

# High C, High gamma -> Highly complex, overfit boundary
svm_4 = SVC(kernel='rbf', C=100, gamma=10, random_state=42).fit(X_train_scaled, y_train)
plot_decision_regions(X_train_scaled, y_train, clf=svm_4, ax=axes[1, 1])
axes[1, 1].set_title('High C (100), High gamma (10)')

plt.tight_layout()
plt.show()

--- Part 3: Support Vector Machines with RBF Kernel ---
RBF SVM - Test Accuracy: 0.9133

# --- TASK 1: Apply Models to a Real-World Dataset ---
# 1. Load the Breast Cancer dataset from scikit-learn.
# 2. Perform a train-test split and scale the data.
# 3. Train both a pruned Decision Tree (e.g., max_depth=4) and an RBF SVM (default parameters) on the data.
# 4. Compare their test set accuracies. Which one performs better out-of-the-box?

# YOUR CODE HERE
# # Load data
# cancer = load_breast_cancer()
# X_cancer, y_cancer = cancer.data, cancer.target
#
# # Split and scale
# X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X_cancer, y_cancer, test_size=0.3, random_state=42, stratify=y_cancer)
# scaler_c = StandardScaler()
# X_train_c_scaled = scaler_c.fit_transform(X_train_c)
# X_test_c_scaled = scaler_c.transform(X_test_c)
#
# # Train and evaluate Decision Tree
# dt_cancer = DecisionTreeClassifier(max_depth=4, random_state=42).fit(X_train_c_scaled, y_train_c)
# acc_dt_cancer = accuracy_score(y_test_c, dt_cancer.predict(X_test_c_scaled))
#
# # Train and evaluate SVM
# svm_cancer = SVC(kernel='rbf', random_state=42).fit(X_train_c_scaled, y_train_c)
# acc_svm_cancer = accuracy_score(y_test_c, svm_cancer.predict(X_test_c_scaled))
#
# print("--- Task 1: Breast Cancer Dataset Results ---")
# print(f"Pruned Decision Tree Test Accuracy: {acc_dt_cancer:.4f}")
# print(f"RBF SVM Test Accuracy: {acc_svm_cancer:.4f}")


# --- TASK 2: Hyperparameter Tuning for SVM ---
# Using the Breast Cancer data, try to improve the SVM's accuracy from Task 1 by experimenting with C and gamma.
# Try at least two different combinations. For example:
#   - A high C and default gamma (e.g., C=100)
#   - A default C and a low gamma (e.g., gamma=0.01)
# Does tuning help?

# YOUR CODE HERE
# svm_c1 = SVC(C=100, random_state=42).fit(X_train_c_scaled, y_train_c)
# acc_svm_c1 = accuracy_score(y_test_c, svm_c1.predict(X_test_c_scaled))
# print(f"\n--- Task 2: SVM Tuning ---")
# print(f"SVM (C=100) Test Accuracy: {acc_svm_c1:.4f}")
#
# svm_g1 = SVC(gamma=0.01, random_state=42).fit(X_train_c_scaled, y_train_c)
# acc_svm_g1 = accuracy_score(y_test_c, svm_g1.predict(X_test_c_scaled))
# print(f"SVM (gamma=0.01) Test Accuracy: {acc_svm_g1:.4f}")


# --- TASK 3: Changing the SVM Kernel ---
# Go back to the original `make_moons` dataset.
# Train an SVM but this time use a polynomial kernel: `SVC(kernel='poly', degree=3)`.
# Visualize its decision boundary using `plot_decision_regions`.
# How does the polynomial kernel's boundary look compared to the RBF kernel's boundary?

# YOUR CODE HERE
# svm_poly = SVC(kernel='poly', degree=3, random_state=42).fit(X_train_scaled, y_train)
# print(f"\n--- Task 3: Polynomial Kernel on Moons Dataset ---")
# print(f"Polynomial SVM Test Accuracy: {accuracy_score(y_test, svm_poly.predict(X_test_scaled)):.4f}")
# plt.figure(figsize=(10, 8))
# plot_decision_regions(X_train_scaled, y_train, clf=svm_poly)
# plt.title('SVM with Polynomial Kernel (degree=3)')
# plt.show()

Feature	Decision Tree	Support Vector Machine (SVM)
Model Type	Rule-based ("White Box")	Margin-based ("Black Box")
Interpretability	High. The decision path is easy to understand and visualize.	Low. The decision function, especially with kernels, is not intuitive.
Decision Boundary	Axis-parallel (rectilinear) splits, can be jagged.	Can be linear or smooth and non-linear (with kernels).
Key Strengths	Very fast, handles categorical and numerical data natively, robust to feature scaling.	Effective in high dimensions, memory efficient, models complex boundaries.
Key Weaknesses	Highly prone to overfitting, can be unstable (small data changes can drastically change the tree).	Can be slow on very large datasets, less interpretable, requires careful feature scaling and hyperparameter tuning.
When to Use	When interpretability is paramount; as a strong baseline; in ensembles like Random Forest.	For complex but not massive datasets; when high accuracy is the primary goal and interpretability is secondary.

Classification with Decision Trees and SVMs¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: The Dataset - Non-Linearly Separable Data¶

Part 2: The Decision Tree Classifier¶

Part 3: The Support Vector Machine (SVM) Classifier¶

Part 4: Model Comparison & Discussion¶

Lab Tasks & Exercises¶