# Install mlxtend for optional visualizations
!pip install mlxtend -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Scikit-learn for models, datasets, and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Import the models we will use
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.2)


# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Scikit-learn Version: {sklearn.__version__}")

Scikit-learn Version: 1.6.1

print("--- Part 1: Dataset Loading and Baseline Model ---")

# 1. Load the dataset from a public URL
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
                'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
df = pd.read_csv(url, header=None, names=column_names, na_values='?')

# 2. Preprocess the data
df.dropna(inplace=True) # Drop rows with missing values for simplicity
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0) # Convert to binary target (1=disease, 0=no disease)

# 3. Separate features and target, then split data
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Train and evaluate the baseline Decision Tree model
baseline_dt = DecisionTreeClassifier(random_state=42)
baseline_dt.fit(X_train_scaled, y_train)
y_pred_base = baseline_dt.predict(X_test_scaled)
baseline_accuracy = accuracy_score(y_test, y_pred_base)

print(f"Baseline Single Decision Tree Accuracy: {baseline_accuracy:.4f}")

--- Part 1: Dataset Loading and Baseline Model ---
Baseline Single Decision Tree Accuracy: 0.6833

print("\n--- Part 2: Bagging with Random Forest ---")

# 1. Initialize and train a RandomForestClassifier
# n_estimators is the number of trees in the forest.
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, oob_score=True)
rf_clf.fit(X_train_scaled, y_train)

# 2. Evaluate the model
y_pred_rf = rf_clf.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Test Accuracy: {rf_accuracy:.4f}")
print(f"Random Forest OOB Score: {rf_clf.oob_score_:.4f}")

# INSIGHT: The Random Forest provides a significant accuracy boost over the single Decision Tree,
# demonstrating the power of bagging for reducing variance and improving generalization. The OOB score
# gives us a good estimate of the test accuracy without using the test set.

# 3. Visualize Feature Importances
importances = rf_clf.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values(by='importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(10), palette='rocket')
plt.title('Top 10 Feature Importances from Random Forest')
plt.show()

--- Part 2: Bagging with Random Forest ---
Random Forest Test Accuracy: 0.8667
Random Forest OOB Score: 0.8059

print("\n--- Part 3: Stacking Classifier ---")

# 1. Define the base learners (Level-0 models)
base_learners = [
    ('knn', KNeighborsClassifier(n_neighbors=5)),
    ('svm', SVC(kernel='rbf', probability=True, random_state=42)),
    ('dt', DecisionTreeClassifier(max_depth=5, random_state=42))
]

# 2. Define the meta-learner (Level-1 model)
meta_learner = LogisticRegression(random_state=42)

# 3. Initialize and train the StackingClassifier
stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)
stacking_clf.fit(X_train_scaled, y_train)

# 4. Evaluate the model
y_pred_stack = stacking_clf.predict(X_test_scaled)
stacking_accuracy = accuracy_score(y_test, y_pred_stack)

print(f"Stacking Classifier Test Accuracy: {stacking_accuracy:.4f}")

--- Part 3: Stacking Classifier ---
Stacking Classifier Test Accuracy: 0.8333

# --- TASK 1: Random Forest Hyperparameters ---
# The number of trees (`n_estimators`) is a key parameter. Re-train the Random Forest with a much
# smaller number of trees (e.g., n_estimators=10) and a much larger number (e.g., n_estimators=500).
# How does the accuracy change? What do you notice about the trend?

# YOUR CODE HERE
# rf_10 = RandomForestClassifier(n_estimators=10, random_state=42).fit(X_train_scaled, y_train)
# acc_10 = accuracy_score(y_test, rf_10.predict(X_test_scaled))
#
# rf_500 = RandomForestClassifier(n_estimators=500, random_state=42).fit(X_train_scaled, y_train)
# acc_500 = accuracy_score(y_test, rf_500.predict(X_test_scaled))
#
# print("--- Task 1: Random Forest n_estimators ---")
# print(f"Accuracy with 10 trees: {acc_10:.4f}")
# print(f"Accuracy with 100 trees (original): {rf_accuracy:.4f}")
# print(f"Accuracy with 500 trees: {acc_500:.4f}")
# print("\nObservation: Performance generally increases with more trees, but with diminishing returns.")


# --- TASK 2: Customizing the Stacking Ensemble ---
# Modify the `StackingClassifier` from Part 3. Replace one of the base learners (e.g., KNN) with a
# different model, like `GaussianNB`.
# Does this change the ensemble's performance? This highlights the "mix-and-match" nature of Stacking.

# YOUR CODE HERE
# new_base_learners = [
#     ('gnb', GaussianNB()),
#     ('svm', SVC(kernel='rbf', probability=True, random_state=42)),
#     ('dt', DecisionTreeClassifier(max_depth=5, random_state=42))
# ]
#
# new_stacking_clf = StackingClassifier(estimators=new_base_learners, final_estimator=meta_learner, cv=5)
# new_stacking_clf.fit(X_train_scaled, y_train)
# new_stack_acc = accuracy_score(y_test, new_stacking_clf.predict(X_test_scaled))
#
# print("\n--- Task 2: Custom Stacking Ensemble ---")
# print(f"Original Stacking Accuracy: {stacking_accuracy:.4f}")
# print(f"New Stacking Accuracy (with GNB): {new_stack_acc:.4f}")


# --- TASK 3: A Preview of Boosting ---
# Boosting is another major type of ensembling where models are trained sequentially, with each new
# model focusing on the errors of the previous one. `GradientBoostingClassifier` is a popular example.
# Train a Gradient Boosting model on the same data and compare its accuracy to the others.

# YOUR CODE HERE
# from sklearn.ensemble import GradientBoostingClassifier
#
# gb_clf = GradientBoostingClassifier(n_estimators=100, random_state=42).fit(X_train_scaled, y_train)
# acc_gb = accuracy_score(y_test, gb_clf.predict(X_test_scaled))
# print(f"\n--- Task 3: Gradient Boosting ---")
# print(f"Gradient Boosting Test Accuracy: {acc_gb:.4f}")

Ensemble Learning: Boosting Model Performance with Bagging and Stacking¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: Dataset & Baseline Model¶

Part 2: Bagging with Random Forest¶

Part 3: Stacking (Stacked Generalization)¶

Part 4: Performance Summary & Discussion¶

Lab Tasks & Exercises¶

Part 5: Advanced Topics & Discussion¶

Model	Technique	Test Accuracy
Single Decision Tree	Baseline	0.6833
Random Forest	Bagging	0.8667
Stacking Classifier	Stacking	0.8333