# Install necessary libraries (if not already present in Colab environment)
!pip install numpy pandas matplotlib seaborn scikit-learn

Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (2.0.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)
Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-packages (0.13.2)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.6.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.58.5)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (24.2)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (11.2.1)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.3)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.15.3)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.5.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.6.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for datasets, preprocessing, KNN models, and metrics
from sklearn.datasets import make_classification, make_regression, load_iris, load_wine
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    mean_absolute_error, mean_squared_error, r2_score
)

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set a consistent plotting style
sns.set_theme(style="whitegrid")

# --- Simple Visualization of KNN Classification ---
# Create a very simple 2D dataset
X_concept = np.array([[1, 2], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])
y_concept = np.array([0, 0, 1, 1, 0, 1]) # 0 for blue, 1 for red

# New data point to classify
new_point = np.array([[3, 4]])

plt.figure(figsize=(9, 7))
sns.scatterplot(x=X_concept[:, 0], y=X_concept[:, 1], hue=y_concept, palette=['blue', 'red'], s=150, legend='full', marker='o')
plt.scatter(new_point[:, 0], new_point[:, 1], color='green', marker='X', s=300, label='New Point', zorder=5)

<matplotlib.collections.PathCollection at 0x7894a9e1cd50>

# Calculate Euclidean distances to show the concept
distances = np.sqrt(np.sum((X_concept - new_point)**2, axis=1))
sorted_indices = np.argsort(distances)

print("\nDistances from New Point to each training point:")
for i, dist in enumerate(distances):
    print(f"  Point {X_concept[i]} (Class {y_concept[i]}): Distance = {dist:.2f}")

# Example for K=3
k_val = 3
nearest_k_indices = sorted_indices[:k_val]
nearest_k_points = X_concept[nearest_k_indices]
nearest_k_classes = y_concept[nearest_k_indices]

print(f"\nFor K = {k_val}:")
print(f"  Nearest {k_val} points: {nearest_k_points.tolist()}")
print(f"  Their classes: {nearest_k_classes.tolist()}")
from collections import Counter
class_counts = Counter(nearest_k_classes)
predicted_class = class_counts.most_common(1)[0][0]
print(f"  Majority vote (predicted class): {predicted_class}")

Distances from New Point to each training point:
  Point [1. 2.] (Class 0): Distance = 2.83
  Point [1.5 1.8] (Class 0): Distance = 2.66
  Point [5. 8.] (Class 1): Distance = 4.47
  Point [8. 8.] (Class 1): Distance = 6.40
  Point [1.  0.6] (Class 0): Distance = 3.94
  Point [ 9. 11.] (Class 1): Distance = 9.22

For K = 3:
  Nearest 3 points: [[1.5, 1.8], [1.0, 2.0], [1.0, 0.6]]
  Their classes: [0, 0, 0]
  Majority vote (predicted class): 0

# Draw lines to nearest K points
for i in nearest_k_indices:
    plt.plot([new_point[0, 0], X_concept[i, 0]], [new_point[0, 1], X_concept[i, 1]],
             color='gray', linestyle='--', alpha=0.6, linewidth=1)

plt.title(f'KNN Conceptual Example (K={k_val}) - Predict Class {predicted_class}')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.show()

# --- 2.1 Create a 2D synthetic dataset ---
X_clf, y_clf = make_classification(n_samples=200, n_features=2, n_redundant=0,
                                   n_informative=2, n_clusters_per_class=1,
                                   random_state=42, n_classes=3) # 3 classes for better visualization

# Convert to DataFrame for easier plotting
df_clf = pd.DataFrame(X_clf, columns=['Feature_1', 'Feature_2'])
df_clf['Target'] = y_clf

# --- 2.2 Visualize the data points ---
plt.figure(figsize=(10, 8))
sns.scatterplot(data=df_clf, x='Feature_1', y='Feature_2', hue='Target', palette='viridis', s=80, alpha=0.8)
plt.title('Synthetic 2D Multi-Class Classification Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(title='Class')
plt.show()

# --- 2.3 Train a KNeighborsClassifier model and plot decision boundaries ---
def plot_decision_boundary(X, y, model, title, ax):
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    ax.contourf(xx, yy, Z, alpha=0.4, cmap=plt.cm.viridis)
    sns.scatterplot(x=X[:, 0], y=X[:, 1], hue=y, palette='viridis', s=50, alpha=0.8, ax=ax)
    ax.set_title(title)
    ax.set_xlabel('Feature 1')
    ax.set_ylabel('Feature 2')

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# K = 1
knn_k1 = KNeighborsClassifier(n_neighbors=1)
knn_k1.fit(X_clf, y_clf)
plot_decision_boundary(X_clf, y_clf, knn_k1, 'KNN Classification (K=1)', axes[0])

# K = 5
knn_k5 = KNeighborsClassifier(n_neighbors=5)
knn_k5.fit(X_clf, y_clf)
plot_decision_boundary(X_clf, y_clf, knn_k5, 'KNN Classification (K=5)', axes[1])

# K = 20
knn_k20 = KNeighborsClassifier(n_neighbors=20)
knn_k20.fit(X_clf, y_clf)
plot_decision_boundary(X_clf, y_clf, knn_k20, 'KNN Classification (K=20)', axes[2])

plt.tight_layout()
plt.show()

# --- 2.4 Make predictions ---
sample_new_point = np.array([[0.5, 0.5]])
predicted_class = knn_k5.predict(sample_new_point)[0]
print(f"\nNew sample point {sample_new_point[0]} is predicted to be in Class: {predicted_class} (with K=5)")
print(f"Probabilities (K=5): {knn_k5.predict_proba(sample_new_point)[0]}")

New sample point [0.5 0.5] is predicted to be in Class: 0 (with K=5)
Probabilities (K=5): [0.6 0.4 0. ]

# --- 3.1 Load the Wine dataset ---
wine = load_wine()
X_wine = pd.DataFrame(wine.data, columns=wine.feature_names)
y_wine = pd.Series(wine.target)

print(f"\nDataset loaded. Number of features: {X_wine.shape[1]}")
print(f"Target classes: {wine.target_names}")

# --- 3.2 Initial data exploration ---
print("\n--- X_wine Head ---")
print(X_wine.head())
print("\n--- X_wine Info ---")
X_wine.info()
print("\n--- X_wine Describe ---")
print(X_wine.describe()) # Notice the wide range of scales!

print("\n--- Target (y_wine) Value Counts ---")
print(y_wine.value_counts())

# --- 3.3 Split data into training and testing sets ---
X_train_raw, X_test_raw, y_train, y_test = train_test_split(X_wine, y_wine, test_size=0.3, random_state=42, stratify=y_wine)

print(f"\nTraining set X shape: {X_train_raw.shape}, y shape: {y_train.shape}")
print(f"Testing set X shape: {X_test_raw.shape}, y shape: {y_test.shape}")

# --- 3.4 Demonstrate KNN performance *without* scaling ---
print("\n### KNN Classification WITHOUT Feature Scaling ###")
knn_no_scale = KNeighborsClassifier(n_neighbors=5) # Using K=5
knn_no_scale.fit(X_train_raw, y_train)
y_pred_no_scale = knn_no_scale.predict(X_test_raw)

print(f"Accuracy (No Scaling): {accuracy_score(y_test, y_pred_no_scale):.4f}")
print("\nClassification Report (No Scaling):")
print(classification_report(y_test, y_pred_no_scale, target_names=wine.target_names))

# --- 3.5 Demonstrate KNN performance *with* StandardScaler ---
print("\n### KNN Classification WITH Feature Scaling ###")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

knn_scaled = KNeighborsClassifier(n_neighbors=5) # Using K=5
knn_scaled.fit(X_train_scaled, y_train)
y_pred_scaled = knn_scaled.predict(X_test_scaled)

print(f"Accuracy (With Scaling): {accuracy_score(y_test, y_pred_scaled):.4f}")
print("\nClassification Report (With Scaling):")
print(classification_report(y_test, y_pred_scaled, target_names=wine.target_names))

# --- 3.6 Compare evaluation metrics ---
print("\n--- Comparison ---")
print(f"Accuracy improved from {accuracy_score(y_test, y_pred_no_scale):.4f} to {accuracy_score(y_test, y_pred_scaled):.4f} after scaling.")
print("This clearly shows the positive impact of feature scaling on KNN's performance.")

Dataset loaded. Number of features: 13
Target classes: ['class_0' 'class_1' 'class_2']

--- X_wine Head ---
   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                          3.92   1065.0  
1                          3.40   1050.0  
2                          3.17   1185.0  
3                          3.45   1480.0  
4                          2.93    735.0  

--- X_wine Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178 entries, 0 to 177
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   alcohol                       178 non-null    float64
 1   malic_acid                    178 non-null    float64
 2   ash                           178 non-null    float64
 3   alcalinity_of_ash             178 non-null    float64
 4   magnesium                     178 non-null    float64
 5   total_phenols                 178 non-null    float64
 6   flavanoids                    178 non-null    float64
 7   nonflavanoid_phenols          178 non-null    float64
 8   proanthocyanins               178 non-null    float64
 9   color_intensity               178 non-null    float64
 10  hue                           178 non-null    float64
 11  od280/od315_of_diluted_wines  178 non-null    float64
 12  proline                       178 non-null    float64
dtypes: float64(13)
memory usage: 18.2 KB

--- X_wine Describe ---
          alcohol  malic_acid         ash  alcalinity_of_ash   magnesium  \
count  178.000000  178.000000  178.000000         178.000000  178.000000   
mean    13.000618    2.336348    2.366517          19.494944   99.741573   
std      0.811827    1.117146    0.274344           3.339564   14.282484   
min     11.030000    0.740000    1.360000          10.600000   70.000000   
25%     12.362500    1.602500    2.210000          17.200000   88.000000   
50%     13.050000    1.865000    2.360000          19.500000   98.000000   
75%     13.677500    3.082500    2.557500          21.500000  107.000000   
max     14.830000    5.800000    3.230000          30.000000  162.000000   

       total_phenols  flavanoids  nonflavanoid_phenols  proanthocyanins  \
count     178.000000  178.000000            178.000000       178.000000   
mean        2.295112    2.029270              0.361854         1.590899   
std         0.625851    0.998859              0.124453         0.572359   
min         0.980000    0.340000              0.130000         0.410000   
25%         1.742500    1.205000              0.270000         1.250000   
50%         2.355000    2.135000              0.340000         1.555000   
75%         2.800000    2.875000              0.437500         1.950000   
max         3.880000    5.080000              0.660000         3.580000   

       color_intensity         hue  od280/od315_of_diluted_wines      proline  
count       178.000000  178.000000                    178.000000   178.000000  
mean          5.058090    0.957449                      2.611685   746.893258  
std           2.318286    0.228572                      0.709990   314.907474  
min           1.280000    0.480000                      1.270000   278.000000  
25%           3.220000    0.782500                      1.937500   500.500000  
50%           4.690000    0.965000                      2.780000   673.500000  
75%           6.200000    1.120000                      3.170000   985.000000  
max          13.000000    1.710000                      4.000000  1680.000000  

--- Target (y_wine) Value Counts ---
1    71
0    59
2    48
Name: count, dtype: int64

Training set X shape: (124, 13), y shape: (124,)
Testing set X shape: (54, 13), y shape: (54,)

### KNN Classification WITHOUT Feature Scaling ###
Accuracy (No Scaling): 0.7222

Classification Report (No Scaling):
              precision    recall  f1-score   support

     class_0       0.89      0.89      0.89        18
     class_1       0.78      0.67      0.72        21
     class_2       0.50      0.60      0.55        15

    accuracy                           0.72        54
   macro avg       0.72      0.72      0.72        54
weighted avg       0.74      0.72      0.73        54


### KNN Classification WITH Feature Scaling ###
Accuracy (With Scaling): 0.9444

Classification Report (With Scaling):
              precision    recall  f1-score   support

     class_0       1.00      1.00      1.00        18
     class_1       1.00      0.86      0.92        21
     class_2       0.83      1.00      0.91        15

    accuracy                           0.94        54
   macro avg       0.94      0.95      0.94        54
weighted avg       0.95      0.94      0.94        54


--- Comparison ---
Accuracy improved from 0.7222 to 0.9444 after scaling.
This clearly shows the positive impact of feature scaling on KNN's performance.

# --- Demonstrate finding optimal 'K' using cross-validation ---
k_values = list(range(1, 20, 2)) # Test odd K values from 1 to 19
cv_scores = []

for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    # Perform 5-fold cross-validation and store the mean accuracy
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())
    print(f"K={k}: Mean CV Accuracy = {scores.mean():.4f}")

# Find the optimal K
optimal_k_index = np.argmax(cv_scores)
optimal_k = k_values[optimal_k_index]
print(f"\nOptimal K found: {optimal_k} with mean CV accuracy of {cv_scores[optimal_k_index]:.4f}")

# --- Plot accuracy vs. K ---
plt.figure(figsize=(10, 6))
plt.plot(k_values, cv_scores, marker='o', linestyle='--', color='blue')
plt.title('KNN Cross-Validation Accuracy vs. K Value')
plt.xlabel('Number of Neighbors (K)')
plt.ylabel('Mean Cross-Validation Accuracy')
plt.xticks(k_values)
plt.grid(True, linestyle='--', alpha=0.6)
plt.axvline(x=optimal_k, color='red', linestyle=':', label=f'Optimal K = {optimal_k}')
plt.legend()
plt.show()


# --- Explain trade-offs ---
print("\nTrade-offs in choosing 'K':")
print("- Small K (e.g., K=1): Captures fine details, sensitive to noise/outliers (high variance, potential overfitting).")
print("- Large K: Smoother decision boundary, less sensitive to noise, may over-generalize (high bias, potential underfitting).")
print("- Generally, an odd K is preferred for binary classification to avoid ties.")

K=1: Mean CV Accuracy = 0.9513
K=3: Mean CV Accuracy = 0.9593
K=5: Mean CV Accuracy = 0.9673
K=7: Mean CV Accuracy = 0.9673
K=9: Mean CV Accuracy = 0.9513
K=11: Mean CV Accuracy = 0.9757
K=13: Mean CV Accuracy = 0.9673
K=15: Mean CV Accuracy = 0.9673
K=17: Mean CV Accuracy = 0.9673
K=19: Mean CV Accuracy = 0.9593

Optimal K found: 11 with mean CV accuracy of 0.9757

Trade-offs in choosing 'K':
- Small K (e.g., K=1): Captures fine details, sensitive to noise/outliers (high variance, potential overfitting).
- Large K: Smoother decision boundary, less sensitive to noise, may over-generalize (high bias, potential underfitting).
- Generally, an odd K is preferred for binary classification to avoid ties.

# --- 5.1 Create a simple regression dataset ---
# Make a more complex, non-linear relationship to show KNN's flexibility
X_reg, y_reg = make_regression(n_samples=200, n_features=1, noise=20, random_state=42)
# Introduce a slight non-linearity
y_reg = y_reg + (X_reg[:,0]**2) * 5

# Reshape X_reg for single feature regression model compatibility
X_reg = X_reg.reshape(-1, 1)

plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_reg[:, 0], y=y_reg, alpha=0.7)
plt.title('Synthetic Regression Data')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.show()

# --- 5.2 Split data and scale ---
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.3, random_state=42)

scaler_reg = StandardScaler()
X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_reg_scaled = scaler_reg.transform(X_test_reg)

# --- 5.3 Train a KNeighborsRegressor model ---
knn_reg = KNeighborsRegressor(n_neighbors=5) # Choose a K value
knn_reg.fit(X_train_reg_scaled, y_train_reg)

print("\nKNeighborsRegressor model trained successfully!")

# --- 5.4 Make predictions ---
y_pred_reg = knn_reg.predict(X_test_reg_scaled)

KNeighborsRegressor model trained successfully!

# --- 5.5 Evaluate using regression metrics ---
mae_reg = mean_absolute_error(y_test_reg, y_pred_reg)
mse_reg = mean_squared_error(y_test_reg, y_pred_reg)
rmse_reg = np.sqrt(mse_reg)
r2_reg = r2_score(y_test_reg, y_pred_reg)

print(f"\nMean Absolute Error (MAE): {mae_reg:.4f}")
print(f"Mean Squared Error (MSE): {mse_reg:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse_reg:.4f}")
print(f"R-squared (R2): {r2_reg:.4f}")

Mean Absolute Error (MAE): 21.2281
Mean Squared Error (MSE): 800.2847
Root Mean Squared Error (RMSE): 28.2893
R-squared (R2): 0.9135

# --- 5.6 Plot actual vs. predicted values ---
plt.figure(figsize=(10, 7))
sns.scatterplot(x=y_test_reg, y=y_pred_reg, alpha=0.6)
plt.plot([y_test_reg.min(), y_test_reg.max()], [y_test_reg.min(), y_test_reg.max()], 'r--', lw=2, label='Perfect Prediction Line')
plt.xlabel('Actual Target')
plt.ylabel('Predicted Target')
plt.title('KNN Regression: Actual vs. Predicted Values')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.show()

# Plot the regression line with original data for a single feature
# To visualize the regression line, we'll sort the test features
# and predict on them.
sorted_indices = np.argsort(X_test_reg_scaled.flatten())
X_test_reg_scaled_sorted = X_test_reg_scaled[sorted_indices] # Corrected line
y_pred_reg_sorted = knn_reg.predict(X_test_reg_scaled_sorted)

plt.figure(figsize=(10, 7))
sns.scatterplot(x=X_test_reg.flatten(), y=y_test_reg, alpha=0.7, label='Actual Data')
plt.plot(X_test_reg_scaled_sorted.flatten(), y_pred_reg_sorted, color='red', linestyle='-', linewidth=2, label='KNN Regression Line (K=5)')
plt.title('KNN Regression Line Fit')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()
plt.show()

Part 1: Understanding K-Nearest Neighbors (KNN) - The Basics¶

Part 2: KNN for Classification with Scikit-learn (Synthetic Data)¶

Part 3: KNN for Classification (Real-world Data) & Feature Scaling¶

Part 4: Optimizing 'K' - Choosing the Best Number of Neighbors¶

Part 5: KNN for Regression with Scikit-learn¶

Part 6: Advantages, Disadvantages, and Use Cases¶