# Install necessary libraries if not already present in Colab environment
!pip install pandas numpy scikit-learn matplotlib seaborn nltk

Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)
Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (2.0.2)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.6.1)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)
Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-packages (0.13.2)
Requirement already satisfied: nltk in /usr/local/lib/python3.11/dist-packages (3.9.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.15.3)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.5.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.6.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.58.5)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (24.2)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (11.2.1)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.3)
Requirement already satisfied: click in /usr/local/lib/python3.11/dist-packages (from nltk) (8.2.1)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.11/dist-packages (from nltk) (2024.11.6)
Requirement already satisfied: tqdm in /usr/local/lib/python3.11/dist-packages (from nltk) (4.67.1)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn # Import sklearn directly

# Scikit-learn for Naive Bayes models, data splitting, preprocessing, and metrics
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.datasets import load_iris, fetch_20newsgroups # For different types of data

# NLTK for text processing (if needed for more advanced text tasks)
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
nltk.download('stopwords') # Download stopwords for text processing

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas Version: {pd.__version__}")
print(f"NumPy Version: {np.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

Pandas Version: 2.2.2
NumPy Version: 2.0.2
Scikit-learn Version: 1.6.1

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

# 1. Load the Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
X_iris = pd.DataFrame(iris.data, columns=iris.feature_names)
y_iris = pd.Series(iris.target)
iris_target_names = iris.target_names

print(f"Iris dataset shape: {X_iris.shape}")
print(f"Iris features (first 5 rows):\n{X_iris.head()}")
print(f"Iris target classes: {iris_target_names}")

# 2. Split data into training and testing sets
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.3, random_state=42, stratify=y_iris
)

print(f"\nTraining data shape: {X_train_iris.shape}")
print(f"Testing data shape: {X_test_iris.shape}")

Iris dataset shape: (150, 4)
Iris features (first 5 rows):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
Iris target classes: ['setosa' 'versicolor' 'virginica']

Training data shape: (105, 4)
Testing data shape: (45, 4)

# 3. (Optional) Scale features
# Gaussian Naive Bayes is not sensitive to feature scaling because it models the distribution
# based on mean and standard deviation, which are scale-variant. However, scaling can be good
# practice for consistency with other models or if distributions are highly skewed.
scaler_iris = StandardScaler()
X_train_iris_scaled = scaler_iris.fit_transform(X_train_iris)
X_test_iris_scaled = scaler_iris.transform(X_test_iris)

print("\nFeatures scaled.")

# 4. Initialize and train a Gaussian Naive Bayes classifier
gnb_model = GaussianNB()
gnb_model.fit(X_train_iris_scaled, y_train_iris)

print("\nGaussian Naive Bayes model trained.")

Features scaled.

Gaussian Naive Bayes model trained.

# 5. Make predictions and evaluate
y_pred_gnb = gnb_model.predict(X_test_iris_scaled)
y_proba_gnb = gnb_model.predict_proba(X_test_iris_scaled)

print(f"\nAccuracy Score (GaussianNB): {accuracy_score(y_test_iris, y_pred_gnb):.4f}")
print("\nClassification Report (GaussianNB):")
print(classification_report(y_test_iris, y_pred_gnb, target_names=iris_target_names))

# Confusion Matrix
cm_gnb = confusion_matrix(y_test_iris, y_pred_gnb)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_gnb, annot=True, fmt='d', cmap='Blues',
            xticklabels=iris_target_names, yticklabels=iris_target_names)
plt.title('Confusion Matrix (Gaussian Naive Bayes)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Accuracy Score (GaussianNB): 0.9111

Classification Report (GaussianNB):
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.82      0.93      0.88        15
   virginica       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45

# Examine learned parameters (mean and variance per class per feature)
print("\n--- Learned Parameters of GaussianNB ---")
print("Class Priors (P(y)):", gnb_model.class_prior_)
print("Class Means (Mean of features per class):\n", gnb_model.theta_)
print("Class Variances (Variance of features per class):\n", gnb_model.var_)

--- Learned Parameters of GaussianNB ---
Class Priors (P(y)): [0.33333333 0.33333333 0.33333333]
Class Means (Mean of features per class):
 [[-1.03020449  0.81419604 -1.29484676 -1.24795487]
 [ 0.08760619 -0.71163664  0.2547855   0.13291827]
 [ 0.94259829 -0.1025594   1.04006125  1.1150366 ]]
Class Variances (Variance of features per class):
 [[0.14005086 0.83996741 0.00727631 0.01545886]
 [0.32645331 0.41335985 0.06708186 0.05951797]
 [0.57600815 0.5668124  0.10237066 0.10665794]]

# 1. Load a subset of the 20 Newsgroups dataset
# We'll choose a few categories to keep it manageable
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
newsgroups_data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

X_text = newsgroups_data.data
y_text = newsgroups_data.target
text_target_names = newsgroups_data.target_names

print(f"Newsgroups dataset size: {len(X_text)} documents")
print(f"Newsgroups target classes: {text_target_names}")
print(f"\nExample document (first 200 chars):\n{X_text[0][:200]}...")
print(f"Corresponding class: {text_target_names[y_text[0]]}")

Newsgroups dataset size: 3759 documents
Newsgroups target classes: ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

Example document (first 200 chars):
From: geb@cs.pitt.edu (Gordon Banks)
Subject: Re: "CAN'T BREATHE"
Article-I.D.: pitt.19440
Reply-To: geb@cs.pitt.edu (Gordon Banks)
Organization: Univ. of Pittsburgh Computer Science
Lines: 23

In art...
Corresponding class: sci.med

# 2. Preprocess text data
# A simple preprocessing function
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower() # Lowercase
    # Remove punctuation, numbers (optional for simple example)
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])
    words = text.split()
    words = [word for word in words if word not in stop_words] # Remove stopwords
    words = [stemmer.stem(word) for word in words] # Stemming (optional)
    return ' '.join(words)

# Apply preprocessing to all documents
X_text_preprocessed = [preprocess_text(doc) for doc in X_text]
print(f"\nExample preprocessed document (first 200 chars):\n{X_text_preprocessed[0][:200]}...")

# 3. Convert text documents into numerical feature vectors using CountVectorizer
# CountVectorizer converts a collection of text documents to a matrix of token counts.
# This is ideal for Multinomial Naive Bayes.
vectorizer = CountVectorizer()
X_text_features = vectorizer.fit_transform(X_text_preprocessed)

print(f"\nText features matrix shape: {X_text_features.shape} (documents x vocabulary size)")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")

Example preprocessed document (first 200 chars):
gebcspittedu gordon bank subject cant breath articleid pitt replyto gebcspittedu gordon bank organ univ pittsburgh comput scienc line articl martijcuucp pjstijcuucp paul schmidt write think import ver...

Text features matrix shape: (3759, 35583) (documents x vocabulary size)
Vocabulary size: 35583

# 4. Split data
X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X_text_features, y_text, test_size=0.3, random_state=42, stratify=y_text
)

print(f"\nTraining text features shape: {X_train_text.shape}")
print(f"Testing text features shape: {X_test_text.shape}")

Training text features shape: (2631, 35583)
Testing text features shape: (1128, 35583)

# 5. Train a Multinomial Naive Bayes classifier
# alpha is the Laplace/Lidstone smoothing parameter (default=1.0 for Laplace smoothing)
mnb_model = MultinomialNB(alpha=1.0) # Default alpha=1.0 applies Laplace smoothing
mnb_model.fit(X_train_text, y_train_text)

print("\nMultinomial Naive Bayes model trained.")

Multinomial Naive Bayes model trained.

# 6. Make predictions and evaluate
y_pred_mnb = mnb_model.predict(X_test_text)
y_proba_mnb = mnb_model.predict_proba(X_test_text)

print(f"\nAccuracy Score (MultinomialNB): {accuracy_score(y_test_text, y_pred_mnb):.4f}")
print("\nClassification Report (MultinomialNB):")
print(classification_report(y_test_text, y_pred_mnb, target_names=text_target_names))

# Confusion Matrix
cm_mnb = confusion_matrix(y_test_text, y_pred_mnb)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_mnb, annot=True, fmt='d', cmap='Blues',
            xticklabels=text_target_names, yticklabels=text_target_names)
plt.title('Confusion Matrix (Multinomial Naive Bayes)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Accuracy Score (MultinomialNB): 0.9619

Classification Report (MultinomialNB):
                        precision    recall  f1-score   support

           alt.atheism       0.96      0.94      0.95       240
         comp.graphics       0.99      0.96      0.98       292
               sci.med       0.95      0.99      0.97       297
soc.religion.christian       0.95      0.95      0.95       299

              accuracy                           0.96      1128
             macro avg       0.96      0.96      0.96      1128
          weighted avg       0.96      0.96      0.96      1128

# Examine learned parameters
print("\n--- Learned Parameters of MultinomialNB ---")
print("Class Priors (P(y)):", np.exp(mnb_model.class_log_prior_)) # Corrected attribute name and converted from log probability
# log_prob_features is log(P(xi|y))
print("Log Probabilities of features given class (first 5 features):\n", mnb_model.feature_log_prob_[:, :5])

--- Learned Parameters of MultinomialNB ---
Class Priors (P(y)): [0.21246674 0.25883694 0.26339795 0.26529837]
Log Probabilities of features given class (first 5 features):
 [[ -8.50599223 -11.76408877 -11.07094159 -11.76408877 -11.76408877]
 [-10.22964246 -10.74046809 -11.83908037 -10.74046809 -11.83908037]
 [-11.91299605 -11.91299605 -11.91299605 -10.81438376 -11.21984887]
 [ -9.72637441 -12.0289595  -12.0289595  -12.0289595  -12.0289595 ]]

# 1. Convert text documents into binary feature vectors
# Use CountVectorizer with binary=True to represent presence/absence of words
vectorizer_binary = CountVectorizer(binary=True)
X_text_binary_features = vectorizer_binary.fit_transform(X_text_preprocessed)

print(f"Binary text features matrix shape: {X_text_binary_features.shape}")

Binary text features matrix shape: (3759, 35583)

# 2. Split data (using the same split strategy as before for consistency)
X_train_text_bin, X_test_text_bin, y_train_text_bin, y_test_text_bin = train_test_split(
    X_text_binary_features, y_text, test_size=0.3, random_state=42, stratify=y_text
)

print(f"\nTraining binary text features shape: {X_train_text_bin.shape}")
print(f"Testing binary text features shape: {X_test_text_bin.shape}")

Training binary text features shape: (2631, 35583)
Testing binary text features shape: (1128, 35583)

# 3. Train a Bernoulli Naive Bayes classifier
bnb_model = BernoulliNB(alpha=1.0) # Default alpha=1.0 for Laplace smoothing
bnb_model.fit(X_train_text_bin, y_train_text_bin)

print("\nBernoulli Naive Bayes model trained.")

Bernoulli Naive Bayes model trained.

# 4. Make predictions and evaluate
y_pred_bnb = bnb_model.predict(X_test_text_bin)
y_proba_bnb = bnb_model.predict_proba(X_test_text_bin)

print(f"\nAccuracy Score (BernoulliNB): {accuracy_score(y_test_text_bin, y_pred_bnb):.4f}")
print("\nClassification Report (BernoulliNB):")
print(classification_report(y_test_text_bin, y_pred_bnb, target_names=text_target_names))

# Confusion Matrix
cm_bnb = confusion_matrix(y_test_text_bin, y_pred_bnb)
plt.figure(figsize=(10, 8))
sns.heatmap(cm_bnb, annot=True, fmt='d', cmap='Blues',
            xticklabels=text_target_names, yticklabels=text_target_names)
plt.title('Confusion Matrix (Bernoulli Naive Bayes)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Accuracy Score (BernoulliNB): 0.9122

Classification Report (BernoulliNB):
                        precision    recall  f1-score   support

           alt.atheism       1.00      0.88      0.93       240
         comp.graphics       0.79      0.98      0.88       292
               sci.med       0.98      0.88      0.93       297
soc.religion.christian       0.94      0.90      0.92       299

              accuracy                           0.91      1128
             macro avg       0.93      0.91      0.91      1128
          weighted avg       0.92      0.91      0.91      1128

Part 1: Gaussian Naive Bayes on a Numerical Dataset¶

Part 2: Multinomial Naive Bayes on Text Data¶

Part 3: Bernoulli Naive Bayes¶

Part 4: Advanced Topics & Discussion:¶

The "Naive" Assumption in Practice:¶

Laplace Smoothing (Detailed):¶

Laplace Smoothing (Additive Smoothing):¶

Handling Continuous and Categorical Features Together:¶

Mixed Naive Bayes:¶

Comparison with Other Classifiers:¶

vs. Logistic Regression:¶

vs. Decision Trees:¶

Strengths and Weaknesses Summarized:¶

Strengths:¶

Weaknesses:¶