# Install necessary libraries (if not already present in Colab environment)
!pip install numpy pandas matplotlib seaborn scikit-learn

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for datasets, preprocessing, models, and metrics
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, roc_auc_score, precision_score, recall_score, f1_score
)

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set a consistent plotting style
sns.set_theme(style="whitegrid")

# Define the sigmoid function
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Generate a range of z values
z_values = np.linspace(-10, 10, 100)
# Calculate corresponding sigmoid values
sigmoid_values = sigmoid(z_values)

plt.figure(figsize=(9, 6))
plt.plot(z_values, sigmoid_values, color='blue', linewidth=2)
plt.axvline(0, color='gray', linestyle='--', label='z = 0')
plt.axhline(0.5, color='red', linestyle='--', label='P = 0.5')
plt.title('Sigmoid (Logistic) Function')
plt.xlabel('z (Linear Output)')
plt.ylabel('P(Y=1 | z) (Probability)')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.show()

# --- 2.1 Create a 2D synthetic dataset ---
# make_classification generates a random n-class classification problem.
# n_samples: total data points
# n_features: number of features (X dimensions)
# n_redundant: number of redundant features
# n_informative: number of informative features
# n_clusters_per_class: how many clusters per class
# random_state: for reproducibility
X_simple, y_simple = make_classification(n_samples=100, n_features=2, n_redundant=0,
                                       n_informative=2, n_clusters_per_class=1,
                                       random_state=42, flip_y=0.05) # Add a small amount of noise

# Convert to DataFrame for easier plotting
df_simple = pd.DataFrame(X_simple, columns=['Feature_1', 'Feature_2'])
df_simple['Target'] = y_simple

# --- 2.2 Visualize the data points ---
plt.figure(figsize=(9, 7))
sns.scatterplot(data=df_simple, x='Feature_1', y='Feature_2', hue='Target', palette='coolwarm', s=80, alpha=0.8)
plt.title('Synthetic 2D Classification Data')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(title='Class')
plt.show()

# --- 2.3 Train a LogisticRegression model ---
model_simple_lr = LogisticRegression(random_state=42)
model_simple_lr.fit(X_simple, y_simple)

print("\nSimple Logistic Regression model trained successfully!")

# --- 2.4 Plot the decision boundary ---
# Create a meshgrid to plot the decision boundary
x_min, x_max = X_simple[:, 0].min() - 1, X_simple[:, 0].max() + 1
y_min, y_max = X_simple[:, 1].min() - 1, X_simple[:, 1].max() + 1
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                     np.linspace(y_min, y_max, 100))

# Predict probabilities over the meshgrid
Z = model_simple_lr.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1] # Probability of Class 1
Z = Z.reshape(xx.shape)

plt.figure(figsize=(9, 7))
# Plot the contour lines of probabilities
plt.contourf(xx, yy, Z, levels=[0, 0.5, 1], cmap='coolwarm', alpha=0.3)
# Plot the decision boundary (where probability is 0.5)
plt.contour(xx, yy, Z, levels=[0.5], linewidths=2, colors='black', linestyles='--')
# Plot original data points
sns.scatterplot(data=df_simple, x='Feature_1', y='Feature_2', hue='Target', palette='coolwarm', s=80, alpha=0.8)

plt.title('Logistic Regression Decision Boundary')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.legend(title='Class')
plt.show()

Simple Logistic Regression model trained successfully!

# --- 2.5 Make predictions and observe probabilities ---
sample_point_1 = np.array([0, 0]).reshape(1, -1) # A point near the center
sample_point_2 = np.array([1, 1]).reshape(1, -1) # A point in Class 1 region
sample_point_3 = np.array([-1, -1]).reshape(1, -1) # A point in Class 0 region

print(f"\nPrediction for {sample_point_1[0]}: Class {model_simple_lr.predict(sample_point_1)[0]}, Probabilities: {model_simple_lr.predict_proba(sample_point_1)[0]}")
print(f"Prediction for {sample_point_2[0]}: Class {model_simple_lr.predict(sample_point_2)[0]}, Probabilities: {model_simple_lr.predict_proba(sample_point_2)[0]}")
print(f"Prediction for {sample_point_3[0]}: Class {model_simple_lr.predict(sample_point_3)[0]}, Probabilities: {model_simple_lr.predict_proba(sample_point_3)[0]}")

Prediction for [0 0]: Class 1, Probabilities: [0.15725068 0.84274932]
Prediction for [1 1]: Class 1, Probabilities: [0.03624701 0.96375299]
Prediction for [-1 -1]: Class 1, Probabilities: [0.48071494 0.51928506]

# --- 3.1 Load the Breast Cancer dataset ---
cancer = load_breast_cancer()
X_cancer = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y_cancer = pd.Series(cancer.target) # 0: malignant, 1: benign (sklearn convention)

print(f"\nDataset loaded. Number of features: {X_cancer.shape[1]}")
print(f"Target classes: {cancer.target_names} (0 for malignant, 1 for benign)")

# --- 3.2 Initial data exploration ---
print("\n--- X_cancer Head ---")
print(X_cancer.head())
print("\n--- X_cancer Info ---")
X_cancer.info()
print("\n--- X_cancer Description ---")
print(X_cancer.describe())

print("\n--- Target (y_cancer) Value Counts ---")
print(y_cancer.value_counts())
print(f"Maliganant (0): {y_cancer.value_counts()[0]} samples")
print(f"Benign (1): {y_cancer.value_counts()[1]} samples")

# --- 3.3 Define features (X) and target (y) (already done above) ---
# y_cancer = pd.Series(cancer.target)

Dataset loaded. Number of features: 30
Target classes: ['malignant' 'benign'] (0 for malignant, 1 for benign)

--- X_cancer Head ---
   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst radius  worst texture  worst perimeter  \
0                 0.07871  ...         25.38          17.33           184.60   
1                 0.05667  ...         24.99          23.41           158.80   
2                 0.05999  ...         23.57          25.53           152.50   
3                 0.09744  ...         14.91          26.50            98.87   
4                 0.05883  ...         22.54          16.67           152.20   

   worst area  worst smoothness  worst compactness  worst concavity  \
0      2019.0            0.1622             0.6656           0.7119   
1      1956.0            0.1238             0.1866           0.2416   
2      1709.0            0.1444             0.4245           0.4504   
3       567.7            0.2098             0.8663           0.6869   
4      1575.0            0.1374             0.2050           0.4000   

   worst concave points  worst symmetry  worst fractal dimension  
0                0.2654          0.4601                  0.11890  
1                0.1860          0.2750                  0.08902  
2                0.2430          0.3613                  0.08758  
3                0.2575          0.6638                  0.17300  
4                0.1625          0.2364                  0.07678  

[5 rows x 30 columns]

--- X_cancer Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         569 non-null    float64
 15  compactness error        569 non-null    float64
 16  concavity error          569 non-null    float64
 17  concave points error     569 non-null    float64
 18  symmetry error           569 non-null    float64
 19  fractal dimension error  569 non-null    float64
 20  worst radius             569 non-null    float64
 21  worst texture            569 non-null    float64
 22  worst perimeter          569 non-null    float64
 23  worst area               569 non-null    float64
 24  worst smoothness         569 non-null    float64
 25  worst compactness        569 non-null    float64
 26  worst concavity          569 non-null    float64
 27  worst concave points     569 non-null    float64
 28  worst symmetry           569 non-null    float64
 29  worst fractal dimension  569 non-null    float64
dtypes: float64(30)
memory usage: 133.5 KB

--- X_cancer Description ---
       mean radius  mean texture  mean perimeter    mean area  \
count   569.000000    569.000000      569.000000   569.000000   
mean     14.127292     19.289649       91.969033   654.889104   
std       3.524049      4.301036       24.298981   351.914129   
min       6.981000      9.710000       43.790000   143.500000   
25%      11.700000     16.170000       75.170000   420.300000   
50%      13.370000     18.840000       86.240000   551.100000   
75%      15.780000     21.800000      104.100000   782.700000   
max      28.110000     39.280000      188.500000  2501.000000   

       mean smoothness  mean compactness  mean concavity  mean concave points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813        0.079720             0.038803   
min           0.052630          0.019380        0.000000             0.000000   
25%           0.086370          0.064920        0.029560             0.020310   
50%           0.095870          0.092630        0.061540             0.033500   
75%           0.105300          0.130400        0.130700             0.074000   
max           0.163400          0.345400        0.426800             0.201200   

       mean symmetry  mean fractal dimension  ...  worst radius  \
count     569.000000              569.000000  ...    569.000000   
mean        0.181162                0.062798  ...     16.269190   
std         0.027414                0.007060  ...      4.833242   
min         0.106000                0.049960  ...      7.930000   
25%         0.161900                0.057700  ...     13.010000   
50%         0.179200                0.061540  ...     14.970000   
75%         0.195700                0.066120  ...     18.790000   
max         0.304000                0.097440  ...     36.040000   

       worst texture  worst perimeter   worst area  worst smoothness  \
count     569.000000       569.000000   569.000000        569.000000   
mean       25.677223       107.261213   880.583128          0.132369   
std         6.146258        33.602542   569.356993          0.022832   
min        12.020000        50.410000   185.200000          0.071170   
25%        21.080000        84.110000   515.300000          0.116600   
50%        25.410000        97.660000   686.500000          0.131300   
75%        29.720000       125.400000  1084.000000          0.146000   
max        49.540000       251.200000  4254.000000          0.222600   

       worst compactness  worst concavity  worst concave points  \
count         569.000000       569.000000            569.000000   
mean            0.254265         0.272188              0.114606   
std             0.157336         0.208624              0.065732   
min             0.027290         0.000000              0.000000   
25%             0.147200         0.114500              0.064930   
50%             0.211900         0.226700              0.099930   
75%             0.339100         0.382900              0.161400   
max             1.058000         1.252000              0.291000   

       worst symmetry  worst fractal dimension  
count      569.000000               569.000000  
mean         0.290076                 0.083946  
std          0.061867                 0.018061  
min          0.156500                 0.055040  
25%          0.250400                 0.071460  
50%          0.282200                 0.080040  
75%          0.317900                 0.092080  
max          0.663800                 0.207500  

[8 rows x 30 columns]

--- Target (y_cancer) Value Counts ---
1    357
0    212
Name: count, dtype: int64
Maliganant (0): 212 samples
Benign (1): 357 samples

# --- 3.4 Split data into training and testing sets ---
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, test_size=0.25, random_state=42, stratify=y_cancer)
# stratify=y_cancer ensures that the proportions of target classes are preserved in train and test sets.

print(f"\nTraining set X shape: {X_train.shape}, y shape: {y_train.shape}")
print(f"Testing set X shape: {X_test.shape}, y shape: {y_test.shape}")

Training set X shape: (426, 30), y shape: (426,)
Testing set X shape: (143, 30), y shape: (143,)

# --- 3.5 Apply feature scaling ---
# StandardScaler is crucial for Logistic Regression as it uses optimization algorithms
# that are sensitive to the scale of features.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled successfully.")
print(f"Example of scaled data (first row of X_train_scaled):\n{X_train_scaled[0]}")

Features scaled successfully.
Example of scaled data (first row of X_train_scaled):
[ 1.65909581e+00  2.17205449e-01  1.61061990e+00  1.63333853e+00
  5.76311605e-01  5.23544521e-01  6.45326310e-01  1.19874480e+00
 -9.35149071e-05 -1.24425088e-01  4.14312264e-01 -5.97142095e-01
  4.25167885e-01  4.97329621e-01 -6.44904409e-01 -1.38083245e-01
 -6.02764307e-02  1.97953779e-02 -1.15354241e+00 -1.48976771e-01
  1.56731877e+00 -7.58786987e-02  1.60722303e+00  1.38496870e+00
  4.12628434e-01  4.61628950e-01  6.42584428e-01  7.01834827e-01
 -5.56084149e-01  3.88780742e-01]

# --- 3.6 Train a LogisticRegression model ---
model_multi_lr = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter for convergence
model_multi_lr.fit(X_train_scaled, y_train)

print("\nMultiple Logistic Regression model trained successfully!")

Multiple Logistic Regression model trained successfully!

# --- 3.7 Make predictions and prediction probabilities on the test set ---
y_pred = model_multi_lr.predict(X_test_scaled)
y_pred_proba = model_multi_lr.predict_proba(X_test_scaled) # Probabilities for each class

print(f"\nFirst 5 true labels (test set): {y_test.tolist()[:5]}")
print(f"First 5 predicted labels: {y_pred.tolist()[:5]}")
print(f"First 5 predicted probabilities (Class 0, Class 1):\n{y_pred_proba[:5]}")

First 5 true labels (test set): [1, 0, 1, 1, 0]
First 5 predicted labels: [1, 0, 1, 1, 0]
First 5 predicted probabilities (Class 0, Class 1):
[[3.10179769e-02 9.68982023e-01]
 [9.99647982e-01 3.52018216e-04]
 [4.40588468e-01 5.59411532e-01]
 [6.10542466e-02 9.38945753e-01]
 [8.24508132e-01 1.75491868e-01]]

from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_curve, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns

# --- 4.1 Accuracy Score ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f}")

# --- 4.2 Confusion Matrix ---
cm = confusion_matrix(y_test, y_pred)
print("\n--- Confusion Matrix ---")
print(cm)

plt.figure(figsize=(7, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=cancer.target_names, yticklabels=cancer.target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Extracting values from confusion matrix for clarity
tn, fp, fn, tp = cm.ravel()
print(f"True Positives (TP): {tp} (Correctly predicted benign)")
print(f"True Negatives (TN): {tn} (Correctly predicted malignant)")
print(f"False Positives (FP): {fp} (Actual malignant, Predicted benign - Type I error)")
print(f"False Negatives (FN): {fn} (Actual benign, Predicted malignant - Type II error)")

# --- 4.3 Classification Report (Precision, Recall, F1-score) ---
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=cancer.target_names))

# --- 4.4 ROC Curve and AUC ---
# y_pred_proba[:, 1] gives probabilities for class 1 ('benign')
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:, 1])
auc = roc_auc_score(y_test, y_pred_proba[:, 1])

plt.figure(figsize=(8, 7))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

Accuracy: 0.9860

--- Confusion Matrix ---
[[52  1]
 [ 1 89]]

True Positives (TP): 89 (Correctly predicted benign)
True Negatives (TN): 52 (Correctly predicted malignant)
False Positives (FP): 1 (Actual malignant, Predicted benign - Type I error)
False Negatives (FN): 1 (Actual benign, Predicted malignant - Type II error)

--- Classification Report ---
              precision    recall  f1-score   support

   malignant       0.98      0.98      0.98        53
      benign       0.99      0.99      0.99        90

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143

# --- 5.1 Get coefficients and intercept from the trained model ---
coefficients = model_multi_lr.coef_[0]
intercept = model_multi_lr.intercept_[0]

# Display intercept
print(f"\nModel Intercept (log-odds when all features are 0): {intercept:.4f}")

# Display coefficients and odds ratios
print("\nModel Coefficients and Odds Ratios:")
print("{:<25} {:<15} {:<15}".format("Feature", "Coefficient", "Odds Ratio"))
print("-" * 55)

for i, feature_name in enumerate(cancer.feature_names):
    coef = coefficients[i]
    odds_ratio = np.exp(coef)
    print("{:<25} {:<15.4f} {:<15.4f}".format(feature_name, coef, odds_ratio))

Model Intercept (log-odds when all features are 0): 0.2806

Model Coefficients and Odds Ratios:
Feature                   Coefficient     Odds Ratio     
-------------------------------------------------------
mean radius               -0.5231         0.5927         
mean texture              -0.5187         0.5953         
mean perimeter            -0.4837         0.6165         
mean area                 -0.5573         0.5727         
mean smoothness           -0.3010         0.7401         
mean compactness          0.6942          2.0022         
mean concavity            -0.5654         0.5681         
mean concave points       -0.6769         0.5082         
mean symmetry             -0.1262         0.8814         
mean fractal dimension    0.0837          1.0873         
radius error              -1.0701         0.3430         
texture error             0.2607          1.2979         
perimeter error           -0.4911         0.6120         
area error                -0.9411         0.3902         
smoothness error          -0.1456         0.8645         
compactness error         0.6164          1.8523         
concavity error           0.1541          1.1666         
concave points error      -0.3440         0.7089         
symmetry error            0.4251          1.5297         
fractal dimension error   0.3749          1.4548         
worst radius              -0.9175         0.3995         
worst texture             -1.2501         0.2865         
worst perimeter           -0.7212         0.4862         
worst area                -0.9258         0.3962         
worst smoothness          -0.6696         0.5119         
worst compactness         0.0466          1.0477         
worst concavity           -0.7968         0.4508         
worst concave points      -0.9419         0.3899         
worst symmetry            -0.9570         0.3840         
worst fractal dimension   -0.2096         0.8109

	Predicted Negative (0)	Predicted Positive (1)
Actual Negative (0)	True Negative (TN)	False Positive (FP)
Actual Positive (1)	False Negative (FN)	True Positive (TP)

Part 1: Understanding Logistic Regression - The Basics¶

1.4 The Linear Combination:¶

1.5 The Decision Boundary:¶

1.6 The Cost Function (Binary Cross-Entropy / Log Loss):¶

Part 2: Simple Logistic Regression with Scikit-learn (Synthetic Data)¶

Part 3: Multiple Logistic Regression (Real-world Data)¶

Part 4: Model Evaluation for Classification¶

Accuracy¶

Confusion Matrix¶

Precision¶

Recall (Sensitivity or True Positive Rate)¶

F1-Score¶

ROC Curve & AUC (Area Under the Curve)¶

Tasks:¶

Part 5: Interpretation of Coefficients & Decision Boundary¶

Interpretation of Coefficients (Odds Ratios)¶

Visualizing the Decision Boundary (for simple 2D case)¶

Impact of `C` Parameter (Regularization)¶

5.2 Interpreting Odds Ratios (Example using Mean Radius)¶

5.3 Notes on Interpretation & Decision Boundary¶

Part 6: Advantages, Disadvantages, and Use Cases¶

Part 1: Understanding Logistic Regression - The Basics¶

1.4 The Linear Combination:¶

1.5 The Decision Boundary:¶

1.6 The Cost Function (Binary Cross-Entropy / Log Loss):¶

Part 2: Simple Logistic Regression with Scikit-learn (Synthetic Data)¶

Part 3: Multiple Logistic Regression (Real-world Data)¶

Part 4: Model Evaluation for Classification¶

Accuracy¶

Confusion Matrix¶

Precision¶

Recall (Sensitivity or True Positive Rate)¶

F1-Score¶

ROC Curve & AUC (Area Under the Curve)¶

Tasks:¶

Part 5: Interpretation of Coefficients & Decision Boundary¶

Interpretation of Coefficients (Odds Ratios)¶

Visualizing the Decision Boundary (for simple 2D case)¶

Impact of C Parameter (Regularization)¶

5.2 Interpreting Odds Ratios (Example using Mean Radius)¶

5.3 Notes on Interpretation & Decision Boundary¶

Part 6: Advantages, Disadvantages, and Use Cases¶

Impact of `C` Parameter (Regularization)¶