# Install necessary libraries (if not already present in Colab environment)
# These are usually pre-installed in Colab, so these commands might just confirm.
!pip install numpy pandas matplotlib seaborn scikit-learn

Requirement already satisfied: numpy in /usr/local/lib/python3.11/dist-packages (2.0.2)
Requirement already satisfied: pandas in /usr/local/lib/python3.11/dist-packages (2.2.2)
Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-packages (3.10.0)
Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-packages (0.13.2)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-packages (1.6.1)
Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.11/dist-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-packages (from pandas) (2025.2)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.3.2)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (4.58.5)
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (1.4.8)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (24.2)
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (11.2.1)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dist-packages (from matplotlib) (3.2.3)
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.15.3)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (1.5.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn) (3.6.0)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for ML models and preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"NumPy Version: {np.__version__}")
print(f"Pandas Version: {pd.__version__}")

NumPy Version: 2.0.2
Pandas Version: 2.2.2

# --- 1.2 Running Code & Runtime ---
# To run this cell:
# 1. Click the "Play" button (triangle icon) on the left of the cell.
# 2. Or, press Shift + Enter.
# The first time you run a cell, Colab will connect to a runtime.
print("Hello from a Code Cell in Google Colab!")

Hello from a Code Cell in Google Colab!

# --- 1.3 Basic Python Operations ---
# Variables and Data Types
name = "Alice"
age = 30
height = 1.75 # meters
is_student = False

print(f"\nName: {name}, Type: {type(name)}")
print(f"Age: {age}, Type: {type(age)}")
print(f"Height: {height}, Type: {type(height)}")
print(f"Is Student: {is_student}, Type: {type(is_student)}")

Name: Alice, Type: <class 'str'>
Age: 30, Type: <class 'int'>
Height: 1.75, Type: <class 'float'>
Is Student: False, Type: <class 'bool'>

# Lists (ordered, mutable collection)
fruits = ["apple", "banana", "cherry"]
print(f"\nFruits list: {fruits}")
print(f"First fruit: {fruits[0]}")
fruits.append("date")
print(f"Fruits after append: {fruits}")

Fruits list: ['apple', 'banana', 'cherry']
First fruit: apple
Fruits after append: ['apple', 'banana', 'cherry', 'date']

# Dictionaries (unordered, mutable key-value pairs)
person = {"name": "Bob", "age": 25, "city": "New York"}
print(f"\nPerson dict: {person}")
print(f"Bob's age: {person['age']}")
person['city'] = "London"
print(f"Person after city update: {person}")

Person dict: {'name': 'Bob', 'age': 25, 'city': 'New York'}
Bob's age: 25
Person after city update: {'name': 'Bob', 'age': 25, 'city': 'London'}

# Conditional Statements
num = 10
if num > 0:
    print(f"\n{num} is positive.")
elif num < 0:
    print(f"{num} is negative.")
else:
    print(f"{num} is zero.")

10 is positive.

# Loops
print("\nLooping through fruits:")
for fruit in fruits:
    print(fruit)

print("\nLooping with range:")
for i in range(3): # 0, 1, 2
    print(f"Count: {i}")

Looping through fruits:
apple
banana
cherry
date

Looping with range:
Count: 0
Count: 1
Count: 2

# --- 1.4 Introduction to NumPy ---
# NumPy (Numerical Python) is fundamental for numerical computing in Python.
# It provides powerful N-dimensional array objects.

# Create a NumPy array
arr = np.array([1, 2, 3, 4, 5])
print(f"\nNumPy array: {arr}")
print(f"Array shape: {arr.shape}")
print(f"Array type: {type(arr)}")

# Basic operations
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
print(f"\n2D array:\n{arr_2d}")
print(f"2D array shape: {arr_2d.shape}")

# Element-wise operations
print(f"\nArray + 5: {arr + 5}")
print(f"Array * 2: {arr * 2}")

# Mathematical functions
print(f"Sum of array elements: {np.sum(arr)}")
print(f"Mean of 2D array: {np.mean(arr_2d)}")

# Reshaping arrays
arr_reshaped = arr.reshape(5, 1)
print(f"\nReshaped array (5x1):\n{arr_reshaped}")

NumPy array: [1 2 3 4 5]
Array shape: (5,)
Array type: <class 'numpy.ndarray'>

2D array:
[[1 2 3]
 [4 5 6]]
2D array shape: (2, 3)

Array + 5: [ 6  7  8  9 10]
Array * 2: [ 2  4  6  8 10]
Sum of array elements: 15
Mean of 2D array: 3.5

Reshaped array (5x1):
[[1]
 [2]
 [3]
 [4]
 [5]]

# --- 2.1 Load a CSV file ---
# We'll use the Palmer Penguins dataset, hosted on a common GitHub repository.
# This dataset contains measurements for three species of penguins.
data_url = "https://raw.githubusercontent.com/allisonhorst/palmerpenguins/main/inst/extdata/penguins.csv"
try:
    df = pd.read_csv(data_url)
    print(f"\nSuccessfully loaded data from: {data_url}")
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please check the URL or your internet connection.")
    # Fallback to a local dataset if online fails
    # You might need to upload a local CSV if this fails persistently
    # For now, we'll assume it works or handle gracefully.
    df = pd.DataFrame() # Empty DataFrame to prevent errors later
if not df.empty:
  print("DataFrame is not empty. You can proceed with data exploration!")
else:
    print("DataFrame is empty. Cannot proceed with data exploration.")

Successfully loaded data from: https://raw.githubusercontent.com/allisonhorst/palmerpenguins/main/inst/extdata/penguins.csv
DataFrame is not empty. You can proceed with data exploration!

# --- 2.2 Initial Data Inspection ---
print("\n--- df.head() (First 5 rows) ---")
print(df.head())

print("\n--- df.info() (Summary of DataFrame, Non-Null counts, dtypes) ---")
df.info()

print("\n--- df.describe() (Statistical summary of numerical columns) ---")
print(df.describe())

print("\n--- df.shape (Number of rows, columns) ---")
print(f"DataFrame shape: {df.shape} (rows, columns)")

print("\n--- df.columns (List of column names) ---")
print(f"Column names: {df.columns.tolist()}")

# --- 2.3 Basic Data Selection and Filtering ---
# Select a single column (Series)
print("\n--- Selecting a single column ('species') ---")
print(df['species'].head())

# Select multiple columns (DataFrame)
print("\n--- Selecting multiple columns ('species', 'bill_length_mm') ---")
print(df[['species', 'bill_length_mm']].head())

# Filtering rows based on a condition
print("\n--- Filtering rows where species is 'Adelie' ---")
adelie_penguins = df[df['species'] == 'Adelie']
print(adelie_penguins.head())

# Count unique values in a categorical column
print("\n--- Value counts for 'species' ---")
print(df['species'].value_counts())

print("\n--- Value counts for 'island' ---")
print(df['island'].value_counts())

--- df.head() (First 5 rows) ---
  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  year  
0       3750.0    male  2007  
1       3800.0  female  2007  
2       3250.0  female  2007  
3          NaN     NaN  2007  
4       3450.0  female  2007  

--- df.info() (Summary of DataFrame, Non-Null counts, dtypes) ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB

--- df.describe() (Statistical summary of numerical columns) ---
       bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
count      342.000000     342.000000         342.000000   342.000000   
mean        43.921930      17.151170         200.915205  4201.754386   
std          5.459584       1.974793          14.061714   801.954536   
min         32.100000      13.100000         172.000000  2700.000000   
25%         39.225000      15.600000         190.000000  3550.000000   
50%         44.450000      17.300000         197.000000  4050.000000   
75%         48.500000      18.700000         213.000000  4750.000000   
max         59.600000      21.500000         231.000000  6300.000000   

              year  
count   344.000000  
mean   2008.029070  
std       0.818356  
min    2007.000000  
25%    2007.000000  
50%    2008.000000  
75%    2009.000000  
max    2009.000000  

--- df.shape (Number of rows, columns) ---
DataFrame shape: (344, 8) (rows, columns)

--- df.columns (List of column names) ---
Column names: ['species', 'island', 'bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex', 'year']

--- Selecting a single column ('species') ---
0    Adelie
1    Adelie
2    Adelie
3    Adelie
4    Adelie
Name: species, dtype: object

--- Selecting multiple columns ('species', 'bill_length_mm') ---
  species  bill_length_mm
0  Adelie            39.1
1  Adelie            39.5
2  Adelie            40.3
3  Adelie             NaN
4  Adelie            36.7

--- Filtering rows where species is 'Adelie' ---
  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  year  
0       3750.0    male  2007  
1       3800.0  female  2007  
2       3250.0  female  2007  
3          NaN     NaN  2007  
4       3450.0  female  2007  

--- Value counts for 'species' ---
species
Adelie       152
Gentoo       124
Chinstrap     68
Name: count, dtype: int64

--- Value counts for 'island' ---
island
Biscoe       168
Dream        124
Torgersen     52
Name: count, dtype: int64

# --- 3.1 Identify and Handle Missing Values ---
print("\n--- Missing values before handling ---")
print(df.isnull().sum())

# Strategy: Drop rows with any missing values for simplicity in this intro lab.
# For more complex scenarios, imputation (e.g., mean, median, mode) might be preferred.
df_cleaned = df.dropna().copy()  # .copy() to avoid SettingWithCopyWarning

print("\n--- Missing values after dropping rows ---")
print(df_cleaned.isnull().sum())
print(f"Original shape: {df.shape}, Cleaned shape: {df_cleaned.shape}")

--- Missing values before handling ---
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

--- Missing values after dropping rows ---
species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64
Original shape: (344, 8), Cleaned shape: (333, 8)

# --- 3.2 Handle Categorical Features ---
# The 'species' column is our target variable, which is categorical.
# The 'island' and 'sex' columns are also categorical features.

from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# For the target variable ('species'), we'll use Label Encoding
# It converts categories into numerical labels (0, 1, 2...)
# This is suitable for target variables in classification where order doesn't imply meaning.
le = LabelEncoder()
df_cleaned['species_encoded'] = le.fit_transform(df_cleaned['species'])
print(f"\nOriginal species: {df_cleaned['species'].unique()}")
print(f"Encoded species: {df_cleaned['species_encoded'].unique()}")
print(f"Species mapping: {list(le.classes_)}")

# For other categorical features ('island', 'sex'), we'll use One-Hot Encoding
# This creates new binary columns for each category. It's preferred for features
# to avoid implying an arbitrary order that doesn't exist.
df_processed = pd.get_dummies(df_cleaned, columns=['island', 'sex'], drop_first=True)  # drop_first avoids multicollinearity
print("\n--- DataFrame after One-Hot Encoding for 'island' and 'sex' ---")
print(df_processed.head())
print(f"Processed DataFrame shape: {df_processed.shape}")

Original species: ['Adelie' 'Gentoo' 'Chinstrap']
Encoded species: [0 2 1]
Species mapping: ['Adelie', 'Chinstrap', 'Gentoo']

--- DataFrame after One-Hot Encoding for 'island' and 'sex' ---
  species  bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0  Adelie            39.1           18.7              181.0       3750.0   
1  Adelie            39.5           17.4              186.0       3800.0   
2  Adelie            40.3           18.0              195.0       3250.0   
4  Adelie            36.7           19.3              193.0       3450.0   
5  Adelie            39.3           20.6              190.0       3650.0   

   year  species_encoded  island_Dream  island_Torgersen  sex_male  
0  2007                0         False              True      True  
1  2007                0         False              True     False  
2  2007                0         False              True     False  
4  2007                0         False              True     False  
5  2007                0         False              True      True  
Processed DataFrame shape: (333, 10)

# --- 3.3 Data Visualization with Matplotlib and Seaborn ---
print("\n--- Visualizing Data ---")

# Histogram of bill_length_mm
plt.figure(figsize=(8, 5))
sns.histplot(df_processed['bill_length_mm'], kde=True, bins=20)
plt.title('Distribution of Bill Length (mm)')
plt.xlabel('Bill Length (mm)')
plt.ylabel('Count')
plt.show()

# Scatter plot of bill_length vs. bill_depth, colored by species
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_processed, x='bill_length_mm', y='bill_depth_mm', hue='species', style='species', s=100)
plt.title('Bill Length vs. Bill Depth by Species')
plt.xlabel('Bill Length (mm)')
plt.ylabel('Bill Depth (mm)')
plt.legend(title='Species')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# Box plot of flipper_length_mm by species
plt.figure(figsize=(10, 6))
sns.boxplot(data=df_processed, x='species', y='flipper_length_mm', palette='viridis')
plt.title('Flipper Length by Species')
plt.xlabel('Species')
plt.ylabel('Flipper Length (mm)')
plt.show()

# Correlation matrix (only for numerical features)
# Drop the original 'species' column to only consider numerical and encoded
numerical_df = df_processed.select_dtypes(include=np.number).drop(columns=['species_encoded'])  # Drop target for correlation calc
plt.figure(figsize=(10, 8))
sns.heatmap(numerical_df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()

--- Visualizing Data ---

# --- 4.1 Supervised Learning Overview ---
# Supervised Learning: Learning from a dataset of input-output pairs.
#    - Classification: Predicting a categorical (discrete) output (e.g., penguin species, spam/not spam).
#    - Regression: Predicting a continuous numerical output (e.g., house price, temperature).
# Our task: Predict 'species_encoded' (0, 1, 2) based on other features, so it's a Classification problem.

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# --- 4.2 Select Features (X) and Target (y) ---
X = df_processed.drop(columns=['species', 'species_encoded'])
y = df_processed['species_encoded']

print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"Features columns: {X.columns.tolist()}")

Features (X) shape: (333, 8)
Target (y) shape: (333,)
Features columns: ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'year', 'island_Dream', 'island_Torgersen', 'sex_male']

# --- 4.3 Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set X shape: {X_train.shape}, y shape: {y_train.shape}")
print(f"Testing set X shape: {X_test.shape}, y shape: {y_test.shape}")

Training set X shape: (266, 8), y shape: (266,)
Testing set X shape: (67, 8), y shape: (67,)

# --- 4.4 Apply Feature Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\nFeatures scaled successfully.")
print(f"Example of scaled data (first 5 rows of X_train_scaled):\n{X_train_scaled[:5]}")

Features scaled successfully.
Example of scaled data (first 5 rows of X_train_scaled):
[[ 0.16732502 -1.99231135  0.83478089  1.08239437 -0.01828334 -0.76994439
  -0.41449508 -1.0461779 ]
 [ 1.22705014  1.25552054  0.1212471  -0.21564233  1.19755866  1.29879509
  -0.41449508  0.95586038]
 [ 0.87989881 -0.52063753  1.4769613   2.19499725  1.19755866 -0.76994439
  -0.41449508  0.95586038]
 [-0.47216428  0.64655206  0.04989372 -0.43198178  1.19755866 -0.76994439
   2.4125739   0.95586038]
 [-1.16646695  1.05253105 -1.44852725 -1.1428114  -1.23412534  1.29879509
  -0.41449508 -1.0461779 ]]

# --- 4.5 Choose and Train a Simple Classification Model (Logistic Regression) ---
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train_scaled, y_train)

print("\nLogistic Regression model trained successfully!")

Logistic Regression model trained successfully!

# --- 4.6 Make Predictions on the Test Set ---
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)

print(f"\nFirst 5 true labels (test set): {y_test.tolist()[:5]}")
print(f"First 5 predicted labels: {y_pred.tolist()[:5]}")
print(f"First 5 predicted probabilities:\n{y_pred_proba[:5]}")

First 5 true labels (test set): [2, 1, 0, 2, 2]
First 5 predicted labels: [2, 1, 0, 2, 2]
First 5 predicted probabilities:
[[2.30976158e-04 1.70105546e-02 9.82758469e-01]
 [2.80995084e-02 9.62237583e-01 9.66290876e-03]
 [9.89169208e-01 9.41251074e-03 1.41828077e-03]
 [5.67323150e-04 5.85620937e-03 9.93576467e-01]
 [8.92716738e-04 9.42269124e-03 9.89684592e-01]]

# --- 4.7 Evaluate the Model ---
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")

target_names = le.classes_
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=target_names))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

Model Accuracy on Test Set: 0.9851

--- Classification Report ---
              precision    recall  f1-score   support

      Adelie       1.00      0.97      0.98        29
   Chinstrap       0.93      1.00      0.97        14
      Gentoo       1.00      1.00      1.00        24

    accuracy                           0.99        67
   macro avg       0.98      0.99      0.98        67
weighted avg       0.99      0.99      0.99        67

Part 1: Navigating Google Colab & Python Basics¶

--- 1.1 Code Cells vs. Text Cells ---¶

Part 2: Data Loading and Initial Exploration with Pandas¶

Part 3: Data Preprocessing and Visualization¶

Part 4: Introduction to Machine Learning (Supervised Learning)¶

Part 5: Further Exploration & Ethical Considerations (Text Cell)¶