Part 1: Data Loading & Initial Exploration¶
Task: Load the dirty_data.csv into a pandas DataFrame.
Task: Perform initial exploratory data analysis (EDA):
df.head(), df.info(), df.describe()
Check for
df.isnull().sum()
to identify missing values.df.dtypes
to inspect column types.Identify categorical vs. numerical features.
Use value_counts() for categorical columns.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer # for fancy imputation
from sklearn.impute import IterativeImputer
from sklearn.ensemble import IsolationForest # for outlier detection
# Load the dataset
try:
df = pd.read_csv('dirty_data.csv')
except FileNotFoundError:
print("Error: 'dirty_data.csv' not found. Please ensure the dataset is in the correct directory.")
# Students would typically download or be provided the path
# For demonstration, you might create a dummy DataFrame if the file isn't ready
data = {
'ID': range(1, 11),
'Age': [25, 30, np.nan, 22, 45, 30, 28, 50, np.nan, 35],
'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male'],
'City': ['New York', 'London', 'Paris', 'New York', 'London', 'Paris', 'New York', 'London', 'Paris', 'New York'],
'Income': [50000, 60000, 75000, 45000, 90000, 62000, 53000, 150000, 70000, np.nan],
'Experience_Years': [3, 7, 2, 1, 10, 5, 4, 15, 6, 8],
'Has_Degree': ['Yes', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes'],
'Favorite_Color': ['Blue', 'Green', 'Red', 'blue', 'Green', 'RED', 'Blue', 'Green', 'Red', np.nan]
}
df = pd.DataFrame(data)
print("Using a dummy DataFrame for demonstration.")
print("Initial DataFrame Info:")
df.info()
print("\nMissing Values:")
print(df.isnull().sum())
print("\nDescriptive Statistics:")
print(df.describe())
Part 2: Handling Missing Values¶
Task: Discuss and implement different strategies for missing data.
Numerical:
Mean/Median imputation using SimpleImputer.
More advanced imputation (e.g., IterativeImputer for multiple imputation, conceptual introduction to KNNImputer).
Categorical:
Mode imputation (SimpleImputer with strategy='most_frequent').
Filling with a constant (e.g., 'Unknown').
Focus: You should justify their choice of imputation strategy for different columns.
# Separate numerical and categorical features for imputation demonstration
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Remove ID column if it's not a feature
if 'ID' in numerical_cols:
numerical_cols.remove('ID')
print(f"Numerical columns for imputation: {numerical_cols}")
print(f"Categorical columns for imputation: {categorical_cols}")
# --- Strategy 1: Simple Imputation (Mean/Median for Numerical, Mode for Categorical) ---
print("\n--- Simple Imputation ---")
df_imputed_simple = df.copy()
# Numerical Imputer
# Students should decide between mean/median based on distribution
num_imputer = SimpleImputer(strategy='mean') # or 'median'
df_imputed_simple[numerical_cols] = num_imputer.fit_transform(df_imputed_simple[numerical_cols])
# Categorical Imputer
cat_imputer = SimpleImputer(strategy='most_frequent') # or 'constant', fill_value='Unknown'
df_imputed_simple[categorical_cols] = cat_imputer.fit_transform(df_imputed_simple[categorical_cols])
print("Missing values after simple imputation:")
print(df_imputed_simple.isnull().sum())
# --- Strategy 2 (Optional/Advanced): Iterative Imputation ---
# Only if time permits and students are comfortable
print("\n--- Iterative Imputation (Advanced) ---")
# Ensure numerical_cols has no NaNs from categorical conversion if you ran it
df_iterative = df[numerical_cols].copy() # only numerical for this example
iterative_imputer = IterativeImputer(max_iter=10, random_state=0)
df_iterative_imputed = iterative_imputer.fit_transform(df_iterative)
df_iterative_imputed = pd.DataFrame(df_iterative_imputed, columns=numerical_cols, index=df_iterative.index)
print("Missing values after iterative numerical imputation (for numerical columns only):")
print(df_iterative_imputed.isnull().sum())
Part 3: Categorical Encoding¶
Task: Implement different encoding schemes.
Nominal (no order): One-Hot Encoding (OneHotEncoder). Discuss the "dummy variable trap" and drop_first=True.
Ordinal (with order): Label Encoding (LabelEncoder - warning about incorrect order for trees, or OrdinalEncoder with specified categories).
Consideration: Target Encoding (brief conceptual intro for advanced students, though implementation is more complex).
Focus: When to use which encoding, and the implications for downstream models.
print("\n--- Categorical Encoding ---")
# Assume df_imputed_simple is the DataFrame from previous step with no missing values
df_encoded = df_imputed_simple.copy()
# Identify nominal and ordinal columns based on the dataset
# Example: 'City', 'Gender', 'Favorite_Color' might be nominal
# 'Has_Degree' could be treated as nominal or ordinal if 'Yes' > 'No'
nominal_features = ['Gender', 'City'] # assuming no inherent order
# For 'Favorite_Color', first clean inconsistencies before encoding
df_encoded['Favorite_Color'] = df_encoded['Favorite_Color'].str.lower().replace({'rred': 'red'}) # Example cleaning
# One-Hot Encoding for nominal features
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False) # sparse_output=False for dense array
encoded_features = ohe.fit_transform(df_encoded[nominal_features])
encoded_df = pd.DataFrame(encoded_features, columns=ohe.get_feature_names_out(nominal_features), index=df_encoded.index)
df_encoded = pd.concat([df_encoded.drop(columns=nominal_features), encoded_df], axis=1)
print("\nDataFrame after One-Hot Encoding (sample columns):")
print(df_encoded.head())
print(f"New shape: {df_encoded.shape}")
# Ordinal Encoding example (e.g., 'Has_Degree' if Yes/No has an order)
# Assuming 'No' is 0, 'Yes' is 1
ordinal_feature = 'Has_Degree'
# Define the order explicitly for OrdinalEncoder
categories_order = [['No', 'Yes']] # Must be a list of lists if multiple columns
oe = OrdinalEncoder(categories=categories_order)
df_encoded[ordinal_feature] = oe.fit_transform(df_encoded[[ordinal_feature]])
print(f"\nDataFrame after Ordinal Encoding '{ordinal_feature}':")
print(df_encoded[[ordinal_feature]].head())
Prepared By
Md. Atikuzzaman
Lecturer
Department of Computer Science and Engineering
Green University of Bangladesh
Email: atik@cse.green.edu.bd