# Install necessary libraries if not already present in Colab environment
!pip install pandas numpy scikit-learn matplotlib seaborn -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

# Scikit-learn for preprocessing, pipelines, and models
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.linear_model import Ridge

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas Version: {pd.__version__}")
print(f"NumPy Version: {np.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

Pandas Version: 2.2.2
NumPy Version: 2.0.2
Scikit-learn Version: 1.6.1

print("--- Part 1: Dataset Loading and Initial Exploration ---")

# Load the dataset from a public URL
url = 'https://www.openml.org/data/get_csv/22/dataset_22_housing.arff'
df = pd.read_csv(url)

# The original data did not have a 'SalePrice' column, so We'll rename 'SalePriceOfHouse'
# df.rename(columns={'SalePriceOfHouse': 'SalePrice'}, inplace=True) # This column doesn't exist

# The target variable seems to be in the 'class' column
# 1. Separate target variable from features
X = df.drop('class', axis=1)
y = df['class']

# 2. Split data into training and testing sets BEFORE any analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# 3. Identify columns with missing values in the training set
missing_vals = X_train.isnull().sum()
missing_vals = missing_vals[missing_vals > 0].sort_values(ascending=False)
print("\nFeatures with Missing Values in Training Data (Top 10):\n", missing_vals.head(10))

# 4. Visualize the target variable's distribution
sns.histplot(y_train, kde=True)
plt.title('Distribution of Target Variable (class)')
plt.show()

--- Part 1: Dataset Loading and Initial Exploration ---
Training data shape: (1600, 47)
Testing data shape: (400, 47)

Features with Missing Values in Training Data (Top 10):
 Series([], dtype: int64)

print("\n--- Part 2: Advanced Imputation with KNNImputer ---")

# 1. Identify numerical features
numerical_features = X_train.select_dtypes(include=np.number).columns

# 2. Initialize KNN Imputer and Scaler
# n_neighbors is a hyperparameter; 5 is a common default.
scaler = StandardScaler()
imputer = KNNImputer(n_neighbors=5)

# 3. Create a temporary copy to avoid modifying the original training data
X_train_temp_imputed = X_train.copy()

# 4. Scale, then impute on the numerical features
X_train_scaled = scaler.fit_transform(X_train_temp_imputed[numerical_features])
X_train_imputed_scaled = imputer.fit_transform(X_train_scaled)

# 5. Inverse transform to get the data back to its original scale
X_train_temp_imputed[numerical_features] = scaler.inverse_transform(X_train_imputed_scaled)


print("\nKNN Imputation demonstrated successfully.")

--- Part 2: Advanced Imputation with KNNImputer ---

KNN Imputation demonstrated successfully.

print("\n--- Part 3: Outlier Detection and Treatment ---")

# We will use a copy of the training data for this demonstration
X_train_temp_outlier = X_train.copy()
# For this example, let's fill NaNs with median to focus on outlier logic
# X_train_temp_outlier['GrLivArea'].fillna(X_train_temp_outlier['GrLivArea'].median(), inplace=True) # This column doesn't exist

# Let's pick a numerical column to demonstrate outlier treatment, e.g., 'att47'
feature_to_analyze = 'att47'
X_train_temp_outlier[feature_to_analyze].fillna(X_train_temp_outlier[feature_to_analyze].median(), inplace=True)

# 1. Visualize a feature known for outliers, like 'att47'
sns.boxplot(x=X_train_temp_outlier[feature_to_analyze])
plt.title(f"Boxplot of '{feature_to_analyze}' Before Outlier Treatment")
plt.show()

# 2. Calculate IQR and boundaries for 'att47'
Q1 = X_train_temp_outlier[feature_to_analyze].quantile(0.25)
Q3 = X_train_temp_outlier[feature_to_analyze].quantile(0.75)
IQR = Q3 - Q1
upper_bound = Q3 + 1.5 * IQR
lower_bound = Q1 - 1.5 * IQR


print(f"Lower bound for '{feature_to_analyze}' outliers: {lower_bound:.2f}")
print(f"Upper bound for '{feature_to_analyze}' outliers: {upper_bound:.2f}")


# 3. Cap the outliers
original_max = X_train_temp_outlier[feature_to_analyze].max()
original_min = X_train_temp_outlier[feature_to_analyze].min()

X_train_temp_outlier[feature_to_analyze] = np.clip(X_train_temp_outlier[feature_to_analyze], a_min=lower_bound, a_max=upper_bound)

capped_max = X_train_temp_outlier[feature_to_analyze].max()
capped_min = X_train_temp_outlier[feature_to_analyze].min()


print(f"Original min value: {original_min:.2f}, Original max value: {original_max:.2f}")
print(f"Min value after capping: {capped_min:.2f}, Max value after capping: {capped_max:.2f}")


# Visualize after capping
sns.boxplot(x=X_train_temp_outlier[feature_to_analyze])
plt.title(f"Boxplot of '{feature_to_analyze}' After Outlier Treatment")
plt.show()

--- Part 3: Outlier Detection and Treatment ---

Lower bound for 'att47' outliers: 301.07
Upper bound for 'att47' outliers: 711.43
Original min value: 322.23, Original max value: 740.95
Min value after capping: 322.23, Max value after capping: 711.43

print("\n--- Part 4: Building an Automated Preprocessing Pipeline ---")

# 1. Identify numerical and categorical feature names from the original DataFrame
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# 2. Create the preprocessing pipeline for numerical data
# This pipeline will first impute missing values with the median, then scale the data.
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# 3. Create the preprocessing pipeline for categorical data
# This pipeline will impute missing values with the most frequent category, then one-hot encode.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 4. Use ColumnTransformer to apply different transformers to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# 5. Create the full pipeline including the preprocessor and a model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', Ridge(alpha=10))
])

# 6. Train and evaluate the pipeline (with log-transformed target for better performance)
y_train_log = np.log1p(y_train)
y_test_log = np.log1p(y_test)

model_pipeline.fit(X_train, y_train_log)
score = model_pipeline.score(X_test, y_test_log)

print(f"\nPipeline trained successfully.")
print(f"Model R^2 score on test data: {score:.4f}")

--- Part 4: Building an Automated Preprocessing Pipeline ---

Pipeline trained successfully.
Model R^2 score on test data: 0.5781

print("\n--- Part 5: Advanced Feature Selection with RFE ---")

# 1. Create a new pipeline with an RFE step
# RFE needs an estimator to judge feature importance. A simple linear model is a good choice.
rfe_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selector', RFE(estimator=Ridge(), n_features_to_select=50, step=0.1)), # Select top 50 features
    ('regressor', Ridge(alpha=10))
])

# 2. Train and evaluate the RFE pipeline
rfe_pipeline.fit(X_train, y_train_log)
rfe_score = rfe_pipeline.score(X_test, y_test_log)

print(f"RFE Pipeline trained successfully.")
print(f"Original Model R^2 score: {score:.4f}")
print(f"RFE Model R^2 score: {rfe_score:.4f}")

# 3. Inspect which features were selected by RFE
selected_mask = rfe_pipeline.named_steps['selector'].support_

# Get the feature names *after* preprocessing from the fitted pipeline
preprocessed_feature_names = rfe_pipeline.named_steps['preprocessor'].get_feature_names_out()

# Apply the RFE selection mask to the preprocessed feature names
selected_feature_names = preprocessed_feature_names[selected_mask]

print("\nA few features selected by RFE:")
print(selected_feature_names[:10])

--- Part 5: Advanced Feature Selection with RFE ---
RFE Pipeline trained successfully.
Original Model R^2 score: 0.5781
RFE Model R^2 score: 0.5781

A few features selected by RFE:
['num__att1' 'num__att2' 'num__att3' 'num__att4' 'num__att5' 'num__att6'
 'num__att7' 'num__att8' 'num__att9' 'num__att10']

Data Preprocessing for Data Mining¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: Dataset Loading and Initial Exploration¶

Part 2: Advanced Imputation: Beyond the Mean¶

Part 3: Outlier Detection and Treatment¶

Part 4: Building an Automated Preprocessing Pipeline¶

Part 5: Advanced Feature Selection with RFE¶

Part 6: Advanced Topics & Discussion¶