# Install necessary libraries (if not already present in Colab environment)
!pip install numpy pandas matplotlib seaborn scikit-learn

# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn for Linear Regression models, preprocessing, and metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer # For combining transformers
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

# Set a consistent plotting style
sns.set_theme(style="whitegrid")

# Let's create a simple conceptual dataset to visualize
np.random.seed(42) # for reproducibility
X_concept = np.random.rand(50) * 10 # 50 random values between 0 and 10
# True relationship: y = 2 * x + 5 + some noise
y_concept = 2 * X_concept + 5 + np.random.randn(50) * 2

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_concept, y=y_concept, color='blue', alpha=0.7)
plt.title('Conceptual Data for Simple Linear Regression')
plt.xlabel('Independent Variable (X)')
plt.ylabel('Dependent Variable (Y)')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()
print("\nVisualizing the conceptual goal: finding a line that best fits these points.")

Visualizing the conceptual goal: finding a line that best fits these points.

# Small synthetic dataset
x_manual = np.array([1, 2, 3, 4, 5])
y_manual = np.array([3, 5, 4, 7, 6]) # Slightly noisy linear relationship

print(f"X data: {x_manual}")
print(f"Y data: {y_manual}")

# 2.1 Calculate means
x_bar = np.mean(x_manual)
y_bar = np.mean(y_manual)
print(f"\nMean of X (x_bar): {x_bar}")
print(f"Mean of Y (y_bar): {y_bar}")

# 2.2 Calculate Beta_1 (slope)
numerator = np.sum((x_manual - x_bar) * (y_manual - y_bar))
denominator = np.sum((x_manual - x_bar)**2)
beta_1_manual = numerator / denominator
print(f"\nCalculated Slope (Beta_1): {beta_1_manual:.4f}")

# 2.3 Calculate Beta_0 (intercept)
beta_0_manual = y_bar - beta_1_manual * x_bar
print(f"Calculated Intercept (Beta_0): {beta_0_manual:.4f}")

print(f"\nManually calculated Regression Line: y = {beta_0_manual:.4f} + {beta_1_manual:.4f} * x")

# 2.4 Plot original data and the manually calculated regression line
plt.figure(figsize=(8, 6))
sns.scatterplot(x=x_manual, y=y_manual, color='blue', s=100, label='Actual Data Points')

# Generate points on the regression line for plotting
x_line = np.array([x_manual.min(), x_manual.max()])
y_line_manual = beta_0_manual + beta_1_manual * x_line
plt.plot(x_line, y_line_manual, color='red', linestyle='-', linewidth=2, label='Manual Regression Line')

plt.title('Simple Linear Regression: Manual Calculation')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# 2.5 Calculate and visualize residuals
y_pred_manual = beta_0_manual + beta_1_manual * x_manual
residuals_manual = y_manual - y_pred_manual

print(f"\nPredicted Y values (manual): {y_pred_manual}")
print(f"Residuals (manual): {residuals_manual}")
print(f"Sum of Squared Residuals (manual): {np.sum(residuals_manual**2):.4f}")

plt.figure(figsize=(8, 6))
sns.scatterplot(x=x_manual, y=y_manual, color='blue', s=100, label='Actual Data Points')
plt.plot(x_line, y_line_manual, color='red', linestyle='-', linewidth=2, label='Regression Line')
# Plot residuals as vertical dashed lines
for i in range(len(x_manual)):
    plt.plot([x_manual[i], x_manual[i]], [y_manual[i], y_pred_manual[i]], 'g--', alpha=0.7)
plt.title('Simple Linear Regression: Residuals')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

X data: [1 2 3 4 5]
Y data: [3 5 4 7 6]

Mean of X (x_bar): 3.0
Mean of Y (y_bar): 5.0

Calculated Slope (Beta_1): 0.8000
Calculated Intercept (Beta_0): 2.6000

Manually calculated Regression Line: y = 2.6000 + 0.8000 * x

Predicted Y values (manual): [3.4 4.2 5.  5.8 6.6]
Residuals (manual): [-0.4  0.8 -1.   1.2 -0.6]
Sum of Squared Residuals (manual): 3.6000

# Re-use the conceptual data
# X needs to be 2D for scikit-learn (e.g., (n_samples, n_features))
X_concept_reshaped = X_concept.reshape(-1, 1) # Convert 1D array to 2D column vector

# 3.1 Create and train the Linear Regression model
model_sklearn_simple = LinearRegression()
model_sklearn_simple.fit(X_concept_reshaped, y_concept)

print("Scikit-learn Simple Linear Regression model trained.")

# 3.2 Access coefficients and intercept
beta_1_sklearn = model_sklearn_simple.coef_[0] # Coefficient for the single feature
beta_0_sklearn = model_sklearn_simple.intercept_
print(f"\nScikit-learn Slope (Beta_1): {beta_1_sklearn:.4f}")
print(f"Scikit-learn Intercept (Beta_0): {beta_0_sklearn:.4f}")

print(f"\nScikit-learn Regression Line: y = {beta_0_sklearn:.4f} + {beta_1_sklearn:.4f} * x")

# 3.3 Make predictions
y_pred_sklearn_simple = model_sklearn_simple.predict(X_concept_reshaped)

# 3.4 Plot data and the scikit-learn regression line
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_concept, y=y_concept, color='blue', alpha=0.7, label='Actual Data Points')
plt.plot(X_concept, y_pred_sklearn_simple, color='green', linestyle='-', linewidth=2, label='Scikit-learn Regression Line')
plt.title('Simple Linear Regression with Scikit-learn')
plt.xlabel('Independent Variable (X)')
plt.ylabel('Dependent Variable (Y)')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

print("\nNotice how the scikit-learn line also closely fits the data, similar to the manual intuition.")

Scikit-learn Simple Linear Regression model trained.

Scikit-learn Slope (Beta_1): 1.9553
Scikit-learn Intercept (Beta_0): 5.1934

Scikit-learn Regression Line: y = 5.1934 + 1.9553 * x

Notice how the scikit-learn line also closely fits the data, similar to the manual intuition.

# --- 4.1 Load a more complex, real-world regression dataset ---
# We'll use the Abalone dataset, predicting age from physical measurements.
# It's a common dataset for regression tasks.
# Data source: https://archive.ics.uci.edu/ml/datasets/Abalone
# For simplicity, we'll use a version readily available from a GitHub raw URL.

abalone_url = "https://raw.githubusercontent.com/TheBabu/Abalone-Machine-Learning/master/abalone.csv"

column_names = ['Sex', 'Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight',
                'Viscera_weight', 'Shell_weight', 'Rings']

try:
    # Skip the first row as it appears to be a header incorrectly read as data
    df_abalone = pd.read_csv(abalone_url, header=0, names=column_names) # Use header=0 to read the first row as header and then replace with custom names
    print(f"\nSuccessfully loaded Abalone data from: {abalone_url}")
except Exception as e:
    print(f"Error loading Abalone data: {e}")
    df_abalone = pd.DataFrame() # Empty DataFrame to prevent errors later

Successfully loaded Abalone data from: https://raw.githubusercontent.com/TheBabu/Abalone-Machine-Learning/master/abalone.csv

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression # Corrected import

# --- 4.1 Initial inspection ---
print("\n--- Abalone DataFrame Head ---")
print(df_abalone.head())

print("\n--- Abalone DataFrame Info ---")
df_abalone.info()

print("\n--- Abalone DataFrame Description ---")
print(df_abalone.describe())

--- Abalone DataFrame Head ---
  Sex  Length  Diameter  Height  Whole_weight  Shucked_weight  Viscera_weight  \
0   M   0.455     0.365   0.095        0.5140          0.2245          0.1010   
1   M   0.350     0.265   0.090        0.2255          0.0995          0.0485   
2   F   0.530     0.420   0.135        0.6770          0.2565          0.1415   
3   M   0.440     0.365   0.125        0.5160          0.2155          0.1140   
4   I   0.330     0.255   0.080        0.2050          0.0895          0.0395   

   Shell_weight   Age  
0         0.150  16.5  
1         0.070   8.5  
2         0.210  10.5  
3         0.155  11.5  
4         0.055   8.5  

--- Abalone DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole_weight    4177 non-null   float64
 5   Shucked_weight  4177 non-null   float64
 6   Viscera_weight  4177 non-null   float64
 7   Shell_weight    4177 non-null   float64
 8   Age             4177 non-null   float64
dtypes: float64(8), object(1)
memory usage: 293.8+ KB

--- Abalone DataFrame Description ---
            Length     Diameter       Height  Whole_weight  Shucked_weight  \
count  4177.000000  4177.000000  4177.000000   4177.000000     4177.000000   
mean      0.523992     0.407881     0.139516      0.828742        0.359367   
std       0.120093     0.099240     0.041827      0.490389        0.221963   
min       0.075000     0.055000     0.000000      0.002000        0.001000   
25%       0.450000     0.350000     0.115000      0.441500        0.186000   
50%       0.545000     0.425000     0.140000      0.799500        0.336000   
75%       0.615000     0.480000     0.165000      1.153000        0.502000   
max       0.815000     0.650000     1.130000      2.825500        1.488000   

       Viscera_weight  Shell_weight          Age  
count     4177.000000   4177.000000  4177.000000  
mean         0.180594      0.238831    11.433684  
std          0.109614      0.139203     3.224169  
min          0.000500      0.001500     2.500000  
25%          0.093500      0.130000     9.500000  
50%          0.171000      0.234000    10.500000  
75%          0.253000      0.329000    12.500000  
max          0.760000      1.005000    30.500000

# --- 4.2 Data Preprocessing for Abalone Dataset ---
# Convert numerical columns to numeric, coercing errors
for col in ['Length', 'Diameter', 'Height', 'Whole_weight',
            'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Rings']:
    # Already handled by reading with header=0, but keeping coerce for robustness
    df_abalone[col] = pd.to_numeric(df_abalone[col], errors='coerce')

# Drop rows with NaN values that resulted from coercion (e.g., any true non-numeric)
df_abalone.dropna(inplace=True)

# Now 'Rings' should be numeric, so we can calculate 'Age'
# Age is Rings + 1.5 according to dataset documentation
df_abalone['Age'] = df_abalone['Rings'] + 1.5
df_abalone = df_abalone.drop('Rings', axis=1)

# Check for missing values again after dropping NaNs
print("\nMissing values in Abalone dataset after cleaning:\n", df_abalone.isnull().sum())

# Define features
numerical_features = ['Length', 'Diameter', 'Height', 'Whole_weight',
                      'Shucked_weight', 'Viscera_weight', 'Shell_weight']
categorical_features = ['Sex']

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# --- 4.3 Define Features (X) and Target (y) ---
X = df_abalone.drop('Age', axis=1)
y = df_abalone['Age']

# --- 4.4 Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set X shape: {X_train.shape}, y shape: {y_train.shape}")
print(f"Testing set X shape: {X_test.shape}, y shape: {y_test.shape}")

# --- 4.5 Apply Feature Scaling and One-Hot Encoding ---
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names
num_feat_names = numerical_features
# Ensure we get the correct feature names from the OneHotEncoder
cat_feat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = num_feat_names + list(cat_feat_names)


print(f"\nProcessed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")
print(f"All feature names after preprocessing: {all_feature_names}")

# --- 4.6 Train a Linear Regression Model ---
model_sklearn_multiple = LinearRegression()
model_sklearn_multiple.fit(X_train_processed, y_train)

print("\nMultiple Linear Regression model trained successfully!")

# --- 4.7 Access coefficients and intercept ---
print(f"\nModel Intercept (Beta_0): {model_sklearn_multiple.intercept_:.4f}")
print("\nModel Coefficients (Beta_i for each feature):")
for i, coef in enumerate(model_sklearn_multiple.coef_):
    print(f"  {all_feature_names[i]}: {coef:.4f}")

Missing values in Abalone dataset after cleaning:
 Sex               0
Length            0
Diameter          0
Height            0
Whole_weight      0
Shucked_weight    0
Viscera_weight    0
Shell_weight      0
Age               0
dtype: int64

Training set X shape: (3341, 8), y shape: (3341,)
Testing set X shape: (836, 8), y shape: (836,)

Processed X_train shape: (3341, 10)
Processed X_test shape: (836, 10)
All feature names after preprocessing: ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Sex_F', 'Sex_I', 'Sex_M']

Multiple Linear Regression model trained successfully!

Model Intercept (Beta_0): 11.4330

Model Coefficients (Beta_i for each feature):
  Length: -0.0240
  Diameter: 1.0976
  Height: 0.4440
  Whole_weight: 4.3903
  Shucked_weight: -4.5169
  Viscera_weight: -1.0460
  Shell_weight: 1.2302
  Sex_F: 0.2052
  Sex_I: -0.5137
  Sex_M: 0.3085

# --- 4.3 Define Features (X) and Target (y) ---
X = df_abalone.drop('Age', axis=1)
y = df_abalone['Age']

# --- 4.4 Split Data into Training and Testing Sets ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set X shape: {X_train.shape}, y shape: {y_train.shape}")
print(f"Testing set X shape: {X_test.shape}, y shape: {y_test.shape}")

Training set X shape: (3341, 8), y shape: (3341,)
Testing set X shape: (836, 8), y shape: (836,)

# --- 4.5 Apply Feature Scaling and One-Hot Encoding ---
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Get feature names
num_feat_names = numerical_features
# Ensure we get the correct feature names from the OneHotEncoder
cat_feat_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = num_feat_names + list(cat_feat_names)


print(f"\nProcessed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")
print(f"All feature names after preprocessing: {all_feature_names}")

Processed X_train shape: (3341, 10)
Processed X_test shape: (836, 10)
All feature names after preprocessing: ['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight', 'Sex_F', 'Sex_I', 'Sex_M']

# --- 4.6 Train a Linear Regression Model ---
model_sklearn_multiple = LinearRegression()
model_sklearn_multiple.fit(X_train_processed, y_train)

print("\nMultiple Linear Regression model trained successfully!")

# --- 4.7 Access coefficients and intercept ---
print(f"\nModel Intercept (Beta_0): {model_sklearn_multiple.intercept_:.4f}")
print("\nModel Coefficients (Beta_i for each feature):")
for i, coef in enumerate(model_sklearn_multiple.coef_):
    print(f"  {all_feature_names[i]}: {coef:.4f}")

Multiple Linear Regression model trained successfully!

Model Intercept (Beta_0): 11.4330

Model Coefficients (Beta_i for each feature):
  Length: -0.0240
  Diameter: 1.0976
  Height: 0.4440
  Whole_weight: 4.3903
  Shucked_weight: -4.5169
  Viscera_weight: -1.0460
  Shell_weight: 1.2302
  Sex_F: 0.2052
  Sex_I: -0.5137
  Sex_M: 0.3085

# --- 4.8 Make predictions on the test set ---
y_pred_sklearn_multiple = model_sklearn_multiple.predict(X_test_processed)

print(f"\nFirst 5 true Age values (test set): {y_test.tolist()[:5]}")
print(f"First 5 predicted Age values: {[f'{val:.2f}' for val in y_pred_sklearn_multiple.tolist()[:5]]}")

First 5 true Age values (test set): [10.5, 9.5, 17.5, 10.5, 15.5]
First 5 predicted Age values: ['13.26', '11.74', '15.50', '13.50', '12.66']

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- 5 Calculate Evaluation Metrics ---
mae = mean_absolute_error(y_test, y_pred_sklearn_multiple)
mse = mean_squared_error(y_test, y_pred_sklearn_multiple)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred_sklearn_multiple)

print(f"\nMean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R2): {r2:.4f}")

# --- Plot: Actual vs. Predicted Values ---
plt.figure(figsize=(10, 7))
sns.scatterplot(x=y_test, y=y_pred_sklearn_multiple, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Perfect Prediction Line')
plt.xlabel('Actual Age')
plt.ylabel('Predicted Age')
plt.title('Actual vs. Predicted Age (Multiple Linear Regression)')
plt.grid(True, linestyle='--', alpha=0.6)
plt.legend()
plt.show()

# --- Plot: Distribution of Residuals ---
residuals = y_test - y_pred_sklearn_multiple
plt.figure(figsize=(10, 7))
sns.histplot(residuals, kde=True, bins=30)
plt.title('Distribution of Residuals')
plt.xlabel('Residuals (Actual - Predicted)')
plt.ylabel('Frequency')
plt.show()

# --- Plot: Residuals vs. Predicted Values ---
plt.figure(figsize=(10, 7))
sns.scatterplot(x=y_pred_sklearn_multiple, y=residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residuals vs. Predicted Values')
plt.xlabel('Predicted Age')
plt.ylabel('Residuals')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

Mean Absolute Error (MAE): 1.5931
Mean Squared Error (MSE): 4.8912
Root Mean Squared Error (RMSE): 2.2116
R-squared (R2): 0.5482

import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# --- Residuals ---
residuals = y_test - y_pred_sklearn_multiple

# --- 6.1 Check Linearity ---
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df_abalone['Length'], y=df_abalone['Age'], alpha=0.6)
plt.title('Linearity Check: Length vs. Age')
plt.xlabel('Length')
plt.ylabel('Age')
plt.show()

# --- 6.2 Check Homoscedasticity ---
plt.figure(figsize=(10, 7))
sns.scatterplot(x=y_pred_sklearn_multiple, y=residuals, alpha=0.6)
plt.axhline(y=0, color='r', linestyle='--', linewidth=2)
plt.title('Homoscedasticity Check: Residuals vs. Predicted Values')
plt.xlabel('Predicted Age')
plt.ylabel('Residuals')
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

# --- 6.3 Check Normality of Residuals ---
plt.figure(figsize=(8, 6))
sns.histplot(residuals, kde=True, bins=30)
plt.title('Normality Check: Histogram of Residuals')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.show()

# Q-Q Plot for Normality
plt.figure(figsize=(8, 6))
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Normality Check: Q-Q Plot of Residuals')
plt.show()

# --- 6.4 Check Multicollinearity ---
# Numerical features for abalone (excluding categorical 'Sex')
numerical_abalone_features = df_abalone[numerical_features]
plt.figure(figsize=(10, 8))
sns.heatmap(numerical_abalone_features.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Multicollinearity Check: Correlation Matrix of Numerical Features')
plt.show()

print("\nNote on Multicollinearity: Check the heatmap above. Strong correlations (e.g., between Length, Diameter, Whole_weight) suggest multicollinearity, which can make coefficients less reliable.")

Note on Multicollinearity: Check the heatmap above. Strong correlations (e.g., between Length, Diameter, Whole_weight) suggest multicollinearity, which can make coefficients less reliable.

Part 1: Understanding Linear Regression - The Basics¶

Part 2: Simple Linear Regression - Manual Calculation (Conceptual)¶

Part 3: Simple Linear Regression with Scikit-learn¶

Part 4: Multiple Linear Regression¶

Part 5: Model Evaluation for Regression¶

Part 6: Assumptions of Linear Regression & Diagnostics¶

Part 7: Advantages and Disadvantages of Linear Regression¶