# Install necessary libraries if not already present in Colab environment
!pip install pandas numpy scikit-learn matplotlib seaborn scipy -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import sklearn

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid")

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas Version: {pd.__version__}")
print(f"Seaborn Version: {sns.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

Pandas Version: 2.2.2
Seaborn Version: 0.13.2
Scikit-learn Version: 1.6.1

print("--- Part 1: Dataset Loading and Initial Inspection ---")

# 1. Load the "tips" dataset from seaborn
tips = sns.load_dataset('tips')

# 2. Initial inspection
print("--- Dataset Info ---")
tips.info()

print("\n--- First 5 Rows ---")
print(tips.head())

print("\n--- Summary Statistics for Numerical Features ---")
print(tips.describe())

print("\n--- Value Counts for Categorical Features ---")
print(tips['day'].value_counts())

--- Part 1: Dataset Loading and Initial Inspection ---
--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB

--- First 5 Rows ---
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4

--- Summary Statistics for Numerical Features ---
       total_bill         tip        size
count  244.000000  244.000000  244.000000
mean    19.785943    2.998279    2.569672
std      8.902412    1.383638    0.951100
min      3.070000    1.000000    1.000000
25%     13.347500    2.000000    2.000000
50%     17.795000    2.900000    2.000000
75%     24.127500    3.562500    3.000000
max     50.810000   10.000000    6.000000

--- Value Counts for Categorical Features ---
day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

print("\n--- Part 2: Univariate Analysis ---")

# 1. Analyzing a Numerical Feature: 'total_bill'
plt.figure(figsize=(14, 5))

# Histogram with Kernel Density Estimate (KDE)
plt.subplot(1, 2, 1)
sns.histplot(tips['total_bill'], kde=True, bins=20)
plt.title('Distribution of Total Bill')
plt.xlabel('Total Bill ($)')

# Box Plot to show quartiles and outliers
plt.subplot(1, 2, 2)
sns.boxplot(x=tips['total_bill'])
plt.title('Box Plot of Total Bill')
plt.xlabel('Total Bill ($)')

plt.tight_layout()
plt.show()
# INSIGHT: The 'total_bill' is right-skewed with several outliers on the higher end.

# 2. Analyzing a Categorical Feature: 'day'
plt.figure(figsize=(8, 6))
sns.countplot(x='day', data=tips, order=['Thur', 'Fri', 'Sat', 'Sun'])
plt.title('Count of Visits per Day')
plt.xlabel('Day of the Week')
plt.ylabel('Number of Visits')
plt.show()
# INSIGHT: The restaurant is busiest on Saturday and Sunday.

--- Part 2: Univariate Analysis ---

print("\n--- Part 3: Bivariate Analysis ---")

# 1. Numeric vs. Numeric: 'total_bill' vs 'tip'
# A scatter plot is the best choice to see the relationship
sns.jointplot(x='total_bill', y='tip', data=tips, kind='reg') # 'reg' adds a regression line
plt.suptitle('Total Bill vs. Tip Amount', y=1.02)
plt.show()
# INSIGHT: There is a strong, positive linear relationship. As the bill increases, the tip tends to increase.

# 2. Numeric vs. Categorical: 'day' vs 'tip'
# A box plot or violin plot is great for comparing distributions across categories
plt.figure(figsize=(10, 6))
sns.boxplot(x='day', y='tip', data=tips, order=['Thur', 'Fri', 'Sat', 'Sun'])
plt.title('Tip Distribution by Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Tip Amount ($)')
plt.show()
# INSIGHT: Median tips are slightly higher on weekends, though the spread is also larger.

# 3. Correlation Heatmap for all numerical features
# First, calculate the correlation matrix
correlation_matrix = tips[['total_bill', 'tip', 'size']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.show()
# INSIGHT: Confirms the strong positive correlation between 'total_bill' and 'tip' (0.68). 'size' of the party is also strongly correlated.

--- Part 3: Bivariate Analysis ---

print("\n--- Part 4: Multivariate Analysis ---")

# 1. Pair Plot to see all pairwise relationships
# Using 'hue' adds a third dimension (a categorical one) to the plots
sns.pairplot(tips, hue='smoker', palette='viridis')
plt.suptitle('Pairwise Relationships by Smoker Status', y=1.02)
plt.show()
# INSIGHT: This gives a rapid, high-level overview. We can quickly see that the distributions for smokers and non-smokers
# are largely overlapping, but there might be subtle differences in the 'total_bill' vs 'tip' relationship.

# 2. Facet Grid to analyze interactions
# Here we examine 'total_bill' vs 'tip' but create separate plots for each combination of 'day' and 'smoker'
g = sns.FacetGrid(tips, col="day", row="smoker", margin_titles=True)
g.map(sns.scatterplot, "total_bill", "tip", alpha=.7)
g.add_legend()
plt.show()
# INSIGHT: This is a powerful visualization. For example, we can see the relationship on Sunday for non-smokers
# seems tighter than for smokers. This detailed view allows for more nuanced hypothesis generation.

--- Part 4: Multivariate Analysis ---

print("\n--- Part 5: From Visualization to Hypothesis Testing ---")

# 1. Isolate the data for the two groups we want to compare
tips_sat = tips[tips['day'] == 'Sat']['tip']
tips_thur = tips[tips['day'] == 'Thur']['tip']

# 2. Perform the independent t-test
ttest_result = stats.ttest_ind(tips_sat, tips_thur)

print(f"T-test statistic: {ttest_result.statistic:.4f}")
print(f"P-value: {ttest_result.pvalue:.4f}")

# 3. Interpret the result
alpha = 0.05
if ttest_result.pvalue < alpha:
    print("\nConclusion: We reject the null hypothesis.")
    print("There is a statistically significant difference in tip amounts between Saturday and Thursday.")
else:
    print("\nConclusion: We fail to reject the null hypothesis.")
    print("There is not enough evidence to claim a significant difference in tip amounts.")

--- Part 5: From Visualization to Hypothesis Testing ---
T-test statistic: 0.9002
P-value: 0.3695

Conclusion: We fail to reject the null hypothesis.
There is not enough evidence to claim a significant difference in tip amounts.

Exploratory Data Analysis (EDA) & Statistical Visualization¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: Dataset Loading and Initial Inspection¶

Part 2: Univariate Analysis (Understanding Individual Features)¶

Part 3: Bivariate Analysis (Exploring Relationships Between Two Features)¶

Part 4: Multivariate Analysis (Visualizing Complex Interactions)¶

Part 5: From Visualization to Hypothesis Testing¶

Part 6: Advanced Topics & Discussion¶