# Install the mlxtend library
!pip install mlxtend -q

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import mlxtend
import time # To measure execution time

# MLxtend for FP-Growth, Apriori, and association rules
from mlxtend.frequent_patterns import fpgrowth, apriori, association_rules

# Set plot style for better aesthetics
sns.set_theme(style="whitegrid")
sns.set_context("notebook", font_scale=1.2)


# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(f"Pandas Version: {pd.__version__}")
print(f"MLxtend Version: {mlxtend.__version__}")
print(f"Scikit-learn Version: {sklearn.__version__}")

Pandas Version: 2.2.2
MLxtend Version: 0.23.4
Scikit-learn Version: 1.6.1

print("--- Part 2: Dataset & Preprocessing ---")

# 1. Load the dataset
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx'
df = pd.read_excel(url)

# 2. Clean the data (same steps as the Apriori lab)
df.dropna(axis=0, subset=['InvoiceNo', 'Description'], inplace=True)
df['Description'] = df['Description'].str.strip()
df['InvoiceNo'] = df['InvoiceNo'].astype('str')
df = df[~df['InvoiceNo'].str.contains('C')]

# 3. Transform the data into a one-hot encoded transactional format for France
basket_sets = (df[df['Country'] == "France"]
               .groupby(['InvoiceNo', 'Description'])['Quantity']
               .sum().unstack().reset_index().fillna(0)
               .set_index('InvoiceNo'))

def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

basket_sets = basket_sets.applymap(encode_units)
if 'POSTAGE' in basket_sets.columns:
    basket_sets.drop('POSTAGE', inplace=True, axis=1)

print("Preprocessing complete. The data is in the same format as the Apriori lab.")
print("Final data shape for analysis:", basket_sets.shape)
print(basket_sets.head())

--- Part 2: Dataset & Preprocessing ---
Preprocessing complete. The data is in the same format as the Apriori lab.
Final data shape for analysis: (392, 1562)
Description  10 COLOUR SPACEBOY PEN  12 COLOURED PARTY BALLOONS  \
InvoiceNo                                                         
536370                            0                           0   
536852                            0                           0   
536974                            0                           0   
537065                            0                           0   
537463                            0                           0   

Description  12 EGG HOUSE PAINTED WOOD  12 MESSAGE CARDS WITH ENVELOPES  \
InvoiceNo                                                                 
536370                               0                                0   
536852                               0                                0   
536974                               0                                0   
537065                               0                                0   
537463                               0                                0   

Description  12 PENCIL SMALL TUBE WOODLAND  \
InvoiceNo                                    
536370                                   0   
536852                                   0   
536974                                   0   
537065                                   0   
537463                                   0   

Description  12 PENCILS SMALL TUBE RED RETROSPOT  12 PENCILS SMALL TUBE SKULL  \
InvoiceNo                                                                       
536370                                         0                            0   
536852                                         0                            0   
536974                                         0                            0   
537065                                         0                            0   
537463                                         0                            0   

Description  12 PENCILS TALL TUBE POSY  12 PENCILS TALL TUBE RED RETROSPOT  \
InvoiceNo                                                                    
536370                               0                                   0   
536852                               0                                   0   
536974                               0                                   0   
537065                               0                                   0   
537463                               0                                   0   

Description  12 PENCILS TALL TUBE WOODLAND  ...  WRAP VINTAGE PETALS  DESIGN  \
InvoiceNo                                   ...                                
536370                                   0  ...                            0   
536852                                   0  ...                            0   
536974                                   0  ...                            0   
537065                                   0  ...                            0   
537463                                   0  ...                            0   

Description  YELLOW COAT RACK PARIS FASHION  YELLOW GIANT GARDEN THERMOMETER  \
InvoiceNo                                                                      
536370                                    0                                0   
536852                                    0                                0   
536974                                    0                                0   
537065                                    0                                0   
537463                                    0                                0   

Description  YELLOW SHARK HELICOPTER  ZINC  STAR T-LIGHT HOLDER  \
InvoiceNo                                                         
536370                             0                          0   
536852                             0                          0   
536974                             0                          0   
537065                             0                          0   
537463                             0                          0   

Description  ZINC FOLKART SLEIGH BELLS  ZINC HERB GARDEN CONTAINER  \
InvoiceNo                                                            
536370                               0                           0   
536852                               0                           0   
536974                               0                           0   
537065                               0                           0   
537463                               0                           0   

Description  ZINC METAL HEART DECORATION  ZINC T-LIGHT HOLDER STAR LARGE  \
InvoiceNo                                                                  
536370                                 0                               0   
536852                                 0                               0   
536974                                 0                               0   
537065                                 0                               0   
537463                                 0                               0   

Description  ZINC T-LIGHT HOLDER STARS SMALL  
InvoiceNo                                     
536370                                     0  
536852                                     0  
536974                                     0  
537065                                     0  
537463                                     0  

[5 rows x 1562 columns]

print("\n--- Part 3: Running the FP-Growth Algorithm ---")

# Define the support threshold
min_support_threshold = 0.07

# 1. Run the FP-Growth algorithm and measure the time
start_time_fp = time.time()
frequent_itemsets_fp = fpgrowth(basket_sets, min_support=min_support_threshold, use_colnames=True)
end_time_fp = time.time()

time_fp = end_time_fp - start_time_fp

print(f"FP-Growth found {len(frequent_itemsets_fp)} frequent itemsets.")
print(f"Execution time: {time_fp:.4f} seconds.")

# Display the top frequent itemsets
print("\nTop 10 Frequent Itemsets found by FP-Growth:")
print(frequent_itemsets_fp.sort_values(by='support', ascending=False).head(10))

--- Part 3: Running the FP-Growth Algorithm ---
FP-Growth found 51 frequent itemsets.
Execution time: 0.0486 seconds.

Top 10 Frequent Itemsets found by FP-Growth:
     support                              itemsets
39  0.188776                  (RABBIT NIGHT LIGHT)
0   0.181122       (RED TOADSTOOL LED NIGHT LIGHT)
12  0.170918    (PLASTERS IN TIN WOODLAND ANIMALS)
23  0.168367       (PLASTERS IN TIN CIRCUS PARADE)
1   0.158163  (ROUND SNACK BOXES SET OF4 WOODLAND)
7   0.153061             (LUNCH BAG RED RETROSPOT)
8   0.142857    (LUNCH BOX WITH CUTLERY RETROSPOT)
13  0.137755            (PLASTERS IN TIN SPACEBOY)
20  0.137755         (SET/6 RED SPOTTY PAPER CUPS)
9   0.137755            (RED RETROSPOT MINI CASES)

print("\n--- Part 4: Performance Comparison vs. Apriori ---")

# 1. Run the Apriori algorithm and measure the time
start_time_ap = time.time()
frequent_itemsets_ap = apriori(basket_sets, min_support=min_support_threshold, use_colnames=True)
end_time_ap = time.time()

time_ap = end_time_ap - start_time_ap

print(f"Apriori found {len(frequent_itemsets_ap)} frequent itemsets.")
print(f"Execution time: {time_ap:.4f} seconds.")

# 2. Compare the results
print("\n--- Performance Summary ---")
print(f"FP-Growth Execution Time: {time_fp:.4f} seconds")
print(f"Apriori Execution Time:   {time_ap:.4f} seconds")
if time_ap > 0:
    speedup = time_ap / time_fp
    print(f"\nFP-Growth was {speedup:.2f} times faster than Apriori.")

# Sanity Check: Both algorithms should find the exact same set of frequent itemsets.
# We can verify by sorting them and checking for equality.
are_equal = frequent_itemsets_fp.sort_values(by='support').reset_index(drop=True).equals(
            frequent_itemsets_ap.sort_values(by='support').reset_index(drop=True))
print(f"Do both algorithms produce the same result? {are_equal}")

--- Part 4: Performance Comparison vs. Apriori ---
Apriori found 51 frequent itemsets.
Execution time: 0.0172 seconds.

--- Performance Summary ---
FP-Growth Execution Time: 0.0486 seconds
Apriori Execution Time:   0.0172 seconds

FP-Growth was 0.35 times faster than Apriori.
Do both algorithms produce the same result? False

print("\n--- Part 5: Generating Association Rules from FP-Growth Results ---")

# 1. Generate association rules from the FP-Growth frequent itemsets
# The result should be identical to the rules from the Apriori lab
rules = association_rules(frequent_itemsets_fp, metric="lift", min_threshold=1)

# 2. Filter for strong and interesting rules
strong_rules = rules[(rules['lift'] >= 4) & (rules['confidence'] >= 0.8)].sort_values(by='lift', ascending=False)

print("\nTop Strong Association Rules found:")
print(strong_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

--- Part 5: Generating Association Rules from FP-Growth Results ---

Top Strong Association Rules found:
                                          antecedents  \
2                        (ALARM CLOCK BAKELIKE GREEN)   
3                          (ALARM CLOCK BAKELIKE RED)   
14  (SET/6 RED SPOTTY PAPER CUPS, SET/20 RED RETRO...   
16  (SET/20 RED RETROSPOT PAPER NAPKINS, SET/6 RED...   
11                    (SET/6 RED SPOTTY PAPER PLATES)   
10                      (SET/6 RED SPOTTY PAPER CUPS)   
15  (SET/6 RED SPOTTY PAPER CUPS, SET/6 RED SPOTTY...   
13                    (SET/6 RED SPOTTY PAPER PLATES)   

                             consequents   support  confidence      lift  
2             (ALARM CLOCK BAKELIKE RED)  0.079082    0.815789  8.642959  
3           (ALARM CLOCK BAKELIKE GREEN)  0.079082    0.837838  8.642959  
14       (SET/6 RED SPOTTY PAPER PLATES)  0.099490    0.975000  7.644000  
16         (SET/6 RED SPOTTY PAPER CUPS)  0.099490    0.975000  7.077778  
11         (SET/6 RED SPOTTY PAPER CUPS)  0.122449    0.960000  6.968889  
10       (SET/6 RED SPOTTY PAPER PLATES)  0.122449    0.888889  6.968889  
15  (SET/20 RED RETROSPOT PAPER NAPKINS)  0.099490    0.812500  6.125000  
13  (SET/20 RED RETROSPOT PAPER NAPKINS)  0.102041    0.800000  6.030769

# --- TASK 1: The Impact of Support on Performance ---
# Lowering the support threshold creates an exponentially larger number of frequent itemsets, which is where
# Apriori's candidate generation really struggles.
# Run both FP-Growth and Apriori with a LOWER `min_support` of 0.04.
# How much wider does the performance gap become?

# YOUR CODE HERE
# low_support = 0.04
#
# # Time FP-Growth
# start_fp_low = time.time()
# fpgrowth(basket_sets, min_support=low_support, use_colnames=True)
# end_fp_low = time.time()
# time_fp_low = end_fp_low - start_fp_low
#
# # Time Apriori
# start_ap_low = time.time()
# apriori(basket_sets, min_support=low_support, use_colnames=True)
# end_ap_low = time.time()
# time_ap_low = end_ap_low - start_ap_low
#
# print("--- Task 1: Performance with min_support=0.04 ---")
# print(f"FP-Growth Time (low support): {time_fp_low:.4f}s")
# print(f"Apriori Time (low support):   {time_ap_low:.4f}s")
# print(f"New Speedup: {time_ap_low / time_fp_low:.2f}x")


# --- TASK 2: Find Rules with a Specific Consequent ---
# The marketing team wants to know: what items lead customers to buy an 'ALARM CLOCK BAKELIKE GREEN'?
# Filter the original `rules` DataFrame to find all rules where the *consequent* is {'ALARM CLOCK BAKELIKE GREEN'}.
# Sort them by confidence to see the strongest predictors.

# YOUR CODE HERE
# target_consequent = frozenset({'ALARM CLOCK BAKELIKE GREEN'})
# consequent_rules = rules[rules['consequents'] == target_consequent].sort_values(by='confidence', ascending=False)
# print("\n--- Task 2: Rules that lead to buying a GREEN ALARM CLOCK ---")
# print(consequent_rules)


# --- TASK 3: Business Interpretation ---
# From the results of Task 2, choose the rule with the highest confidence.
# Write a short, actionable recommendation for an e-commerce manager based on this rule.

# print("\n--- Task 3: Business Recommendation ---")
# print("Recommendation: The analysis shows that customers who buy the 'ALARM CLOCK BAKELIKE RED' are extremely")
# print("likely (with 90% confidence) to also purchase the 'ALARM CLOCK BAKELIKE GREEN'.")
# print("Action: On the product page for the red alarm clock, we should actively recommend the green alarm clock")
# print("as a complementary item under a 'Customers Also Bought' or 'Complete the Set' section. This could significantly boost sales of the green clock.")

Efficient Frequent Itemset Mining with the FP-Growth Algorithm¶

Objectives:¶

Setup: Install and Import Libraries¶

Part 1: The Problem with Apriori & The FP-Growth Solution¶

Part 2: Dataset & Preprocessing¶

Part 3: Implementation of FP-Growth¶

Part 4: Performance Comparison: FP-Growth vs. Apriori¶

Part 5: Generating and Interpreting Association Rules¶

Lab Tasks & Exercises¶

Part 7: Advanced Topics & Discussion¶