Python案例如何实现数据关联规则?

wen python案例 1

本文目录导读:

Python案例如何实现数据关联规则?

  1. 使用mlxtend库实现Apriori算法
  2. 自定义Apriori算法实现
  3. 实际应用案例:电商购物篮分析
  4. 使用FP-Growth算法(大数据集推荐)
  5. 实用技巧和建议

我来详细介绍Python实现数据关联规则(如Apriori算法和FP-Growth算法)的几种方法。

使用mlxtend库实现Apriori算法

安装所需库

pip install mlxtend pandas

完整案例代码

import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
# 示例数据:购物篮数据
dataset = [
    ['牛奶', '面包', '黄油'],
    ['啤酒', '面包'],
    ['牛奶', '尿布', '啤酒', '可乐'],
    ['面包', '牛奶', '尿布', '啤酒'],
    ['面包', '黄油', '牛奶'],
    ['啤酒', '尿布'],
    ['牛奶', '尿布', '啤酒', '面包'],
    ['尿布', '啤酒', '可乐']
]
# 1. 数据预处理:将交易数据转换为One-Hot编码格式
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
print("转换后的数据格式:")
print(df.head())
# 2. 挖掘频繁项集
# 设置最小支持度阈值
min_support = 0.3
frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
print(f"\n频繁项集(支持度 >= {min_support}):")
print(frequent_itemsets)
# 3. 生成关联规则
# 设置最小置信度和提升度
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
print("\n关联规则:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
# 4. 添加更多评估指标
rules_with_metrics = association_rules(
    frequent_itemsets, 
    metric="lift", 
    min_threshold=1.0
)
print("\n带有完整评估指标的规则:")
print(rules_with_metrics[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']])

自定义Apriori算法实现

from itertools import combinations
from collections import Counter
import pandas as pd
class AprioriCustom:
    def __init__(self, min_support=0.3, min_confidence=0.6):
        self.min_support = min_support
        self.min_confidence = min_confidence
        self.frequent_itemsets = []
        self.rules = []
    def fit(self, transactions):
        # 数据预处理
        self.transactions = transactions
        self.num_transactions = len(transactions)
        # 步骤1:生成频繁1项集
        item_counts = Counter()
        for transaction in transactions:
            for item in transaction:
                item_counts[item] += 1
        # 过滤支持度小于阈值的项
        self.frequent_itemsets_1 = {
            frozenset([item]): count / self.num_transactions 
            for item, count in item_counts.items() 
            if count / self.num_transactions >= self.min_support
        }
        # 存储所有频繁项集
        self.frequent_itemsets = [self.frequent_itemsets_1]
        # 步骤2:生成更长的频繁项集
        k = 2
        current_itemsets = list(self.frequent_itemsets_1.keys())
        while current_itemsets:
            # 生成候选k项集
            candidates = self._generate_candidates(current_itemsets, k)
            # 计算支持度
            frequent_k = {}
            for candidate in candidates:
                support = self._calculate_support(candidate)
                if support >= self.min_support:
                    frequent_k[candidate] = support
            if frequent_k:
                self.frequent_itemsets.append(frequent_k)
                current_itemsets = list(frequent_k.keys())
                k += 1
            else:
                break
        # 步骤3:生成关联规则
        self._generate_rules()
        return self
    def _generate_candidates(self, itemsets, k):
        """生成候选k项集"""
        candidates = set()
        for i in range(len(itemsets)):
            for j in range(i + 1, len(itemsets)):
                # 合并两个k-1项集
                candidate = itemsets[i] | itemsets[j]
                if len(candidate) == k:
                    # 检查所有k-1子集是否都是频繁的
                    all_subsets_frequent = True
                    for subset in combinations(candidate, k-1):
                        if frozenset(subset) not in itemsets:
                            all_subsets_frequent = False
                            break
                    if all_subsets_frequent:
                        candidates.add(candidate)
        return list(candidates)
    def _calculate_support(self, itemset):
        """计算项集的支持度"""
        count = sum(1 for transaction in self.transactions 
                   if itemset.issubset(set(transaction)))
        return count / self.num_transactions
    def _generate_rules(self):
        """生成关联规则"""
        self.rules = []
        # 遍历所有频繁项集(长度>=2)
        for itemset_dict in self.frequent_itemsets[1:]:  # 跳过频繁1项集
            for itemset, support in itemset_dict.items():
                # 生成所有可能的规则
                for i in range(1, len(itemset)):
                    for antecedent in combinations(list(itemset), i):
                        antecedent = set(antecedent)
                        consequent = itemset - antecedent
                        if antecedent and consequent:
                            # 计算置信度
                            antecedent_support = self._calculate_support(antecedent)
                            confidence = support / antecedent_support
                            if confidence >= self.min_confidence:
                                # 计算提升度
                                consequent_support = self._calculate_support(consequent)
                                lift = confidence / consequent_support
                                self.rules.append({
                                    'antecedents': antecedent,
                                    'consequents': consequent,
                                    'support': support,
                                    'confidence': confidence,
                                    'lift': lift
                                })
# 使用自定义Apriori算法
if __name__ == "__main__":
    # 使用前面的数据集
    dataset = [
        ['牛奶', '面包', '黄油'],
        ['啤酒', '面包'],
        ['牛奶', '尿布', '啤酒', '可乐'],
        ['面包', '牛奶', '尿布', '啤酒'],
        ['面包', '黄油', '牛奶'],
        ['啤酒', '尿布'],
        ['牛奶', '尿布', '啤酒', '面包'],
        ['尿布', '啤酒', '可乐']
    ]
    # 训练模型
    apriori = AprioriCustom(min_support=0.3, min_confidence=0.6)
    apriori.fit(dataset)
    # 显示频繁项集
    print("频繁项集:")
    for i, itemset_dict in enumerate(apriori.frequent_itemsets):
        print(f"\n{i+1}-项集:")
        for itemset, support in itemset_dict.items():
            print(f"  {set(itemset)}: 支持度={support:.2f}")
    # 显示关联规则
    print(f"\n关联规则(置信度 >= {apriori.min_confidence}):")
    for rule in apriori.rules:
        print(f"{rule['antecedents']} -> {rule['consequents']}: "
              f"支持度={rule['support']:.2f}, "
              f"置信度={rule['confidence']:.2f}, "
              f"提升度={rule['lift']:.2f}")

实际应用案例:电商购物篮分析

import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
def create_sample_ecommerce_data():
    """创建模拟电商数据"""
    data = {
        'Transaction_ID': [1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8],
        'Product': ['手机', '手机壳', '耳机', '手机', '充电器', 
                   '笔记本电脑', '鼠标', '键盘', '电脑包',
                   '手机', '耳机', '充电器', '笔记本电脑', '鼠标',
                   '手机', '手机壳', '笔记本电脑', '鼠标', '键盘',
                   '耳机', '充电器']
    }
    return pd.DataFrame(data)
def ecommerce_basket_analysis():
    """电商购物篮分析"""
    # 加载数据
    df = create_sample_ecommerce_data()
    # 将数据转换为购物篮格式
    basket = df.groupby('Transaction_ID')['Product'].apply(list).values.tolist()
    # 数据预处理
    te = TransactionEncoder()
    te_ary = te.fit(basket).transform(basket)
    df_basket = pd.DataFrame(te_ary, columns=te.columns_)
    # 挖掘频繁项集
    frequent_itemsets = apriori(df_basket, min_support=0.2, use_colnames=True)
    # 生成关联规则
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
    # 筛选高质量规则
    high_quality_rules = rules[(rules['lift'] > 1.5) & 
                              (rules['confidence'] > 0.5)]
    print("=== 电商购物篮分析结果 ===")
    print(f"\n总交易数: {len(df_basket)}")
    print(f"发现频繁项集: {len(frequent_itemsets)}")
    print(f"发现关联规则: {len(rules)}")
    print(f"高质量规则: {len(high_quality_rules)}")
    print("\n=== 高质量关联规则 ===")
    for i, rule in high_quality_rules.iterrows():
        antecedents = set(rule['antecedents'])
        consequents = set(rule['consequents'])
        print(f"\n规则 {i+1}: {antecedents} -> {consequents}")
        print(f"  支持度: {rule['support']:.2%}")
        print(f"  置信度: {rule['confidence']:.2%}")
        print(f"  提升度: {rule['lift']:.2f}")
    # 可视化规则
    try:
        import matplotlib.pyplot as plt
        # 绘制提升度分布
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
        # 支持度 vs 置信度散点图
        ax1.scatter(rules['support'], rules['confidence'], alpha=0.5)
        ax1.set_xlabel('支持度')
        ax1.set_ylabel('置信度')
        ax1.set_title('关联规则:支持度 vs 置信度')
        # 提升度分布
        ax2.hist(rules['lift'], bins=20, alpha=0.7)
        ax2.set_xlabel('提升度')
        ax2.set_ylabel('频率')
        ax2.set_title('提升度分布')
        plt.tight_layout()
        plt.show()
    except ImportError:
        print("\n请安装matplotlib以查看可视化结果")
# 运行分析
if __name__ == "__main__":
    ecommerce_basket_analysis()

使用FP-Growth算法(大数据集推荐)

from mlxtend.frequent_patterns import fpgrowth
import pandas as pd
def fpgrowth_example():
    """使用FP-Growth算法(比Apriori更高效)"""
    from mlxtend.preprocessing import TransactionEncoder
    # 示例数据
    dataset = [
        ['牛奶', '面包', '黄油'],
        ['啤酒', '面包'],
        ['牛奶', '尿布', '啤酒', '可乐'],
        ['面包', '牛奶', '尿布', '啤酒'],
        ['面包', '黄油', '牛奶']
    ]
    # 数据预处理
    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    # FP-Growth挖掘频繁项集
    frequent_itemsets = fpgrowth(df, min_support=0.3, use_colnames=True)
    print("FP-Growth挖掘到的频繁项集:")
    print(frequent_itemsets)
    # 生成关联规则
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
    print("\n生成的关联规则:")
    print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
# 运行FP-Growth示例
if __name__ == "__main__":
    fpgrowth_example()

实用技巧和建议

参数调优建议

def parameter_tuning_tips():
    """关联规则参数调优建议"""
    tips = {
        "min_support": {
            "范围": "0.01-0.5",
            "建议": "数据量大用较小值(0.01-0.1),数据量小用较大值(0.1-0.3)",
            "影响": "值越小,发现的规则越多,但可能包含噪声"
        },
        "min_confidence": {
            "范围": "0.5-0.9",
            "建议": "一般从0.6开始尝试",
            "影响": "值越大,规则越可靠,但可能遗漏重要规则"
        },
        "min_lift": {
            "范围": "1.0-3.0",
            "建议": "大于1.3表示规则有实际意义",
            "影响": "提升度>1表示正相关,<1表示负相关"
        }
    }
    print("=== 参数调优指南 ===")
    for param, info in tips.items():
        print(f"\n{param}:")
        for key, value in info.items():
            print(f"  {key}: {value}")
# 使用
parameter_tuning_tips()

性能优化建议

def performance_tips():
    """性能优化建议"""
    print("=== 性能优化建议 ===")
    print("""
    1. 对于大数据集,优先使用FP-Growth而非Apriori
    2. 适当提高min_support可以减少计算量
    3. 使用数据采样进行初步探索
    4. 考虑使用数据分区策略
    5. 对稀疏数据使用压缩表示方法
    6. 使用并行计算框架(如Spark MLlib)处理超大数据集
    """)
# 使用
performance_tips()

Python实现关联规则挖掘的主要步骤:

  1. 数据预处理:将交易数据转换为One-Hot编码
  2. 频繁项集挖掘:使用Apriori或FP-Growth算法
  3. 规则生成:根据频繁项集生成关联规则
  4. 规则评估:使用支持度、置信度、提升度等指标筛选规则
  5. 结果解释:解释和可视化发现的规则

选择合适的算法:

  • 小数据集:Apriori(简单易用)
  • 大数据集:FP-Growth(更高效)
  • 商业应用:mlxtend库(可靠、功能完善)

抱歉,评论功能暂时关闭!