Python案例如何逐步优化

wen python案例 2026-06-05 85

本文目录导读：

Python案例如何逐步优化

初始版本：最直接的实现
第一次优化：使用更高效的字典方法
第二次优化：使用defaultdict
第三次优化：使用Counter类
第四次优化：处理更复杂的文本（大小写和标点）
第五次优化：性能优化（使用生成器）
第六次优化：处理大文件（流式处理）
第七次优化：并行处理
性能对比测试
优化建议

我来通过一个具体的案例，逐步演示Python代码的优化过程,我们用一个统计文本中单词频率的例子。

初始版本：最直接的实现

def count_words(text):
    """
    统计文本中每个单词出现的次数
    """
    word_count = {}
    # 分割文本为单词
    words = text.split()
    # 统计每个单词出现次数
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
    return word_count
# 测试
text = "hello world hello python world python hello"
result = count_words(text)
print(result)  # {'hello': 3, 'world': 2, 'python': 2}

第一次优化：使用更高效的字典方法

def count_words_v1(text):
    """
    优化1：使用get()方法简化代码
    """
    word_count = {}
    for word in text.split():
        word_count[word] = word_count.get(word, 0) + 1
    return result

第二次优化：使用defaultdict

from collections import defaultdict
def count_words_v2(text):
    """
    优化2：使用defaultdict避免键检查
    """
    word_count = defaultdict(int)
    for word in text.split():
        word_count[word] += 1
    return dict(word_count)

第三次优化：使用Counter类

from collections import Counter
def count_words_v3(text):
    """
    优化3：使用Counter，一行搞定
    """
    return Counter(text.split())

第四次优化：处理更复杂的文本（大小写和标点）

import re
from collections import Counter
def count_words_v4(text):
    """
    优化4：处理标点符号和大小写
    """
    # 移除标点符号并转为小写
    cleaned_text = re.sub(r'[^\w\s]', '', text.lower())
    return Counter(cleaned_text.split())

第五次优化：性能优化（使用生成器）

import re
from collections import Counter
def count_words_v5(text):
    """
    优化5：使用生成器表达式节省内存
    """
    # 生成器表达式，一次只处理一个单词
    words = (word.lower().strip('.,!?;:') for word in re.findall(r'\b\w+\b', text))
    return Counter(words)

第六次优化：处理大文件（流式处理）

import re
from collections import Counter
import mmap
def count_words_streaming(filename):
    """
    优化6：处理大文件，避免一次加载到内存
    """
    word_counts = Counter()
    # 使用mmap高效读取大文件
    with open(filename, 'r+b') as f:
        # 内存映射文件
        with mmap.mmap(f.fileno(), 0) as mmapped_file:
            # 逐块读取
            chunk_size = 8192
            partial_word = ""
            for chunk in iter(lambda: mmapped_file.read(chunk_size), b''):
                text = partial_word + chunk.decode('utf-8')
                # 确保最后一个单词完整
                if not text.endswith(' '):
                    last_space = text.rfind(' ')
                    if last_space != -1:
                        partial_word = text[last_space+1:]
                        text = text[:last_space]
                    else:
                        partial_word = text
                        continue
                else:
                    partial_word = ""
                words = re.findall(r'\b\w+\b', text.lower())
                word_counts.update(words)
    return dict(word_counts)

第七次优化：并行处理

from collections import Counter
import re
from multiprocessing import Pool
import os
def process_chunk(chunk):
    """处理文本块"""
    words = re.findall(r'\b\w+\b', chunk.lower())
    return Counter(words)
def count_words_parallel(filename, num_processes=4):
    """
    优化7：使用多进程并行处理
    """
    # 获取文件大小
    file_size = os.path.getsize(filename)
    chunk_size = file_size // num_processes
    chunks = []
    with open(filename, 'r', encoding='utf-8') as f:
        # 分割文件
        for i in range(num_processes):
            start = i * chunk_size
            f.seek(start)
            if i > 0:
                # 跳到完整单词开始
                f.readline()
            if i < num_processes - 1:
                data = f.read(chunk_size)
                # 确保在单词边界结束
                last_space = data.rfind(' ')
                if last_space != -1:
                    data = data[:last_space]
                chunks.append(data)
            else:
                chunks.append(f.read())
    # 并行处理
    with Pool(num_processes) as pool:
        counters = pool.map(process_chunk, chunks)
    # 合并结果
    final_counter = Counter()
    for c in counters:
        final_counter.update(c)
    return dict(final_counter)

性能对比测试

import time
def performance_test():
    """性能测试函数"""
    # 生成测试数据
    test_text = " ".join(["hello world python" for _ in range(100000)])
    functions = [
        ("基本版本", count_words),
        ("get优化", count_words_v1),
        ("defaultdict", count_words_v2),
        ("Counter", count_words_v3),
        ("处理标点", count_words_v4),
        ("生成器优化", count_words_v5),
    ]
    for name, func in functions:
        start_time = time.time()
        for _ in range(100):
            func(test_text)
        end_time = time.time()
        print(f"{name}: {end_time - start_time:.4f}秒")

版本	改进点	适用场景
v0	基础实现	小数据量，代码可读性优先
v1	使用get()	微优化，减少代码行数
v2	使用defaultdict	需要频繁插入字典
v3	使用Counter	通用计数场景
v4	文本清洗	真实文本处理
v5	生成器优化	内存受限场景
v6	流式处理	大文件处理
v7	并行处理	多核CPU，超大文件

优化建议

先功能，后优化：确保代码正确性
使用合适的工具：如Counter替代手动计数
考虑数据规模：不同规模有不同的优化策略
使用内置函数：Python内置函数通常比循环快
减少属性访问：将频繁访问的方法赋值给局部变量
使用profiler：先找出瓶颈再优化

# 使用cProfile进行性能分析示例
import cProfile
def profile_example():
    text = "hello world " * 100000
    cProfile.run('count_words_v3(text)')

通过这个案例，你可以看到如何从简单的实现逐步优化到高性能的版本，优化的关键在于理解业务场景和数据规模,选择最合适的方案。