Python案例如何分割长文本内容？

wen python案例 2026-06-08 58

本文目录导读：

Python案例如何分割长文本内容？

基础分割方法
智能分割方法
使用自然语言处理库
实际应用场景
完整工具类

我来介绍几种Python分割长文本的常用方法,从简单到复杂：

基础分割方法

按标点符号分割

import re
def split_by_punctuation(text, max_length=100):
    """按句子分割"""
    # 按句号、问号、感叹号分割
    sentences = re.split(r'[。！？.!?]', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if sentence.strip():
            if len(current_chunk + sentence) <= max_length:
                current_chunk += sentence + "。"
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence + "。"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks
# 示例
text = "这是一个长文本，需要分成多个段落，每个段落包含完整句子，这样便于处理，或者用于其他用途。"
result = split_by_punctuation(text, max_length=20)
for i, chunk in enumerate(result, 1):
    print(f"段落{i}: {chunk}")

按字数分割（简单粗暴）

def split_by_length(text, chunk_size=50):
    """按固定长度分割"""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
text = "这是一个用于测试的长文本内容，按固定长度分割，不考虑语义完整性。" * 5
chunks = split_by_length(text, 30)
for i, chunk in enumerate(chunks, 1):
    print(f"第{i}段: {chunk}")

智能分割方法

优化版：保持句子完整

def smart_split(text, max_length=100):
    """智能分割，尽量保持句子完整"""
    # 先按标点分割为句子
    sentences = []
    current = ""
    for char in text:
        current += char
        if char in '。！？.!?\n':
            if current.strip():
                sentences.append(current.strip())
            current = ""
    if current.strip():
        sentences.append(current.strip())
    # 组合句子为段落
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) <= max_length:
            current_chunk += sentence
        else:
            if current_chunk:
                chunks.append(current_chunk)
            # 如果单个句子超过最大长度，强制分割
            while len(sentence) > max_length:
                chunks.append(sentence[:max_length])
                sentence = sentence[max_length:]
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk)
    return chunks
text = """
Python是一种流行的编程语言，它被广泛用于数据科学、人工智能等领域。
这个例子展示了如何智能分割长文本，保持句子的完整性很重要。
当处理大量文本时，好的分割策略可以大大提高处理效率。
"""
chunks = smart_split(text, max_length=50)
for i, chunk in enumerate(chunks, 1):
    print(f"段落{i}: {chunk}")

使用自然语言处理库

使用 jieba 分词（适合中文）

import jieba
def split_by_words(text, max_words=30):
    """按词数分割"""
    words = list(jieba.cut(text))
    chunks = []
    current_chunk = []
    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= max_words:
            chunks.append(''.join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(''.join(current_chunk))
    return chunks
text = "Python是一种强大的编程语言，广泛应用于数据分析、机器学习和Web开发等领域。"
chunks = split_by_words(text, max_words=8)
for i, chunk in enumerate(chunks, 1):
    print(f"段落{i}: {chunk}")

使用 TextBlob（适合英文）

from textblob import TextBlob
def split_by_sentences(text, max_sentences=3):
    """按句子数量分割"""
    blob = TextBlob(text)
    sentences = list(blob.sentences)
    chunks = []
    for i in range(0, len(sentences), max_sentences):
        chunk = ' '.join(str(s) for s in sentences[i:i+max_sentences])
        chunks.append(chunk)
    return chunks
# 英文示例
text = """
Python is a versatile language. It's used in many fields. 
Data science and machine learning are popular applications. 
Web development also heavily relies on Python.
"""
chunks = split_by_sentences(text, max_sentences=2)
for i, chunk in enumerate(chunks, 1):
    print(f"段落{i}: {chunk}")

实际应用场景

处理大文件

def split_large_file(file_path, chunk_size=1000):
    """分割大文件"""
    chunks = []
    with open(file_path, 'r', encoding='utf-8') as f:
        chunk = ""
        for line in f:
            if len(chunk + line) <= chunk_size:
                chunk += line
            else:
                if chunk:
                    chunks.append(chunk)
                # 处理超长行
                while len(line) > chunk_size:
                    chunks.append(line[:chunk_size])
                    line = line[chunk_size:]
                chunk = line
        if chunk:
            chunks.append(chunk)
    return chunks
# 使用示例
# chunks = split_large_file('large_file.txt', chunk_size=500)

API文本长度限制

def prepare_for_api(text, max_tokens=200):
    """准备发送到API的文本块"""
    # 这里使用简单的token估算（中文约1.5字符/token）
    estimated_tokens = len(text) * 1.5
    if estimated_tokens <= max_tokens:
        return [text]
    # 需要分割
    char_limit = int(max_tokens / 1.5)
    return split_by_punctuation(text, char_limit)
# 示例
long_text = "这是一段很长的文本" * 100 + "需要发送给API进行处理，它超出了API的限制，因此我们需要分割它。"
chunks = prepare_for_api(long_text, max_tokens=100)
print(f"被分割为{len(chunks)}个段落")

完整工具类

class TextSplitter:
    """文本分割工具类"""
    def __init__(self, method='smart', max_length=100):
        self.method = method
        self.max_length = max_length
    def split(self, text):
        if self.method == 'simple':
            return self._simple_split(text)
        elif self.method == 'smart':
            return self._smart_split(text)
        elif self.method == 'punctuation':
            return self._punctuation_split(text)
    def _simple_split(self, text):
        """简单按长度分割"""
        return [text[i:i+self.max_length] 
                for i in range(0, len(text), self.max_length)]
    def _smart_split(self, text):
        """智能分割"""
        return smart_split(text, self.max_length)
    def _punctuation_split(self, text):
        """按标点分割"""
        return split_by_punctuation(text, self.max_length)
# 使用
splitter = TextSplitter(method='smart', max_length=50)
text = "这是测试文本，需要分割成段落，保持语义完整，方便后续处理。"
result = splitter.split(text)
print("\n".join(f"{i}: {chunk}" for i, chunk in enumerate(result, 1)))

选择哪种方法取决于你的具体需求：

简单分割：按固定长度
语义分割：保持句子完整
专业分割：使用NLP工具

建议根据实际应用场景选择合适的方法。