Python案例如何分割长文本内容?

wen python案例 18

本文目录导读:

Python案例如何分割长文本内容?

  1. 基础分割方法
  2. 智能分割方法
  3. 使用自然语言处理库
  4. 实际应用场景
  5. 完整工具类

我来介绍几种Python分割长文本的常用方法,从简单到复杂:

基础分割方法

按标点符号分割

import re
def split_by_punctuation(text, max_length=100):
    """按句子分割"""
    # 按句号、问号、感叹号分割
    sentences = re.split(r'[。!?.!?]', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if sentence.strip():
            if len(current_chunk + sentence) <= max_length:
                current_chunk += sentence + "。"
            else:
                if current_chunk:
                    chunks.append(current_chunk)
                current_chunk = sentence + "。"
    if current_chunk:
        chunks.append(current_chunk)
    return chunks
# 示例
text = "这是一个长文本,需要分成多个段落,每个段落包含完整句子,这样便于处理,或者用于其他用途。"
result = split_by_punctuation(text, max_length=20)
for i, chunk in enumerate(result, 1):
    print(f"段落{i}: {chunk}")

按字数分割(简单粗暴)

def split_by_length(text, chunk_size=50):
    """按固定长度分割"""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
text = "这是一个用于测试的长文本内容,按固定长度分割,不考虑语义完整性。" * 5
chunks = split_by_length(text, 30)
for i, chunk in enumerate(chunks, 1):
    print(f"第{i}段: {chunk}")

智能分割方法

优化版:保持句子完整

def smart_split(text, max_length=100):
    """智能分割,尽量保持句子完整"""
    # 先按标点分割为句子
    sentences = []
    current = ""
    for char in text:
        current += char
        if char in '。!?.!?\n':
            if current.strip():
                sentences.append(current.strip())
            current = ""
    if current.strip():
        sentences.append(current.strip())
    # 组合句子为段落
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk + sentence) <= max_length:
            current_chunk += sentence
        else:
            if current_chunk:
                chunks.append(current_chunk)
            # 如果单个句子超过最大长度,强制分割
            while len(sentence) > max_length:
                chunks.append(sentence[:max_length])
                sentence = sentence[max_length:]
            current_chunk = sentence
    if current_chunk:
        chunks.append(current_chunk)
    return chunks
text = """
Python是一种流行的编程语言,它被广泛用于数据科学、人工智能等领域。
这个例子展示了如何智能分割长文本,保持句子的完整性很重要。
当处理大量文本时,好的分割策略可以大大提高处理效率。
"""
chunks = smart_split(text, max_length=50)
for i, chunk in enumerate(chunks, 1):
    print(f"段落{i}: {chunk}")

使用自然语言处理库

使用 jieba 分词(适合中文)

import jieba
def split_by_words(text, max_words=30):
    """按词数分割"""
    words = list(jieba.cut(text))
    chunks = []
    current_chunk = []
    for word in words:
        current_chunk.append(word)
        if len(current_chunk) >= max_words:
            chunks.append(''.join(current_chunk))
            current_chunk = []
    if current_chunk:
        chunks.append(''.join(current_chunk))
    return chunks
text = "Python是一种强大的编程语言,广泛应用于数据分析、机器学习和Web开发等领域。"
chunks = split_by_words(text, max_words=8)
for i, chunk in enumerate(chunks, 1):
    print(f"段落{i}: {chunk}")

使用 TextBlob(适合英文)

from textblob import TextBlob
def split_by_sentences(text, max_sentences=3):
    """按句子数量分割"""
    blob = TextBlob(text)
    sentences = list(blob.sentences)
    chunks = []
    for i in range(0, len(sentences), max_sentences):
        chunk = ' '.join(str(s) for s in sentences[i:i+max_sentences])
        chunks.append(chunk)
    return chunks
# 英文示例
text = """
Python is a versatile language. It's used in many fields. 
Data science and machine learning are popular applications. 
Web development also heavily relies on Python.
"""
chunks = split_by_sentences(text, max_sentences=2)
for i, chunk in enumerate(chunks, 1):
    print(f"段落{i}: {chunk}")

实际应用场景

处理大文件

def split_large_file(file_path, chunk_size=1000):
    """分割大文件"""
    chunks = []
    with open(file_path, 'r', encoding='utf-8') as f:
        chunk = ""
        for line in f:
            if len(chunk + line) <= chunk_size:
                chunk += line
            else:
                if chunk:
                    chunks.append(chunk)
                # 处理超长行
                while len(line) > chunk_size:
                    chunks.append(line[:chunk_size])
                    line = line[chunk_size:]
                chunk = line
        if chunk:
            chunks.append(chunk)
    return chunks
# 使用示例
# chunks = split_large_file('large_file.txt', chunk_size=500)

API文本长度限制

def prepare_for_api(text, max_tokens=200):
    """准备发送到API的文本块"""
    # 这里使用简单的token估算(中文约1.5字符/token)
    estimated_tokens = len(text) * 1.5
    if estimated_tokens <= max_tokens:
        return [text]
    # 需要分割
    char_limit = int(max_tokens / 1.5)
    return split_by_punctuation(text, char_limit)
# 示例
long_text = "这是一段很长的文本" * 100 + "需要发送给API进行处理,它超出了API的限制,因此我们需要分割它。"
chunks = prepare_for_api(long_text, max_tokens=100)
print(f"被分割为{len(chunks)}个段落")

完整工具类

class TextSplitter:
    """文本分割工具类"""
    def __init__(self, method='smart', max_length=100):
        self.method = method
        self.max_length = max_length
    def split(self, text):
        if self.method == 'simple':
            return self._simple_split(text)
        elif self.method == 'smart':
            return self._smart_split(text)
        elif self.method == 'punctuation':
            return self._punctuation_split(text)
    def _simple_split(self, text):
        """简单按长度分割"""
        return [text[i:i+self.max_length] 
                for i in range(0, len(text), self.max_length)]
    def _smart_split(self, text):
        """智能分割"""
        return smart_split(text, self.max_length)
    def _punctuation_split(self, text):
        """按标点分割"""
        return split_by_punctuation(text, self.max_length)
# 使用
splitter = TextSplitter(method='smart', max_length=50)
text = "这是测试文本,需要分割成段落,保持语义完整,方便后续处理。"
result = splitter.split(text)
print("\n".join(f"{i}: {chunk}" for i, chunk in enumerate(result, 1)))

选择哪种方法取决于你的具体需求:

  • 简单分割:按固定长度
  • 语义分割:保持句子完整
  • 专业分割:使用NLP工具

建议根据实际应用场景选择合适的方法。

抱歉,评论功能暂时关闭!