本文目录导读:

我来介绍几种Python分割长文本的常用方法,从简单到复杂:
基础分割方法
按标点符号分割
import re
def split_by_punctuation(text, max_length=100):
"""按句子分割"""
# 按句号、问号、感叹号分割
sentences = re.split(r'[。!?.!?]', text)
chunks = []
current_chunk = ""
for sentence in sentences:
if sentence.strip():
if len(current_chunk + sentence) <= max_length:
current_chunk += sentence + "。"
else:
if current_chunk:
chunks.append(current_chunk)
current_chunk = sentence + "。"
if current_chunk:
chunks.append(current_chunk)
return chunks
# 示例
text = "这是一个长文本,需要分成多个段落,每个段落包含完整句子,这样便于处理,或者用于其他用途。"
result = split_by_punctuation(text, max_length=20)
for i, chunk in enumerate(result, 1):
print(f"段落{i}: {chunk}")
按字数分割(简单粗暴)
def split_by_length(text, chunk_size=50):
"""按固定长度分割"""
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
text = "这是一个用于测试的长文本内容,按固定长度分割,不考虑语义完整性。" * 5
chunks = split_by_length(text, 30)
for i, chunk in enumerate(chunks, 1):
print(f"第{i}段: {chunk}")
智能分割方法
优化版:保持句子完整
def smart_split(text, max_length=100):
"""智能分割,尽量保持句子完整"""
# 先按标点分割为句子
sentences = []
current = ""
for char in text:
current += char
if char in '。!?.!?\n':
if current.strip():
sentences.append(current.strip())
current = ""
if current.strip():
sentences.append(current.strip())
# 组合句子为段落
chunks = []
current_chunk = ""
for sentence in sentences:
if len(current_chunk + sentence) <= max_length:
current_chunk += sentence
else:
if current_chunk:
chunks.append(current_chunk)
# 如果单个句子超过最大长度,强制分割
while len(sentence) > max_length:
chunks.append(sentence[:max_length])
sentence = sentence[max_length:]
current_chunk = sentence
if current_chunk:
chunks.append(current_chunk)
return chunks
text = """
Python是一种流行的编程语言,它被广泛用于数据科学、人工智能等领域。
这个例子展示了如何智能分割长文本,保持句子的完整性很重要。
当处理大量文本时,好的分割策略可以大大提高处理效率。
"""
chunks = smart_split(text, max_length=50)
for i, chunk in enumerate(chunks, 1):
print(f"段落{i}: {chunk}")
使用自然语言处理库
使用 jieba 分词(适合中文)
import jieba
def split_by_words(text, max_words=30):
"""按词数分割"""
words = list(jieba.cut(text))
chunks = []
current_chunk = []
for word in words:
current_chunk.append(word)
if len(current_chunk) >= max_words:
chunks.append(''.join(current_chunk))
current_chunk = []
if current_chunk:
chunks.append(''.join(current_chunk))
return chunks
text = "Python是一种强大的编程语言,广泛应用于数据分析、机器学习和Web开发等领域。"
chunks = split_by_words(text, max_words=8)
for i, chunk in enumerate(chunks, 1):
print(f"段落{i}: {chunk}")
使用 TextBlob(适合英文)
from textblob import TextBlob
def split_by_sentences(text, max_sentences=3):
"""按句子数量分割"""
blob = TextBlob(text)
sentences = list(blob.sentences)
chunks = []
for i in range(0, len(sentences), max_sentences):
chunk = ' '.join(str(s) for s in sentences[i:i+max_sentences])
chunks.append(chunk)
return chunks
# 英文示例
text = """
Python is a versatile language. It's used in many fields.
Data science and machine learning are popular applications.
Web development also heavily relies on Python.
"""
chunks = split_by_sentences(text, max_sentences=2)
for i, chunk in enumerate(chunks, 1):
print(f"段落{i}: {chunk}")
实际应用场景
处理大文件
def split_large_file(file_path, chunk_size=1000):
"""分割大文件"""
chunks = []
with open(file_path, 'r', encoding='utf-8') as f:
chunk = ""
for line in f:
if len(chunk + line) <= chunk_size:
chunk += line
else:
if chunk:
chunks.append(chunk)
# 处理超长行
while len(line) > chunk_size:
chunks.append(line[:chunk_size])
line = line[chunk_size:]
chunk = line
if chunk:
chunks.append(chunk)
return chunks
# 使用示例
# chunks = split_large_file('large_file.txt', chunk_size=500)
API文本长度限制
def prepare_for_api(text, max_tokens=200):
"""准备发送到API的文本块"""
# 这里使用简单的token估算(中文约1.5字符/token)
estimated_tokens = len(text) * 1.5
if estimated_tokens <= max_tokens:
return [text]
# 需要分割
char_limit = int(max_tokens / 1.5)
return split_by_punctuation(text, char_limit)
# 示例
long_text = "这是一段很长的文本" * 100 + "需要发送给API进行处理,它超出了API的限制,因此我们需要分割它。"
chunks = prepare_for_api(long_text, max_tokens=100)
print(f"被分割为{len(chunks)}个段落")
完整工具类
class TextSplitter:
"""文本分割工具类"""
def __init__(self, method='smart', max_length=100):
self.method = method
self.max_length = max_length
def split(self, text):
if self.method == 'simple':
return self._simple_split(text)
elif self.method == 'smart':
return self._smart_split(text)
elif self.method == 'punctuation':
return self._punctuation_split(text)
def _simple_split(self, text):
"""简单按长度分割"""
return [text[i:i+self.max_length]
for i in range(0, len(text), self.max_length)]
def _smart_split(self, text):
"""智能分割"""
return smart_split(text, self.max_length)
def _punctuation_split(self, text):
"""按标点分割"""
return split_by_punctuation(text, self.max_length)
# 使用
splitter = TextSplitter(method='smart', max_length=50)
text = "这是测试文本,需要分割成段落,保持语义完整,方便后续处理。"
result = splitter.split(text)
print("\n".join(f"{i}: {chunk}" for i, chunk in enumerate(result, 1)))
选择哪种方法取决于你的具体需求:
- 简单分割:按固定长度
- 语义分割:保持句子完整
- 专业分割:使用NLP工具
建议根据实际应用场景选择合适的方法。