如何用Python案例实现文本分类?

wen python案例 4

本文目录导读:

如何用Python案例实现文本分类?

  1. 基础案例:使用朴素贝叶斯分类新闻
  2. 实战案例:情感分析(好评/差评)
  3. 进阶案例:使用深度学习(PyTorch)
  4. 完整项目:垃圾短信分类
  5. 使用建议

我来介绍几个用Python实现文本分类的实用案例,从简单到复杂逐步展开。

基础案例:使用朴素贝叶斯分类新闻

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# 示例数据:新闻文本
data = {
    'text': [
        '苹果发布新款iPhone手机',
        '华为推出5G折叠屏手机',
        '中国男足击败日本队',
        'NBA总决赛湖人夺冠',
        '人工智能改变生活方式',
        '量子计算最新突破',
        '长三角经济快速发展',
        '央行调整存款利率'
    ],
    'category': ['科技', '科技', '体育', '体育', '科技', '科技', '经济', '经济']
}
df = pd.DataFrame(data)
# 特征提取
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
    X, df['category'], test_size=0.3, random_state=42
)
# 训练模型
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# 预测
y_pred = classifier.predict(X_test)
# 评估
print("分类报告:")
print(classification_report(y_test, y_pred))
# 预测新文本
new_texts = ['北京冬奥会中国获金牌', '特斯拉股价上涨']
X_new = vectorizer.transform(new_texts)
predictions = classifier.predict(X_new)
print("\n新文本预测结果:")
for text, pred in zip(new_texts, predictions):
    print(f"'{text}' -> {pred}")

实战案例:情感分析(好评/差评)

import nltk
import re
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# 下载停用词
nltk.download('stopwords')
class TextClassifier:
    def __init__(self):
        self.pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer(
                max_features=5000,
                stop_words=stopwords.words('chinese'),
                ngram_range=(1, 2)
            )),
            ('classifier', LogisticRegression(
                C=1.0,
                max_iter=1000
            ))
        ])
    def preprocess(self, texts):
        """文本预处理"""
        processed = []
        for text in texts:
            # 去除特殊字符
            text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z\s]', '', text)
            # 转为小写
            text = text.lower()
            processed.append(text)
        return processed
    def train(self, texts, labels):
        """训练模型"""
        texts = self.preprocess(texts)
        self.pipeline.fit(texts, labels)
        print("模型训练完成!")
    def predict(self, texts):
        """预测"""
        texts = self.preprocess(texts)
        return self.pipeline.predict(texts)
    def predict_proba(self, texts):
        """预测概率"""
        texts = self.preprocess(texts)
        return self.pipeline.predict_proba(texts)
# 示例:电影评论情感分析
def sentiment_demo():
    # 训练数据
    train_texts = [
        '这部电影太精彩了,演员演技很棒',
        '剧情很无聊,浪费了两个小时',
        '非常感人的故事,推荐观看',
        '特效很烂,剧情逻辑不通',
        '值得反复观看的好电影',
        '导演水平太差,完全不推荐',
        '画面精美,配乐动听',
        '演员表演极其做作,看不下去'
    ]
    train_labels = [1, 0, 1, 0, 1, 0, 1, 0]  # 1=好评, 0=差评
    # 创建并训练分类器
    classifier = TextClassifier()
    classifier.train(train_texts, train_labels)
    # 测试预测
    test_texts = [
        '非常棒的观影体验',
        '这部电影差强人意',
        '演技浮夸,剧情老套'
    ]
    predictions = classifier.predict(test_texts)
    probabilities = classifier.predict_proba(test_texts)
    print("\n情感分析结果:")
    for text, pred, prob in zip(test_texts, predictions, probabilities):
        sentiment = '好评' if pred == 1 else '差评'
        confidence = max(prob) * 100
        print(f"'{text}' -> {sentiment} (置信度: {confidence:.1f}%)")
if __name__ == "__main__":
    sentiment_demo()

进阶案例:使用深度学习(PyTorch)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }
class BERTClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs.pooler_output
        output = self.dropout(pooled_output)
        return self.classifier(output)
def train_deep_learning_model():
    # 初始化 tokenizer 和模型
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
    model = BERTClassifier(n_classes=2)  # 二分类
    # 准备数据
    texts = [
        '苹果发布新款iPhone手机',
        '华为推出5G折叠屏手机',
        '这部电影太精彩了',
        '剧情很无聊,浪费时间'
    ]
    labels = [1, 1, 0, 0]  # 假设1是科技类,0是非科技类
    # 创建数据集和数据加载器
    dataset = TextClassificationDataset(texts, labels, tokenizer)
    dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    # 训练设置
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    # 训练循环
    model.train()
    for epoch in range(3):
        total_loss = 0
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}')
    print("深度学习模型训练完成!")
    # 预测示例
    model.eval()
    test_text = "三星发布新款手机"
    encoding = tokenizer(
        test_text,
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors='pt'
    )
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        prediction = torch.argmax(outputs, dim=1)
    print(f"预测类别: {'科技' if prediction[0] == 1 else '非科技'}")
if __name__ == "__main__":
    # 运行基础案例
    print("=== 基础案例:朴素贝叶斯分类 ===")
    # 执行上面的基础案例代码
    # 运行情感分析
    print("\n=== 情感分析案例 ===")
    sentiment_demo()
    # 运行深度学习案例
    print("\n=== 深度学习案例 ===")
    train_deep_learning_model()

完整项目:垃圾短信分类

import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
class SpamClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=3000,
            min_df=2,
            max_df=0.8
        )
        self.classifier = SVC(kernel='linear', probability=True)
    def train(self, X_train, y_train):
        """训练垃圾短信分类器"""
        # 特征提取
        X_train_tfidf = self.vectorizer.fit_transform(X_train)
        # 训练
        self.classifier.fit(X_train_tfidf, y_train)
        print(f"训练完成,训练集大小: {len(X_train)}")
    def evaluate(self, X_test, y_test):
        """评估模型"""
        X_test_tfidf = self.vectorizer.transform(X_test)
        y_pred = self.classifier.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        print(f"准确率: {accuracy:.2%}")
        print("\n混淆矩阵:")
        print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
        print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")
        # 可视化混淆矩阵
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title('混淆矩阵')
        plt.ylabel('真实标签')
        plt.xlabel('预测标签')
        plt.show()
        return accuracy
    def predict(self, texts):
        """预测新短信"""
        texts_tfidf = self.vectorizer.transform(texts)
        predictions = self.classifier.predict(texts_tfidf)
        probabilities = self.classifier.predict_proba(texts_tfidf)
        results = []
        for text, pred, prob in zip(texts, predictions, probabilities):
            prob_spam = prob[1] if pred == 1 else prob[0]
            results.append({
                'text': text,
                'is_spam': bool(pred),
                'confidence': max(prob)
            })
        return results
# 使用示例
def spam_detection_demo():
    # 示例数据
    ham_texts = [
        '明天下午三点开会,请准时参加',
        '晚上一起吃饭吧',
        '作业已经提交,请查收'
    ]
    spam_texts = [
        '恭喜您中奖了!点击链接领取奖品',
        '只需支付99元,即可获得万元大礼包',
        '您的账户异常,请立即点击链接验证'
    ]
    # 准备数据
    all_texts = ham_texts + spam_texts
    all_labels = [0, 0, 0, 1, 1, 1]  # 0=正常, 1=垃圾
    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(
        all_texts, all_labels, test_size=0.3, random_state=42
    )
    # 训练分类器
    classifier = SpamClassifier()
    classifier.train(X_train, y_train)
    # 评估
    if len(X_test) > 0:
        classifier.evaluate(X_test, y_test)
    # 预测新短信
    new_messages = [
        '亲爱的用户,您的积分即将过期',
        '项目进度汇报:已完成80%',
        '加微信领红包,速速联系'
    ]
    results = classifier.predict(new_messages)
    print("\n短信检测结果:")
    for result in results:
        status = "🚫 垃圾短信" if result['is_spam'] else "✅ 正常短信"
        print(f"[{status}] {result['text']} (确信度: {result['confidence']:.1%})")
if __name__ == "__main__":
    print("=== 垃圾短信分类器 ===")
    spam_detection_demo()

使用建议

  1. 数据准备:确保有足够的标注数据(至少各100条以上)
  2. 特征选择:根据文本语言选择合适的分词工具
  3. 模型选择
    • 小数据集:朴素贝叶斯、SVM
    • 大数据集:深度学习模型
  4. 评估指标:使用准确率、精确率、召回率、F1分数
  5. 优化策略
    • 调整文本预处理参数
    • 优化特征提取参数
    • 尝试不同模型
    • 使用交叉验证

这些案例覆盖了从基础到进阶的文本分类实现,你可以根据实际需求选择合适的方案。

抱歉,评论功能暂时关闭!