Python案例如何实现语音识别？

wen python案例 2026-06-18 10

本文目录导读：

Python案例如何实现语音识别？

使用SpeechRecognition库（推荐新手）
使用百度AI语音识别（需要API Key）
使用Whisper（OpenAI开源模型，离线可用）
完整的实时语音识别系统
语音识别优化建议

我将为您详细介绍Python实现语音识别的几种方法,并提供完整的可运行案例。

使用SpeechRecognition库（推荐新手）

安装依赖

pip install SpeechRecognition
pip install pyaudio  # 麦克风输入需要

基础语音识别示例

import speech_recognition as sr
def voice_recognition_from_microphone():
    """从麦克风获取语音并识别"""
    # 创建识别器对象
    r = sr.Recognizer()
    # 使用麦克风作为音频源
    with sr.Microphone() as source:
        print("请说话...")
        # 调整环境噪音
        r.adjust_for_ambient_noise(source, duration=1)
        # 录制音频
        audio = r.listen(source, timeout=5)
    try:
        # 使用Google Web Speech API进行识别
        text = r.recognize_google(audio, language='zh-CN')
        print(f"识别结果：{text}")
        return text
    except sr.UnknownValueError:
        print("无法识别语音")
    except sr.RequestError as e:
        print(f"请求错误：{e}")
def voice_recognition_from_file(file_path):
    """从音频文件识别语音"""
    r = sr.Recognizer()
    # 打开音频文件
    with sr.AudioFile(file_path) as source:
        audio = r.record(source)
    try:
        text = r.recognize_google(audio, language='en-US')
        print(f"文件识别结果：{text}")
        return text
    except Exception as e:
        print(f"错误：{e}")
# 测试示例
if __name__ == "__main__":
    voice_recognition_from_microphone()

使用百度AI语音识别（需要API Key）

安装

pip install baidu-aip

代码实现

from aip import AipSpeech
import wave
import os
class BaiduVoiceRecognizer:
    def __init__(self):
        """初始化百度语音识别"""
        # 替换为您的API信息（在百度AI开放平台获取）
        APP_ID = '你的APP_ID'
        API_KEY = '你的API_KEY'
        SECRET_KEY = '你的SECRET_KEY'
        self.client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
    def recognize_from_file(self, audio_path):
        """从音频文件识别"""
        # 读取音频文件
        with open(audio_path, 'rb') as f:
            audio_data = f.read()
        # 调用API进行识别
        result = self.client.asr(audio_data, 'wav', 16000, {
            'dev_pid': 1537,  # 普通话
        })
        if result['err_msg'] == 'success.':
            return result['result'][0]
        else:
            return f"识别失败：{result['err_msg']}"
    def recognize_from_microphone(self, duration=5):
        """从麦克风录制并识别"""
        import pyaudio
        # 录制参数
        CHUNK = 1024
        FORMAT = pyaudio.paInt16
        CHANNELS = 1
        RATE = 16000
        p = pyaudio.PyAudio()
        stream = p.open(format=FORMAT,
                       channels=CHANNELS,
                       rate=RATE,
                       input=True,
                       frames_per_buffer=CHUNK)
        print(f"开始录音 {duration} 秒...")
        frames = []
        for _ in range(0, int(RATE / CHUNK * duration)):
            data = stream.read(CHUNK)
            frames.append(data)
        print("录音结束")
        stream.stop_stream()
        stream.close()
        p.terminate()
        # 保存临时文件
        temp_file = "temp_audio.wav"
        wf = wave.open(temp_file, 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(p.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))
        wf.close()
        # 识别
        result = self.recognize_from_file(temp_file)
        # 删除临时文件
        os.remove(temp_file)
        return result
# 使用示例
recognizer = BaiduVoiceRecognizer()
result = recognizer.recognize_from_microphone()
print(f"识别结果：{result}")

使用Whisper（OpenAI开源模型，离线可用）

安装

pip install openai-whisper

代码实现

import whisper
import numpy as np
class WhisperVoiceRecognizer:
    def __init__(self, model_size="base"):
        """初始化Whisper模型
        Args:
            model_size: 模型大小 (tiny, base, small, medium, large)
        """
        print(f"加载{model_size}模型...")
        self.model = whisper.load_model(model_size)
        print("模型加载完成")
    def recognize_from_file(self, audio_path):
        """从音频文件识别"""
        result = self.model.transcribe(audio_path)
        return result["text"]
    def recognize_from_microphone(self, duration=5):
        """从麦克风录制并识别"""
        import sounddevice as sd
        import soundfile as sf
        # 录制参数
        sample_rate = 16000
        print(f"开始录音 {duration} 秒...")
        recording = sd.rec(int(duration * sample_rate), 
                          samplerate=sample_rate, 
                          channels=1)
        sd.wait()
        print("录音结束")
        # 转为一维数组
        audio_array = recording.flatten()
        # 转换为numpy数组并确保类型正确
        audio_array = np.float32(audio_array)
        # 直接识别
        result = self.model.transcribe(audio_array)
        return result["text"]
# 使用示例
recognizer = WhisperVoiceRecognizer(model_size="base")
# 从文件识别
result = recognizer.recognize_from_file("audio.wav")
print(f"文件识别结果：{result}")
# 从麦克风识别
result = recognizer.recognize_from_microphone(5)
print(f"麦克风识别结果：{result}")

完整的实时语音识别系统

import speech_recognition as sr
import threading
import queue
import time
class RealTimeVoiceRecognizer:
    def __init__(self, language='zh-CN'):
        self.recognizer = sr.Recognizer()
        self.language = language
        self.is_running = False
        self.audio_queue = queue.Queue()
        self.result_queue = queue.Queue()
    def record_audio(self):
        """录音线程"""
        with sr.Microphone() as source:
            self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
            print("录音线程已启动...")
            while self.is_running:
                try:
                    audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=5)
                    self.audio_queue.put(audio)
                except sr.WaitTimeoutError:
                    pass
    def recognize_audio(self):
        """识别线程"""
        print("识别线程已启动...")
        while self.is_running or not self.audio_queue.empty():
            try:
                audio = self.audio_queue.get(timeout=1)
                try:
                    text = self.recognizer.recognize_google(audio, language=self.language)
                    self.result_queue.put(("success", text))
                except sr.UnknownValueError:
                    self.result_queue.put(("error", "无法识别"))
                except sr.RequestError as e:
                    self.result_queue.put(("error", f"请求错误：{e}"))
            except queue.Empty:
                pass
    def start(self):
        """启动语音识别系统"""
        self.is_running = True
        # 创建录音和识别线程
        record_thread = threading.Thread(target=self.record_audio)
        recognize_thread = threading.Thread(target=self.recognize_audio)
        record_thread.daemon = True
        recognize_thread.daemon = True
        record_thread.start()
        recognize_thread.start()
        print("语音识别系统已启动，说'退出'来结束程序")
        # 主循环处理结果显示
        while self.is_running:
            try:
                status, text = self.result_queue.get(timeout=0.1)
                if status == "success":
                    print(f"识别到：{text}")
                    # 检查退出命令
                    if "退出" in text or "结束" in text:
                        self.stop()
                        break
            except queue.Empty:
                pass
    def stop(self):
        """停止语音识别系统"""
        self.is_running = False
        print("语音识别系统已停止")
# 使用示例
if __name__ == "__main__":
    recognizer = RealTimeVoiceRecognizer(language='zh-CN')
    try:
        recognizer.start()
    except KeyboardInterrupt:
        recognizer.stop()

语音识别优化建议

提高识别准确率的技巧

import speech_recognition as sr
def enhanced_voice_recognition():
    """增强版语音识别"""
    r = sr.Recognizer()
    with sr.Microphone() as source:
        print("准备中...")
        # 1. 环境噪音调整
        r.adjust_for_ambient_noise(source, duration=2)
        # 2. 设置灵敏度
        r.energy_threshold = 300  # 默认300
        r.dynamic_energy_threshold = True
        # 3. 设置静音超时
        r.pause_threshold = 0.8  # 默认0.8秒
        print("请说话...")
        audio = r.listen(source, timeout=5, phrase_time_limit=10)
    # 4. 尝试多个识别引擎
    results = {}
    try:
        # Google Web Speech
        text = r.recognize_google(audio, language='zh-CN')
        results['google'] = text
    except:
        pass
    try:
        # Sphinx (离线)
        text = r.recognize_sphinx(audio, language='zh-CN')
        results['sphinx'] = text
    except:
        pass
    try:
        # Wit.ai
        text = r.recognize_wit(audio, key='YOUR_WIT_KEY')
        results['wit'] = text
    except:
        pass
    return results
# 音频预处理
def preprocess_audio(audio_path):
    """音频预处理以提高识别率"""
    import librosa
    import soundfile as sf
    # 加载音频
    y, sr = librosa.load(audio_path, sr=16000)
    # 1. 降噪
    y_denoised = librosa.effects.preemphasis(y)
    # 2. 归一化
    y_normalized = y_denoised / np.max(np.abs(y_denoised))
    # 3. 保存处理后的音频
    output_path = "processed_audio.wav"
    sf.write(output_path, y_normalized, sr)
    return output_path

选择哪种方法取决于您的需求：

SpeechRecognition：最简单，适合快速原型开发
百度AI：中文识别效果好，但需要网络和API Key
Whisper：离线可用，支持多语言，准度高但需要GPU
实时系统：适合需要持续语音识别的应用

建议初学者从SpeechRecognition开始,熟悉后再根据实际需求选择其他方案。