本文目录导读:

我将为您详细介绍Python实现语音识别的几种方法,并提供完整的可运行案例。
使用SpeechRecognition库(推荐新手)
安装依赖
pip install SpeechRecognition pip install pyaudio # 麦克风输入需要
基础语音识别示例
import speech_recognition as sr
def voice_recognition_from_microphone():
"""从麦克风获取语音并识别"""
# 创建识别器对象
r = sr.Recognizer()
# 使用麦克风作为音频源
with sr.Microphone() as source:
print("请说话...")
# 调整环境噪音
r.adjust_for_ambient_noise(source, duration=1)
# 录制音频
audio = r.listen(source, timeout=5)
try:
# 使用Google Web Speech API进行识别
text = r.recognize_google(audio, language='zh-CN')
print(f"识别结果:{text}")
return text
except sr.UnknownValueError:
print("无法识别语音")
except sr.RequestError as e:
print(f"请求错误:{e}")
def voice_recognition_from_file(file_path):
"""从音频文件识别语音"""
r = sr.Recognizer()
# 打开音频文件
with sr.AudioFile(file_path) as source:
audio = r.record(source)
try:
text = r.recognize_google(audio, language='en-US')
print(f"文件识别结果:{text}")
return text
except Exception as e:
print(f"错误:{e}")
# 测试示例
if __name__ == "__main__":
voice_recognition_from_microphone()
使用百度AI语音识别(需要API Key)
安装
pip install baidu-aip
代码实现
from aip import AipSpeech
import wave
import os
class BaiduVoiceRecognizer:
def __init__(self):
"""初始化百度语音识别"""
# 替换为您的API信息(在百度AI开放平台获取)
APP_ID = '你的APP_ID'
API_KEY = '你的API_KEY'
SECRET_KEY = '你的SECRET_KEY'
self.client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)
def recognize_from_file(self, audio_path):
"""从音频文件识别"""
# 读取音频文件
with open(audio_path, 'rb') as f:
audio_data = f.read()
# 调用API进行识别
result = self.client.asr(audio_data, 'wav', 16000, {
'dev_pid': 1537, # 普通话
})
if result['err_msg'] == 'success.':
return result['result'][0]
else:
return f"识别失败:{result['err_msg']}"
def recognize_from_microphone(self, duration=5):
"""从麦克风录制并识别"""
import pyaudio
# 录制参数
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
print(f"开始录音 {duration} 秒...")
frames = []
for _ in range(0, int(RATE / CHUNK * duration)):
data = stream.read(CHUNK)
frames.append(data)
print("录音结束")
stream.stop_stream()
stream.close()
p.terminate()
# 保存临时文件
temp_file = "temp_audio.wav"
wf = wave.open(temp_file, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
# 识别
result = self.recognize_from_file(temp_file)
# 删除临时文件
os.remove(temp_file)
return result
# 使用示例
recognizer = BaiduVoiceRecognizer()
result = recognizer.recognize_from_microphone()
print(f"识别结果:{result}")
使用Whisper(OpenAI开源模型,离线可用)
安装
pip install openai-whisper
代码实现
import whisper
import numpy as np
class WhisperVoiceRecognizer:
def __init__(self, model_size="base"):
"""初始化Whisper模型
Args:
model_size: 模型大小 (tiny, base, small, medium, large)
"""
print(f"加载{model_size}模型...")
self.model = whisper.load_model(model_size)
print("模型加载完成")
def recognize_from_file(self, audio_path):
"""从音频文件识别"""
result = self.model.transcribe(audio_path)
return result["text"]
def recognize_from_microphone(self, duration=5):
"""从麦克风录制并识别"""
import sounddevice as sd
import soundfile as sf
# 录制参数
sample_rate = 16000
print(f"开始录音 {duration} 秒...")
recording = sd.rec(int(duration * sample_rate),
samplerate=sample_rate,
channels=1)
sd.wait()
print("录音结束")
# 转为一维数组
audio_array = recording.flatten()
# 转换为numpy数组并确保类型正确
audio_array = np.float32(audio_array)
# 直接识别
result = self.model.transcribe(audio_array)
return result["text"]
# 使用示例
recognizer = WhisperVoiceRecognizer(model_size="base")
# 从文件识别
result = recognizer.recognize_from_file("audio.wav")
print(f"文件识别结果:{result}")
# 从麦克风识别
result = recognizer.recognize_from_microphone(5)
print(f"麦克风识别结果:{result}")
完整的实时语音识别系统
import speech_recognition as sr
import threading
import queue
import time
class RealTimeVoiceRecognizer:
def __init__(self, language='zh-CN'):
self.recognizer = sr.Recognizer()
self.language = language
self.is_running = False
self.audio_queue = queue.Queue()
self.result_queue = queue.Queue()
def record_audio(self):
"""录音线程"""
with sr.Microphone() as source:
self.recognizer.adjust_for_ambient_noise(source, duration=0.5)
print("录音线程已启动...")
while self.is_running:
try:
audio = self.recognizer.listen(source, timeout=1, phrase_time_limit=5)
self.audio_queue.put(audio)
except sr.WaitTimeoutError:
pass
def recognize_audio(self):
"""识别线程"""
print("识别线程已启动...")
while self.is_running or not self.audio_queue.empty():
try:
audio = self.audio_queue.get(timeout=1)
try:
text = self.recognizer.recognize_google(audio, language=self.language)
self.result_queue.put(("success", text))
except sr.UnknownValueError:
self.result_queue.put(("error", "无法识别"))
except sr.RequestError as e:
self.result_queue.put(("error", f"请求错误:{e}"))
except queue.Empty:
pass
def start(self):
"""启动语音识别系统"""
self.is_running = True
# 创建录音和识别线程
record_thread = threading.Thread(target=self.record_audio)
recognize_thread = threading.Thread(target=self.recognize_audio)
record_thread.daemon = True
recognize_thread.daemon = True
record_thread.start()
recognize_thread.start()
print("语音识别系统已启动,说'退出'来结束程序")
# 主循环处理结果显示
while self.is_running:
try:
status, text = self.result_queue.get(timeout=0.1)
if status == "success":
print(f"识别到:{text}")
# 检查退出命令
if "退出" in text or "结束" in text:
self.stop()
break
except queue.Empty:
pass
def stop(self):
"""停止语音识别系统"""
self.is_running = False
print("语音识别系统已停止")
# 使用示例
if __name__ == "__main__":
recognizer = RealTimeVoiceRecognizer(language='zh-CN')
try:
recognizer.start()
except KeyboardInterrupt:
recognizer.stop()
语音识别优化建议
提高识别准确率的技巧
import speech_recognition as sr
def enhanced_voice_recognition():
"""增强版语音识别"""
r = sr.Recognizer()
with sr.Microphone() as source:
print("准备中...")
# 1. 环境噪音调整
r.adjust_for_ambient_noise(source, duration=2)
# 2. 设置灵敏度
r.energy_threshold = 300 # 默认300
r.dynamic_energy_threshold = True
# 3. 设置静音超时
r.pause_threshold = 0.8 # 默认0.8秒
print("请说话...")
audio = r.listen(source, timeout=5, phrase_time_limit=10)
# 4. 尝试多个识别引擎
results = {}
try:
# Google Web Speech
text = r.recognize_google(audio, language='zh-CN')
results['google'] = text
except:
pass
try:
# Sphinx (离线)
text = r.recognize_sphinx(audio, language='zh-CN')
results['sphinx'] = text
except:
pass
try:
# Wit.ai
text = r.recognize_wit(audio, key='YOUR_WIT_KEY')
results['wit'] = text
except:
pass
return results
# 音频预处理
def preprocess_audio(audio_path):
"""音频预处理以提高识别率"""
import librosa
import soundfile as sf
# 加载音频
y, sr = librosa.load(audio_path, sr=16000)
# 1. 降噪
y_denoised = librosa.effects.preemphasis(y)
# 2. 归一化
y_normalized = y_denoised / np.max(np.abs(y_denoised))
# 3. 保存处理后的音频
output_path = "processed_audio.wav"
sf.write(output_path, y_normalized, sr)
return output_path
选择哪种方法取决于您的需求:
- SpeechRecognition:最简单,适合快速原型开发
- 百度AI:中文识别效果好,但需要网络和API Key
- Whisper:离线可用,支持多语言,准度高但需要GPU
- 实时系统:适合需要持续语音识别的应用
建议初学者从SpeechRecognition开始,熟悉后再根据实际需求选择其他方案。