本文目录导读:

我来详细介绍使用Python实现动作识别的几种常见方法,并提供完整案例。
基于MediaPipe的姿态估计+规则判断
这是最简单实用的方法,适合识别特定动作。
import cv2
import mediapipe as mp
import numpy as np
class SimpleActionRecognizer:
def __init__(self):
self.mp_pose = mp.solutions.pose
self.pose = self.mp_pose.Pose(
static_image_mode=False,
model_complexity=1,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
self.mp_draw = mp.solutions.drawing_utils
def calculate_angle(self, a, b, c):
"""计算三点之间的角度"""
a = np.array(a)
b = np.array(b)
c = np.array(c)
radians = np.arctan2(c[1] - b[1], c[0] - b[0]) - \
np.arctan2(a[1] - b[1], a[0] - b[0])
angle = np.abs(radians * 180.0 / np.pi)
if angle > 180.0:
angle = 360 - angle
return angle
def detect_arm_raise(self, landmarks):
"""检测手臂上举动作"""
# 获取关键点坐标
left_shoulder = [landmarks[self.mp_pose.PoseLandmark.LEFT_SHOULDER.value].x,
landmarks[self.mp_pose.PoseLandmark.LEFT_SHOULDER.value].y]
left_elbow = [landmarks[self.mp_pose.PoseLandmark.LEFT_ELBOW.value].x,
landmarks[self.mp_pose.PoseLandmark.LEFT_ELBOW.value].y]
left_wrist = [landmarks[self.mp_pose.PoseLandmark.LEFT_WRIST.value].x,
landmarks[self.mp_pose.PoseLandmark.LEFT_WRIST.value].y]
# 计算肩肘腕角度
angle = self.calculate_angle(left_shoulder, left_elbow, left_wrist)
# 判断是否上举(角度大于160度认为手臂伸直上举)
return angle > 160
def detect_squat(self, landmarks):
"""检测深蹲动作"""
# 获取关键点
left_hip = [landmarks[self.mp_pose.PoseLandmark.LEFT_HIP.value].x,
landmarks[self.mp_pose.PoseLandmark.LEFT_HIP.value].y]
left_knee = [landmarks[self.mp_pose.PoseLandmark.LEFT_KNEE.value].x,
landmarks[self.mp_pose.PoseLandmark.LEFT_KNEE.value].y]
left_ankle = [landmarks[self.mp_pose.PoseLandmark.LEFT_ANKLE.value].x,
landmarks[self.mp_pose.PoseLandmark.LEFT_ANKLE.value].y]
# 计算髋膝踝角度
angle = self.calculate_angle(left_hip, left_knee, left_ankle)
# 判断是否深蹲(角度小于90度认为在深蹲)
return angle < 90
def process_frame(self, frame):
"""处理单帧图像"""
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.pose.process(rgb_frame)
actions = []
if results.pose_landmarks:
# 绘制关键点
self.mp_draw.draw_landmarks(
frame,
results.pose_landmarks,
self.mp_pose.POSE_CONNECTIONS
)
landmarks = results.pose_landmarks.landmark
# 检测动作
if self.detect_arm_raise(landmarks):
actions.append("手臂上举")
if self.detect_squat(landmarks):
actions.append("深蹲")
return frame, actions
# 使用示例
def main():
recognizer = SimpleActionRecognizer()
cap = cv2.VideoCapture(0) # 使用摄像头
while True:
ret, frame = cap.read()
if not ret:
break
# 镜像翻转(方便自拍)
frame = cv2.flip(frame, 1)
# 处理帧
processed_frame, actions = recognizer.process_frame(frame)
# 显示检测结果
for i, action in enumerate(actions):
cv2.putText(processed_frame, action, (10, 30 + i*30),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
cv2.imshow('Action Recognition', processed_frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
main()
基于LSTM的时间序列分类
适用于识别连续动作序列:
import torch
import torch.nn as nn
import numpy as np
from collections import deque
class ActionLSTM(nn.Module):
"""LSTM动作识别模型"""
def __init__(self, input_size=33*2, hidden_size=128, num_layers=2, num_classes=5):
super(ActionLSTM, self).__init__()
self.lstm = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
batch_first=True,
dropout=0.2
)
self.fc = nn.Linear(hidden_size, num_classes)
self.relu = nn.ReLU()
def forward(self, x):
lstm_out, _ = self.lstm(x)
last_output = lstm_out[:, -1, :]
output = self.fc(last_output)
return output
class ActionSequenceRecognizer:
"""基于时序的动作识别器"""
def __init__(self, sequence_length=30):
self.sequence_length = sequence_length
self.sequence = deque(maxlen=sequence_length)
# 动作标签
self.actions = {
0: "站立",
1: "走路",
2: "跑步",
3: "跳跃",
4: "坐下"
}
# 加载预训练模型(这里演示架构)
self.model = ActionLSTM()
def extract_keypoints(self, landmarks):
"""从MediaPipe结果中提取关键点"""
if landmarks:
keypoints = []
for landmark in landmarks.landmark:
keypoints.extend([landmark.x, landmark.y])
return np.array(keypoints)
return np.zeros(33*2)
def predict_action(self, landmarks):
"""预测当前动作"""
keypoints = self.extract_keypoints(landmarks)
self.sequence.append(keypoints)
if len(self.sequence) == self.sequence_length:
# 准备输入数据
input_seq = np.array(self.sequence)
input_tensor = torch.FloatTensor(input_seq).unsqueeze(0)
# 预测
with torch.no_grad():
output = self.model(input_tensor)
predicted = torch.argmax(output, dim=1).item()
return self.actions[predicted]
return "检测中..."
def train_model(self, X_train, y_train, epochs=50):
"""训练模型"""
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(self.model.parameters(), lr=0.001)
X_train = torch.FloatTensor(X_train)
y_train = torch.LongTensor(y_train)
for epoch in range(epochs):
self.model.train()
optimizer.zero_grad()
outputs = self.model(X_train)
loss = criterion(outputs, y_train)
loss.backward()
optimizer.step()
if (epoch + 1) % 10 == 0:
print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')
OpenPose + 动作模板匹配
基于人体骨架的模板匹配方法:
import cv2
import numpy as np
from scipy.spatial.distance import cosine
class SkeletonActionMatcher:
"""骨架动作匹配器"""
def __init__(self):
# 定义动作模板(示例:挥手)
self.templates = {
'wave': {
'left_arm_angle': [30, 60, 90, 120, 150], # 角度序列
'right_arm_angle': [150, 120, 90, 60, 30]
}
}
def extract_skeleton_features(self, landmarks):
"""提取骨架特征向量"""
features = []
# 提取主要关节角度
joint_pairs = [
(11, 13, 15), # 左肩-左肘-左腕
(12, 14, 16), # 右肩-右肘-右腕
(23, 25, 27), # 左髋-左膝-左踝
(24, 26, 28) # 右髋-右膝-右踝
]
for a, b, c in joint_pairs:
if all(x in landmarks for x in [a, b, c]):
angle = self.calculate_angle(
landmarks[a], landmarks[b], landmarks[c]
)
features.append(angle)
return np.array(features)
def calculate_angle(self, a, b, c):
"""计算三点角度"""
ba = a - b
bc = c - b
cosine_angle = np.dot(ba, bc) / (np.linalg.norm(ba) * np.linalg.norm(bc))
angle = np.arccos(cosine_angle)
return np.degrees(angle)
def match_action(self, current_features, template_name='wave'):
"""匹配动作模板"""
if template_name not in self.templates:
return 0.0
template = self.templates[template_name]
best_similarity = 0.0
# 计算与模板的相似度
for template_feature in template.values():
if len(current_features) == len(template_feature):
similarity = 1 - cosine(current_features, template_feature)
best_similarity = max(best_similarity, similarity)
return best_similarity
完整的实时动作识别系统
整合以上方法:
import cv2
import mediapipe as mp
import numpy as np
from collections import deque
import threading
class RealTimeActionRecognition:
"""实时动作识别系统"""
def __init__(self):
# 初始化MediaPipe
self.mp_pose = mp.solutions.pose
self.pose = self.mp_pose.Pose(
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
self.mp_draw = mp.solutions.drawing_utils
# 动作缓冲区
self.action_buffer = deque(maxlen=30)
self.current_action = None
# 动作计数器
self.action_counts = {
'arm_raise': 0,
'squat': 0,
'wave': 0
}
# 锁定状态(防止重复计数)
self.locked_actions = set()
self.lock_duration = 20 # 帧数
def calculate_angle(self, a, b, c):
"""计算三点角度"""
a = np.array(a)
b = np.array(b)
c = np.array(c)
radians = np.arctan2(c[1]-b[1], c[0]-b[0]) - \
np.arctan2(a[1]-b[1], a[0]-b[0])
angle = np.abs(radians * 180.0 / np.pi)
if angle > 180.0:
angle = 360 - angle
return angle
def detect_actions(self, landmarks):
"""检测多个动作"""
h, w = 480, 640 # 假设图像尺寸
actions = {}
# 获取关键点坐标(归一化坐标转像素坐标)
try:
left_shoulder = [landmarks[11].x * w, landmarks[11].y * h]
left_elbow = [landmarks[13].x * w, landmarks[13].y * h]
left_wrist = [landmarks[15].x * w, landmarks[15].y * h]
right_shoulder = [landmarks[12].x * w, landmarks[12].y * h]
right_elbow = [landmarks[14].x * w, landmarks[14].y * h]
right_wrist = [landmarks[16].x * w, landmarks[16].y * h]
# 检测手臂上举
left_arm_angle = self.calculate_angle(
left_shoulder, left_elbow, left_wrist
)
right_arm_angle = self.calculate_angle(
right_shoulder, right_elbow, right_wrist
)
if left_arm_angle > 160 or right_arm_angle > 160:
actions['arm_raise'] = 1.0
else:
actions['arm_raise'] = 0.0
# 检测挥手
if left_arm_angle > 90 and left_arm_angle < 160:
actions['wave'] = 0.8
else:
actions['wave'] = 0.0
except Exception as e:
print(f"Error detecting actions: {e}")
return actions
def update_action_counts(self, actions):
"""更新动作计数(带防抖)"""
for action, confidence in actions.items():
if confidence > 0.7:
if action not in self.locked_actions:
self.action_counts[action] += 1
self.locked_actions.add(action)
# 启动解锁计时器
threading.Timer(1.0, self.unlock_action, args=[action]).start()
def unlock_action(self, action):
"""解锁动作,允许再次计数"""
if action in self.locked_actions:
self.locked_actions.remove(action)
def process_frame(self, frame):
"""处理视频帧"""
rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
results = self.pose.process(rgb_frame)
if results.pose_landmarks:
# 绘制骨架
self.mp_draw.draw_landmarks(
frame,
results.pose_landmarks,
self.mp_pose.POSE_CONNECTIONS,
self.mp_draw.DrawingSpec(color=(0,0,255), thickness=2),
self.mp_draw.DrawingSpec(color=(0,255,0), thickness=2)
)
# 检测动作
actions = self.detect_actions(results.pose_landmarks.landmark)
self.update_action_counts(actions)
# 显示检测结果
y_offset = 30
for action, count in self.action_counts.items():
text = f"{action}: {count}"
cv2.putText(frame, text, (10, y_offset),
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 255, 0), 2)
y_offset += 30
# 显示当前动作
detected_actions = [a for a, c in actions.items() if c > 0.7]
if detected_actions:
cv2.putText(frame, f"Action: {detected_actions[0]}",
(10, frame.shape[0] - 30),
cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)
return frame
def main():
"""主函数"""
recognizer = RealTimeActionRecognition()
cap = cv2.VideoCapture(0)
# 设置分辨率
cap.set(cv2.CAP_PROP_FRAME_WIDTH, 640)
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 480)
while cap.isOpened():
success, frame = cap.read()
if not success:
break
# 镜像翻转
frame = cv2.flip(frame, 1)
# 处理帧
processed_frame = recognizer.process_frame(frame)
# 显示
cv2.imshow('Real-time Action Recognition', processed_frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
if __name__ == "__main__":
# 安装必要的库
# pip install opencv-python mediapipe numpy torch scipy
main()
关键实现要点
环境配置
# 安装必要库 pip install opencv-python pip install mediapipe pip install numpy pip install torch pip install scipy
性能优化建议
- 降低分辨率:处理640x480即可
- 跳帧处理:每2-3帧处理一次
- 使用GPU:如果使用深度学习模型
- 优化检测频率:不需要每帧都检测
动作识别准确率提升方法
- 数据增强:增加旋转、缩放等
- 多帧投票:连续多帧结果取平均
- 置信度阈值:设置合理的阈值
- 防抖处理:减少误检测
这个实现包含了从简单规则到深度学习的多种方法,你可以根据实际需求选择合适的方案。