如何用实用脚本自动提取邮件附件?

wen 实用脚本 3

本文目录导读:

如何用实用脚本自动提取邮件附件?

  1. IMAP协议提取附件(Python + imaplib)
  2. POP3协议提取(适用于简单场景)
  3. 使用Outlook API(Exchange/Office 365)
  4. 使用Shell脚本(Linux/Mac)
  5. 安装和使用建议

我来为您介绍几种实用的邮件附件自动提取脚本方案,涵盖不同邮件协议和场景。

IMAP协议提取附件(Python + imaplib)

#!/usr/bin/env python3
"""
自动提取邮箱附件脚本 - IMAP协议
"""
import imaplib
import email
import os
from email.header import decode_header
import re
from datetime import datetime, timedelta
class EmailAttachmentExtractor:
    def __init__(self, email_address, password, imap_server='imap.gmail.com'):
        self.email_address = email_address
        self.password = password
        self.imap_server = imap_server
        self.mail = None
    def connect(self):
        """连接邮箱服务器"""
        try:
            self.mail = imaplib.IMAP4_SSL(self.imap_server)
            self.mail.login(self.email_address, self.password)
            return True
        except Exception as e:
            print(f"连接失败: {e}")
            return False
    def decode_mime_words(self, text):
        """解码邮件标题等MIME编码文本"""
        try:
            decoded_parts = decode_header(text)
            result = []
            for part, charset in decoded_parts:
                if isinstance(part, bytes):
                    if charset:
                        result.append(part.decode(charset))
                    else:
                        result.append(part.decode('utf-8', errors='ignore'))
                else:
                    result.append(part)
            return ''.join(result)
        except:
            return str(text)
    def sanitize_filename(self, filename):
        """清理文件名,移除非法字符"""
        return re.sub(r'[<>:"/\\|?*]', '_', filename)
    def extract_attachments(self, save_dir='downloads', 
                          max_emails=50, days_ago=7,
                          allowed_extensions=None):
        """
        提取邮件附件
        Args:
            save_dir: 保存目录
            max_emails: 最大处理邮件数
            days_ago: 提取最近N天的邮件
            allowed_extensions: 允许的扩展名列表,如['pdf', 'xlsx']
        """
        if not self.connect():
            return
        # 创建保存目录
        os.makedirs(save_dir, exist_ok=True)
        # 选择收件箱
        self.mail.select('INBOX')
        # 计算日期范围
        since_date = (datetime.now() - timedelta(days=days_ago)).strftime("%d-%b-%Y")
        # 搜索邮件
        search_criteria = f'(SINCE {since_date})'
        status, messages = self.mail.search(None, search_criteria)
        if status != 'OK':
            print("搜索邮件失败")
            return
        # 获取邮件ID列表(最新的在前)
        email_ids = messages[0].split()
        email_ids.reverse()
        # 限制处理数量
        email_ids = email_ids[:max_emails]
        extracted_count = 0
        for email_id in email_ids:
            try:
                # 获取邮件内容
                status, msg_data = self.mail.fetch(email_id, '(RFC822)')
                if status != 'OK':
                    continue
                # 解析邮件
                msg = email.message_from_bytes(msg_data[0][1])
                # 获取邮件主题
                subject = self.decode_mime_words(msg['Subject'])
                print(f"处理邮件: {subject}")
                # 处理附件
                if msg.is_multipart():
                    for part in msg.walk():
                        if part.get_content_maintype() == 'multipart':
                            continue
                        if part.get('Content-Disposition') is None:
                            continue
                        # 获取文件名
                        filename = part.get_filename()
                        if filename:
                            # 解码文件名
                            filename = self.decode_mime_words(filename)
                            filename = self.sanitize_filename(filename)
                            # 检查扩展名
                            if allowed_extensions:
                                ext = filename.split('.')[-1].lower()
                                if ext not in allowed_extensions:
                                    continue
                            # 保存附件
                            filepath = os.path.join(save_dir, filename)
                            # 避免文件名冲突
                            counter = 1
                            while os.path.exists(filepath):
                                name, ext = os.path.splitext(filename)
                                filepath = os.path.join(save_dir, 
                                    f"{name}_{counter}{ext}")
                                counter += 1
                            # 写入文件
                            with open(filepath, 'wb') as f:
                                f.write(part.get_payload(decode=True))
                            print(f"  ✓ 已保存: {filename}")
                            extracted_count += 1
            except Exception as e:
                print(f"处理邮件 {email_id} 时出错: {e}")
                continue
        # 断开连接
        self.mail.logout()
        print(f"\n提取完成!共保存 {extracted_count} 个附件到 '{save_dir}' 目录")
        return extracted_count
# 使用示例
if __name__ == "__main__":
    # 配置信息
    config = {
        'email': 'your_email@gmail.com',  # 替换为你的邮箱
        'password': 'your_password',      # 替换为你的密码或应用专用密码
        'imap_server': 'imap.gmail.com',  # 邮箱IMAP服务器
        'save_dir': 'downloaded_attachments',
        'max_emails': 100,
        'days_ago': 30,
        'allowed_extensions': ['pdf', 'docx', 'xlsx', 'zip']  # 指定要提取的附件类型
    }
    # 创建提取器实例
    extractor = EmailAttachmentExtractor(
        config['email'], 
        config['password'],
        config['imap_server']
    )
    # 执行提取
    extractor.extract_attachments(
        save_dir=config['save_dir'],
        max_emails=config['max_emails'],
        days_ago=config['days_ago'],
        allowed_extensions=config['allowed_extensions']
    )

POP3协议提取(适用于简单场景)

#!/usr/bin/env python3
"""
POP3协议提取附件脚本
"""
import poplib
import email
import os
from email.header import decode_header
import re
def extract_attachments_pop3(email_address, password, pop3_server='pop.gmail.com', 
                           port=995, save_dir='pop3_downloads', max_emails=20):
    """通过POP3协议提取附件"""
    # 连接POP3服务器
    try:
        mail = poplib.POP3_SSL(pop3_server, port)
        mail.user(email_address)
        mail.pass_(password)
        print(f"连接成功!邮箱有 {len(mail.list()[1])} 封邮件")
    except Exception as e:
        print(f"连接失败: {e}")
        return
    # 创建保存目录
    os.makedirs(save_dir, exist_ok=True)
    # 获取邮件数量
    num_messages = len(mail.list()[1])
    start = max(1, num_messages - max_emails + 1)
    extracted_count = 0
    for i in range(num_messages, start - 1, -1):
        try:
            # 获取邮件
            response, lines, bytes = mail.retr(i)
            msg_content = b'\n'.join(lines)
            msg = email.message_from_bytes(msg_content)
            # 处理附件
            if msg.is_multipart():
                for part in msg.walk():
                    if part.get_content_maintype() == 'multipart':
                        continue
                    if part.get('Content-Disposition') is None:
                        continue
                    filename = part.get_filename()
                    if filename:
                        # 解码和清理文件名
                        try:
                            filename = decode_header(filename)[0][0]
                            if isinstance(filename, bytes):
                                filename = filename.decode('utf-8', errors='ignore')
                        except:
                            pass
                        filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
                        # 保存文件
                        filepath = os.path.join(save_dir, filename)
                        with open(filepath, 'wb') as f:
                            f.write(part.get_payload(decode=True))
                        print(f"已保存: {filename}")
                        extracted_count += 1
        except Exception as e:
            print(f"处理邮件 {i} 时出错: {e}")
            continue
    mail.quit()
    print(f"\n完成!共提取 {extracted_count} 个附件")
# 使用示例
# extract_attachments_pop3('your_email@gmail.com', 'your_password')

使用Outlook API(Exchange/Office 365)

#!/usr/bin/env python3
"""
使用Microsoft Graph API提取附件(适用于Office 365/Outlook)
需要安装: pip install requests msal
"""
import requests
import msal
import os
from datetime import datetime, timedelta
class OutlookAttachmentExtractor:
    def __init__(self, client_id, client_secret, tenant_id):
        self.client_id = client_id
        self.client_secret = client_secret
        self.tenant_id = tenant_id
        self.access_token = None
    def get_access_token(self):
        """获取OAuth2访问令牌"""
        authority = f"https://login.microsoftonline.com/{self.tenant_id}"
        app = msal.ConfidentialClientApplication(
            self.client_id, 
            authority=authority,
            client_credential=self.client_secret
        )
        result = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
        if "access_token" in result:
            self.access_token = result['access_token']
            return True
        else:
            print(f"获取令牌失败: {result.get('error_description')}")
            return False
    def get_user_messages(self, user_email, days_back=7):
        """获取用户邮件列表"""
        headers = {
            'Authorization': f'Bearer {self.access_token}',
            'Content-Type': 'application/json'
        }
        # 计算时间范围
        since_date = (datetime.utcnow() - timedelta(days=days_back)).isoformat() + 'Z'
        url = f"https://graph.microsoft.com/v1.0/users/{user_email}/messages"
        params = {
            '$filter': f"receivedDateTime ge {since_date} and hasAttachments eq true",
            '$select': 'id,subject,receivedDateTime',
            '$top': 50
        }
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            return response.json().get('value', [])
        else:
            print(f"获取邮件失败: {response.text}")
            return []
    def download_attachments(self, user_email, message_id, save_dir='outlook_downloads'):
        """下载指定邮件的附件"""
        headers = {
            'Authorization': f'Bearer {self.access_token}'
        }
        url = f"https://graph.microsoft.com/v1.0/users/{user_email}/messages/{message_id}/attachments"
        response = requests.get(url, headers=headers)
        if response.status_code != 200:
            print(f"获取附件列表失败: {response.text}")
            return
        attachments = response.json().get('value', [])
        for attachment in attachments:
            if attachment.get('@odata.type') == '#microsoft.graph.fileAttachment':
                filename = attachment['name']
                content = attachment['contentBytes']
                # 保存文件
                os.makedirs(save_dir, exist_ok=True)
                filepath = os.path.join(save_dir, filename)
                # 解码Base64内容
                import base64
                with open(filepath, 'wb') as f:
                    f.write(base64.b64decode(content))
                print(f"已保存: {filename}")
# 使用示例(需要Azure AD注册应用)
# extractor = OutlookAttachmentExtractor('client_id', 'client_secret', 'tenant_id')
# extractor.get_access_token()
# messages = extractor.get_user_messages('user@company.com')
# for msg in messages:
#     extractor.download_attachments('user@company.com', msg['id'])

使用Shell脚本(Linux/Mac)

#!/bin/bash
# 自动提取邮件附件脚本(配合getmail或fetchmail使用)
# 配置
MAIL_DIR="$HOME/Maildir"
ATTACH_DIR="$HOME/attachments"
LOG_FILE="$HOME/attachment_extract.log"
# 支持的附件扩展名
EXTENSIONS="pdf|docx|xlsx|zip|jpg|png"
# 创建目录
mkdir -p "$ATTACH_DIR"
# 查找并提取附件
find "$MAIL_DIR" -type f -name "*.eml" -o -name "*.mail" | while read mailfile; do
    echo "处理: $mailfile" >> "$LOG_FILE"
    # 使用munpack提取附件(需要安装)
    munpack -f -t "$mailfile" -C "$ATTACH_DIR" 2>/dev/null
    # 或者使用转码工具
    # ripmime -i "$mailfile" -d "$ATTACH_DIR" 2>/dev/null
done
# 按扩展名筛选附件
find "$ATTACH_DIR" -type f | while read file; do
    ext="${file##*.}"
    if ! echo "$ext" | grep -qiE "^($EXTENSIONS)$"; then
        rm "$file"
        echo "删除不需要的文件: $file" >> "$LOG_FILE"
    fi
done
# 重命名文件(避免乱码)
find "$ATTACH_DIR" -type f | while read file; do
    newname=$(echo "$file" | iconv -f utf-8 -t ascii//TRANSLIT 2>/dev/null)
    if [ "$file" != "$newname" ]; then
        mv "$file" "$newname" 2>/dev/null
    fi
done
echo "完成!附件保存到: $ATTACH_DIR" | tee -a "$LOG_FILE"

安装和使用建议

Python脚本依赖安装

# 安装必要库
pip install python-dotenv  # 用于管理环境变量
# 对于IMGAP/POPS使用标准库,无需额外安装
# 对于Microsoft Graph API
pip install requests msal

环境变量配置(推荐)

# 创建 .env 文件
# EMAIL=your_email@gmail.com
# PASSWORD=your_app_password
# IMAP_SERVER=imap.gmail.com
# 在脚本中加载
from dotenv import load_dotenv
import os
load_dotenv()
EMAIL = os.getenv('EMAIL')
PASSWORD = os.getenv('PASSWORD')

安全注意事项

  1. 使用应用专用密码(Gmail等支持)
  2. 使用环境变量或加密配置,不要硬编码密码
  3. OAuth2认证更安全(推荐用于企业邮箱)
  4. 设置合理的权限,只读取必要的邮件
  5. 处理敏感信息时注意合规性

自动化设置(crontab示例)

# 每天凌晨2点运行
0 2 * * * /usr/bin/python3 /path/to/extract_attachments.py
# 每小时运行一次
0 * * * * /usr/bin/python3 /path/to/extract_attachments.py

选择哪种方案取决于您的邮件服务提供商和使用场景,Gmail、QQ邮箱等推荐使用IMAP方案,企业邮箱推荐使用API方案。

抱歉,评论功能暂时关闭!