Python案例做爬虫?

wen 网络安全 51

本文目录导读:

Python案例做爬虫?

  1. 基础案例:爬取网页标题
  2. 中级案例:抓取新闻列表
  3. 高级案例:带代理和反反爬的爬虫
  4. 实战案例:爬取电商商品信息
  5. 异步爬虫案例
  6. 安装所需库
  7. 注意事项

我来为您介绍几个Python爬虫的实用案例,从简单到复杂。

基础案例:爬取网页标题

import requests
from bs4 import BeautifulSoup
# 爬取单个网页的标题
url = "https://www.example.com"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
    response = requests.get(url, headers=headers, timeout=10)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')= soup.title.string
    print(f"网页标题: {title}")
except Exception as e:
    print(f"爬取失败: {e}")

中级案例:抓取新闻列表

import requests
from bs4 import BeautifulSoup
import csv
import time
def crawl_news():
    """爬取新闻标题和链接"""
    url = "https://news.163.com/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        news_list = []
        # 根据实际网页结构调整选择器
        for item in soup.select('.data_row .news_title a'):
            title = item.get_text().strip()
            link = item.get('href')
            if title and link:
                news_list.append([title, link])
        # 保存到CSV
        with open('news.csv', 'w', newline='', encoding='utf-8-sig') as f:
            writer = csv.writer(f)
            writer.writerow(['标题', '链接'])
            writer.writerows(news_list)
        print(f"成功爬取 {len(news_list)} 条新闻")
        return news_list
    except Exception as e:
        print(f"爬取失败: {e}")
        return []
# 运行爬虫
news_data = crawl_news()

高级案例:带代理和反反爬的爬虫

import requests
from bs4 import BeautifulSoup
import random
import time
from fake_useragent import UserAgent
class AdvancedSpider:
    """高级爬虫,包含代理和反反爬策略"""
    def __init__(self):
        self.ua = UserAgent()
        self.proxies = [
            {'http': 'http://proxy1.com:8080'},
            {'http': 'http://proxy2.com:8080'},
        ]
    def get_random_headers(self):
        """生成随机请求头"""
        return {
            'User-Agent': self.ua.random,
            'Accept': 'text/html,application/xhtml+xml',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
        }
    def retry_request(self, url, max_retries=3):
        """带重试机制的请求"""
        for i in range(max_retries):
            try:
                headers = self.get_random_headers()
                proxy = random.choice(self.proxies)
                response = requests.get(
                    url, 
                    headers=headers,
                    # proxies=proxy,  # 如果需要代理
                    timeout=10
                )
                if response.status_code == 200:
                    return response
                else:
                    print(f"请求失败,状态码: {response.status_code}")
            except Exception as e:
                print(f"第{i+1}次尝试失败: {e}")
                time.sleep(random.uniform(1, 3))
        return None
    def parse_content(self, url):
        """解析页面内容"""
        response = self.retry_request(url)
        if response:
            soup = BeautifulSoup(response.text, 'html.parser')
            # 这里添加具体的解析逻辑
            return soup
        return None
# 使用示例
spider = AdvancedSpider()
content = spider.parse_content("https://example.com")

实战案例:爬取电商商品信息

import requests
from bs4 import BeautifulSoup
import json
import time
import random
def crawl_product_info(keyword, pages=5):
    """爬取商品信息(以京东为例)"""
    base_url = "https://search.jd.com/Search"
    products = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        'Referer': 'https://www.jd.com/',
    }
    for page in range(1, pages + 1):
        params = {
            'keyword': keyword,
            'page': page,
            'enc': 'utf-8',
        }
        try:
            response = requests.get(base_url, params=params, headers=headers)
            response.encoding = 'utf-8'
            soup = BeautifulSoup(response.text, 'html.parser')
            # 解析商品列表
            items = soup.select('.gl-item')
            for item in items:
                try:
                    product = {
                        'title': item.select('.p-name em')[0].text.strip(),
                        'price': item.select('.p-price i')[0].text.strip(),
                        'shop': item.select('.p-shop a')[0].text.strip(),
                        'link': 'https:' + item.select('.p-name a')[0].get('href', ''),
                    }
                    products.append(product)
                except:
                    continue
            print(f"已爬取第{page}页,共{len(items)}个商品")
            time.sleep(random.uniform(1, 3))  # 随机延时
        except Exception as e:
            print(f"爬取第{page}页失败: {e}")
    # 保存结果
    with open(f'{keyword}_products.json', 'w', encoding='utf-8') as f:
        json.dump(products, f, ensure_ascii=False, indent=2)
    print(f"共爬取 {len(products)} 个商品")
    return products
# 运行
products = crawl_product_info("Python书籍", pages=3)

异步爬虫案例

import aiohttp
import asyncio
from bs4 import BeautifulSoup
import time
async def fetch_url(session, url):
    """异步获取网页内容"""
    try:
        async with session.get(url, timeout=10) as response:
            return await response.text()
    except Exception as e:
        print(f"请求失败: {url}, 错误: {e}")
        return None
async def parse_page(session, url):
    """异步解析页面"""
    html = await fetch_url(session, url)
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.title.string if soup.title else "No title"
        return {'url': url, 'title': title}
    return None
async def crawl_multiple_pages(urls):
    """并发爬取多个页面"""
    async with aiohttp.ClientSession() as session:
        tasks = [parse_page(session, url) for url in urls]
        results = await asyncio.gather(*tasks)
        return [r for r in results if r]
# 使用示例
async def main():
    urls = [
        'https://www.python.org',
        'https://www.github.com',
        'https://stackoverflow.com',
    ]
    start_time = time.time()
    results = await crawl_multiple_pages(urls)
    elapsed = time.time() - start_time
    print(f"爬取完成,用时: {elapsed:.2f}秒")
    for result in results:
        print(f"URL: {result['url']}, 标题: {result['title']}")
# 运行异步爬虫
asyncio.run(main())

安装所需库

pip install requests beautifulsoup4 lxml fake-useragent aiohttp

注意事项

  1. 遵守Robots协议:查看网站的robots.txt文件
  2. 控制请求频率:添加适当的延时,避免对服务器造成压力
  3. 动态IP处理:对于反爬严格的网站,可能需要使用代理池
  4. 数据存储:考虑使用数据库存储大量数据
  5. 异常处理:完善的错误处理机制

这些案例涵盖了从基础到高级的爬虫技术,您可以根据实际需求选择适合的方案,记得在爬取数据时遵守相关法律法规和网站的使用条款。

抱歉,评论功能暂时关闭!