本文目录导读:

在Python中解析HTML标签,最常用的方法有两种:BeautifulSoup(最适合新手)和 lxml,下面通过具体案例来说明。
使用 BeautifulSoup(推荐)
安装
pip install beautifulsoup4 lxml
基础案例:解析简单的HTML
from bs4 import BeautifulSoup
# 示例HTML
html_content = """
<html>
<head><title>我的页面</title></head>
<body>
<div class="content" id="main">
<h1>欢迎光临</h1>
<p class="text">这是一段文字</p>
<p class="text">这是另一段文字</p>
<a href="http://example.com">点击这里</a>
<ul>
<li>苹果</li>
<li>香蕉</li>
<li>橘子</li>
</ul>
</div>
</body>
</html>
"""
# 创建BeautifulSoup对象
soup = BeautifulSoup(html_content, 'lxml') # 或 'html.parser'
# 1. 获取标题
print("标题:", soup.title.string)
# 2. 获取第一个p标签
first_p = soup.find('p')
print("第一个段落:", first_p.text)
# 3. 获取所有p标签
all_p = soup.find_all('p')
print("\n所有段落:")
for i, p in enumerate(all_p, 1):
print(f" 段落{i}: {p.text}")
# 4. 通过class获取
text_paragraphs = soup.find_all('p', class_='text')
print("\nclass=text的段落:")
for p in text_paragraphs:
print(f" {p.text}")
# 5. 通过id获取
main_div = soup.find('div', id='main')
print("\nid=main的div:", main_div.name)
# 6. 获取链接
link = soup.find('a')
print(f"\n链接文本: {link.text}")
print(f"链接地址: {link['href']}")
# 7. 获取列表项
items = soup.find_all('li')
print("\n列表项:")
for item in items:
print(f" - {item.text}")
访问真实网页案例
import requests
from bs4 import BeautifulSoup
# 获取网页内容
url = "https://httpbin.org/html"
response = requests.get(url)
response.encoding = 'utf-8'
# 解析HTML
soup = BeautifulSoup(response.text, 'lxml')
# 查找所有链接
links = soup.find_all('a')
for link in links:
href = link.get('href')
text = link.text.strip()
if href:
print(f"链接: {text} -> {href}")
# 查找所有图片
images = soup.find_all('img')
for img in images:
src = img.get('src')
alt = img.get('alt', '无描述')
if src:
print(f"图片: {alt} -> {src}")
使用 CSS选择器
BeautifulSoup支持CSS选择器,更简化代码:
from bs4 import BeautifulSoup
html = """
<div class="container">
<p id="first" class="highlight">第一个段落</p>
<p class="highlight">第二个段落</p>
<p class="normal">普通段落</p>
<div class="inner">
<p>嵌套段落</p>
</div>
</div>
"""
soup = BeautifulSoup(html, 'lxml')
# CSS选择器示例
print("id=first的标签:", soup.select_one('#first').text)
print("\nclass=highlight的所有标签:")
for elem in soup.select('.highlight'):
print(f" {elem.text}")
print("\ndiv内所有p标签:")
for p in soup.select('div p'):
print(f" {p.text}")
print("\ndiv > p (直接子元素):")
for p in soup.select('div > p'):
print(f" {p.text}")
高级解析技巧
处理表格数据
from bs4 import BeautifulSoup
html_table = """
<table>
<tr>
<th>姓名</th>
<th>年龄</th>
<th>城市</th>
</tr>
<tr>
<td>张三</td>
<td>25</td>
<td>北京</td>
</tr>
<tr>
<td>李四</td>
<td>30</td>
<td>上海</td>
</tr>
</table>
"""
soup = BeautifulSoup(html_table, 'lxml')
table = soup.find('table')
# 提取表头
headers = [th.text for th in table.find_all('th')]
print("表头:", headers)
# 提取数据
rows = []
for tr in table.find_all('tr')[1:]: # 跳过表头
cells = tr.find_all('td')
row_data = [cell.text for cell in cells]
rows.append(dict(zip(headers, row_data)))
print("\n数据:")
for row in rows:
print(f" {row}")
使用正则表达式
import re
from bs4 import BeautifulSoup
html = """
<div class="price">¥99.99</div>
<div class="price">¥199.50</div>
<div class="old-price">¥299.00</div>
<p>联系电话: 138-8888-8888</p>
"""
soup = BeautifulSoup(html, 'lxml')
# 查找包含"price"的class
price_divs = soup.find_all('div', class_=re.compile('price'))
for div in price_divs:
print(f"价格: {div.text}")
# 查找电话号码
phone = soup.find('p', string=re.compile(r'\d{3}-\d{4}-\d{4}'))
if phone:
print(f"电话: {phone.text}")
实际应用:爬取文章列表
import requests
from bs4 import BeautifulSoup
def parse_article_list(url):
"""解析文章列表页面"""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'lxml')
articles = []
# 假设文章在class=article的div中
for article_div in soup.find_all('div', class_='article'):
title_tag = article_div.find('h2')
link_tag = article_div.find('a')
summary_tag = article_div.find('p', class_='summary')
if title_tag and link_tag:
article = {
'title': title_tag.text.strip(),
'link': link_tag.get('href', ''),
'summary': summary_tag.text.strip() if summary_tag else ''
}
articles.append(article)
return articles
except Exception as e:
print(f"解析失败: {e}")
return []
# 使用示例
# articles = parse_article_list('https://example.com/articles')
# for article in articles:
# print(f"标题: {article['title']}")
使用 lxml 直接解析(更高效)
from lxml import etree
html = """
<html>
<body>
<div class="content">
<p id="p1">段落1</p>
<p id="p2">段落2</p>
</div>
</body>
</html>
"""
# 解析HTML
tree = etree.HTML(html)
# 使用XPath
paragraphs = tree.xpath('//p')
for p in paragraphs:
print(f"XPath结果: {p.text}")
# 按id查找
p1 = tree.xpath('//p[@id="p1"]')
if p1:
print(f"id=p1: {p1[0].text}")
# 按class查找
content = tree.xpath('//div[@class="content"]')
if content:
print(f"class=content: {etree.tostring(content[0], pretty_print=True).decode()}")
| 方法 | 优点 | 适用场景 |
|---|---|---|
| BeautifulSoup | 简单易用,容错性强 | 大多数Web解析任务 |
| lxml | 速度快,支持XPath | 需要高性能的场景 |
| CSS选择器 | 简洁直观 | 熟悉CSS的开发人员 |
| 正则表达式 | 灵活精确 | 特定模式匹配 |
最佳实践建议:
- 新手首选 BeautifulSoup + lxml 组合
- 简单解析用
find()和find_all() - 复杂选择用CSS选择器
- 性能要求高时考虑 lxml
有任何具体场景需要解析,欢迎继续提问!