使用Python编程语言,如何解析网页的数据,请看下文。
1. 前言
当我们需要定时获取某些网页的列表数据,那么怎么获取这些数据呢?本文稍微总结一下。
2. 方法
- BeautifulSoup4
- lxml
- requests-html
3. 例子
目标网站:aHR0cHM6Ly9oZWxwLmFsaXl1bi5jb20vbm90aWNlbGlzdC5odG1s
列表数据样例
<li class="notice-li">
<a href="/noticelist/articleid/1064842204.html">【升级】消息队列RabbitMQ版 2023年11月升级通知</a>
<span class="y-right">2023-10-26<span
class="time">17:58:27</span></span>
</li>
3.1 BeautifulSoup4例子
3.1.1 安装方法
pip install BeautifulSoup4
3.1.2 获取公告列表
#!/usr/bin/python
# coding=utf-8
__author__ = 'testerzhang'
import traceback
import requests
from loguru import logger as log
from bs4 import BeautifulSoup
log.add('logs/bs4_demo_{time:YYYY-MM-DD}.log')
def fetch_list():
#请填写对应的地址
url = "xxx"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"authority": "help.aliyun.com",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://help.aliyun.com/noticelist.html",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
}
try:
resp = requests.get(url, headers=headers)
resp_text = resp.text
# log.debug(f"resp_text:{resp_text}")
except:
log.error(f"获取列表异常:{traceback.format_exc()}")
resp_text = ""
return resp_text
def resolve_html(html_text):
soup = BeautifulSoup(html_text, 'html.parser')
# 查找<li class="notice-li">
notice_li_elems = soup.find_all('li', class_='notice-li')
for notice_li_elem in notice_li_elems:
# log.debug(f"notice_li_elem:{notice_li_elem}")
# <a href="/noticelist/articleid/1064842204.html">【升级】消息队列RabbitMQ版 2023年11月升级通知</a>
a_elem = notice_li_elem.find('a')
title = a_elem.text
# a_elem.attrs可以打印所有属性
link = a_elem.attrs["href"]
# <span class="y-right">2023-10-26<span class="time">17:58:27</span></span>
span_elem = notice_li_elem.find('span')
title_time = span_elem.text
# 2023-10-1813:31:48 前面固定10位
title_time = f"{title_time[:10]} {title_time[10:]}"
log.debug(f"title:{title},link:{link},title_time:{title_time}")
def main():
resp_text = fetch_list()
resolve_html(resp_text)
if __name__ == '__main__':
main()
3.2 lxml
3.2.1 安装方法
pip install lxml
lxml节点对象常用方法:
- xpath(): 使用XPath获取下级节点,结果为列表
- text: 节点文本
- itertext(): 迭代输出当前节点及下级所有节点文本,例如
''.join(node.itertext())
可以拿到节点中所有文本 - attrib: 节点属性字典,如a节点
node.attrib['href']
可以拿到其url
3.1.2 获取公告列表
#!/usr/bin/python
# coding=utf-8
__author__ = 'testerzhang'
import traceback
import requests
from loguru import logger as log
from lxml import etree
log.add('logs/lxml_demo_{time:YYYY-MM-DD}.log')
def fetch_list():
#请填写对应的地址
url = "xxxx"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"authority": "help.aliyun.com",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://help.aliyun.com/noticelist.html",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
}
try:
resp = requests.get(url, headers=headers)
resp_text = resp.text
# log.debug(f"resp_text:{resp_text}")
except:
log.error(f"获取列表异常:{traceback.format_exc()}")
resp_text = ""
return resp_text
def resolve_html(html_text):
soup =etree.HTML(html_text)
# 查找<li class="notice-li">
notice_li_elems = soup.xpath('//li[@class="notice-li"]')
for notice_li_elem in notice_li_elems:
# log.debug(f"notice_li_elem:{notice_li_elem}")
a_elems = notice_li_elem.xpath('./a')
if len(a_elems) == 0:
continue
a_elem = a_elems[0]
title = a_elem.text
link = a_elem.attrib["href"]
span_elems = notice_li_elem.xpath('./span')
if len(span_elems) == 0:
continue
span_elem = span_elems[0]
title_time = ''.join(span_elem.itertext())
# 2023-10-1813:31:48 前面固定10位
title_time = f"{title_time[:10]} {title_time[10:]}"
log.debug(f"title:{title},link:{link},title_time:{title_time}")
def main():
resp_text = fetch_list()
resolve_html(resp_text)
if __name__ == '__main__':
main()
3.3 requests-html
3.2.1 安装方法
pip install requests-html
3.1.2 获取公告列表
#!/usr/bin/python
# coding=utf-8
__author__ = 'testerzhang'
import traceback
from requests_html import HTMLSession
from loguru import logger as log
log.add('logs/request_html_demo_{time:YYYY-MM-DD}.log')
def fetch_list():
#请填写对应的地址
url = "xxxx"
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "zh-CN,zh;q=0.9",
"authority": "help.aliyun.com",
"cache-control": "no-cache",
"pragma": "no-cache",
"referer": "https://help.aliyun.com/noticelist.html",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
}
try:
session = HTMLSession()
resp = session.get(url, headers=headers)
resp_html = resp.html
except:
log.error(f"获取列表异常:{traceback.format_exc()}")
resp_html = None
return resp_html
def resolve_html(resp_html):
notice_li_elems = resp_html.xpath('''//li[@class="notice-li"]''')
for notice_li_elem in notice_li_elems:
a_elem = notice_li_elem.find("a", first=True)
if a_elem is None:
continue
title = a_elem.text
link = a_elem.attrs["href"]
span_elem = notice_li_elem.find('span', first=True)
if span_elem is None:
continue
title_time = span_elem.text
# 2023-10-1813:31:48 前面固定10位
title_time = f"{title_time[:10]} {title_time[10:]}"
log.debug(f"title:{title},link:{link},title_time:{title_time}")
def main():
resp_html = fetch_list()
resolve_html(resp_html)
if __name__ == '__main__':
main()
本文没有授权给任何组织、企业和个人转载,未经作者允许禁止转载!
欢迎关注我的公众号testerzhang,原创技术文章第一时间推送。