python爬取网页数据方法和代码

使用Python编程语言,如何解析网页的数据

使用Python编程语言,如何解析网页的数据,请看下文。

1. 前言

当我们需要定时获取某些网页的列表数据,那么怎么获取这些数据呢?本文稍微总结一下。

2. 方法

  • BeautifulSoup4
  • lxml
  • requests-html

3. 例子

目标网站:aHR0cHM6Ly9oZWxwLmFsaXl1bi5jb20vbm90aWNlbGlzdC5odG1s

列表数据样例

<li class="notice-li">
<a href="/noticelist/articleid/1064842204.html">【升级】消息队列RabbitMQ版 2023年11月升级通知</a>
<span class="y-right">2023-10-26<span
class="time">17:58:27</span></span>
</li>

3.1 BeautifulSoup4例子

3.1.1 安装方法

pip install BeautifulSoup4

3.1.2 获取公告列表

#!/usr/bin/python
# coding=utf-8

__author__ = 'testerzhang'

import traceback

import requests
from loguru import logger as log
from bs4 import BeautifulSoup

log.add('logs/bs4_demo_{time:YYYY-MM-DD}.log')


def fetch_list():
		#请填写对应的地址
    url = "xxx"

    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "authority": "help.aliyun.com",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "referer": "https://help.aliyun.com/noticelist.html",
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    }

    try:
        resp = requests.get(url, headers=headers)
        resp_text = resp.text
        # log.debug(f"resp_text:{resp_text}")

    except:
        log.error(f"获取列表异常:{traceback.format_exc()}")
        resp_text = ""

    return resp_text


def resolve_html(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    # 查找<li class="notice-li">
    notice_li_elems = soup.find_all('li', class_='notice-li')
    for notice_li_elem in notice_li_elems:
        # log.debug(f"notice_li_elem:{notice_li_elem}")
        # <a href="/noticelist/articleid/1064842204.html">【升级】消息队列RabbitMQ版 2023年11月升级通知</a>
        a_elem = notice_li_elem.find('a')
        title = a_elem.text
        # a_elem.attrs可以打印所有属性
        link = a_elem.attrs["href"]

        # <span class="y-right">2023-10-26<span class="time">17:58:27</span></span>
        span_elem = notice_li_elem.find('span')
        title_time = span_elem.text
        # 2023-10-1813:31:48 前面固定10位
        title_time = f"{title_time[:10]} {title_time[10:]}"
        log.debug(f"title:{title},link:{link},title_time:{title_time}")


def main():
    resp_text = fetch_list()
    resolve_html(resp_text)

if __name__ == '__main__':
    main()

3.2 lxml

3.2.1 安装方法

pip install lxml

lxml节点对象常用方法:

  • xpath(): 使用XPath获取下级节点,结果为列表
  • text: 节点文本
  • itertext(): 迭代输出当前节点及下级所有节点文本,例如''.join(node.itertext()) 可以拿到节点中所有文本
  • attrib: 节点属性字典,如a节点 node.attrib['href']可以拿到其url

3.1.2 获取公告列表

#!/usr/bin/python
# coding=utf-8

__author__ = 'testerzhang'

import traceback

import requests
from loguru import logger as log
from lxml import etree

log.add('logs/lxml_demo_{time:YYYY-MM-DD}.log')


def fetch_list():
    #请填写对应的地址
    url = "xxxx"

    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "authority": "help.aliyun.com",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "referer": "https://help.aliyun.com/noticelist.html",
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    }

    try:
        resp = requests.get(url, headers=headers)
        resp_text = resp.text
        # log.debug(f"resp_text:{resp_text}")

    except:
        log.error(f"获取列表异常:{traceback.format_exc()}")
        resp_text = ""

    return resp_text


def resolve_html(html_text):
    soup =etree.HTML(html_text)
    # 查找<li class="notice-li">
    notice_li_elems = soup.xpath('//li[@class="notice-li"]')
    for notice_li_elem in notice_li_elems:
        # log.debug(f"notice_li_elem:{notice_li_elem}")
        a_elems = notice_li_elem.xpath('./a')
        if len(a_elems) == 0:
            continue
        a_elem = a_elems[0]

        title = a_elem.text
        link = a_elem.attrib["href"]

        span_elems = notice_li_elem.xpath('./span')
        if len(span_elems) == 0:
            continue
        span_elem = span_elems[0]

        title_time = ''.join(span_elem.itertext())
        # 2023-10-1813:31:48 前面固定10位
        title_time = f"{title_time[:10]} {title_time[10:]}"
        log.debug(f"title:{title},link:{link},title_time:{title_time}")


def main():
    resp_text = fetch_list()
    resolve_html(resp_text)

if __name__ == '__main__':
    main()

3.3 requests-html

3.2.1 安装方法

pip install requests-html

3.1.2 获取公告列表

#!/usr/bin/python
# coding=utf-8

__author__ = 'testerzhang'

import traceback

from requests_html import HTMLSession
from loguru import logger as log

log.add('logs/request_html_demo_{time:YYYY-MM-DD}.log')


def fetch_list():
    #请填写对应的地址
    url = "xxxx"

    headers = {
        "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "accept-language": "zh-CN,zh;q=0.9",
        "authority": "help.aliyun.com",
        "cache-control": "no-cache",
        "pragma": "no-cache",
        "referer": "https://help.aliyun.com/noticelist.html",
        "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
    }

    try:
        session = HTMLSession()
        resp = session.get(url, headers=headers)
        resp_html = resp.html

    except:
        log.error(f"获取列表异常:{traceback.format_exc()}")
        resp_html = None

    return resp_html


def resolve_html(resp_html):
    notice_li_elems = resp_html.xpath('''//li[@class="notice-li"]''')
    for notice_li_elem in notice_li_elems:
        a_elem = notice_li_elem.find("a", first=True)
        if a_elem is None:
            continue

        title = a_elem.text
        link = a_elem.attrs["href"]

        span_elem = notice_li_elem.find('span', first=True)
        if span_elem is None:
            continue

        title_time = span_elem.text
        # 2023-10-1813:31:48 前面固定10位
        title_time = f"{title_time[:10]} {title_time[10:]}"
        log.debug(f"title:{title},link:{link},title_time:{title_time}")


def main():
    resp_html = fetch_list()
    resolve_html(resp_html)

if __name__ == '__main__':
    main()

本文没有授权给任何组织、企业和个人转载,未经作者允许禁止转载!

欢迎关注我的公众号testerzhang,原创技术文章第一时间推送。

公众号二维码

updatedupdated2023-10-302023-10-30