用Python写的个crawler

在我之前的文章中有写过。python其实很适合写爬虫。因为其拥有庞大的方法库。当然包括爬虫必备的各类库 我也尝试用php写过。现在打算用python写个。 上次我们爬取了acfun,这次我们爬取一下pixiv吧。(ps:至于pixiv是什么网址,插图网站,我们就是爬取这些图片)

准备工作

建立一个完整的python环境和配置好pip 如下所示我是使用的python version = 3.8.4、pip version = 20.2.3的环境 ```shell # xxx @ xxxdeiMac in ~/Desktop/python/crawler [19:10:56] $ python --version Python 3.8.4

xxx @ xxxdeiMac in ~/Desktop/python/crawler [19:26:23]

$ pip --version pip 20.2.3 from /usr/local/lib/python3.8/site-packages/pip (python 3.8)


加载库类

首先我们要明白,爬虫需要爬取网页内容首先要能下载到网页内容。然后对下载的内容进行分析得到想要的数据。这个在之前的php写爬虫中有说过 这里我用的是requests-http请求库、lxml-web内容解析库、os-系统方法库和re-正则匹配库 如下所示 ```python import requests from lxml import etree import os import re

安装就是用pip快速安装

# xxx @ xxxdeiMac in ~/Desktop/python/crawler [17:17:08]
$ pip install requests
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting requests
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl (61 kB)
     |████████████████████████████████| 61 kB 167 kB/s
Collecting chardet<4,>=3.0.2
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/9f/f0/a391d1463ebb1b233795cabfc0ef38d3db4442339de68f847026199e69d7/urllib3-1.25.10-py2.py3-none-any.whl (127 kB)
     |████████████████████████████████| 127 kB 264 kB/s
Collecting certifi>=2017.4.17
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/5e/c4/6c4fe722df5343c33226f0b4e0bb042e4dc13483228b4718baf286f86d87/certifi-2020.6.20-py2.py3-none-any.whl (156 kB)
     |████████████████████████████████| 156 kB 254 kB/s
Collecting idna<3,>=2.5
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a2/38/928ddce2273eaa564f6f50de919327bf3a00f091b5baba8dfa9460f3a8a8/idna-2.10-py2.py3-none-any.whl (58 kB)
     |████████████████████████████████| 58 kB 88 kB/s
Installing collected packages: chardet, urllib3, certifi, idna, requests
Successfully installed certifi-2020.6.20 chardet-3.0.4 idna-2.10 requests-2.24.0 urllib3-1.25.10

# xxx @ xxxdeiMac in ~/Desktop/python/crawler [17:17:08]
$ pip install lxml
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting lxml
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/25/64/10836b5790d37e3400a219f1e09bdc9c99bf93533f2edbaddc19796d84cd/lxml-4.5.2-cp38-cp38-macosx_10_9_x86_64.whl (4.5 MB)
     |████████████████████████████████| 4.5 MB 67.2 MB/s
Installing collected packages: lxml
Successfully installed lxml-4.5.2

获取web全部内容

```python def req_open_pixiv(url_main): #header参数在浏览器控制台->network中获取 headersData = { 'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' }
#requests进行get请求,带headers参数
res_data = requests.get(url_main,headers = headersData)
return res_data

#主函数调用req_open_pixiv 请求pixiv站日排行榜https://www.pixiv.net/ranking.php地址 if name == "main": url_main = 'https://www.pixiv.net/ranking.php' res_data = req_open_pixiv(url_main)


获取图片内容并写入

```python def req_open_pixiv(url_main): headersData = { 'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' } res_data = requests.get(url_main,headers = headersData) return res_data def req_open_image(url): url_headers = { 'referer':'https://www.pixiv.net/', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' } img_res = requests.get(url,headers = url_headers) return img_res def check_finder(image_dir): if not os.path.exists(image_dir): os.mkdir(image_dir) def res_image_downloads(image_name,image_dir,image_res): with open(image_dir+"/"+image_name,'wb') as fw: fw.write(image_res.content) msg = image_name + "下载成功" return msg if __name__ == "__main__": url_main = 'https://www.pixiv.net/ranking.php' res_data = req_open_pixiv(url_main) res_code = res_data.status_code url_list = [] title_list = [] id_list = [] if res_code == 200: res_text = res_data.text res_etree = etree.HTML(res_text) item_section = res_etree.xpath('//div[@class="ranking-items-container"]/div/section') for section in item_section: title = section.xpath('./@data-title')[0] title_list.append(title) id = section.xpath('./@data-id')[0] id_list.append(id) breviary_url = section.xpath('.//div[@class="_layout-thumbnail"]/img/@data-src')[0] url_joint = re.findall('img/(.*?)_p0',breviary_url)[0] url = "https://i.pximg.net/img-original/img/"+url_joint+"_p0.jpg" url_list.append(str(url)) #检查是否有文件夹,若无进行创建 image_dir = './pixiv_img' check_finder(image_dir) i = -1 for url in url_list: i = i + 1 img_res = req_open_image(url) img_name = title_list[i] + ".png" #调用下载函数 msg = res_image_downloads(img_name,image_dir,img_res) print(msg) else: msg = '请求失败,状态码为:'+str(res_code) print(msg)

对404图片进行处理(全部代码)

```python import requests from lxml import etree import os import re

def req_open_pixiv(url_main): headersData = { 'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' }

res_data = requests.get(url_main,headers = headersData)
return res_data

def req_open_image(url): url_headers = { 'referer':'https://www.pixiv.net/', 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' } img_res = requests.get(url,headers = url_headers)

return img_res

def check_finder(image_dir): if not os.path.exists(image_dir): os.mkdir(image_dir)

def res_image_downloads(image_name,image_dir,image_res): with open(image_dir+"/"+image_name,'wb') as fw: fw.write(image_res.content) msg = image_name + "下载成功"

    return msg

if name == "main": url_main = 'https://www.pixiv.net/ranking.php' res_data = req_open_pixiv(url_main) res_code = res_data.status_code

url_list = []
title_list = []
id_list = []

if res_code == 200:
    res_text = res_data.text
    res_etree = etree.HTML(res_text)
    item_section = res_etree.xpath('//div[@class="ranking-items-container"]/div/section')
    for section in item_section:
        title = section.xpath('./@data-title')[0]
        title_list.append(title)

        id = section.xpath('./@data-id')[0]
        id_list.append(id)

        breviary_url = section.xpath('.//div[@class="_layout-thumbnail"]/img/@data-src')[0]
        url_joint = re.findall('img/(.*?)_p0',breviary_url)[0]
        url = "https://i.pximg.net/img-original/img/"+url_joint+"_p0.jpg"
        url_list.append(str(url))

    image_dir = './pixiv_img'
    check_finder(image_dir)

    i = -1
    for url in url_list:
        i = i + 1
        img_res = req_open_image(url)
        img_res_code = img_res.status_code
        if img_res_code == 200:
            img_name = title_list[i] + ".png"
            msg = res_image_downloads(img_name,image_dir,img_res)
        elif img_res_code == 404:
            prefix_url = re.findall('(.*?).jpg',url)[0]
            new_url = prefix_url+".png"

            url_list.append(new_url)
            title_list.append(title_list[i])
            id_list.append(id_list[i])
            msg = "请求 图片"+title_list[i]+"失败!改成png"
        else:
            msg = "请求 图片"+title_list[i]+"失败!状态码为:"+str(img_res_code)

        print(str(i)+msg)
else:
    msg = '请求失败,状态码为:'+str(res_code)
    print(msg)

运行结果

如下所示 ```shell # ctexthuang @ ctexthuangdeiMac in ~/Desktop/python/crawler [18:55:46] C:1 $ python pixiv.py 0ReadySteady.png下载成功 ... 65Ch'en.png下载成功

----------end

本文为ctexthuang原创文章,转载请注明来自ctexthuang_blog

Edit with Markdown
召唤看板娘