用Python写的个crawler

2020 年 10 月 3 日 星期六
6

用Python写的个crawler

在我之前的文章中有写过。python其实很适合写爬虫。因为其拥有庞大的方法库。当然包括爬虫必备的各类库 我也尝试用php写过。现在打算用python写个。 上次我们爬取了acfun,这次我们爬取一下pixiv吧。(ps:至于pixiv是什么网址,插图网站,我们就是爬取这些图片)

准备工作

建立一个完整的python环境和配置好pip 如下所示我是使用的python version = 3.8.4、pip version = 20.2.3的环境

# xxx @ xxxdeiMac in ~/Desktop/python/crawler [19:10:56]
$ python --version
Python 3.8.4

# xxx @ xxxdeiMac in ~/Desktop/python/crawler [19:26:23]
$ pip --version
pip 20.2.3 from /usr/local/lib/python3.8/site-packages/pip (python 3.8)

加载库类

首先我们要明白,爬虫需要爬取网页内容首先要能下载到网页内容。然后对下载的内容进行分析得到想要的数据。这个在之前的php写爬虫中有说过 这里我用的是requests-http请求库、lxml-web内容解析库、os-系统方法库和re-正则匹配库 如下所示

import requests
from lxml import etree
import os
import re

安装就是用pip快速安装

# xxx @ xxxdeiMac in ~/Desktop/python/crawler [17:17:08]
$ pip install requests
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting requests
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl (61 kB)
     |████████████████████████████████| 61 kB 167 kB/s
Collecting chardet<4,>=3.0.2
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/9f/f0/a391d1463ebb1b233795cabfc0ef38d3db4442339de68f847026199e69d7/urllib3-1.25.10-py2.py3-none-any.whl (127 kB)
     |████████████████████████████████| 127 kB 264 kB/s
Collecting certifi>=2017.4.17
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/5e/c4/6c4fe722df5343c33226f0b4e0bb042e4dc13483228b4718baf286f86d87/certifi-2020.6.20-py2.py3-none-any.whl (156 kB)
     |████████████████████████████████| 156 kB 254 kB/s
Collecting idna<3,>=2.5
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a2/38/928ddce2273eaa564f6f50de919327bf3a00f091b5baba8dfa9460f3a8a8/idna-2.10-py2.py3-none-any.whl (58 kB)
     |████████████████████████████████| 58 kB 88 kB/s
Installing collected packages: chardet, urllib3, certifi, idna, requests
Successfully installed certifi-2020.6.20 chardet-3.0.4 idna-2.10 requests-2.24.0 urllib3-1.25.10

# xxx @ xxxdeiMac in ~/Desktop/python/crawler [17:17:08]
$ pip install lxml
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting lxml
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/25/64/10836b5790d37e3400a219f1e09bdc9c99bf93533f2edbaddc19796d84cd/lxml-4.5.2-cp38-cp38-macosx_10_9_x86_64.whl (4.5 MB)
     |████████████████████████████████| 4.5 MB 67.2 MB/s
Installing collected packages: lxml
Successfully installed lxml-4.5.2

获取web全部内容

def req_open_pixiv(url_main):
    #header参数在浏览器控制台->network中获取
    headersData = {
        'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }

    #requests进行get请求,带headers参数
    res_data = requests.get(url_main,headers = headersData)
    return res_data

#主函数调用req_open_pixiv 请求pixiv站日排行榜https://www.pixiv.net/ranking.php地址
if __name__ == "__main__":
    url_main = 'https://www.pixiv.net/ranking.php'
    res_data = req_open_pixiv(url_main)

获取图片内容并写入

def req_open_pixiv(url_main):
    headersData = {
        'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }

    res_data = requests.get(url_main,headers = headersData)
    return res_data

def req_open_image(url):
    url_headers = {
        'referer':'https://www.pixiv.net/',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }
    img_res = requests.get(url,headers = url_headers)

    return img_res

def check_finder(image_dir):
    if not os.path.exists(image_dir):
        os.mkdir(image_dir)

def res_image_downloads(image_name,image_dir,image_res):
    with open(image_dir+"/"+image_name,'wb') as fw:
        fw.write(image_res.content)
        msg = image_name + "下载成功"

        return msg

if __name__ == "__main__":
    url_main = 'https://www.pixiv.net/ranking.php'
    res_data = req_open_pixiv(url_main)
    res_code = res_data.status_code

    url_list = []
    title_list = []
    id_list = []

    if res_code == 200:
        res_text = res_data.text
        res_etree = etree.HTML(res_text)
        item_section = res_etree.xpath('//div[@class="ranking-items-container"]/div/section')
        for section in item_section:
            title = section.xpath('./@data-title')[0]
            title_list.append(title)

            id = section.xpath('./@data-id')[0]
            id_list.append(id)

            breviary_url = section.xpath('.//div[@class="_layout-thumbnail"]/img/@data-src')[0]
            url_joint = re.findall('img/(.*?)_p0',breviary_url)[0]
            url = "https://i.pximg.net/img-original/img/"+url_joint+"_p0.jpg"
            url_list.append(str(url))

        #检查是否有文件夹,若无进行创建
        image_dir = './pixiv_img'
        check_finder(image_dir)

        i = -1
        for url in url_list:
            i = i + 1
            img_res = req_open_image(url)

            img_name = title_list[i] + ".png"
            #调用下载函数
            msg = res_image_downloads(img_name,image_dir,img_res)
            print(msg)
    else:
        msg = '请求失败,状态码为:'+str(res_code)
        print(msg)

对404图片进行处理(全部代码)

import requests
from lxml import etree
import os
import re

def req_open_pixiv(url_main):
    headersData = {
        'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }

    res_data = requests.get(url_main,headers = headersData)
    return res_data

def req_open_image(url):
    url_headers = {
        'referer':'https://www.pixiv.net/',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
    }
    img_res = requests.get(url,headers = url_headers)

    return img_res

def check_finder(image_dir):
    if not os.path.exists(image_dir):
        os.mkdir(image_dir)

def res_image_downloads(image_name,image_dir,image_res):
    with open(image_dir+"/"+image_name,'wb') as fw:
        fw.write(image_res.content)
        msg = image_name + "下载成功"

        return msg

if __name__ == "__main__":
    url_main = 'https://www.pixiv.net/ranking.php'
    res_data = req_open_pixiv(url_main)
    res_code = res_data.status_code

    url_list = []
    title_list = []
    id_list = []

    if res_code == 200:
        res_text = res_data.text
        res_etree = etree.HTML(res_text)
        item_section = res_etree.xpath('//div[@class="ranking-items-container"]/div/section')
        for section in item_section:
            title = section.xpath('./@data-title')[0]
            title_list.append(title)

            id = section.xpath('./@data-id')[0]
            id_list.append(id)

            breviary_url = section.xpath('.//div[@class="_layout-thumbnail"]/img/@data-src')[0]
            url_joint = re.findall('img/(.*?)_p0',breviary_url)[0]
            url = "https://i.pximg.net/img-original/img/"+url_joint+"_p0.jpg"
            url_list.append(str(url))

        image_dir = './pixiv_img'
        check_finder(image_dir)

        i = -1
        for url in url_list:
            i = i + 1
            img_res = req_open_image(url)
            img_res_code = img_res.status_code
            if img_res_code == 200:
                img_name = title_list[i] + ".png"
                msg = res_image_downloads(img_name,image_dir,img_res)
            elif img_res_code == 404:
                prefix_url = re.findall('(.*?).jpg',url)[0]
                new_url = prefix_url+".png"

                url_list.append(new_url)
                title_list.append(title_list[i])
                id_list.append(id_list[i])
                msg = "请求 图片"+title_list[i]+"失败!改成png"
            else:
                msg = "请求 图片"+title_list[i]+"失败!状态码为:"+str(img_res_code)

            print(str(i)+msg)
    else:
        msg = '请求失败,状态码为:'+str(res_code)
        print(msg)

运行结果

如下所示

# ctexthuang @ ctexthuangdeiMac in ~/Desktop/python/crawler [18:55:46] C:1
$ python pixiv.py
0ReadySteady.png下载成功
...
65Ch'en.png下载成功
  • Loading...
  • Loading...
  • Loading...
  • Loading...
  • Loading...