用Python写的个crawler
在我之前的文章中有写过。python其实很适合写爬虫。因为其拥有庞大的方法库。当然包括爬虫必备的各类库 我也尝试用php写过。现在打算用python写个。 上次我们爬取了acfun,这次我们爬取一下pixiv吧。(ps:至于pixiv是什么网址,插图网站,我们就是爬取这些图片)
准备工作
建立一个完整的python环境和配置好pip 如下所示我是使用的python version = 3.8.4、pip version = 20.2.3的环境
# xxx @ xxxdeiMac in ~/Desktop/python/crawler [19:10:56]
$ python --version
Python 3.8.4
# xxx @ xxxdeiMac in ~/Desktop/python/crawler [19:26:23]
$ pip --version
pip 20.2.3 from /usr/local/lib/python3.8/site-packages/pip (python 3.8)
加载库类
首先我们要明白,爬虫需要爬取网页内容首先要能下载到网页内容。然后对下载的内容进行分析得到想要的数据。这个在之前的php写爬虫中有说过 这里我用的是requests-http请求库、lxml-web内容解析库、os-系统方法库和re-正则匹配库 如下所示
import requests
from lxml import etree
import os
import re
安装就是用pip快速安装
# xxx @ xxxdeiMac in ~/Desktop/python/crawler [17:17:08]
$ pip install requests
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting requests
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/45/1e/0c169c6a5381e241ba7404532c16a21d86ab872c9bed8bdcd4c423954103/requests-2.24.0-py2.py3-none-any.whl (61 kB)
|████████████████████████████████| 61 kB 167 kB/s
Collecting chardet<4,>=3.0.2
Using cached https://pypi.tuna.tsinghua.edu.cn/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl (133 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/9f/f0/a391d1463ebb1b233795cabfc0ef38d3db4442339de68f847026199e69d7/urllib3-1.25.10-py2.py3-none-any.whl (127 kB)
|████████████████████████████████| 127 kB 264 kB/s
Collecting certifi>=2017.4.17
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/5e/c4/6c4fe722df5343c33226f0b4e0bb042e4dc13483228b4718baf286f86d87/certifi-2020.6.20-py2.py3-none-any.whl (156 kB)
|████████████████████████████████| 156 kB 254 kB/s
Collecting idna<3,>=2.5
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/a2/38/928ddce2273eaa564f6f50de919327bf3a00f091b5baba8dfa9460f3a8a8/idna-2.10-py2.py3-none-any.whl (58 kB)
|████████████████████████████████| 58 kB 88 kB/s
Installing collected packages: chardet, urllib3, certifi, idna, requests
Successfully installed certifi-2020.6.20 chardet-3.0.4 idna-2.10 requests-2.24.0 urllib3-1.25.10
# xxx @ xxxdeiMac in ~/Desktop/python/crawler [17:17:08]
$ pip install lxml
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting lxml
Downloading https://pypi.tuna.tsinghua.edu.cn/packages/25/64/10836b5790d37e3400a219f1e09bdc9c99bf93533f2edbaddc19796d84cd/lxml-4.5.2-cp38-cp38-macosx_10_9_x86_64.whl (4.5 MB)
|████████████████████████████████| 4.5 MB 67.2 MB/s
Installing collected packages: lxml
Successfully installed lxml-4.5.2
获取web全部内容
def req_open_pixiv(url_main):
#header参数在浏览器控制台->network中获取
headersData = {
'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
#requests进行get请求,带headers参数
res_data = requests.get(url_main,headers = headersData)
return res_data
#主函数调用req_open_pixiv 请求pixiv站日排行榜https://www.pixiv.net/ranking.php地址
if __name__ == "__main__":
url_main = 'https://www.pixiv.net/ranking.php'
res_data = req_open_pixiv(url_main)
获取图片内容并写入
def req_open_pixiv(url_main):
headersData = {
'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
res_data = requests.get(url_main,headers = headersData)
return res_data
def req_open_image(url):
url_headers = {
'referer':'https://www.pixiv.net/',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
img_res = requests.get(url,headers = url_headers)
return img_res
def check_finder(image_dir):
if not os.path.exists(image_dir):
os.mkdir(image_dir)
def res_image_downloads(image_name,image_dir,image_res):
with open(image_dir+"/"+image_name,'wb') as fw:
fw.write(image_res.content)
msg = image_name + "下载成功"
return msg
if __name__ == "__main__":
url_main = 'https://www.pixiv.net/ranking.php'
res_data = req_open_pixiv(url_main)
res_code = res_data.status_code
url_list = []
title_list = []
id_list = []
if res_code == 200:
res_text = res_data.text
res_etree = etree.HTML(res_text)
item_section = res_etree.xpath('//div[@class="ranking-items-container"]/div/section')
for section in item_section:
title = section.xpath('./@data-title')[0]
title_list.append(title)
id = section.xpath('./@data-id')[0]
id_list.append(id)
breviary_url = section.xpath('.//div[@class="_layout-thumbnail"]/img/@data-src')[0]
url_joint = re.findall('img/(.*?)_p0',breviary_url)[0]
url = "https://i.pximg.net/img-original/img/"+url_joint+"_p0.jpg"
url_list.append(str(url))
#检查是否有文件夹,若无进行创建
image_dir = './pixiv_img'
check_finder(image_dir)
i = -1
for url in url_list:
i = i + 1
img_res = req_open_image(url)
img_name = title_list[i] + ".png"
#调用下载函数
msg = res_image_downloads(img_name,image_dir,img_res)
print(msg)
else:
msg = '请求失败,状态码为:'+str(res_code)
print(msg)
对404图片进行处理(全部代码)
import requests
from lxml import etree
import os
import re
def req_open_pixiv(url_main):
headersData = {
'accept-language':'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
res_data = requests.get(url_main,headers = headersData)
return res_data
def req_open_image(url):
url_headers = {
'referer':'https://www.pixiv.net/',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
}
img_res = requests.get(url,headers = url_headers)
return img_res
def check_finder(image_dir):
if not os.path.exists(image_dir):
os.mkdir(image_dir)
def res_image_downloads(image_name,image_dir,image_res):
with open(image_dir+"/"+image_name,'wb') as fw:
fw.write(image_res.content)
msg = image_name + "下载成功"
return msg
if __name__ == "__main__":
url_main = 'https://www.pixiv.net/ranking.php'
res_data = req_open_pixiv(url_main)
res_code = res_data.status_code
url_list = []
title_list = []
id_list = []
if res_code == 200:
res_text = res_data.text
res_etree = etree.HTML(res_text)
item_section = res_etree.xpath('//div[@class="ranking-items-container"]/div/section')
for section in item_section:
title = section.xpath('./@data-title')[0]
title_list.append(title)
id = section.xpath('./@data-id')[0]
id_list.append(id)
breviary_url = section.xpath('.//div[@class="_layout-thumbnail"]/img/@data-src')[0]
url_joint = re.findall('img/(.*?)_p0',breviary_url)[0]
url = "https://i.pximg.net/img-original/img/"+url_joint+"_p0.jpg"
url_list.append(str(url))
image_dir = './pixiv_img'
check_finder(image_dir)
i = -1
for url in url_list:
i = i + 1
img_res = req_open_image(url)
img_res_code = img_res.status_code
if img_res_code == 200:
img_name = title_list[i] + ".png"
msg = res_image_downloads(img_name,image_dir,img_res)
elif img_res_code == 404:
prefix_url = re.findall('(.*?).jpg',url)[0]
new_url = prefix_url+".png"
url_list.append(new_url)
title_list.append(title_list[i])
id_list.append(id_list[i])
msg = "请求 图片"+title_list[i]+"失败!改成png"
else:
msg = "请求 图片"+title_list[i]+"失败!状态码为:"+str(img_res_code)
print(str(i)+msg)
else:
msg = '请求失败,状态码为:'+str(res_code)
print(msg)
运行结果
如下所示
# ctexthuang @ ctexthuangdeiMac in ~/Desktop/python/crawler [18:55:46] C:1
$ python pixiv.py
0ReadySteady.png下载成功
...
65Ch'en.png下载成功