引言

  因为在小程序的开发过程中用到了python的爬虫就在这里写下来了,以后的爬虫可以把这个作为一个模板。

代码

User.py

这个文件的存在是为了生成不同的请求头,这个UserAgent就是一个用户代理,每个浏览器都会有的,爬虫如果不加这个UserAgent,会被网页默认为不通过浏览器来访问,认为是机器,就会限制访问,防止因为被服务器识别为机器人导致IP被封。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import random


def getuser():
# 由于频繁的访问,所以做了多个访问头,防止因为被识别为机器访问导致IP被封
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 "
"Safari/537.36 Core/1.53.4295.400 "
# Opera
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 "
"OPR/26.0.1656.60",
"Opera/8.0 (Windows NT 5.1; U; en)",
"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
# Firefox
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
# Safari
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
# chrome
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16",
# 360
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
# 淘宝浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
# 猎豹浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
# QQ浏览器
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
# sogou浏览器
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
# maxthon浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
# UC浏览器
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",

]
user_agent = random.choice(USER_AGENTS)
return user_agent

proxy.py

这个文件使用了代理,使用高匿名性的IP进行访问,不使用自己的IP进行爬虫访问,防止因为机器识别造成意外,也提高了访问的安全性。提供了两种代理方法,可以自己进行选择。这部分用来存动态代理的ip信息,因为有的网站当你用同一个ip获取大量数据以后且速度很快,也会认为这是机器在进行操作,而不属于用户的正常访问,此时ip被禁,就连访问网页也不能了,所以得使用动态代理,让网站认为是不同的ip访问,免费的代理网站有好几个。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# 快代理
import requests
from lxml.html import etree

url = 'http://www.kuaidaili.com/free/inha/6' # 快代理
data = requests.get(url)
html = etree.HTML(data.text)

# 使用代理,使用匿名性高的IP地址进行访问防止IP被封

# 找xpath
ip_xpath = '//*[@id="list"]/table/tbody/tr/td[1]/text()'
port_xpath = '//*[@id="list"]/table/tbody/tr/td[2]/text()'
http_or_https_xpath = '//*[@id="list"]/table/tbody/tr/td[4]/text()'

# 匹配内容
ip_list = html.xpath(ip_xpath)
port_list = html.xpath(port_xpath)
http_or_https_list = html.xpath(http_or_https_xpath)

# 进行组合
list_zip = zip(ip_list, port_list, http_or_https_list)
proxy_dict = {}
proxy_list = []
for ip, port, http_or_https in list_zip:
proxy_dict[http_or_https] = f'{ip}:{port}'
proxy_list.append(proxy_dict)
proxy_dict = {}

# 西刺代理
# import re
#
# import requests
# from bs4 import BeautifulSoup
#
# import user
#
# import random
#
#
# def getListProxies():
# session = requests.session()
# headers = {'User-Agent': user.getuser()}
# proxies = random.choice(proxy_list)
# page = session.get("http://www.xicidaili.com/nn/2", headers = headers,proxies = proxies)#西刺代理
# soup = BeautifulSoup(page.text, 'lxml')
#
# proxyList = []
# taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
# for trtag in taglist:
# tdlist = trtag.find_all('td')
# proxy = {'http': tdlist[1].string + ':' + tdlist[2].string}
#
# proxyList.append(proxy)
# # 设定代理ip个数
# if len(proxyList) >= 20:
# break
#
# return proxyList

main.py

这个就是主程序,里面的注释已经十分完善了,就不再赘述,以后找时间再出一个Xpath的解析。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from lxml import etree
import json
import random
import requests
import time
import recommend.proxy
import recommend.user as user
import urllib.request


def getResqutes(tag, num):
urlRequest = ""
result_list = []
# filename = tag + ".json"
# with open(filename, 'a', encoding='utf-8') as file:
# file.write("[")
if tag[0] == '%':
urlRequest = "https://book.douban.com/tag/" + urllib.parse.quote(tag) + "?start={}"
else:
urlRequest = "https://book.douban.com/tag/" + tag + "?start={}"
# 获取诗词类图书的请求
# urls = [urlRequest.format(str(i)) for i in
# range(0, 1000, 20)]
# # 豆瓣分类图书每页20本,搜索一千本,每次搜索完一页,数字加20表示跳转到下一页继续搜索
url = urlRequest.format((str(random.randint(1, 50) * 20)))
# for url in urls
# 每搜索1页20本书更换一次请求头信息和代理ip
# 动态设置请求头信息
headers = {'User-Agent': user.getuser()}
# 动态设置代理ip信息
List = recommend.proxy.proxy_list
proxies = random.choice(List)
# 打印搜索时代理ip信息
print(proxies)
data = requests.get(url, headers=headers, proxies=proxies) # 此处是请求
html = etree.HTML(data.text) # 网页的解析
count = html.xpath("//li[@class='subject-item']") # count是一个页面中的20本书的一个列表,一共20项,下面可以写循环进行抓取。
info = count[random.randint(0, 20)]
# 把页面获取的详情页面的信息转化成字符串link作为下面请求的url,有些网页比如京东在转化成字符串的同时需要在前面拼接"https://"
link = ''.join(info.xpath("div[2]/h2/a/@href"))
# 每爬取一本书线程休息随机时间,模拟人类行为
time.sleep(random.random())
# 控制台输出书籍详情页地址,便于观察爬取过程中的bug
print(link)
# author_name在类别页获取,因为详情页每个页面的作者对应的块位置不同,存在获取不到作者情况,导致书籍信息获取失败
# author_name =''.join(info.xpath("div[2]/div[1]/text()")[0].split('/')[0]).replace(" ","")
# print(author_name)
# author_name = author_name.split()
link_data = requests.get(link, headers=headers, proxies=proxies)
html = etree.HTML(link_data.text)
# 书名
book_name = html.xpath("//*[@id='mainpic']/a/@title")
# 图片url
book_img = html.xpath("//*[@id='mainpic']/a/img/@src")
# 作者信息,因为不同页面位置不同做判断
author_name = html.xpath("//*[@id='info']/span[1]/a/text()")
temp = ''.join(html.xpath("//*[@id='info']/span[1]/a/text()"))
if temp is None or len(temp) == 0:
author_name = html.xpath("//*[@id='info']/a[1]/text()")
# 作者人数大于1时候用/分隔,并去除多余空格和换行符
sum = ""
if len(author_name) > 1:
for item in author_name:
sum += (str(item) + "/")
author_name = sum
else:
author_name = author_name
author_name = "".join(author_name)
author_name = author_name.replace(" ", "")
author_name = author_name.replace("\n", "")
author_name = author_name.split()

# 出版社
press = html.xpath(u'//span[./text()="出版社:"]/following::text()[1]')
# 出版年
press_year = html.xpath(u'//span[./text()="出版年:"]/following::text()[1]')
# 页数
pages = html.xpath(u'//span[./text()="页数:"]/following::text()[1]')
# 价格
price = html.xpath(u'//span[./text()="定价:"]/following::text()[1]')
# 图书ISBN
ISBN = html.xpath(u'//span[./text()="ISBN:"]/following::text()[1]')
# 评分
score = html.xpath("//*[@id='interest_sectl']/div/div[2]/strong/text()")
# 评价人数
number_reviewers = html.xpath("//*[@id='interest_sectl']/div/div[2]/div/div[2]/span/a/span/text()")
# 图书简介
introduction = html.xpath("//*[@class='intro']/p/text()")

for book_name, book_img, author_name, press, press_year, pages, price, ISBN, score, number_reviewers, introduction in zip(
book_name, book_img, author_name, press, press_year, pages, price, ISBN, score, number_reviewers,
introduction):
result = {
"book_name": book_name,
"book_img": book_img,
"author_name": author_name,
"press": press,
"press_year": press_year,
"pages": pages,
"price": price,
"ISBN": ISBN,
"score": score,
"number_reviewers": number_reviewers,
"introduction": introduction
}
print(result)
result_list.append(result)
return result
# 以json形式保存输出结果
# with open(filename, 'a', encoding='utf-8') as file:
# file.write(json.dumps(result, ensure_ascii=False) + ',' + '\n')

结语

  这个Python的爬虫是我通过改动一个CSDN博客上的爬虫实现的,这个爬虫功能上已经是十分强大,基本上能爬到图书的全部信息,能够直接处理中文的标签,能够处理经过html转义的标签,返回时使用的是json数据格式同时支持对json文件直接进行写入,在注释中有对应的代码。

参考链接

原文链接:https://blog.csdn.net/qq_41821963/article/details/105446196