文章目录

摘要: 利用python, requests, xpath爬取花瓣网美女标签全部图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# -*- coding: utf-8 -*-

'''
python 2.7.12
'''

import requests
from parsel import Selector
import time
import re, random, os


def scraw_pin_ids():

pin_ids = []
pin_id = '1068018182'

flag = True
while flag:
try:
url = "http://huaban.com/favorite/beauty/"
headers1 = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
'Accept':'application/json',
'X-Request':'JSON',
'X-Requested-With':'XMLHttpRequest',
}

params = {
'j0l4lymf':'',
'max':pin_id,
'limit':'20',
'wfl':'1',
}

z1 = requests.get(url, params=params, headers=headers1)

if z1.json()['pins']:
for i in z1.json()['pins']:
pin_ids.append(i['pin_id'])
pin_id = pin_ids[-1]
print i['pin_id']
# with open("pin_ids.txt",'ab') as f:
# f.write(str(i['pin_id'])+"\n")
# f.close()
time.sleep(0.001)
else:
flag = False
return set(pin_ids)
except:
continue

def scraw_urls(pin_ids):

urls = []

urlss = ['http://huaban.com/pins/' + str(i) +'/' for i in pin_ids]
for url in urlss:
try:
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}

z3 = requests.get(url, headers=headers)

text = z3.text

pattern = re.compile('"key":"(.*?)"', re.S)
items = re.findall(pattern, text)

urls.extend(items)
print items
print '============================================================================================================'
except:
continue
return set(urls)

def download(urls):
headers1 = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
n = 1
urls = set(urls)
for url in urls:
try:
if not os.path.exists(os.path.join(file_path, "huaban")):
os.makedirs(os.path.join(file_path, "huaban"))
os.chdir(file_path + '\\' + "huaban")
try:
url = 'http://img.hb.aicdn.com/' + url
r = requests.get(url, headers=headers1)
if len(r.content)>40000:
with open(str(n)+".jpg", 'wb') as f:
f.write(r.content)
f.close()
print u"第" + str(n) + u"张图片下载成功"
n+=1
# time.sleep(3)
except:
continue
except:
continue

# 图片存储路径
file_path = 'E:\selfprogress\programming\project\huaban'
pin_ids = scraw_pin_ids()
urls = scraw_urls(pin_ids)
download(urls)
文章目录
Fork me on GitHub