爬虫项目——final

selenium爬虫项目——完结

Introduction

过了这么些天,总算是完成了第一次大作业的验收,我也成功在一天时间内把原来冗杂的代码重构了一遍,并将重构的代码及其逻辑功能上台演示了一遍,现在针对该项目重构后作最后的展示。
本项目实现了微博的自动登录、给定用户ID爬取该用户基本信息、爬取该用户的博文内容、爬取该用户的社交信息包括关注者和粉丝的基本信息(虽然严格来说selenium不算爬虫),并利用flask模板引擎对爬取的数据进行展示(没错,就是SSTI的经典flask)。重构之后的代码将爬虫写成了一个python库,称为My_selenium,里面包含登录、获取基本信息、获取博文信息、获取社交信息、获取所有信息等函数,外部通过my_flask.py调用该库获取数据进行渲染。值得注意的是因为诸如网页元素加载等问题,程序运行有时候会报错(比如在sleep的时间内未能加载出指定页面元素),可以调大对应的sleep的时间或者多试几次(属于是概率学)。
文件结构:

1
2
3
4
5
|-My_selenium.py
-my_flask.py
-templates
|-login.html
|-微博.html

My_selenium.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
from selenium import webdriver
from time import sleep
import sys
import threading
non_bmp_map=dict.fromkeys(range(0x10000,sys.maxunicode+1),0xfffd)



def mylogin(username,password):
global driver
chrome_driver='C:\\Users\\lenovo\\Anaconda3\\Lib\\site-packages\\chromedriver.exe'
driver = webdriver.Chrome(executable_path = chrome_driver)

driver.get("https://weibo.com/u/7791772275/home?wvr=5")
sleep(10)
driver.find_element_by_xpath('//a[@class="LoginBtn_btn_10QRY LoginBtn_btna_1hH9H loginBtn"]').click()
driver.find_element_by_xpath('//*[@id="app"]/div[4]/div[1]/div/div[2]/div/div/div[5]/a[1]').click()
driver.close()
driver.switch_to.window(driver.window_handles[0])
sleep(3)
driver.find_element_by_id('loginname').click()
driver.find_element_by_id('loginname').send_keys(username)
driver.find_element_by_xpath('//div[@class="info_list password"]/div/span').click()
driver.find_element_by_xpath('//div[@class="info_list password"]/div/input').send_keys(password)
driver.find_element_by_xpath('//div[@class="info_list login_btn"]').click()
sleep(3)
driver.find_element_by_id("dmCheck").click()
driver.find_element_by_xpath('//button[@class="W_btn_a btn_34px"]').click()

def basic_info(nickname):
statu=[]
info=[]
global driver
driver.find_element_by_xpath('//*[@id="plc_top"]/div/div[1]/div[2]/input').click()
driver.find_element_by_xpath('//*[@id="plc_top"]/div/div[1]/div[2]/input').send_keys(nickname)
driver.find_element_by_xpath('//a[@class="W_ficon ficon_search S_ficon"]').click()
sleep(2)
driver.find_element_by_xpath('/html/body/div[1]/div[2]/ul/li[2]/a').click()
sleep(1)
driver.find_element_by_xpath('//*[@id="pl_user_feedList"]/div[1]/div[2]/div/a[1]').click()
#driver.find_element_by_xpath('//*[@id="pl_user_feedList"]/div[2]/div[2]/div/a[1]').click()

driver.close()
driver.switch_to.window(driver.window_handles[0])
sleep(1)
driver.find_element_by_xpath('//div[@class="PCD_person_info"]/a[@class="WB_cardmore S_txt1 S_line1 clearfix"]').click()
sleep(1)
status=driver.find_elements_by_xpath('//a[@class="t_link S_txt1"]')
for i in status:
statu.append(i.text)
statu=[i.split() for i in statu]
dic={}
for i in range(len(statu)):
dic[statu[i][1]]=statu[i][0]
basic_info_title=driver.find_elements_by_xpath('//span[@class="pt_title S_txt2"]')
basic_info=driver.find_elements_by_xpath('//span[contains(@class,"pt_detail")]')
for i in range(len(basic_info)):
info.append((basic_info_title[i].text.translate(non_bmp_map),basic_info[i].text.translate(non_bmp_map)))
return dic,info

def blog_info(blog_num):
global driver
blog=[]
comments=[]
driver.find_elements_by_xpath('//a[@class="t_link S_txt1"]')[2].click()
sleep(5)
driver.find_element_by_xpath('//li[contains(@class,"tab_li tab_li_first")]').click()
driver.execute_script("window.scrollBy(0,30000)")
sleep(5)
search=min(blog_num,10)
button=driver.find_elements_by_xpath('//span[@class="line S_line1"]/span/em[@class="W_ficon ficon_repeat S_ficon"]')
for i in button:
driver.execute_script("arguments[0].click()",i)
sleep(2)
count=0
index=0
for i in range(search):
Time=driver.find_elements_by_xpath('//div[@class="WB_from S_txt2"]')[:search]
Text=driver.find_elements_by_xpath('//div[@class="WB_text W_f14"]')[:search]
Status=driver.find_elements_by_xpath('//span[@class="line S_line1"]/span/em[2]')
forward=Status[count+1].text
comment=Status[count+2].text
praise=Status[count+3].text

Comment=[]
transmit=0
time=Time[i].text
if '转发微博' in Text[i].text:
transmit=1
#count+=3
text=Text[i].text.translate(non_bmp_map)

if comment=='评论':
comment=0
comment_num=min(int(comment),10)
count+=4
repeat=driver.find_elements_by_xpath('//div[@class="repeat_list"]')[i]
if '查看更多' in repeat.text:
test=driver.find_elements_by_xpath('//div[@class="list_ul"]/a[contains(@class,"S_txt1 S_line1 clearfix")]')
driver.execute_script("arguments[0].click()",test[index])
driver.switch_to.window(driver.window_handles[-1])
sleep(4)
comment_text=driver.find_elements_by_xpath('//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_text"]')[:comment_num]
comment_time=driver.find_elements_by_xpath('//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_func clearfix"]/div[@class="WB_from S_txt2"]')[:comment_num]
comment_praise=driver.find_elements_by_xpath('//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div/div/ul/li/span[@class="line S_line1"]/a/span/em[2]')[:comment_num]
for j in range(min(comment_num,len(comment_text))):
if comment_praise[j].text=='赞':
Comment.append((comment_text[j].text.translate(non_bmp_map),comment_time[j].text,'0'))

else:
Comment.append((comment_text[j].text.translate(non_bmp_map),comment_time[j].text,comment_praise[j].text))
driver.close()
driver.switch_to.window(driver.window_handles[0])
index+=1
else:
temp=driver.find_elements_by_xpath('//div[@class="repeat_list"]')[i]
comment_text=temp.find_elements_by_xpath('.//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_text"]')
comment_time=temp.find_elements_by_xpath('.//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_func clearfix"]/div[@class="WB_from S_txt2"]')
comment_praise=temp.find_elements_by_xpath('.//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_func clearfix"]/div[@class="WB_handle W_fr"]/ul[@class="clearfix"]/li/span[@class="line S_line1"]/a/span/em[2]')
for _ in range(min(comment_num,len(comment_text))):
if comment_praise[_].text=='赞':
Comment.append((comment_text[_].text.translate(non_bmp_map),comment_time[_].text,'0'))
else:
Comment.append((comment_text[_].text.translate(non_bmp_map),comment_time[_].text,comment_praise[_].text))

blog.append((time,text,transmit,forward,comment,praise,Comment))
return blog



def social_info(focus_num,fans_num):
global driver
total_focus_statu=[]
status=driver.find_elements_by_xpath('//a[@class="t_link S_txt1"]')
driver.execute_script("arguments[0].click()",status[0])
search=min(focus_num,10)

sleep(3)
focus=driver.find_elements_by_xpath('//div[@class="info_name W_fb W_f14"]/a[@class="S_txt1"]')[:search]
for i in focus:
i.click()
for i in range(len(focus)):
info=[]
driver.switch_to.window(driver.window_handles[-1])
sleep(2)
try:
username=driver.find_element_by_xpath('//div[@class="pf_username"]/h1[@class="username"]').text
except:
driver.close()
driver.switch_to.window(driver.window_handles[0])
continue
intro=driver.find_element_by_xpath('//div[@class="pf_intro"]').text
url=driver.current_url
statu=driver.find_elements_by_xpath('//td[@class="S_line1"]')
try:
f_focus=statu[0].text
except:
continue
f_fans=statu[1].text
b_blogs=statu[2].text
tmp=[f_focus,f_fans,b_blogs]
tmp=[i.split() for i in tmp]
dic={}
for i in tmp:
dic[i[1]]=i[0]
driver.find_element_by_xpath('//a[@class="WB_cardmore S_txt1 S_line1 clearfix"]').click()
sleep(1)
basic_info_title=driver.find_elements_by_xpath('//span[@class="pt_title S_txt2"]')
basic_info=driver.find_elements_by_xpath('//span[contains(@class,"pt_detail")]')
for j in range(len(basic_info)):
try:
info.append((basic_info_title[j].text.translate(non_bmp_map),basic_info[j].text.translate(non_bmp_map)))
except:
continue
total_focus_statu.append([[username.translate(non_bmp_map),intro.translate(non_bmp_map),url],dic,info])
driver.close()
driver.switch_to.window(driver.window_handles[0])


total_fans_statu=[]
search=min(fans_num,10)
driver.find_element_by_xpath('//em[@class="W_ficon ficon_fans S_ficon"]').click()
sleep(2)
fans=driver.find_elements_by_xpath('//div[@class="info_name W_fb W_f14"]/a[@class="S_txt1"]')[:search]
for i in fans:
i.click()
for i in range(len(fans)):
info=[]
driver.switch_to.window(driver.window_handles[-1])
sleep(2)
username=driver.find_element_by_xpath('//div[@class="pf_username"]/h1[@class="username"]').text
intro=driver.find_element_by_xpath('//div[contains(@class,"shadow")]/div[@class="pf_intro"]').text
url=driver.current_url
statu=driver.find_elements_by_xpath('//td[@class="S_line1"]')
f_focus=statu[0].text
f_fans=statu[1].text
b_blogs=statu[2].text
tmp=[f_focus,f_fans,b_blogs]
tmp=[i.split() for i in tmp]
dic={}
for i in tmp:
dic[i[1]]=i[0]
driver.find_element_by_xpath('//a[@class="WB_cardmore S_txt1 S_line1 clearfix"]').click()
sleep(1)
basic_info_title=driver.find_elements_by_xpath('//span[@class="pt_title S_txt2"]')
basic_info=driver.find_elements_by_xpath('//span[contains(@class,"pt_detail")]')
for j in range(len(basic_info)):
try:
info.append((basic_info_title[j].text.translate(non_bmp_map),basic_info[j].text.translate(non_bmp_map)))
except:
continue
total_fans_statu.append([[username.translate(non_bmp_map),intro.translate(non_bmp_map),url],dic,info])
driver.close()
driver.switch_to.window(driver.window_handles[0])

return total_focus_statu,total_fans_statu

def get_all_info(username):
global driver
dic,info=basic_info(username)
blog=blog_info(int(dic['微博']))
sleep(3)
focus_num,fans_num=dic['关注'],dic['粉丝']
if '万' in focus_num:
focus_num=10
else:
focus_num=int(focus_num)
if '万' in fans_num:
fans_num=10
else:
fans_num=int(fans_num)
focus,fans=social_info(focus_num,fans_num)
return dic,info,blog,focus,fans


if __name__== '__main__':
mylogin('你的账号','你的密码')
sleep(25)
username="明天想遛猪"

dic,info,blog,focus,fans=get_all_info(username)
print(dic)
print(info)
print(blog)
print(focus)
print(fans)


my_flask.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from flask import Flask, redirect, url_for, request, render_template
from time import sleep
from My_selenium import *

app = Flask(__name__)


@app.route('/')
def index():
return render_template("login.html")



@app.route('/show',methods = ['POST', 'GET'])
def show():
username=request.form['username']
password=request.form['password']
query_name=request.form['query']
mylogin(username,password)
sleep(25)
dic,info,blog,focus,fans=get_all_info(query_name)

return render_template('微博.html',dic=dic,info=info,blog=blog,focus=focus,fans=fans)




@app.route('/login',methods = ['POST', 'GET'])
def login():
return render_template('微博.html')


if __name__ == '__main__':

app.run(debug=True)

login.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
<!DOCTYPE html>
<html lang="en">

<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Document</title>
<style>
body {
background: url('https://cdn.pixabay.com/photo/2018/08/14/13/23/ocean-3605547_1280.jpg') no-repeat;
background-size: 100% 130%;
}

#login_box {
width: 20%;
height: 400px;
background-color: #00000060;
margin: auto;
margin-top: 10%;
text-align: center;
border-radius: 10px;
padding: 50px 50px;
}

h2 {
color: #ffffff90;
margin-top: 5%;
}

#input-box {
margin-top: 5%;
}

span {
color: #fff;
}

input {
border: 0;
width: 60%;
font-size: 15px;
color: #fff;
background: transparent;
border-bottom: 2px solid #fff;
padding: 5px 10px;
outline: none;
margin-top: 10px;
}

button {
margin-top: 50px;
width: 60%;
height: 30px;
border-radius: 10px;
border: 0;
color: #fff;
text-align: center;
line-height: 30px;
font-size: 15px;
background-image: linear-gradient(to right, #30cfd0, #330867);
}

#sign_up {
margin-top: 45%;
margin-left: 60%;
}

a {
color: #b94648;
}
</style>
</head>

<body>
<form action="/show" method="post">
<div id="login_box">
<h2>LOGIN</h2>
<div id="input_box">
<input type="text" placeholder="请输入用户名" name="username" id="username">
</div>
<div class="input_box">
<input type="password" placeholder="请输入密码" name="password" id="password">
</div>
<div class="input_box">
<input type="text" placeholder="请输入查询ID" name="query" id="query">
</div>
<button>登录</button><br>
</div>
</form>
</body>

</html>

微博.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
<html>
<head>
<title>微博爬取</title>
</head>
<body>
<h2 align="center">基本信息</h2>
{%for i in dic.items()%}</br>
<h5 align="left">{{i[0]}}:{{i[1]}}</h5></br>
{%endfor%}
{%for i in info%}</br>
<h5 align="left">{{i[0]}}:{{i[1]}}</h5></br>
{%endfor%}
{%for i in blog%}</br>
<h2 align="center">第{{loop.index}}篇博文:</h4></br>
<h5 align="left">发帖时间:{{i[0]}}</h5></br>
<h5 align="left">发帖内容:{{i[1]}}</h5></br>

{%if i[2]==1%}
<h5 align="left">是否转发:是</h5></br>
{%else%}
<h5 align="left">是否转发:否</h5></br>
{%endif%}

{%if i[3]=="转发"%}
<h5 align="left">转发数:0</h5></br>
{%else%}
<h5 align="left">转发数:{{i[3]}}</h5></br>
{%endif%}

{%if i[3]=="评论"%}
<h5 align="left">评论数:0</h5></br>
{%else%}
<h5 align="left">评论数:{{i[4]}}</h5></br>
{%endif%}

{%if i[3]=="赞"%}
<h5 align="left">点赞数:0</h5></br>
{%else%}
<h5 align="left">点赞数:{{i[5]}}</h5></br>
{%endif%}

<h2 align="center">评论</h4>
{%for j in i[6]%}
<h3 align="center">第{{loop.index}}条评论</h5></br>
<h5 align="left">评论内容:{{j[0]}}</h5></br>
<h5 align="left">评论时间:{{j[1]}}</h5></br>
<h5 align="left">点赞数:{{j[2]}}</h5></br>
{%endfor%}

{%endfor%}
<h2 align="center">关注列表</h4></br>
{%for i in focus%}
<h3 align="center">第{{loop.index}}位关注者</h5></br>
<h5 align="left">昵称:{{i[0][0]}}</h5></br>
<h5 align="left">介绍:{{i[0][1]}}</h5></br>
<h5 align="left">个性域名:{{i[0][2]}}</h5></br>
{% for j in i[1].items()%}
<h5 align="left">{{j[0]}}:{{j[1]}}</h5></br>
{%endfor%}

{% for j in i[2]%}
<h5 align="left">{{j[0]}}:{{j[1]}}</h5></br>
{%endfor%}

{%endfor%}

<h2 align="center">粉丝列表</h4></br>
{%for i in fans%}
<h3 align="center">第{{loop.index}}位粉丝</h5></br>
<h5 align="left">昵称:{{i[0][0]}}</h5></br>
<h5 align="left">介绍:{{i[0][1]}}</h5></br>
<h5 align="left">个性域名:{{i[0][2]}}</h5></br>
{% for j in i[1].items()%}
<h5 align="left">{{j[0]}}:{{j[1]}}</h5></br>
{%endfor%}

{% for j in i[2]%}
<h5 align="left">{{j[0]}}:{{j[1]}}</h5></br>
{%endfor%}

{%endfor%}
</body>
</html>

Tips

可使用python my_flask.py 的命令启动flask模板引擎,访问127.0.0.1,默认端口是5000,在登陆界面输入账号密码和需要爬取人的ID即可进入程序,程序会将爬取到的数据直接在微博.html的页面上显示出来。


本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!