selenium爬虫项目——完结

Introduction

过了这么些天，总算是完成了第一次大作业的验收，我也成功在一天时间内把原来冗杂的代码重构了一遍，并将重构的代码及其逻辑功能上台演示了一遍，现在针对该项目重构后作最后的展示。
本项目实现了微博的自动登录、给定用户ID爬取该用户基本信息、爬取该用户的博文内容、爬取该用户的社交信息包括关注者和粉丝的基本信息（虽然严格来说selenium不算爬虫），并利用flask模板引擎对爬取的数据进行展示（没错，就是SSTI的经典flask）。重构之后的代码将爬虫写成了一个python库，称为My_selenium，里面包含登录、获取基本信息、获取博文信息、获取社交信息、获取所有信息等函数，外部通过my_flask.py调用该库获取数据进行渲染。值得注意的是因为诸如网页元素加载等问题，程序运行有时候会报错（比如在sleep的时间内未能加载出指定页面元素），可以调大对应的sleep的时间或者多试几次（属于是概率学）。
文件结构：

|-My_selenium.py
 -my_flask.py
 -templates
  |-login.html
  |-微博.html

My_selenium.py

from selenium import webdriver
from time import sleep
import sys
import threading
non_bmp_map=dict.fromkeys(range(0x10000,sys.maxunicode+1),0xfffd)



def mylogin(username,password):
    global driver
    chrome_driver='C:\\Users\\lenovo\\Anaconda3\\Lib\\site-packages\\chromedriver.exe'
    driver = webdriver.Chrome(executable_path = chrome_driver)
    
    driver.get("https://weibo.com/u/7791772275/home?wvr=5")
    sleep(10)
    driver.find_element_by_xpath('//a[@class="LoginBtn_btn_10QRY LoginBtn_btna_1hH9H loginBtn"]').click()
    driver.find_element_by_xpath('//*[@id="app"]/div[4]/div[1]/div/div[2]/div/div/div[5]/a[1]').click()
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    sleep(3)
    driver.find_element_by_id('loginname').click()
    driver.find_element_by_id('loginname').send_keys(username)
    driver.find_element_by_xpath('//div[@class="info_list password"]/div/span').click()
    driver.find_element_by_xpath('//div[@class="info_list password"]/div/input').send_keys(password)
    driver.find_element_by_xpath('//div[@class="info_list login_btn"]').click()
    sleep(3)
    driver.find_element_by_id("dmCheck").click()
    driver.find_element_by_xpath('//button[@class="W_btn_a btn_34px"]').click()

def basic_info(nickname):
    statu=[]
    info=[]
    global driver
    driver.find_element_by_xpath('//*[@id="plc_top"]/div/div[1]/div[2]/input').click()
    driver.find_element_by_xpath('//*[@id="plc_top"]/div/div[1]/div[2]/input').send_keys(nickname)
    driver.find_element_by_xpath('//a[@class="W_ficon ficon_search S_ficon"]').click()
    sleep(2)
    driver.find_element_by_xpath('/html/body/div[1]/div[2]/ul/li[2]/a').click()
    sleep(1)
    driver.find_element_by_xpath('//*[@id="pl_user_feedList"]/div[1]/div[2]/div/a[1]').click()
    #driver.find_element_by_xpath('//*[@id="pl_user_feedList"]/div[2]/div[2]/div/a[1]').click()
    
    driver.close()
    driver.switch_to.window(driver.window_handles[0])
    sleep(1)
    driver.find_element_by_xpath('//div[@class="PCD_person_info"]/a[@class="WB_cardmore S_txt1 S_line1 clearfix"]').click()
    sleep(1)
    status=driver.find_elements_by_xpath('//a[@class="t_link S_txt1"]')
    for i in status:
        statu.append(i.text)
    statu=[i.split() for i in statu]
    dic={}
    for i in range(len(statu)):
        dic[statu[i][1]]=statu[i][0]
    basic_info_title=driver.find_elements_by_xpath('//span[@class="pt_title S_txt2"]')
    basic_info=driver.find_elements_by_xpath('//span[contains(@class,"pt_detail")]')
    for i in range(len(basic_info)):
        info.append((basic_info_title[i].text.translate(non_bmp_map),basic_info[i].text.translate(non_bmp_map)))
    return dic,info

def blog_info(blog_num):
    global driver
    blog=[]
    comments=[]
    driver.find_elements_by_xpath('//a[@class="t_link S_txt1"]')[2].click()
    sleep(5)
    driver.find_element_by_xpath('//li[contains(@class,"tab_li tab_li_first")]').click()
    driver.execute_script("window.scrollBy(0,30000)")
    sleep(5)    
    search=min(blog_num,10)
    button=driver.find_elements_by_xpath('//span[@class="line S_line1"]/span/em[@class="W_ficon ficon_repeat S_ficon"]')
    for i in button:
        driver.execute_script("arguments[0].click()",i)
    sleep(2)
    count=0
    index=0
    for i in range(search):
        Time=driver.find_elements_by_xpath('//div[@class="WB_from S_txt2"]')[:search]
        Text=driver.find_elements_by_xpath('//div[@class="WB_text W_f14"]')[:search]
        Status=driver.find_elements_by_xpath('//span[@class="line S_line1"]/span/em[2]')
        forward=Status[count+1].text
        comment=Status[count+2].text
        praise=Status[count+3].text
        
        Comment=[]
        transmit=0
        time=Time[i].text
        if '转发微博' in Text[i].text:
            transmit=1
            #count+=3
        text=Text[i].text.translate(non_bmp_map)

        if comment=='评论':
            comment=0
        comment_num=min(int(comment),10)
        count+=4
        repeat=driver.find_elements_by_xpath('//div[@class="repeat_list"]')[i]
        if '查看更多' in repeat.text:
            test=driver.find_elements_by_xpath('//div[@class="list_ul"]/a[contains(@class,"S_txt1 S_line1 clearfix")]')
            driver.execute_script("arguments[0].click()",test[index])
            driver.switch_to.window(driver.window_handles[-1])
            sleep(4)
            comment_text=driver.find_elements_by_xpath('//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_text"]')[:comment_num]
            comment_time=driver.find_elements_by_xpath('//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_func clearfix"]/div[@class="WB_from S_txt2"]')[:comment_num]
            comment_praise=driver.find_elements_by_xpath('//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div/div/ul/li/span[@class="line S_line1"]/a/span/em[2]')[:comment_num]
            for j in range(min(comment_num,len(comment_text))):
                if comment_praise[j].text=='赞':
                    Comment.append((comment_text[j].text.translate(non_bmp_map),comment_time[j].text,'0'))
            
                else:
                    Comment.append((comment_text[j].text.translate(non_bmp_map),comment_time[j].text,comment_praise[j].text))
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            index+=1
        else:
            temp=driver.find_elements_by_xpath('//div[@class="repeat_list"]')[i]
            comment_text=temp.find_elements_by_xpath('.//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_text"]')
            comment_time=temp.find_elements_by_xpath('.//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_func clearfix"]/div[@class="WB_from S_txt2"]')
            comment_praise=temp.find_elements_by_xpath('.//div[@class="list_box"]/div[@class="list_ul"]/div/div[@class="list_con"]/div[@class="WB_func clearfix"]/div[@class="WB_handle W_fr"]/ul[@class="clearfix"]/li/span[@class="line S_line1"]/a/span/em[2]')
            for _ in range(min(comment_num,len(comment_text))):
                if comment_praise[_].text=='赞':
                    Comment.append((comment_text[_].text.translate(non_bmp_map),comment_time[_].text,'0'))
                else:
                    Comment.append((comment_text[_].text.translate(non_bmp_map),comment_time[_].text,comment_praise[_].text))
                
        blog.append((time,text,transmit,forward,comment,praise,Comment))
    return blog
        
        

def social_info(focus_num,fans_num):
    global driver
    total_focus_statu=[]
    status=driver.find_elements_by_xpath('//a[@class="t_link S_txt1"]')
    driver.execute_script("arguments[0].click()",status[0])
    search=min(focus_num,10)

    sleep(3)
    focus=driver.find_elements_by_xpath('//div[@class="info_name W_fb W_f14"]/a[@class="S_txt1"]')[:search]
    for i in focus:
        i.click()
    for i in range(len(focus)):
        info=[]
        driver.switch_to.window(driver.window_handles[-1])
        sleep(2)
        try:
            username=driver.find_element_by_xpath('//div[@class="pf_username"]/h1[@class="username"]').text
        except:
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            continue
        intro=driver.find_element_by_xpath('//div[@class="pf_intro"]').text
        url=driver.current_url
        statu=driver.find_elements_by_xpath('//td[@class="S_line1"]')
        try:
            f_focus=statu[0].text
        except:
            continue
        f_fans=statu[1].text
        b_blogs=statu[2].text
        tmp=[f_focus,f_fans,b_blogs]
        tmp=[i.split() for i in tmp]
        dic={}
        for i in tmp:
            dic[i[1]]=i[0]
        driver.find_element_by_xpath('//a[@class="WB_cardmore S_txt1 S_line1 clearfix"]').click()
        sleep(1)
        basic_info_title=driver.find_elements_by_xpath('//span[@class="pt_title S_txt2"]')
        basic_info=driver.find_elements_by_xpath('//span[contains(@class,"pt_detail")]')
        for j in range(len(basic_info)):
            try:
                info.append((basic_info_title[j].text.translate(non_bmp_map),basic_info[j].text.translate(non_bmp_map)))
            except:
                continue
        total_focus_statu.append([[username.translate(non_bmp_map),intro.translate(non_bmp_map),url],dic,info])
        driver.close()
        driver.switch_to.window(driver.window_handles[0])


    total_fans_statu=[]
    search=min(fans_num,10)
    driver.find_element_by_xpath('//em[@class="W_ficon ficon_fans S_ficon"]').click()
    sleep(2)
    fans=driver.find_elements_by_xpath('//div[@class="info_name W_fb W_f14"]/a[@class="S_txt1"]')[:search]
    for i in fans:
        i.click()
    for i in range(len(fans)):
        info=[]
        driver.switch_to.window(driver.window_handles[-1])
        sleep(2)
        username=driver.find_element_by_xpath('//div[@class="pf_username"]/h1[@class="username"]').text
        intro=driver.find_element_by_xpath('//div[contains(@class,"shadow")]/div[@class="pf_intro"]').text
        url=driver.current_url
        statu=driver.find_elements_by_xpath('//td[@class="S_line1"]')
        f_focus=statu[0].text
        f_fans=statu[1].text
        b_blogs=statu[2].text
        tmp=[f_focus,f_fans,b_blogs]
        tmp=[i.split() for i in tmp]
        dic={}
        for i in tmp:
            dic[i[1]]=i[0]
        driver.find_element_by_xpath('//a[@class="WB_cardmore S_txt1 S_line1 clearfix"]').click()
        sleep(1)
        basic_info_title=driver.find_elements_by_xpath('//span[@class="pt_title S_txt2"]')
        basic_info=driver.find_elements_by_xpath('//span[contains(@class,"pt_detail")]')
        for j in range(len(basic_info)):
            try:
                info.append((basic_info_title[j].text.translate(non_bmp_map),basic_info[j].text.translate(non_bmp_map)))
            except:
                continue
        total_fans_statu.append([[username.translate(non_bmp_map),intro.translate(non_bmp_map),url],dic,info])
        driver.close()
        driver.switch_to.window(driver.window_handles[0])
        
    return total_focus_statu,total_fans_statu

def get_all_info(username):
    global driver
    dic,info=basic_info(username)
    blog=blog_info(int(dic['微博']))
    sleep(3)
    focus_num,fans_num=dic['关注'],dic['粉丝']
    if '万' in focus_num:
        focus_num=10
    else:
        focus_num=int(focus_num)
    if '万' in fans_num:
        fans_num=10
    else:
        fans_num=int(fans_num)
    focus,fans=social_info(focus_num,fans_num)
    return dic,info,blog,focus,fans


if __name__== '__main__':
    mylogin('你的账号','你的密码')
    sleep(25)
    username="明天想遛猪"
    
    dic,info,blog,focus,fans=get_all_info(username)
    print(dic)
    print(info)
    print(blog)
    print(focus)
    print(fans)

my_flask.py

from flask import Flask, redirect, url_for, request, render_template
from time import sleep
from My_selenium import *

app = Flask(__name__)


@app.route('/')
def index():
    return render_template("login.html")



@app.route('/show',methods = ['POST', 'GET'])
def show():
    username=request.form['username']
    password=request.form['password']
    query_name=request.form['query']
    mylogin(username,password)
    sleep(25)
    dic,info,blog,focus,fans=get_all_info(query_name)
    
    return render_template('微博.html',dic=dic,info=info,blog=blog,focus=focus,fans=fans)

    


@app.route('/login',methods = ['POST', 'GET'])
def login():
    return render_template('微博.html')


if __name__ == '__main__':

    app.run(debug=True)

<!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="UTF-8">
  <meta name="viewport" content="width=device-width, initial-scale=1.0">
  <title>Document</title>
  <style>
    body {
      background: url('https://cdn.pixabay.com/photo/2018/08/14/13/23/ocean-3605547_1280.jpg') no-repeat;
      background-size: 100% 130%;
    }

    #login_box {
      width: 20%;
      height: 400px;
      background-color: #00000060;
      margin: auto;
      margin-top: 10%;
      text-align: center;
      border-radius: 10px;
      padding: 50px 50px;
    }

    h2 {
      color: #ffffff90;
      margin-top: 5%;
    }

    #input-box {
      margin-top: 5%;
    }

    span {
      color: #fff;
    }

    input {
      border: 0;
      width: 60%;
      font-size: 15px;
      color: #fff;
      background: transparent;
      border-bottom: 2px solid #fff;
      padding: 5px 10px;
      outline: none;
      margin-top: 10px;
    }

    button {
      margin-top: 50px;
      width: 60%;
      height: 30px;
      border-radius: 10px;
      border: 0;
      color: #fff;
      text-align: center;
      line-height: 30px;
      font-size: 15px;
      background-image: linear-gradient(to right, #30cfd0, #330867);
    }

    #sign_up {
      margin-top: 45%;
      margin-left: 60%;
    }

    a {
      color: #b94648;
    }
  </style>
</head>

<body>
	<form action="/show" method="post">
  <div id="login_box">
    <h2>LOGIN</h2>
    <div id="input_box">
      <input type="text" placeholder="请输入用户名" name="username" id="username">
    </div>
    <div class="input_box">
      <input type="password" placeholder="请输入密码" name="password" id="password">
    </div>
	<div class="input_box">
		<input type="text" placeholder="请输入查询ID" name="query" id="query">
	  </div>
    <button>登录</button><br>
  </div>
  </form>
</body>

</html>

微博.html

<html>
	<head>
		<title>微博爬取</title>
	</head>
	<body>
		<h2 align="center">基本信息</h2>
		{%for i in dic.items()%}</br>
		<h5 align="left">{{i[0]}}：{{i[1]}}</h5></br>
		{%endfor%}
		{%for i in info%}</br>
		<h5 align="left">{{i[0]}}:{{i[1]}}</h5></br>
		{%endfor%}
		{%for i in blog%}</br>
		<h2 align="center">第{{loop.index}}篇博文：</h4></br>
		<h5 align="left">发帖时间：{{i[0]}}</h5></br>
		<h5 align="left">发帖内容：{{i[1]}}</h5></br>

		{%if i[2]==1%}
		<h5 align="left">是否转发：是</h5></br>
		{%else%}
		<h5 align="left">是否转发：否</h5></br>
		{%endif%}

		{%if i[3]=="转发"%}
		<h5 align="left">转发数：0</h5></br>
		{%else%}
		<h5 align="left">转发数：{{i[3]}}</h5></br>
		{%endif%}

		{%if i[3]=="评论"%}
		<h5 align="left">评论数：0</h5></br>
		{%else%}
		<h5 align="left">评论数：{{i[4]}}</h5></br>
		{%endif%}

		{%if i[3]=="赞"%}
		<h5 align="left">点赞数：0</h5></br>
		{%else%}
		<h5 align="left">点赞数：{{i[5]}}</h5></br>
		{%endif%}

		<h2 align="center">评论</h4>
		{%for j in i[6]%}
		<h3 align="center">第{{loop.index}}条评论</h5></br>
		<h5 align="left">评论内容：{{j[0]}}</h5></br>
		<h5 align="left">评论时间：{{j[1]}}</h5></br>
		<h5 align="left">点赞数：{{j[2]}}</h5></br>
		{%endfor%}

		{%endfor%}
		<h2 align="center">关注列表</h4></br>
		{%for i in focus%}
		<h3 align="center">第{{loop.index}}位关注者</h5></br>
		<h5 align="left">昵称：{{i[0][0]}}</h5></br>
		<h5 align="left">介绍：{{i[0][1]}}</h5></br>
		<h5 align="left">个性域名：{{i[0][2]}}</h5></br>
		{% for j in i[1].items()%}
		<h5 align="left">{{j[0]}}：{{j[1]}}</h5></br>
		{%endfor%}

		{% for j in i[2]%}
		<h5 align="left">{{j[0]}}：{{j[1]}}</h5></br>
		{%endfor%}

		{%endfor%}

		<h2 align="center">粉丝列表</h4></br>
		{%for i in fans%}
		<h3 align="center">第{{loop.index}}位粉丝</h5></br>
		<h5 align="left">昵称：{{i[0][0]}}</h5></br>
		<h5 align="left">介绍：{{i[0][1]}}</h5></br>
		<h5 align="left">个性域名：{{i[0][2]}}</h5></br>
		{% for j in i[1].items()%}
		<h5 align="left">{{j[0]}}：{{j[1]}}</h5></br>
		{%endfor%}

		{% for j in i[2]%}
		<h5 align="left">{{j[0]}}：{{j[1]}}</h5></br>
		{%endfor%}

		{%endfor%}
	</body>
</html>

Tips

可使用python my_flask.py 的命令启动flask模板引擎，访问127.0.0.1，默认端口是5000，在登陆界面输入账号密码和需要爬取人的ID即可进入程序，程序会将爬取到的数据直接在微博.html的页面上显示出来。

项目爬虫

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

php反序列化 Previous

爬虫项目(4)——微博动态信息获取 Next

爬虫项目——final