selenium爬虫项目（3）——微博社交信息获取

Introduction

最近事好像又挺多，也不知道在忙啥。本来计划周末写完这个大作业的，然而昨天被抓去校运会拍了一天照，晚上回来开始写，基本上把信息获取部分都给完成了，剩下的只有信息的可视化了，这一篇先介绍微博社交信息的部分，下一篇写动态信息的获取。

step1 获取关注列表和粉丝列表前十个的信息

Code:

from selenium import webdriver
import time
import sys

non_bmp_map=dict.fromkeys(range(0x10000,sys.maxunicode+1),0xfffd)
if __name__ == '__main__':

    chrome_driver='C:\\Users\\lenovo\\Anaconda3\\Lib\\site-packages\\chromedriver.exe'
    driver = webdriver.Chrome(executable_path = chrome_driver)
    driver.get('https://www.baidu.com')
    try:
        driver.find_element_by_xpath('//*[@id="kw"]').click()
        driver.find_element_by_xpath('//*[@id="kw"]').send_keys('微博')
        time.sleep(3)
        driver.find_element_by_xpath('//*[@id="su"]').click()
        time.sleep(2)
        
        #driver.find_element_by_xpath('//*[@id="1"]/div/div[1]/h3/a').click()
        driver.find_element_by_xpath('//*[@id="2"]/div/div[1]/h3/a').click()
        time.sleep(10)

        handles=driver.window_handles
        for handle in handles:
            if handle!=driver.current_window_handle:
                driver.close()
                driver.switch_to.window(handle)
        driver.execute_script("window.scrollBy(0,3000)")
        time.sleep(5)
        driver.find_element_by_xpath('//*[@id="app"]/div[1]/div[1]/div[2]/div[1]/div/div/div[3]/div[1]/div/a[1]').click()
        time.sleep(5)
        
        driver.find_element_by_xpath('//*[@id="app"]/div[4]/div[1]/div/div[2]/div/div/div[5]/a[1]').click()

        handles=driver.window_handles
        for handle in handles:
            if handle!=driver.current_window_handle:
                driver.close()
                driver.switch_to.window(handle)
        time.sleep(10)
        
        
        driver.find_element_by_id("loginname").click()
        driver.find_element_by_id("loginname").send_keys("你的账号")
        driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/span').click()
        driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input').send_keys('你的密码')
        time.sleep(5)
        driver.find_element_by_class_name('W_btn_a').click()
        time.sleep(5)
        
        driver.find_element_by_xpath('//*[@id="dmCheck"]').click()
        time.sleep(1)
        driver.find_element_by_id('send_dm_btn').click()
        time.sleep(20)
        
        driver.find_element_by_xpath('//*[@id="plc_top"]/div/div/div[2]/input').click()
        time.sleep(20)
        driver.find_element_by_xpath('//*[@id="plc_top"]/div/div/div[2]/input').send_keys("等风")
        time.sleep(10)
        driver.find_element_by_xpath('//*[@id="plc_top"]/div/div/div[2]/a').click()
        time.sleep(2)
        driver.find_element_by_xpath('/html/body/div[1]/div[2]/ul/li[2]/a').click()
        #driver.find_element_by_xpath('//*[@id="pl_user_feedList"]/div[1]/div[2]/div/a[1]').click()
        driver.find_element_by_xpath('//*[@id="pl_user_feedList"]/div[2]/div[2]/div/a[1]').click()
        time.sleep(2)
        
        handles=driver.window_handles
        for handle in handles:
            if handle!=driver.current_window_handle:
                driver.close()
                driver.switch_to.window(handle)
        time.sleep(10)
        driver.find_element_by_xpath('//*[@id="Pl_Core_UserInfo__6"]/div[2]/div[1]/div/a/span').click()
        time.sleep(5)
        handles=driver.window_handles
        for handle in handles:
            if handle!=driver.current_window_handle:
                driver.close()
                driver.switch_to.window(handle)
        time.sleep(10)
        source=driver.page_source
        with open("information1.txt","w",encoding='utf-8') as f:
            f.write(str(source).translate(non_bmp_map))
        
        driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__50"]/div/div/div/table/tbody/tr/td[1]/a/strong').click() #点开关注列表
        time.sleep(3)
        for i in range(1,11): #打开十个关注人的窗口
            driver.find_element_by_xpath(f'//*[@id="Pl_Official_HisRelation__56"]/div/div/div/div[2]/div[1]/ul/li[{i}]/dl/dd[1]/div[1]/a[1]').click()
            time.sleep(1)
        guanzhu=driver.window_handles[1:]
        
        for i in range(len(guanzhu)):
            time.sleep(1)
            driver.switch_to.window(guanzhu[len(guanzhu)-1-i])
            try:
                time.sleep(3)
                try:   #处理不同的格式
                    guanzhu1=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[1]/strong')  #关注数
                    print(guanzhu1.text)
                    fans1=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[2]/strong') #粉丝数
                    print(fans1.text)
                    blogs=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[3]/strong')  #博文数
                    print(blogs.text)
                except:
                    guanzhu1=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[1]/a/strong')
                    print(guanzhu1.text)
                    fans1=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[2]/a/strong')
                    print(fans1.text)
                    blogs=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[3]/a/strong')
                    print(blogs.text)
                driver.find_element_by_xpath('//*[@id="Pl_Core_UserInfo__6"]/div[2]/div[1]/div/a').click()
                time.sleep(3)
                source=driver.page_source
                with open(f"关注_{i}.txt","w",encoding='utf-8') as f:    #个人信息写入文件
                    f.write(str(source).translate(non_bmp_map))
                time.sleep(1)
                driver.close()
            except:
                print("no info")
                driver.close()

            time.sleep(2)
        driver.switch_to.window(driver.window_handles[0])
        driver.find_element_by_xpath('//*[@id="Pl_Official_HisRelationNav__55"]/div/div[2]/div[1]/div/div/div/div/ul/li[2]/a').click()  #打开粉丝列表
        time.sleep(5)
        
        
        for i in range(1,11):  #点开十个粉丝的窗口
            driver.find_element_by_xpath(f'//*[@id="Pl_Official_HisRelation__56"]/div/div/div/div[2]/div[1]/ul/li[{i}]/dl/dd[1]/div[1]/a[1]').click()
            time.sleep(1)
        fans=driver.window_handles[1:]
        for i in range(len(fans)):
            time.sleep(1)
            driver.switch_to.window(fans[len(fans)-1-i])
            try:   #处理不同的格式
                time.sleep(3)
                guanzhu1=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[1]/a/strong')  #关注数
                print(guanzhu1.text)
                fans1=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[2]/a/strong')  #粉丝数
                print(fans1.text)
                blogs=driver.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[3]/a/strong') #博文数
                print(blogs.text)
                driver.find_element_by_xpath('//*[@id="Pl_Core_UserInfo__6"]/div[2]/div[1]/div/a').click()
                time.sleep(3)
                source=driver.page_source
                with open(f"粉丝_{i}.txt","w",encoding='utf-8') as f:  #个人信息写入文件
                    f.write(str(source).translate(non_bmp_map))
                time.sleep(1)
                driver.close()
            except:
                print("no info")
                driver.close()

            time.sleep(2)
        
    finally:
        time.sleep(30)
        driver.quit()

Explanation

这一步总体来说需要注意很多地方，首先是窗口句柄的转换，需要进入不同用户的主页进行信息采集，这一步需要将当前窗口句柄设置好；其次对于不同类型的用户，官方账号和普通用户，它们主页的格式有所不同，需要同时对这两种类型的用户考虑，此外还有关注人数、粉丝数是否大于10的判断，这里我们为了简便，直接采用try-except语句处理。

End

由于时间原因，而且上面的过程基本上也都是体力活（界面的跳转和格式的混乱确实烦人），所以不作过多解释，将代码调通，能获取到关注列表和粉丝列表的关注数、点赞数、博文数和个人信息即可。
下面是获取到的关注数、点赞数、博文数（个人信息已写入文件）：

项目爬虫

本博客所有文章除特别声明外，均采用 CC BY-SA 4.0 协议，转载请注明出处！

爬虫项目(4)——微博动态信息获取 Previous

Javascript初步 Next

爬虫项目(3)——微博社交信息获取

selenium爬虫项目（3）——微博社交信息获取

Introduction

step1 获取关注列表和粉丝列表前十个的信息

Code:

Explanation

End