使用python+selenium爬取抖音数据

同学找我帮忙,想爬取抖音的视频信息和用户评论做分类和词频分析。感觉用不了太长时间,趁着周末做了一个小应用。用的是python和selenium自动化测试工具。考虑到抖音页面会更改布局或者class,本程序可能会失效,需要修改定位代码。

功能介绍

  • 输入视频地址和采集视频数,需要进行手动扫码登录,模拟浏览器运行自动采集视频
  • 可选择只采集点赞数10w以上的视频信息
  • 采集三次出现异常自动保存并关闭程序
  • 采集的信息包括标题时间点赞评论收藏数评论(以’|'进行分隔)
  • 存储为excel 文件

准备

运行环境和所需包:

  • python 3.9.7
  • 依赖包——selenium
  • 依赖包——pandas
  • chrome浏览器,最好是最新版本
  • webdriver(chrome)

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# 从selenium里面导入webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import numpy as np
import pandas as pd
import traceback
import time

caps = DesiredCapabilities.CHROME
chrome_options = Options()
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
caps['goog:loggingPrefs'] = {'performance': 'ALL'}

# 指定chrom的驱动
# 执行到这里的时候Selenium会到指定的路径将chrome driver程序运行起来
# driver_path ='E:\webdriver\chromedriver.exe'
driver_path = 'chromedriver.exe'
driver = webdriver.Chrome(options=chrome_options, executable_path=driver_path, desired_capabilities=caps)
driver.maximize_window()

url1 = input("请输入视频地址:\n")
#ex: https://www.douyin.com/video/7074795685694016775?modeFrom=userPost&secUid=MS4wLjABAAAA8U_l6rBzmy7bcy6xOJel4v0RzoR_wfAubGPeJimN__4
num = int(input("请输入要采集的视频个数:\n"))
t = int(input("请输入每个视频的缓冲时间,默认:10s,抓评论5s,缓冲5s\n") or 10) // 2
mode = int(input("是否只采集点赞10w以上的数据?1 or 0,默认:1\n") or 1)
print("等待浏览器加载完成。")
driver.get(url1)
# 等待完成登录操作
print("等待30s,请手动完成登录操作···")
s_time = 30
while s_time > 0:
print(f"剩余时间{s_time}s")
time.sleep(1)
s_time -= 1
print("采集开始,请勿操作浏览器并保持网络畅通!")
videos = []
i = 1
errors = 0

# #获取follow页面的评论按钮
# try:
# driver.find_elements_by_class_name("tzVl3l7w")[0].click()
# except Exception as e:
# print("定位评论按钮失败")

# 循环采集视频
while num > 0:
if errors >= 3:
print("连续出现错误,采集结束")
break
print(f"正在采集第【{i}】个视频")
# 等待页面加载完成
time.sleep(t)
video = dict()
try:
# 获取点赞评论收藏数量,博主主页,非follow页面
try:
nums = driver.find_elements_by_class_name("CE7XkkTw")
video['like_num'] = nums[0].text
video['comment_num'] = nums[1].text
video['collect_num'] = nums[2].text
except NoSuchElementException:
video['like_num'] = -1
video['comment_num'] = -1
video['collect_num'] = -1
flag = 0
# 视频点赞不足10w跳过
if 'w' in video['like_num']:
n = float(video['like_num'][:-1])
if n < 10:
flag = 1
else:
flag = 1
if flag == 1:
print("视频点赞不足10w,跳过!")
ActionChains(driver).send_keys(Keys.DOWN).perform()
num -= 1
i += 1
continue
# 划到底部拉取评论,follow不用拉
driver.execute_script("window.scrollTo(0,document.body.scrollHeight)")
time.sleep(t)

# 获取标题
try:
title = driver.find_element_by_css_selector(".z8_VexPf span span span").text
except NoSuchElementException:
title = "定位失败"
video['title'] = title

#获取日期
try:
_datetime = driver.find_element_by_css_selector(".aQoncqRg").text[5:]
except NoSuchElementException:
_datetime = "定位失败"
video['datetime'] = _datetime

## follow页面获取信息 https://www.douyin.com/follow,需要配合上文的click
# try:
# like_num = driver.find_element_by_css_selector(".CE7XkkTw").text
# except NoSuchElementException:
# like_num = "定位失败"
# video['like_num'] = like_num

# try:
# comment_num = driver.find_element_by_css_selector(".CE7XkkTw").text
# except NoSuchElementException:
# comment_num = "定位失败"
# video['comment_num'] = comment_num

# try:
# collect_num = driver.find_element_by_css_selector(".qc4FQDvn").text
# except NoSuchElementException:
# collect_num = "定位失败"
# video['collect_num'] = collect_num
content = driver.find_elements_by_css_selector("span.VD5Aa1A1")
comments = []
for div in content:
try:
text = div.find_element_by_css_selector("span>span>span>span>span")
except NoSuchElementException:
pass
# url_text = div.find_element_by_css_selector("span.VD5Aa1A1>span:first-child>span>a>span")
if not (text.text == '' or text.text in comments):
comments.append(text.text)
# 触发键盘方向键下,切换下一个视频
ActionChains(driver).send_keys(Keys.DOWN).perform()
# 分割评论
video['comments'] = '|'.join(comments)
videos.append(video)
# 采集视频多了会遇到No Such window,window was closed 错误
except Exception as e:
print(traceback.format_exc())
errors += 1
print(f"出现异常,跳过第{i}条采集")
# driver.find_element_by_class_name("xgplayer-playswitch-next").click()
num -= 1
i += 1
print("采集完成,正在写入文件.")
data = pd.DataFrame(videos)
file_name = time.strftime('%Y-%m-%d %H_%M_%S') + '.xlsx'
data.to_excel(file_name, index=False)
driver.quit()
print("写入文件完成,按Ctrl+C退出")

成品

proxy.exe

无需环境,需要将其与chromedirver.exe放在同一文件夹下

其他的尝试

本来想用selenium+ browsermob-proxy。使用代理来抓取xhr请求,但是抖音的评论数据解析出来没有text字段,其他的请求能收到text,就很奇怪。网上查了一些也没查出来个所以然,遂放弃。

1
2
3
4
5
6
7
8
9
10
11
import json
for entry in result['log']['entries']:
_url = entry['request']['url']
# print(entry['request']['method'], entry['request']['url'])
# 根据URL找到数据接口
if "/aweme/v1/web/comment/list" in _url:
_response = entry['response']
_content = _response['content'][]
# 获取接口返回内容
print(json.dumps(_response, indent=4, ensure_ascii=False))
# print(_content)

返回内容:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
{
"status": 200,
"statusText": "OK",
"httpVersion": "HTTP/1.1",
"cookies": [],
"headers": [
{
"name": "Server",
"value": "Tengine"
},
···
]
"content": {
"size": 8233,
"mimeType": "application/json; charset=utf-8",
"comment": ""
//此处应该有text字段
},
"redirectURL": "",
"headersSize": 975,
"bodySize": 8233,
"comment": ""

具体实现可参考https://blog.csdn.net/thmail/article/details/101029519

只找到一例相同问题https://stackoverflow.com/questions/56048551/selenium-browsermob-proxy-cannot-capture-bodies-of-some-https-responses

评论