爬取指定页面所有评论
import csv
import json
import random
import time
import urllib.request
class Data:
code = 'utf-8'
id = 0
uid = 0
max_id = 0
url = ''
total = 0
retry = 0
count = 0
filename = ''
over = Data()
def http_r(url, c=over.code):
# 设置UA
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
}
request = urllib.request.Request(url=url, headers=headers)
return urllib.request.urlopen(request).read().decode(c)
def setData(url):
# url = 'https://m.weibo.cn/status/4639954594955978'
# url = 'https://weibo.com/2803301701/KgLoTkNh8'
over.count = 0
over.total = 0
u = url.replace('https://', 'w.') \
.replace('http://', 'w.') \
.replace(url[url.find('#'): 0 if url.find('#') == -1 else None], '') \
.replace(url[url.find('?'): 0 if url.find('?') == -1 else None], '') \
.split('/')
if u[0].find('m.weibo.') != -1:
s = 'var $render_data = ['
e = '][0] || {};'
back = http_r(url)
try:
back = json.loads(back[back.find(s) + len(s):back.find(e)])
except Exception:
back = http_r(url)
try:
back = json.loads(back[back.find(s) + len(s):back.find(e)])
except Exception:
return False
over.uid = back['status']['user']['id']
over.id = u[2]
newCsv(back['status']['status_title'], back['status']['user']['screen_name'])
return True
elif u[0].find('w.weibo.') != -1:
back = json.loads(http_r('https://weibo.com/ajax/statuses/show?id=' + u[2]))
over.uid = u[1]
over.id = back['id']
newCsv(back['text_raw'], back['user']['screen_name'])
return True
return False
def setUrl():
if over.max_id \> 0:
over.url = 'https://weibo.com/ajax/statuses/buildComments?flow=0\&is_reload=1\&id='
+ str(over.id) + '\&is_show_bulletin=2\&is_mix=0\&max_id=' + str(over.max_id)
+ '\&count=20\&uid=' + str(over.uid)
elif over.max_id == 0:
over.url = 'https://weibo.com/ajax/statuses/buildComments?is_reload=1\&id='
+ str(over.id) + '\&is_show_bulletin=2\&is_mix=0\&count=20\&uid=' + str(over.uid)
def getComment():
# https://weibo.com/ajax/statuses/buildComments?is_reload=1\&id=4639954594955978\&is_show_bulletin=2\&is_mix=0\&count=20\&uid=2803301701
# https://weibo.com/ajax/statuses/buildComments?flow=0\&is_reload=1\&id=4639954594955978\&is_show_bulletin=2\&is_mix=0\&max_id=273267091222258\&count=20\&uid=2803301701
setUrl()
print(' 开始获取 ', end='')
try:
back = json.loads(http_r(over.url))
except Exception:
back = {}
over.retry = 0
while ('data' not in back or len(back['data']) == 0) and over.retry < 10:
print('-', end='')
over.retry += 1
time.sleep(random.uniform(1, 2.5))
try:
back = json.loads(http_r(over.url))
except Exception:
back = {}
if 'max_id' in back:
over.max_id = back['max_id']
setUrl()
if over.retry == 10:
print('\n 本次获取失败')
return False
else:
s = len(back['data'])
over.count += s
print('\n 本次获取到: ' + str(s) + ' 新数据')
if over.total == 0:
over.total = back['total_number']
comments = {'name': [], 'comment': []}
for c in back['data']:
comments['name'].append(c['user']['name'])
comments['comment'].append(c['text_raw'].replace('\r', '').replace('\n', '\\n').replace('\t', '\\t'))
writeOut(comments)
return True
def writeOut(info): # 将解析好的数据按格式写入文本
with open(over.filename + '.csv', 'a', encoding='utf-8') as file_obj: # 将数据追加写出到同级目录下的infos.txt中
f_csv = csv.writer(file_obj)
for i in range(len(info\['name'\])):
f_csv.writerow(\[info\['name'\]\[i\], info\['comment'\]\[i\]\])
def newCsv(title, author):
over.filename = title\[title.find('【') + 1:title.find('】')\].replace('#', '') + '-' + str(time.time())
with open(over.filename + '.csv', 'w', encoding='utf-8')as f:
f_csv = csv.writer(f)
f_csv.writerow(\[title.replace('\\r', '').replace('\\n', '\\n').replace('\\t', '\\t'), author\])
f_csv.writerow(\['评论者', '评论'\])
def start(urls, num=-1): # 爬取链接, 爬取数量
for url in urls:
if not setData(url):
continue
print('0/' + str(num))
# for i in range(int(num / 20)):
while over.count \< num or num == -1:
if not getComment():
continue
print(str(over.count) + '/' + str(num if num \> 0 else over.total))
if over.total \<= over.count:
print('此文已抓完')
break
time.sleep(random.uniform(1, 2.5))
`url_list = [
'https://weibo.com/1893892941/KlDpMDEnh',
'https://m.weibo.cn/status/4639954594955978'
]
start(url_list)
`