51工具盒子

依楼听风雨
笑看云卷云舒,淡观潮起潮落

Python学习之路-新浪微博评论爬虫

爬取指定页面所有评论

import csv
import json
import random
import time
import urllib.request

class Data:
code = 'utf-8'
id = 0
uid = 0
max_id = 0
url = ''
total = 0
retry = 0
count = 0
filename = ''


over = Data()


def http_r(url, c=over.code):
# 设置UA
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0'
}
request = urllib.request.Request(url=url, headers=headers)
return urllib.request.urlopen(request).read().decode(c)


def setData(url):
# url = 'https://m.weibo.cn/status/4639954594955978'
# url = 'https://weibo.com/2803301701/KgLoTkNh8'


    over.count = 0
    over.total = 0

    u = url.replace('https://', 'w.') \
        .replace('http://', 'w.') \
        .replace(url[url.find('#'): 0 if url.find('#') == -1 else None], '') \
        .replace(url[url.find('?'): 0 if url.find('?') == -1 else None], '') \
        .split('/')

    if u[0].find('m.weibo.') != -1:
        s = 'var $render_data = ['
        e = '][0] || {};'
        back = http_r(url)
        try:
            back = json.loads(back[back.find(s) + len(s):back.find(e)])
        except Exception:
            back = http_r(url)
            try:
                back = json.loads(back[back.find(s) + len(s):back.find(e)])
            except Exception:
                return False

        over.uid = back['status']['user']['id']
        over.id = u[2]
        newCsv(back['status']['status_title'], back['status']['user']['screen_name'])
        return True
    elif u[0].find('w.weibo.') != -1:
        back = json.loads(http_r('https://weibo.com/ajax/statuses/show?id=' + u[2]))
        over.uid = u[1]
        over.id = back['id']
        newCsv(back['text_raw'], back['user']['screen_name'])
        return True
    return False




def setUrl():
if over.max_id \> 0:
over.url = 'https://weibo.com/ajax/statuses/buildComments?flow=0\&is_reload=1\&id=' 

+ str(over.id) + '\&is_show_bulletin=2\&is_mix=0\&max_id=' + str(over.max_id) 

+ '\&count=20\&uid=' + str(over.uid)
elif over.max_id == 0:
over.url = 'https://weibo.com/ajax/statuses/buildComments?is_reload=1\&id=' 

+ str(over.id) + '\&is_show_bulletin=2\&is_mix=0\&count=20\&uid=' + str(over.uid)


def getComment():
# https://weibo.com/ajax/statuses/buildComments?is_reload=1\&id=4639954594955978\&is_show_bulletin=2\&is_mix=0\&count=20\&uid=2803301701
# https://weibo.com/ajax/statuses/buildComments?flow=0\&is_reload=1\&id=4639954594955978\&is_show_bulletin=2\&is_mix=0\&max_id=273267091222258\&count=20\&uid=2803301701


    setUrl()
    print('    开始获取 ', end='')
    try:
        back = json.loads(http_r(over.url))
    except Exception:
        back = {}

    over.retry = 0
    while ('data' not in back or len(back['data']) == 0) and over.retry < 10:
        print('-', end='')
        over.retry += 1
        time.sleep(random.uniform(1, 2.5))
        try:
            back = json.loads(http_r(over.url))
        except Exception:
            back = {}
        if 'max_id' in back:
            over.max_id = back['max_id']
        setUrl()

    if over.retry == 10:
        print('\n    本次获取失败')
        return False
    else:
        s = len(back['data'])
        over.count += s
        print('\n    本次获取到: ' + str(s) + ' 新数据')

    if over.total == 0:
        over.total = back['total_number']

    comments = {'name': [], 'comment': []}
    for c in back['data']:
        comments['name'].append(c['user']['name'])
        comments['comment'].append(c['text_raw'].replace('\r', '').replace('\n', '\\n').replace('\t', '\\t'))
    writeOut(comments)
    return True




def writeOut(info):  # 将解析好的数据按格式写入文本
with open(over.filename + '.csv', 'a', encoding='utf-8') as file_obj:  # 将数据追加写出到同级目录下的infos.txt中
f_csv = csv.writer(file_obj)
for i in range(len(info\['name'\])):
f_csv.writerow(\[info\['name'\]\[i\], info\['comment'\]\[i\]\])


def newCsv(title, author):
over.filename = title\[title.find('【') + 1:title.find('】')\].replace('#', '') + '-' + str(time.time())
with open(over.filename + '.csv', 'w', encoding='utf-8')as f:
f_csv = csv.writer(f)
f_csv.writerow(\[title.replace('\\r', '').replace('\\n', '\\n').replace('\\t', '\\t'), author\])
f_csv.writerow(\['评论者', '评论'\])


def start(urls, num=-1):  # 爬取链接, 爬取数量
for url in urls:
if not setData(url):
continue
print('0/' + str(num))
# for i in range(int(num / 20)):
while over.count \< num or num == -1:
if not getComment():
continue
print(str(over.count) + '/' + str(num if num \> 0 else over.total))
if over.total \<= over.count:
print('此文已抓完')
break
time.sleep(random.uniform(1, 2.5))

`url_list = [
'https://weibo.com/1893892941/KlDpMDEnh',
'https://m.weibo.cn/status/4639954594955978'
]
start(url_list)
`

赞(0)
未经允许不得转载:工具盒子 » Python学习之路-新浪微博评论爬虫