Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 5 additions & 12 deletions weibo_spider/parser/comment_parser.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,26 @@
import logging
import random
import requests
from time import sleep

from .parser import Parser
from .util import handle_garbled, handle_html
from .util import handle_html, get_long_weibo_detail

logger = logging.getLogger('spider.comment_parser')


class CommentParser(Parser):
def __init__(self, cookie, weibo_id):
self.cookie = cookie
self.weibo_id = weibo_id
self.url = 'https://weibo.cn/comment/' + weibo_id
self.selector = handle_html(self.cookie, self.url)

def get_long_weibo(self):
"""获取长原创微博"""
try:
for i in range(5):
self.selector = handle_html(self.cookie, self.url)
if self.selector is not None:
info = self.selector.xpath("//div[@class='c']")[1]
wb_content = handle_garbled(info)
wb_time = info.xpath("//span[@class='ct']/text()")[0]
weibo_content = wb_content[wb_content.find(':') +
1:wb_content.rfind(wb_time)]
if weibo_content is not None:
return weibo_content
weibo_content = get_long_weibo_detail(self.cookie, self.weibo_id)
if weibo_content is not None:
return weibo_content
sleep(random.randint(6, 10))
except Exception:
logger.exception(u'网络出错')
Expand Down
14 changes: 14 additions & 0 deletions weibo_spider/parser/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import hashlib
import json
import logging
import re
import sys

import requests
Expand Down Expand Up @@ -118,3 +119,16 @@ def string_to_int(string):
elif string.endswith(u'亿'):
string = float(string[:-1]) * 100000000
return int(string)


def get_long_weibo_detail(cookie, id):
"""获取长微博详情"""
try:
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
headers = {'User_Agent': user_agent, 'Cookie': cookie}
resp = requests.get("https://m.weibo.cn/statuses/show?id=" + id, headers=headers)
if resp.status_code == 200:
content = resp.json()['data']['text'].replace("<br />", "\n")
return re.sub("</?[^>]+>", "", content)
except Exception as e:
logger.exception(e)