Skip to content

Commit b393732

Browse files
authored
Create douban.py
1 parent 4a408d1 commit b393732

File tree

1 file changed

+137
-0
lines changed

1 file changed

+137
-0
lines changed

moumoubaimifan/douban/douban.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import json
2+
import requests
3+
import time
4+
import random
5+
import cpca
6+
import jieba
7+
from pyecharts import options as opts
8+
from pyecharts.charts import WordCloud
9+
from pyecharts.charts import Geo
10+
from pyecharts.globals import ChartType
11+
from collections import Counter
12+
13+
addr_dic = {}
14+
text_list = []
15+
16+
def main():
17+
url_basic = 'https://m.douban.com/rexxar/api/v2/gallery/topic/18306/items?from_web=1&sort=hot&start={}&count=20&status_full_text=1&guest_only=0&ck=GStY'
18+
headers = {
19+
'Accept': 'application/json, text/javascript, */*; q=0.01',
20+
'Accept-Encoding': 'gzip, deflate, br',
21+
'Accept-Language': 'zh-CN,zh;q=0.9',
22+
'Connection': 'keep-alive',
23+
'Content-Type': 'application/x-www-form-urlencoded',
24+
'Cookie': 'bid=n7vzKfXLoUA; douban-fav-remind=1; ll="108296"; __utmc=30149280; __utmz=30149280.1624276858.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; gr_user_id=ca8b9156-1926-4c82-9dda-27fc7f7ad51b; __utma=30149280.66080894.1623848440.1624276858.1624282580.3; __utmt=1; dbcl2="157316158:e4ojS8paSUc"; ck=GStY; push_doumail_num=0; __utmv=30149280.15731; frodotk="a187943e3a17e8bbe496bcbaae47ba31"; push_noty_num=0; __utmb=30149280.11.10.1624282580',
25+
'Host': 'm.douban.com',
26+
'Origin': 'https://www.douban.com',
27+
'Referer': 'https://www.douban.com/gallery/topic/18306/',
28+
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
29+
'sec-ch-ua-mobile': '?0',
30+
'Sec-Fetch-Dest': 'empty',
31+
'Sec-Fetch-Mode': 'cors',
32+
'Sec-Fetch-Site': 'same-site',
33+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36'
34+
}
35+
36+
for i in range(1,35):
37+
38+
res = requests.get(url=url_basic.format(i * 20), headers=headers)
39+
res_json = json.loads(res.text)
40+
print("这是第 {} 页".format(i * 20))
41+
index = 0
42+
for item in res_json.get('items'):
43+
target = item.get('target')
44+
status = target.get('status')
45+
print("这里是第 {} 个".format((i - 1) * 20 + index));
46+
index = index + 1
47+
with open('douban.txt', 'a+') as f:
48+
f.write(json.dumps(status) + '\n');
49+
50+
sleeptime=random.randint(1, 10)
51+
time.sleep(sleeptime)
52+
53+
54+
def readfile():
55+
file_object = open('douban.txt','r')
56+
try:
57+
for line in file_object:
58+
item = json.loads(line)
59+
if item == None:
60+
continue
61+
author = item['author']
62+
text = item['text']
63+
images = item['images']
64+
id = item['id']
65+
66+
addr_transform = cpca.transform([text])
67+
addr = None
68+
69+
if addr_transform['省'].str.split(' ')[0] != None:
70+
addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省')
71+
72+
if addr is None and author['loc'] is not None:
73+
cpca.transform([author['loc']['name']])
74+
if addr_transform['省'].str.split(' ')[0] != None:
75+
addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省')
76+
77+
if addr is not None:
78+
if addr == '广西壮族自治区':
79+
addr = '广西'
80+
if addr == '香港特别行政区':
81+
addr = '香港'
82+
if addr == '澳门特别行政区':
83+
addr = '澳门'
84+
addr_dic[addr] = addr_dic.get(addr, 0) + 1
85+
86+
87+
seg_list = jieba.cut(text, cut_all=False)
88+
text_list.extend(seg_list)
89+
90+
index = 0
91+
for i in images:
92+
index = index + 1
93+
url = i.get('large').get('url')
94+
r = requests.get(url);
95+
with open('./image/{}-{}.jpg'.format(id, index), 'wb') as f:
96+
f.write(r.content)
97+
98+
99+
finally:
100+
file_object.close()
101+
102+
103+
104+
105+
106+
def ciyun():
107+
# 词频统计,使用Count计数方法
108+
words_counter = Counter(text_list)
109+
# 将Counter类型转换为列表
110+
words = words_counter.most_common(500)
111+
112+
(
113+
WordCloud()
114+
.add(series_name="", data_pair=words, word_size_range=[20, 66])
115+
.render("词云.html")
116+
)
117+
118+
def relitu():
119+
(
120+
Geo()
121+
.add_schema(maptype="china")
122+
.add(
123+
"",
124+
[list(z) for z in zip(list(addr_dic.keys()), list(addr_dic.values()))],
125+
type_=ChartType.HEATMAP,
126+
)
127+
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
128+
.set_global_opts(
129+
visualmap_opts=opts.VisualMapOpts(),
130+
).render("热力图.html")
131+
)
132+
133+
if __name__ == '__main__':
134+
main()
135+
readfile()
136+
relitu()
137+
ciyun()

0 commit comments

Comments
 (0)