|
| 1 | +import json |
| 2 | +import requests |
| 3 | +import time |
| 4 | +import random |
| 5 | +import cpca |
| 6 | +import jieba |
| 7 | +from pyecharts import options as opts |
| 8 | +from pyecharts.charts import WordCloud |
| 9 | +from pyecharts.charts import Geo |
| 10 | +from pyecharts.globals import ChartType |
| 11 | +from collections import Counter |
| 12 | + |
| 13 | +addr_dic = {} |
| 14 | +text_list = [] |
| 15 | + |
| 16 | +def main(): |
| 17 | + url_basic = 'https://m.douban.com/rexxar/api/v2/gallery/topic/18306/items?from_web=1&sort=hot&start={}&count=20&status_full_text=1&guest_only=0&ck=GStY' |
| 18 | + headers = { |
| 19 | + 'Accept': 'application/json, text/javascript, */*; q=0.01', |
| 20 | + 'Accept-Encoding': 'gzip, deflate, br', |
| 21 | + 'Accept-Language': 'zh-CN,zh;q=0.9', |
| 22 | + 'Connection': 'keep-alive', |
| 23 | + 'Content-Type': 'application/x-www-form-urlencoded', |
| 24 | + 'Cookie': 'bid=n7vzKfXLoUA; douban-fav-remind=1; ll="108296"; __utmc=30149280; __utmz=30149280.1624276858.2.2.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ap_v=0,6.0; gr_user_id=ca8b9156-1926-4c82-9dda-27fc7f7ad51b; __utma=30149280.66080894.1623848440.1624276858.1624282580.3; __utmt=1; dbcl2="157316158:e4ojS8paSUc"; ck=GStY; push_doumail_num=0; __utmv=30149280.15731; frodotk="a187943e3a17e8bbe496bcbaae47ba31"; push_noty_num=0; __utmb=30149280.11.10.1624282580', |
| 25 | + 'Host': 'm.douban.com', |
| 26 | + 'Origin': 'https://www.douban.com', |
| 27 | + 'Referer': 'https://www.douban.com/gallery/topic/18306/', |
| 28 | + 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"', |
| 29 | + 'sec-ch-ua-mobile': '?0', |
| 30 | + 'Sec-Fetch-Dest': 'empty', |
| 31 | + 'Sec-Fetch-Mode': 'cors', |
| 32 | + 'Sec-Fetch-Site': 'same-site', |
| 33 | + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36' |
| 34 | + } |
| 35 | + |
| 36 | + for i in range(1,35): |
| 37 | + |
| 38 | + res = requests.get(url=url_basic.format(i * 20), headers=headers) |
| 39 | + res_json = json.loads(res.text) |
| 40 | + print("这是第 {} 页".format(i * 20)) |
| 41 | + index = 0 |
| 42 | + for item in res_json.get('items'): |
| 43 | + target = item.get('target') |
| 44 | + status = target.get('status') |
| 45 | + print("这里是第 {} 个".format((i - 1) * 20 + index)); |
| 46 | + index = index + 1 |
| 47 | + with open('douban.txt', 'a+') as f: |
| 48 | + f.write(json.dumps(status) + '\n'); |
| 49 | + |
| 50 | + sleeptime=random.randint(1, 10) |
| 51 | + time.sleep(sleeptime) |
| 52 | + |
| 53 | + |
| 54 | +def readfile(): |
| 55 | + file_object = open('douban.txt','r') |
| 56 | + try: |
| 57 | + for line in file_object: |
| 58 | + item = json.loads(line) |
| 59 | + if item == None: |
| 60 | + continue |
| 61 | + author = item['author'] |
| 62 | + text = item['text'] |
| 63 | + images = item['images'] |
| 64 | + id = item['id'] |
| 65 | + |
| 66 | + addr_transform = cpca.transform([text]) |
| 67 | + addr = None |
| 68 | + |
| 69 | + if addr_transform['省'].str.split(' ')[0] != None: |
| 70 | + addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省') |
| 71 | + |
| 72 | + if addr is None and author['loc'] is not None: |
| 73 | + cpca.transform([author['loc']['name']]) |
| 74 | + if addr_transform['省'].str.split(' ')[0] != None: |
| 75 | + addr = addr_transform['省'].str.split(' ')[0][0].rstrip('省') |
| 76 | + |
| 77 | + if addr is not None: |
| 78 | + if addr == '广西壮族自治区': |
| 79 | + addr = '广西' |
| 80 | + if addr == '香港特别行政区': |
| 81 | + addr = '香港' |
| 82 | + if addr == '澳门特别行政区': |
| 83 | + addr = '澳门' |
| 84 | + addr_dic[addr] = addr_dic.get(addr, 0) + 1 |
| 85 | + |
| 86 | + |
| 87 | + seg_list = jieba.cut(text, cut_all=False) |
| 88 | + text_list.extend(seg_list) |
| 89 | + |
| 90 | + index = 0 |
| 91 | + for i in images: |
| 92 | + index = index + 1 |
| 93 | + url = i.get('large').get('url') |
| 94 | + r = requests.get(url); |
| 95 | + with open('./image/{}-{}.jpg'.format(id, index), 'wb') as f: |
| 96 | + f.write(r.content) |
| 97 | + |
| 98 | + |
| 99 | + finally: |
| 100 | + file_object.close() |
| 101 | + |
| 102 | + |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | +def ciyun(): |
| 107 | + # 词频统计,使用Count计数方法 |
| 108 | + words_counter = Counter(text_list) |
| 109 | + # 将Counter类型转换为列表 |
| 110 | + words = words_counter.most_common(500) |
| 111 | + |
| 112 | + ( |
| 113 | + WordCloud() |
| 114 | + .add(series_name="", data_pair=words, word_size_range=[20, 66]) |
| 115 | + .render("词云.html") |
| 116 | + ) |
| 117 | + |
| 118 | +def relitu(): |
| 119 | + ( |
| 120 | + Geo() |
| 121 | + .add_schema(maptype="china") |
| 122 | + .add( |
| 123 | + "", |
| 124 | + [list(z) for z in zip(list(addr_dic.keys()), list(addr_dic.values()))], |
| 125 | + type_=ChartType.HEATMAP, |
| 126 | + ) |
| 127 | + .set_series_opts(label_opts=opts.LabelOpts(is_show=False)) |
| 128 | + .set_global_opts( |
| 129 | + visualmap_opts=opts.VisualMapOpts(), |
| 130 | + ).render("热力图.html") |
| 131 | + ) |
| 132 | + |
| 133 | +if __name__ == '__main__': |
| 134 | + main() |
| 135 | + readfile() |
| 136 | + relitu() |
| 137 | + ciyun() |
0 commit comments