diff --git a/weiboSpider.py b/weiboSpider.py index af787c79..e424e743 100644 --- a/weiboSpider.py +++ b/weiboSpider.py @@ -10,35 +10,36 @@ import traceback class weibo: - cookie = {"Cookie": "your cookie"} #将your cookie替换成自己的cookie - #weibo类初始化 + cookie = {"Cookie": "your cookie"} #灏唝our cookie鏇挎崲鎴愯嚜宸辩殑cookie + #weibo绫诲垵濮嬪寲 def __init__(self,user_id,filter = 0): - self.user_id = user_id #用户id,即需要我们输入的数字,如昵称为“Dear-迪丽热巴”的id为1669879400 - self.filter = filter #取值范围为0、1,程序默认值为0,代表要爬取用户的全部微博,1代表只爬取用户的原创微博 - self.userName = '' #用户名,如“Dear-迪丽热巴” - self.weiboNum = 0 #用户全部微博数 - self.weiboNum2 = 0 #爬取到的微博数 - self.following = 0 #用户关注数 - self.followers = 0 #用户粉丝数 - self.weibos = [] #微博内容 - self.num_zan = [] #微博对应的点赞数 - self.num_forwarding = [] #微博对应的转发数 - self.num_comment = [] #微博对应的评论数 + self.user_id = user_id #鐢ㄦ埛id锛屽嵆闇瑕佹垜浠緭鍏ョ殑鏁板瓧锛屽鏄电О涓衡淒ear-杩附鐑反鈥濈殑id涓1669879400 + self.filter = filter #鍙栧艰寖鍥翠负0銆1锛岀▼搴忛粯璁ゅ间负0锛屼唬琛ㄨ鐖彇鐢ㄦ埛鐨勫叏閮ㄥ井鍗氾紝1浠h〃鍙埇鍙栫敤鎴风殑鍘熷垱寰崥 + self.userName = '' #鐢ㄦ埛鍚嶏紝濡傗淒ear-杩附鐑反鈥 + self.weiboNum = 0 #鐢ㄦ埛鍏ㄩ儴寰崥鏁 + self.weiboNum2 = 0 #鐖彇鍒扮殑寰崥鏁 + self.following = 0 #鐢ㄦ埛鍏虫敞鏁 + self.followers = 0 #鐢ㄦ埛绮変笣鏁 + self.weibos = [] #寰崥鍐呭 + self.num_zan = [] #寰崥瀵瑰簲鐨勭偣璧炴暟 + self.num_forwarding = [] #寰崥瀵瑰簲鐨勮浆鍙戞暟 + self.num_comment = [] #寰崥瀵瑰簲鐨勮瘎璁烘暟 + self.date = [] # 寰崥瀵瑰簲鐨勬椂闂 - #获取用户昵称 + #鑾峰彇鐢ㄦ埛鏄电О def getUserName(self): try: url = 'http://weibo.cn/%d/info'%(self.user_id) html = requests.get(url, cookies = weibo.cookie).content selector = etree.HTML(html) userName = selector.xpath("//title/text()")[0] - self.userName = userName[:-3].encode('gbk') - #print '用户昵称:' + self.userName + self.userName = userName[:-3].encode(sys.stdout.encoding) + #print '鐢ㄦ埛鏄电О锛' + self.userName except Exception,e: print "Error: ",e traceback.print_exc() - #获取用户微博数、关注数、粉丝数 + #鑾峰彇鐢ㄦ埛寰崥鏁般佸叧娉ㄦ暟銆佺矇涓濇暟 def getUserInfo(self): try: url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter) @@ -46,31 +47,31 @@ def getUserInfo(self): selector = etree.HTML(html) pattern = r"\d+\.?\d*" - #微博数 + #寰崥鏁 str_wb = selector.xpath("//div[@class='tip2']/span[@class='tc']/text()")[0] guid = re.findall(pattern, str_wb, re.S|re.M) for value in guid: num_wb = int(value) break self.weiboNum = num_wb - #print '微博数: ' + str(self.weiboNum) + #print '寰崥鏁: ' + str(self.weiboNum) - #关注数 + #鍏虫敞鏁 str_gz = selector.xpath("//div[@class='tip2']/a/text()")[0] guid = re.findall(pattern, str_gz, re.M) self.following = int(guid[0]) - #print '关注数: ' + str(self.following) + #print '鍏虫敞鏁: ' + str(self.following) - #粉丝数 + #绮変笣鏁 str_fs = selector.xpath("//div[@class='tip2']/a/text()")[1] guid = re.findall(pattern, str_fs, re.M) self.followers = int(guid[0]) - #print '粉丝数: ' + str(self.followers) + #print '绮変笣鏁: ' + str(self.followers) except Exception,e: print "Error: ",e traceback.print_exc() - #获取用户微博内容及对应的点赞数、转发数、评论数 + #鑾峰彇鐢ㄦ埛寰崥鍐呭鍙婂搴旂殑鐐硅禐鏁般佽浆鍙戞暟銆佽瘎璁烘暟 def getWeiboInfo(self): try: url = 'http://weibo.cn/u/%d?filter=%d&page=1'%(self.user_id,self.filter) @@ -81,6 +82,7 @@ def getWeiboInfo(self): else: pageNum = (int)(selector.xpath('//input[@name="mp"]')[0].attrib['value']) pattern = r"\d+\.?\d*" + date_pattern = r'\d+-\d+-\d+.*\d+:\d+:\d+' for page in range(1,pageNum+1): url2 = 'http://weibo.cn/u/%d?filter=%d&page=%d'%(self.user_id,self.filter,page) html2 = requests.get(url2, cookies = weibo.cookie).content @@ -90,58 +92,64 @@ def getWeiboInfo(self): if len(info) > 3: for i in range(0,len(info)-2): self.weiboNum2 = self.weiboNum2 + 1 - #微博内容 + #寰崥鍐呭 str_t = info[i].xpath("div/span[@class='ctt']") - weibos = str_t[0].xpath('string(.)').encode('gbk','ignore') + weibos = str_t[0].xpath('string(.)').encode(sys.stdout.encoding,'ignore') self.weibos.append(weibos) - #print '微博内容:'+ weibos - #点赞数 + #print '寰崥鍐呭锛'+ weibos + #鐐硅禐鏁 str_zan = info[i].xpath("div/a/text()")[-4] guid = re.findall(pattern, str_zan, re.M) num_zan = int(guid[0]) self.num_zan.append(num_zan) - #print '点赞数: ' + str(num_zan) - #转发数 + #print '鐐硅禐鏁: ' + str(num_zan) + #杞彂鏁 forwarding = info[i].xpath("div/a/text()")[-3] guid = re.findall(pattern, forwarding, re.M) num_forwarding = int(guid[0]) self.num_forwarding.append(num_forwarding) - #print '转发数: ' + str(num_forwarding) - #评论数 + #print '杞彂鏁: ' + str(num_forwarding) + #璇勮鏁 comment = info[i].xpath("div/a/text()")[-2] guid = re.findall(pattern, comment, re.M) num_comment = int(guid[0]) self.num_comment.append(num_comment) - #print '评论数: ' + str(num_comment) + #print '璇勮鏁: ' + str(num_comment) + #鏃堕棿 + date = info[i].xpath("div[last()]/span[last()]/text()")[0] + match = re.findall(date_pattern, date, re.M) + date = str(match[0]) + self.date.append(date) + # print '鏃堕棿: ' + date if self.filter == 0: - print '共'+str(self.weiboNum2)+'条微博' + print '鍏'+str(self.weiboNum2)+'鏉″井鍗' else: - print '共'+str(self.weiboNum)+'条微博,其中'+str(self.weiboNum2)+'条为原创微博' + print '鍏'+str(self.weiboNum)+'鏉″井鍗氾紝鍏朵腑'+str(self.weiboNum2)+'鏉′负鍘熷垱寰崥' except Exception,e: print "Error: ",e traceback.print_exc() - #主程序 + #涓荤▼搴 def start(self): try: weibo.getUserName(self) weibo.getUserInfo(self) weibo.getWeiboInfo(self) - print '信息抓取完毕' + print '淇℃伅鎶撳彇瀹屾瘯' print '===========================================================================' except Exception,e: print "Error: ",e - #将爬取的信息写入文件 + #灏嗙埇鍙栫殑淇℃伅鍐欏叆鏂囦欢 def writeTxt(self): try: if self.filter == 1: - resultHeader = '\n\n原创微博内容:\n' + resultHeader = '\n\n鍘熷垱寰崥鍐呭锛歕n' else: - resultHeader = '\n\n微博内容:\n' - result = '用户信息\n用户昵称:' + self.userName + '\n用户id:' + str(self.user_id) + '\n微博数:' + str(self.weiboNum) + '\n关注数:' + str(self.following) + '\n粉丝数:' + str(self.followers) + resultHeader + resultHeader = '\n\n寰崥鍐呭锛歕n' + result = '鐢ㄦ埛淇℃伅\n鐢ㄦ埛鏄电О锛' + self.userName + '\n鐢ㄦ埛id锛' + str(self.user_id) + '\n寰崥鏁帮細' + str(self.weiboNum) + '\n鍏虫敞鏁帮細' + str(self.following) + '\n绮変笣鏁帮細' + str(self.followers) + resultHeader for i in range(1,self.weiboNum2 + 1): - text=str(i) + ':' + self.weibos[i-1] + '\n'+'点赞数:' + str(self.num_zan[i-1]) + ' 转发数:' + str(self.num_forwarding[i-1]) + ' 评论数:' + str(self.num_comment[i-1]) + '\n\n' + text=str(i) + ':' + self.weibos[i-1] + '\n'+'鐐硅禐鏁帮細' + str(self.num_zan[i-1]) + ' 杞彂鏁帮細' + str(self.num_forwarding[i-1]) + ' 璇勮鏁帮細' + str(self.num_comment[i-1]) + ' 鏃堕棿锛' + str(self.date[i-1])' + '\n\n' result = result + text if os.path.isdir('weibo') == False: os.mkdir('weibo') @@ -149,23 +157,23 @@ def writeTxt(self): f.write(result) f.close() file_path=os.getcwd()+"\weibo"+"\%d"%self.user_id+".txt" - print '微博写入文件完毕,保存路径%s'%(file_path) + print '寰崥鍐欏叆鏂囦欢瀹屾瘯锛屼繚瀛樿矾寰%s'%(file_path) except Exception,e: print "Error: ",e traceback.print_exc() -#使用实例,输入一个用户id,所有信息都会存储在wb实例中 -user_id = 1669879400 #可以改成任意合法的用户id(爬虫的微博id除外) -filter = 1 #值为0表示爬取全部的微博信息(原创微博+转发微博),值为1表示只爬取原创微博 -wb = weibo(user_id,filter) #调用weibo类,创建微博实例wb -wb.start() #爬取微博信息 -print '用户名:' + wb.userName -print '全部微博数:' + str(wb.weiboNum) -print '关注数:' + str(wb.following) -print '粉丝数:' + str(wb.followers) -print '最新一条微博为:' + wb.weibos[0] #若filter=1则为最新的原创微博,如果该用户微博数为0,即len(wb.weibos)==0,打印会出错,下同 -print '最新一条微博获得的点赞数:' + str(wb.num_zan[0]) -print '最新一条微博获得的转发数:' + str(wb.num_forwarding[0]) -print '最新一条微博获得的评论数:' + str(wb.num_comment[0]) -wb.writeTxt() #wb.writeTxt()只是把信息写到文件里,大家可以根据自己的需要重新编写writeTxt()函数 \ No newline at end of file +#浣跨敤瀹炰緥,杈撳叆涓涓敤鎴穒d锛屾墍鏈変俊鎭兘浼氬瓨鍌ㄥ湪wb瀹炰緥涓 +user_id = 1669879400 #鍙互鏀规垚浠绘剰鍚堟硶鐨勭敤鎴穒d锛堢埇铏殑寰崥id闄ゅ锛 +filter = 1 #鍊间负0琛ㄧず鐖彇鍏ㄩ儴鐨勫井鍗氫俊鎭紙鍘熷垱寰崥+杞彂寰崥锛夛紝鍊间负1琛ㄧず鍙埇鍙栧師鍒涘井鍗 +wb = weibo(user_id,filter) #璋冪敤weibo绫伙紝鍒涘缓寰崥瀹炰緥wb +wb.start() #鐖彇寰崥淇℃伅 +print '鐢ㄦ埛鍚嶏細' + wb.userName +print '鍏ㄩ儴寰崥鏁帮細' + str(wb.weiboNum) +print '鍏虫敞鏁帮細' + str(wb.following) +print '绮変笣鏁帮細' + str(wb.followers) +print '鏈鏂颁竴鏉″井鍗氫负锛' + wb.weibos[0] #鑻ilter=1鍒欎负鏈鏂扮殑鍘熷垱寰崥锛屽鏋滆鐢ㄦ埛寰崥鏁颁负0锛屽嵆len(wb.weibos)==0,鎵撳嵃浼氬嚭閿欙紝涓嬪悓 +print '鏈鏂颁竴鏉″井鍗氳幏寰楃殑鐐硅禐鏁帮細' + str(wb.num_zan[0]) +print '鏈鏂颁竴鏉″井鍗氳幏寰楃殑杞彂鏁帮細' + str(wb.num_forwarding[0]) +print '鏈鏂颁竴鏉″井鍗氳幏寰楃殑璇勮鏁帮細' + str(wb.num_comment[0]) +wb.writeTxt() #wb.writeTxt()鍙槸鎶婁俊鎭啓鍒版枃浠堕噷锛屽ぇ瀹跺彲浠ユ牴鎹嚜宸辩殑闇瑕侀噸鏂扮紪鍐檞riteTxt()鍑芥暟