From e5b5122ac8b7474e302f2fb68c46eacacfed0168 Mon Sep 17 00:00:00 2001 From: xingjian lin Date: Sat, 2 Jan 2016 21:10:42 +0800 Subject: [PATCH 1/3] =?UTF-8?q?=E4=BD=9C=E4=B8=9A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 除了动态生成的浏览量之外已经全部完成,这个浏览量我现在水平有限无法完成,望老师解惑. --- exercise_1.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 exercise_1.py diff --git a/exercise_1.py b/exercise_1.py new file mode 100644 index 0000000..4507801 --- /dev/null +++ b/exercise_1.py @@ -0,0 +1,88 @@ +# __author__ = 'xjlin' +# -*- coding: utf-8 -*- + +import requests +import time +from bs4 import BeautifulSoup + +headers = { + 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' +} + +url = 'http://bj.58.com/pbdn/?PGTID=0d100000-0000-1121-f41b-137aeef068b7&ClickID=6' + +wb_data = requests.get(url) +soup = BeautifulSoup(wb_data.text, 'lxml') + +hrefs = soup.select('td.t > a') +detail_url = 'http://bj.58.com/pingbandiannao/24517179000509x.shtml?psid=170801987190281724635921968&entinfo=24517179000509_0&iuType=p_0&PGTID=0d305a36-0000-1d8b-4afe-dd3a99ef2c8f&ClickID=5' +def get_info(url, data = None): + time.sleep(2) + detail_data = requests.get(url, headers = headers) + soup_detail = BeautifulSoup(detail_data.text, 'lxml') + titles = soup_detail.select('div.col_sub.mainTitle > h1') + reviews = soup_detail.select('#index_show > ul.mtit_con_left.fl > li.count') + times = soup_detail.select('ul.mtit_con_left.fl > li.time') + + prices = soup_detail.select('div.su_con > span.price.c_f50') + types = soup_detail.select('p.c_666 > span') + areas = soup_detail.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a:nth-of-type(1)') + categories = soup_detail.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a') + + #print(titles, reviews, times, prices, types, areas, categories) + #print(categories) + #time.sleep(2) + if areas == []: + for title, review, tim, price, type, category in zip(titles, reviews, times, prices, types, categories): + if(type.get_text() == '\n'): + data = { + 'title' : title.get_text(), + 'review': review.get_text(), + 'time' : tim.get_text(), + 'price' : price.get_text(), + 'type' : '个人', + 'area' : '未指定区域', + 'category' : list(category.stripped_strings) + } + print(data) + else: + data = { + 'title' : title.get_text(), + 'review': review.get_text(), + 'time' : tim.get_text(), + 'price' : price.get_text(), + 'type' : '商家', + 'area' : '未指定区域', + 'category' : list(category.stripped_strings) + } + print(data) + else: + for title, review, tim, price, type, area, category in zip(titles, reviews, times, prices, types, areas, categories): + if(type.get_text() == '\n'): + data = { + 'title' : title.get_text(), + 'review': review.get_text(), + 'time' : tim.get_text(), + 'price' : price.get_text(), + 'type' : '个人', + 'area' : area.get_text(), + 'category' : list(category.stripped_strings) + } + print(data) + else: + data = { + 'title' : title.get_text(), + 'review': review.get_text(), + 'time' : tim.get_text(), + 'price' : price.get_text(), + 'type' : '商家', + 'area' : area.get_text(), + 'category' : list(category.stripped_strings) + } + print(data) + + +for href in hrefs: + get_info(href.get('href')) + #print(href.get('href')) + From e0fd794038725bfd114a95c0816862d6cee1252d Mon Sep 17 00:00:00 2001 From: xingjian lin Date: Sat, 2 Jan 2016 21:50:16 +0800 Subject: [PATCH 2/3] =?UTF-8?q?=E8=8A=B1=E5=90=8D=E5=86=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- "\350\212\261\345\220\215\345\206\214" | 1 + 1 file changed, 1 insertion(+) diff --git "a/\350\212\261\345\220\215\345\206\214" "b/\350\212\261\345\220\215\345\206\214" index ba788bb..f527855 100644 --- "a/\350\212\261\345\220\215\345\206\214" +++ "b/\350\212\261\345\220\215\345\206\214" @@ -1,3 +1,4 @@ 请在下面加上你的姓名和一句话感言,或者联系方式什么都行,注意一人一行哦: 林茜茜 Hello,world! +林行健 a burden \ No newline at end of file From 93dd12637864be760184c9e083f6b9dbd104060a Mon Sep 17 00:00:00 2001 From: xingjian lin Date: Sun, 3 Jan 2016 14:25:04 +0800 Subject: [PATCH 3/3] =?UTF-8?q?=E5=A4=A7=E4=BD=9C=E4=B8=9A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加了抓取访问人数的方法 --- exercise_1.py | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/exercise_1.py b/exercise_1.py index 4507801..b44af54 100644 --- a/exercise_1.py +++ b/exercise_1.py @@ -3,42 +3,65 @@ import requests import time +import re from bs4 import BeautifulSoup headers = { 'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36' } + url = 'http://bj.58.com/pbdn/?PGTID=0d100000-0000-1121-f41b-137aeef068b7&ClickID=6' wb_data = requests.get(url) + soup = BeautifulSoup(wb_data.text, 'lxml') hrefs = soup.select('td.t > a') -detail_url = 'http://bj.58.com/pingbandiannao/24517179000509x.shtml?psid=170801987190281724635921968&entinfo=24517179000509_0&iuType=p_0&PGTID=0d305a36-0000-1d8b-4afe-dd3a99ef2c8f&ClickID=5' + +#detail_url = 'http://bj.58.com/pingbandiannao/24517179000509x.shtml?psid=170801987190281724635921968&entinfo=24517179000509_0&iuType=p_0&PGTID=0d305a36-0000-1d8b-4afe-dd3a99ef2c8f&ClickID=5' + +prefix_js = 'http://jst1.58.com/counter?infoid=' + + + +def get_id(url): + info = re.findall(r'[\d]+', url) + return(info[-2]) + + +def get_rev(prefix_url, suffix_url): + str = prefix_url + suffix_url + jsdata = requests.get(str) + soup_js = BeautifulSoup(jsdata.text, 'lxml') + res = soup_js.select('p') + for rr in res: + a = re.findall(r'[\d|.]+', rr.get_text()) + result = a[4] + return result + + def get_info(url, data = None): time.sleep(2) detail_data = requests.get(url, headers = headers) soup_detail = BeautifulSoup(detail_data.text, 'lxml') titles = soup_detail.select('div.col_sub.mainTitle > h1') - reviews = soup_detail.select('#index_show > ul.mtit_con_left.fl > li.count') times = soup_detail.select('ul.mtit_con_left.fl > li.time') prices = soup_detail.select('div.su_con > span.price.c_f50') types = soup_detail.select('p.c_666 > span') areas = soup_detail.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a:nth-of-type(1)') categories = soup_detail.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a') - #print(titles, reviews, times, prices, types, areas, categories) #print(categories) #time.sleep(2) if areas == []: - for title, review, tim, price, type, category in zip(titles, reviews, times, prices, types, categories): + for title, tim, price, type, category in zip(titles, times, prices, types, categories): if(type.get_text() == '\n'): data = { 'title' : title.get_text(), - 'review': review.get_text(), 'time' : tim.get_text(), + 'review': get_rev(prefix_js, get_id(url)), 'price' : price.get_text(), 'type' : '个人', 'area' : '未指定区域', @@ -48,8 +71,8 @@ def get_info(url, data = None): else: data = { 'title' : title.get_text(), - 'review': review.get_text(), 'time' : tim.get_text(), + 'review': get_rev(prefix_js, get_id(url)), 'price' : price.get_text(), 'type' : '商家', 'area' : '未指定区域', @@ -57,12 +80,12 @@ def get_info(url, data = None): } print(data) else: - for title, review, tim, price, type, area, category in zip(titles, reviews, times, prices, types, areas, categories): + for title, tim, price, type, area, category in zip(titles, times, prices, types, areas, categories): if(type.get_text() == '\n'): data = { 'title' : title.get_text(), - 'review': review.get_text(), 'time' : tim.get_text(), + 'review': get_rev(prefix_js, get_id(url)), 'price' : price.get_text(), 'type' : '个人', 'area' : area.get_text(), @@ -72,8 +95,8 @@ def get_info(url, data = None): else: data = { 'title' : title.get_text(), - 'review': review.get_text(), 'time' : tim.get_text(), + 'review': get_rev(prefix_js, get_id(url)), 'price' : price.get_text(), 'type' : '商家', 'area' : area.get_text(), @@ -86,3 +109,4 @@ def get_info(url, data = None): get_info(href.get('href')) #print(href.get('href')) +