From e5b5122ac8b7474e302f2fb68c46eacacfed0168 Mon Sep 17 00:00:00 2001
From: xingjian lin <xjlin0531@outlook.com>
Date: Sat, 2 Jan 2016 21:10:42 +0800
Subject: [PATCH 1/3] =?UTF-8?q?=E4=BD=9C=E4=B8=9A1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

除了动态生成的浏览量之外已经全部完成,这个浏览量我现在水平有限无法完成,望老师解惑.
---
 exercise_1.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 exercise_1.py

diff --git a/exercise_1.py b/exercise_1.py
new file mode 100644
index 0000000..4507801
--- /dev/null
+++ b/exercise_1.py
@@ -0,0 +1,88 @@
+# __author__ = 'xjlin'
+# -*- coding: utf-8 -*-
+
+import requests
+import time
+from bs4 import BeautifulSoup
+
+headers = {
+    'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
+}
+
+url = 'http://bj.58.com/pbdn/?PGTID=0d100000-0000-1121-f41b-137aeef068b7&ClickID=6'
+
+wb_data = requests.get(url)
+soup = BeautifulSoup(wb_data.text, 'lxml')
+
+hrefs = soup.select('td.t > a')
+detail_url = 'http://bj.58.com/pingbandiannao/24517179000509x.shtml?psid=170801987190281724635921968&entinfo=24517179000509_0&iuType=p_0&PGTID=0d305a36-0000-1d8b-4afe-dd3a99ef2c8f&ClickID=5'
+def get_info(url, data = None):
+    time.sleep(2)
+    detail_data = requests.get(url, headers = headers)
+    soup_detail = BeautifulSoup(detail_data.text, 'lxml')
+    titles = soup_detail.select('div.col_sub.mainTitle > h1')
+    reviews = soup_detail.select('#index_show > ul.mtit_con_left.fl > li.count')
+    times = soup_detail.select('ul.mtit_con_left.fl > li.time')
+
+    prices = soup_detail.select('div.su_con > span.price.c_f50')
+    types = soup_detail.select('p.c_666 > span')
+    areas = soup_detail.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a:nth-of-type(1)')
+    categories = soup_detail.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')
+
+    #print(titles, reviews, times, prices, types, areas, categories)
+    #print(categories)
+    #time.sleep(2)
+    if areas == []:
+        for title, review, tim, price, type, category in zip(titles, reviews, times, prices, types, categories):
+            if(type.get_text() == '\n'):
+                data = {
+                    'title' : title.get_text(),
+                    'review': review.get_text(),
+                    'time'  : tim.get_text(),
+                    'price' : price.get_text(),
+                    'type'  : '个人',
+                    'area'  : '未指定区域',
+                    'category' : list(category.stripped_strings)
+                }
+                print(data)
+            else:
+                data = {
+                    'title' : title.get_text(),
+                    'review': review.get_text(),
+                    'time'  : tim.get_text(),
+                    'price' : price.get_text(),
+                    'type'  : '商家',
+                    'area'  : '未指定区域',
+                    'category' : list(category.stripped_strings)
+                }
+                print(data)
+    else:
+        for title, review, tim, price, type, area, category in zip(titles, reviews, times, prices, types, areas, categories):
+            if(type.get_text() == '\n'):
+                data = {
+                    'title' : title.get_text(),
+                    'review': review.get_text(),
+                    'time'  : tim.get_text(),
+                    'price' : price.get_text(),
+                    'type'  : '个人',
+                    'area'  : area.get_text(),
+                    'category' : list(category.stripped_strings)
+                }
+                print(data)
+            else:
+                data = {
+                    'title' : title.get_text(),
+                    'review': review.get_text(),
+                    'time'  : tim.get_text(),
+                    'price' : price.get_text(),
+                    'type'  : '商家',
+                    'area'  : area.get_text(),
+                    'category' : list(category.stripped_strings)
+                }
+                print(data)
+
+
+for href in hrefs:
+    get_info(href.get('href'))
+    #print(href.get('href'))
+

From e0fd794038725bfd114a95c0816862d6cee1252d Mon Sep 17 00:00:00 2001
From: xingjian lin <xjlin0531@outlook.com>
Date: Sat, 2 Jan 2016 21:50:16 +0800
Subject: [PATCH 2/3] =?UTF-8?q?=E8=8A=B1=E5=90=8D=E5=86=8C?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 "\350\212\261\345\220\215\345\206\214" | 1 +
 1 file changed, 1 insertion(+)

diff --git "a/\350\212\261\345\220\215\345\206\214" "b/\350\212\261\345\220\215\345\206\214"
index ba788bb..f527855 100644
--- "a/\350\212\261\345\220\215\345\206\214"
+++ "b/\350\212\261\345\220\215\345\206\214"
@@ -1,3 +1,4 @@
 请在下面加上你的姓名和一句话感言,或者联系方式什么都行，注意一人一行哦：
 
 林茜茜    Hello,world!
+林行健    a burden
\ No newline at end of file

From 93dd12637864be760184c9e083f6b9dbd104060a Mon Sep 17 00:00:00 2001
From: xingjian lin <xjlin0531@outlook.com>
Date: Sun, 3 Jan 2016 14:25:04 +0800
Subject: [PATCH 3/3] =?UTF-8?q?=E5=A4=A7=E4=BD=9C=E4=B8=9A1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

添加了抓取访问人数的方法
---
 exercise_1.py | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/exercise_1.py b/exercise_1.py
index 4507801..b44af54 100644
--- a/exercise_1.py
+++ b/exercise_1.py
@@ -3,42 +3,65 @@
 
 import requests
 import time
+import re
 from bs4 import BeautifulSoup
 
 headers = {
     'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'
 }
 
+
 url = 'http://bj.58.com/pbdn/?PGTID=0d100000-0000-1121-f41b-137aeef068b7&ClickID=6'
 
 wb_data = requests.get(url)
+
 soup = BeautifulSoup(wb_data.text, 'lxml')
 
 hrefs = soup.select('td.t > a')
-detail_url = 'http://bj.58.com/pingbandiannao/24517179000509x.shtml?psid=170801987190281724635921968&entinfo=24517179000509_0&iuType=p_0&PGTID=0d305a36-0000-1d8b-4afe-dd3a99ef2c8f&ClickID=5'
+
+#detail_url = 'http://bj.58.com/pingbandiannao/24517179000509x.shtml?psid=170801987190281724635921968&entinfo=24517179000509_0&iuType=p_0&PGTID=0d305a36-0000-1d8b-4afe-dd3a99ef2c8f&ClickID=5'
+
+prefix_js = 'http://jst1.58.com/counter?infoid='
+
+
+
+def get_id(url):
+    info = re.findall(r'[\d]+', url)
+    return(info[-2])
+
+
+def get_rev(prefix_url, suffix_url):
+    str = prefix_url + suffix_url
+    jsdata = requests.get(str)
+    soup_js = BeautifulSoup(jsdata.text, 'lxml')
+    res = soup_js.select('p')
+    for rr in res:
+        a = re.findall(r'[\d|.]+', rr.get_text())
+        result = a[4]
+    return result
+
+
 def get_info(url, data = None):
     time.sleep(2)
     detail_data = requests.get(url, headers = headers)
     soup_detail = BeautifulSoup(detail_data.text, 'lxml')
     titles = soup_detail.select('div.col_sub.mainTitle > h1')
-    reviews = soup_detail.select('#index_show > ul.mtit_con_left.fl > li.count')
     times = soup_detail.select('ul.mtit_con_left.fl > li.time')
 
     prices = soup_detail.select('div.su_con > span.price.c_f50')
     types = soup_detail.select('p.c_666 > span')
     areas = soup_detail.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a:nth-of-type(1)')
     categories = soup_detail.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a')
-
     #print(titles, reviews, times, prices, types, areas, categories)
     #print(categories)
     #time.sleep(2)
     if areas == []:
-        for title, review, tim, price, type, category in zip(titles, reviews, times, prices, types, categories):
+        for title, tim, price, type, category in zip(titles, times, prices, types, categories):
             if(type.get_text() == '\n'):
                 data = {
                     'title' : title.get_text(),
-                    'review': review.get_text(),
                     'time'  : tim.get_text(),
+                    'review': get_rev(prefix_js, get_id(url)),
                     'price' : price.get_text(),
                     'type'  : '个人',
                     'area'  : '未指定区域',
@@ -48,8 +71,8 @@ def get_info(url, data = None):
             else:
                 data = {
                     'title' : title.get_text(),
-                    'review': review.get_text(),
                     'time'  : tim.get_text(),
+                    'review': get_rev(prefix_js, get_id(url)),
                     'price' : price.get_text(),
                     'type'  : '商家',
                     'area'  : '未指定区域',
@@ -57,12 +80,12 @@ def get_info(url, data = None):
                 }
                 print(data)
     else:
-        for title, review, tim, price, type, area, category in zip(titles, reviews, times, prices, types, areas, categories):
+        for title, tim, price, type, area, category in zip(titles, times, prices, types, areas, categories):
             if(type.get_text() == '\n'):
                 data = {
                     'title' : title.get_text(),
-                    'review': review.get_text(),
                     'time'  : tim.get_text(),
+                    'review': get_rev(prefix_js, get_id(url)),
                     'price' : price.get_text(),
                     'type'  : '个人',
                     'area'  : area.get_text(),
@@ -72,8 +95,8 @@ def get_info(url, data = None):
             else:
                 data = {
                     'title' : title.get_text(),
-                    'review': review.get_text(),
                     'time'  : tim.get_text(),
+                    'review': get_rev(prefix_js, get_id(url)),
                     'price' : price.get_text(),
                     'type'  : '商家',
                     'area'  : area.get_text(),
@@ -86,3 +109,4 @@ def get_info(url, data = None):
     get_info(href.get('href'))
     #print(href.get('href'))
 
+