From 64b6467c03b25aff7b58c4c7e7ea2f4b08c646f2 Mon Sep 17 00:00:00 2001 From: frog7431 Date: Sat, 28 Apr 2018 10:59:32 +0800 Subject: [PATCH 1/5] GitHub commit test --- .idea/compiler.xml | 16 ++++++++++ .idea/encodings.xml | 6 ++++ .../Maven__com_github_abola_crawler_1_1_1.xml | 13 ++++++++ .../Maven__com_google_guava_guava_19_0.xml | 13 ++++++++ ...iversalchardet_juniversalchardet_1_0_3.xml | 13 ++++++++ ...com_mashape_unirest_unirest_java_1_4_9.xml | 13 ++++++++ ...Maven__commons_codec_commons_codec_1_2.xml | 13 ++++++++ ...mons_httpclient_commons_httpclient_3_1.xml | 13 ++++++++ .../Maven__commons_io_commons_io_2_5.xml | 13 ++++++++ ...n__commons_logging_commons_logging_1_2.xml | 13 ++++++++ ...g_apache_commons_commons_compress_1_12.xml | 13 ++++++++ ...n__org_apache_commons_commons_vfs2_2_1.xml | 13 ++++++++ ...e_httpcomponents_httpasyncclient_4_1_1.xml | 13 ++++++++ ...apache_httpcomponents_httpclient_4_5_2.xml | 13 ++++++++ ...g_apache_httpcomponents_httpcore_4_4_4.xml | 13 ++++++++ ...ache_httpcomponents_httpcore_nio_4_4_4.xml | 13 ++++++++ ...g_apache_httpcomponents_httpmime_4_5_2.xml | 13 ++++++++ .../Maven__org_json_json_20160212.xml | 13 ++++++++ .../Maven__org_jsoup_jsoup_1_9_2.xml | 13 ++++++++ .../Maven__org_mongodb_bson_2_13_3.xml | 13 ++++++++ ...__org_mongodb_mongo_java_driver_2_13_3.xml | 13 ++++++++ .idea/misc.xml | 13 ++++++++ .idea/modules.xml | 8 +++++ .idea/vcs.xml | 6 ++++ MyCrawlerExample.iml | 32 +++++++++++++++++++ 25 files changed, 328 insertions(+) create mode 100644 .idea/compiler.xml create mode 100644 .idea/encodings.xml create mode 100644 .idea/libraries/Maven__com_github_abola_crawler_1_1_1.xml create mode 100644 .idea/libraries/Maven__com_google_guava_guava_19_0.xml create mode 100644 .idea/libraries/Maven__com_googlecode_juniversalchardet_juniversalchardet_1_0_3.xml create mode 100644 .idea/libraries/Maven__com_mashape_unirest_unirest_java_1_4_9.xml create mode 100644 .idea/libraries/Maven__commons_codec_commons_codec_1_2.xml create mode 100644 .idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml create mode 100644 .idea/libraries/Maven__commons_io_commons_io_2_5.xml create mode 100644 .idea/libraries/Maven__commons_logging_commons_logging_1_2.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_compress_1_12.xml create mode 100644 .idea/libraries/Maven__org_apache_commons_commons_vfs2_2_1.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpasyncclient_4_1_1.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_4.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpcore_nio_4_4_4.xml create mode 100644 .idea/libraries/Maven__org_apache_httpcomponents_httpmime_4_5_2.xml create mode 100644 .idea/libraries/Maven__org_json_json_20160212.xml create mode 100644 .idea/libraries/Maven__org_jsoup_jsoup_1_9_2.xml create mode 100644 .idea/libraries/Maven__org_mongodb_bson_2_13_3.xml create mode 100644 .idea/libraries/Maven__org_mongodb_mongo_java_driver_2_13_3.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 MyCrawlerExample.iml diff --git a/.idea/compiler.xml b/.idea/compiler.xml new file mode 100644 index 0000000..c8fd901 --- /dev/null +++ b/.idea/compiler.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 0000000..b26911b --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_github_abola_crawler_1_1_1.xml b/.idea/libraries/Maven__com_github_abola_crawler_1_1_1.xml new file mode 100644 index 0000000..61069f6 --- /dev/null +++ b/.idea/libraries/Maven__com_github_abola_crawler_1_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_google_guava_guava_19_0.xml b/.idea/libraries/Maven__com_google_guava_guava_19_0.xml new file mode 100644 index 0000000..68e23cc --- /dev/null +++ b/.idea/libraries/Maven__com_google_guava_guava_19_0.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_googlecode_juniversalchardet_juniversalchardet_1_0_3.xml b/.idea/libraries/Maven__com_googlecode_juniversalchardet_juniversalchardet_1_0_3.xml new file mode 100644 index 0000000..b127443 --- /dev/null +++ b/.idea/libraries/Maven__com_googlecode_juniversalchardet_juniversalchardet_1_0_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_mashape_unirest_unirest_java_1_4_9.xml b/.idea/libraries/Maven__com_mashape_unirest_unirest_java_1_4_9.xml new file mode 100644 index 0000000..d7792cc --- /dev/null +++ b/.idea/libraries/Maven__com_mashape_unirest_unirest_java_1_4_9.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_codec_commons_codec_1_2.xml b/.idea/libraries/Maven__commons_codec_commons_codec_1_2.xml new file mode 100644 index 0000000..fbcb992 --- /dev/null +++ b/.idea/libraries/Maven__commons_codec_commons_codec_1_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml b/.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml new file mode 100644 index 0000000..66e6537 --- /dev/null +++ b/.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_io_commons_io_2_5.xml b/.idea/libraries/Maven__commons_io_commons_io_2_5.xml new file mode 100644 index 0000000..67c2ad2 --- /dev/null +++ b/.idea/libraries/Maven__commons_io_commons_io_2_5.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__commons_logging_commons_logging_1_2.xml b/.idea/libraries/Maven__commons_logging_commons_logging_1_2.xml new file mode 100644 index 0000000..eab40b3 --- /dev/null +++ b/.idea/libraries/Maven__commons_logging_commons_logging_1_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_compress_1_12.xml b/.idea/libraries/Maven__org_apache_commons_commons_compress_1_12.xml new file mode 100644 index 0000000..d28b2ea --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_compress_1_12.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_commons_commons_vfs2_2_1.xml b/.idea/libraries/Maven__org_apache_commons_commons_vfs2_2_1.xml new file mode 100644 index 0000000..eff22b6 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_commons_commons_vfs2_2_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpasyncclient_4_1_1.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpasyncclient_4_1_1.xml new file mode 100644 index 0000000..8484ecb --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpasyncclient_4_1_1.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_2.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_2.xml new file mode 100644 index 0000000..fdb7ead --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_4.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_4.xml new file mode 100644 index 0000000..3a5aa19 --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_nio_4_4_4.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_nio_4_4_4.xml new file mode 100644 index 0000000..1dab39e --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpcore_nio_4_4_4.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_apache_httpcomponents_httpmime_4_5_2.xml b/.idea/libraries/Maven__org_apache_httpcomponents_httpmime_4_5_2.xml new file mode 100644 index 0000000..4b2025a --- /dev/null +++ b/.idea/libraries/Maven__org_apache_httpcomponents_httpmime_4_5_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_json_json_20160212.xml b/.idea/libraries/Maven__org_json_json_20160212.xml new file mode 100644 index 0000000..44a278c --- /dev/null +++ b/.idea/libraries/Maven__org_json_json_20160212.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_jsoup_jsoup_1_9_2.xml b/.idea/libraries/Maven__org_jsoup_jsoup_1_9_2.xml new file mode 100644 index 0000000..f38a2fc --- /dev/null +++ b/.idea/libraries/Maven__org_jsoup_jsoup_1_9_2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mongodb_bson_2_13_3.xml b/.idea/libraries/Maven__org_mongodb_bson_2_13_3.xml new file mode 100644 index 0000000..9eed368 --- /dev/null +++ b/.idea/libraries/Maven__org_mongodb_bson_2_13_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__org_mongodb_mongo_java_driver_2_13_3.xml b/.idea/libraries/Maven__org_mongodb_mongo_java_driver_2_13_3.xml new file mode 100644 index 0000000..d375251 --- /dev/null +++ b/.idea/libraries/Maven__org_mongodb_mongo_java_driver_2_13_3.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..aca9be3 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,13 @@ + + + + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..f83b4e0 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/MyCrawlerExample.iml b/MyCrawlerExample.iml new file mode 100644 index 0000000..f3b83fd --- /dev/null +++ b/MyCrawlerExample.iml @@ -0,0 +1,32 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file From 3b5e727daf179b1df2ea6c5f33f0222b5823e72b Mon Sep 17 00:00:00 2001 From: frog7431 Date: Thu, 3 May 2018 11:56:10 +0800 Subject: [PATCH 2/5] update BDSE07 0503 Homework "Google Ditections API Test" --- .../example/bdse07/ExamGoogleMapApi.java | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 src/crawler/example/bdse07/ExamGoogleMapApi.java diff --git a/src/crawler/example/bdse07/ExamGoogleMapApi.java b/src/crawler/example/bdse07/ExamGoogleMapApi.java new file mode 100644 index 0000000..7c96fda --- /dev/null +++ b/src/crawler/example/bdse07/ExamGoogleMapApi.java @@ -0,0 +1,30 @@ +package crawler.example.bdse07; + +import com.github.abola.crawler.CrawlerPack; +import org.apache.commons.logging.impl.SimpleLog; + +/** + * 練習題:請使用 Google direction API (導航)功能,完成API call,以及印出 distance + * + * 完成後同學請記得兩步動作上傳 + * 1. git > add + * 2. git > commit file (一定要記得PUSH!!) + */ +public class ExamGoogleMapApi { + + public static void main(String[] args) { + CrawlerPack.setLoggerLevel(SimpleLog.LOG_LEVEL_OFF); + + // 遠端資料路徑 (可先在postman 完成查詢,再貼上) + String uri = "https://maps.googleapis.com/maps/api/directions/json?origin=25.091896,121.518145&destination=25.033509,121.543516&key=AIzaSyCE3rhrAg9_Nuxr1i-lfwTnbZ48ECkc-9c"; + + // 完成下方 select 部份的內容,使其可取得 distance 的內容 + String distance = + CrawlerPack.start() + .getFromJson(uri) + .select("legs > distance text") + .text(); + + System.out.println("result: " + distance); + } +} From bef598063a0b965e0fe88252238c867a5ce83158 Mon Sep 17 00:00:00 2001 From: frog7431 Date: Thu, 3 May 2018 16:32:34 +0800 Subject: [PATCH 3/5] update BDSE07 0503 Homework "Google Ditections API Test" --- .../youtube/FullExampleIntergrationToELK.java | 227 ++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 src/crawler/example/youtube/FullExampleIntergrationToELK.java diff --git a/src/crawler/example/youtube/FullExampleIntergrationToELK.java b/src/crawler/example/youtube/FullExampleIntergrationToELK.java new file mode 100644 index 0000000..ce4b70b --- /dev/null +++ b/src/crawler/example/youtube/FullExampleIntergrationToELK.java @@ -0,0 +1,227 @@ +package crawler.example.youtube; + +import com.github.abola.crawler.CrawlerPack; +import com.google.common.base.Joiner; +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.Table; +import com.mashape.unirest.http.Unirest; +import org.apache.commons.logging.impl.SimpleLog; +import org.json.JSONObject; +import org.jsoup.nodes.Element; + +import java.util.*; + +/** + * 透過 userid 找出相關的 channels + */ +public class FullExampleIntergrationToELK { + + static String elasticHost = "localhost" ; + static String elasticPort = "9200" ; + static String elasticIndex = "youtube-pewdata"; // 請在後方加入帳號(ex: youtube-abola),務必全小寫字母 + static String elasticIndexType = "data"; // 範例請不要改這行 + + // 設定使用者ID或頻道ID任一 + String username = ""; + String channelId = "UCEf_Bc-KVd7onSeifS3py9g"; + String api_key = "AIzaSyCE3rhrAg9_Nuxr1i-lfwTnbZ48ECkc-9c"; + + // 使用 Guava 物件 Table 資料會像以下 + // | row | column | value| + // |-----|--------|------| + // | id | item1 | aaa | + // | id | item2 | bbb | + // | id | item3 | ccc | + Table videoTable; + + + public FullExampleIntergrationToELK() throws Exception{ + // 確認要查詢 channels 清單 + List channels = getChannels(); + + // 讀取 channels 的 videos + for(String channelId: channels ){ + getVideos(channelId); + } + + // 更新每一個影片的統計資料 + getVideoStatistics( videoTable.rowKeySet() ); + + + // 將資料寫入 Elasticsearch + for(String row: videoTable.rowKeySet()){ + String elasticJson = new JSONObject(videoTable.row(row)).toString(); + sendPost("http://" + elasticHost + ":" + elasticPort + + "/" + elasticIndex + "/" + elasticIndexType + , elasticJson); + } + } + + + + /** + * 讀取指定 username or channelId 所有的頻道清單 + * @return + */ + public List getChannels() throws Exception{ + List channels = new ArrayList<>() ; + + // 有指定 username,就用 username來找channels + if (!"".equals(username)){ + // 讀取指定 username 所有的頻道清單 + String uri = "https://www.googleapis.com/youtube/v3/channels?forUsername=" + username + "&part=snippet,id&key=" + api_key; + + for (Element elem : CrawlerPack.start().getFromJson(uri).select("items id")) { + //System.out.println(elem); + String channelId = elem.select("id").text(); + //String channelTitle = elem.select("title").text(); + channels.add(channelId); + + } + } + // 沒有指定username,就用指定的 channelId + else if(!"".equals(channelId)){ + channels.add(channelId); + } + else{ + throw new Exception("未輸入有效的username或channelId"); + } + + return channels; + } + + /** + * 取得指定CHANNEL的影片清單 + * @param channelId + * @return + */ + public void getVideos(String channelId){ + getVideos(channelId, ""); + } + + /** + * 取得指定CHANNEL的影片清單 + * @param channelId + * @return + */ + public void getVideos(String channelId, String pageToken){ + + // 首次進入建立TABLE物件 + if (null == videoTable) { + videoTable = HashBasedTable.create(); + } + + String uri = "https://www.googleapis.com/youtube/v3/search?channelId="+channelId+ + "&fields=items(id(videoId),snippet(title,channelTitle)),nextPageToken" + + "&part=snippet&order=date&maxResults=50&key="+api_key; + + // 如果有指定換頁指標 + if( !"".equals(pageToken) ){ + uri += "&pageToken=" + pageToken; + } + + Element results = null; + // 如果已達最後一頁,會因為最後一頁無資料,出現IndexOutOfBoundsException + try { + results = CrawlerPack.start().getFromJson(uri); + } + catch(java.lang.IndexOutOfBoundsException outBounds){ + return ; + } + + for (Element elem : results.select("items")) { + String videoId = elem.select("id").text(); + String title = elem.select("title").text(); + String channelTitle = elem.select("channelTitle").text(); + + // 空ID資料不處理 + if ("".equals(videoId)) continue; + + videoTable.put(videoId, "videoid", videoId); + videoTable.put(videoId, "title", title); + videoTable.put(videoId, "channelTitle", channelTitle); + + } + + + String nextPageToken = results.select("nextPageToken").text(); + if ( !"".equals(nextPageToken) ){ + // return + getVideos(channelId, nextPageToken); + } + } + + /** + * 查詢每一部影片的統計資料,50筆資料送一次REQUEST,加速處理 + * + * @param videos + */ + public void getVideoStatistics(Set videos){ + int idsLimitCounter = 50; + List ids = new ArrayList<>(); + // 取得 video 的統計資訊 + for(String videoId: videos){ + ids.add(videoId); + // 計數,累計至最大值才執行 + idsLimitCounter--; + if ( 0 >= idsLimitCounter ){ + // reset counter + idsLimitCounter = 50; + // Guava 指令:將集合物件使用指定的符號合併成一個字串 + getVideoStatistics( Joiner.on(",").join(ids) ); + ids = new ArrayList<>(); + } + } + if (0 < ids.size()) getVideoStatistics( Joiner.on(",").join(ids) ); + } + + /** + * 查詢指定ID(s)的統計資料,並回填至 TABLE + * @param ids + */ + public void getVideoStatistics(String ids){ + System.out.println(ids); + String uri = "https://www.googleapis.com/youtube/v3/videos?id="+ids+ + "&part=snippet,statistics&fields=items(id,snippet(publishedAt),statistics)"+ + "&key="+api_key; + + for (Element elem : CrawlerPack.start().getFromJson(uri).select("items")) { + String videoId = elem.select("id").text(); + String publishedAt = elem.select("publishedAt").text(); + String viewCount = elem.select("viewCount").text(); + String likeCount = elem.select("likeCount").text(); + String dislikeCount = elem.select("dislikeCount").text(); + String commentCount = elem.select("commentCount").text(); + + videoTable.put(videoId, "publishedAt", publishedAt); + videoTable.put(videoId, "viewCount", viewCount); + videoTable.put(videoId, "likeCount", likeCount); + videoTable.put(videoId, "dislikeCount", dislikeCount); + videoTable.put(videoId, "commentCount", commentCount); + } + } + + + String sendPost(String url, String body){ + try{ + return Unirest.post(url) + .header("content-type", "text/plain") + .header("cache-control", "no-cache") + .body(body) + .asString().getBody(); + + }catch(Exception e){return "Error:" + e.getMessage();} + } + + + public static void main(String[] args) { + CrawlerPack.setLoggerLevel(SimpleLog.LOG_LEVEL_OFF); + + try { + new FullExampleIntergrationToELK(); + }catch(Exception ex){ + ex.printStackTrace(); +// System.out.println(ex.getMessage()); + } + } +} From 0c695ab450e62a0dcd966e11f80d8dad5fbfe778 Mon Sep 17 00:00:00 2001 From: frog7431 Date: Thu, 3 May 2018 16:33:21 +0800 Subject: [PATCH 4/5] update BDSE07 0503 "PttTestFile" --- src/crawler/example/PttExample.java | 76 ++++++++++++++++++++++++++ src/crawler/example/PttGetContent.java | 28 ++++++++++ 2 files changed, 104 insertions(+) create mode 100644 src/crawler/example/PttExample.java create mode 100644 src/crawler/example/PttGetContent.java diff --git a/src/crawler/example/PttExample.java b/src/crawler/example/PttExample.java new file mode 100644 index 0000000..3a0037b --- /dev/null +++ b/src/crawler/example/PttExample.java @@ -0,0 +1,76 @@ +package crawler.example; + +import com.github.abola.crawler.CrawlerPack; +import org.apache.commons.logging.impl.SimpleLog; +import org.jsoup.nodes.Document; + +/** + * 爬蟲包程式的全貌,就只有這固定的模式 + * + * @author Abola Lee + * + */ +public class PttExample { + // commit test test + public static void main(String[] args) { + + // set to debug level + //CrawlerPack.setLoggerLevel(SimpleLog.LOG_LEVEL_DEBUG); + + // turn off logging + CrawlerPack.setLoggerLevel(SimpleLog.LOG_LEVEL_OFF); + + // 遠端資料路徑 + String uri = "https://www.ptt.cc/bbs/Gossiping/M.1524893176.A.DED.html"; + + /* + System.out.println( + CrawlerPack.start() + // 參數設定 + .addCookie("over18","1") // 設定cookie + //.setRemoteEncoding("big5")// 設定遠端資料文件編碼 + + // 選擇資料格式 (三選一) + //.getFromJson(uri) + .getFromHtml(uri) + //.getFromXml(uri) + + // 這兒開始是 Jsoup Document 物件操作 + //.select(".article-meta-value") + //.select("#main-content div.push:contains(噓) .f3.push-content") + //.select("#main-content ") + ); + */ + + System.out.println( + CrawlerPack.start() + // 參數設定 + .addCookie("over18","1") // 設定cookie + //.setRemoteEncoding("big5")// 設定遠端資料文件編碼 + + // 選擇資料格式 (三選一) + //.getFromJson(uri) + .getFromHtml(uri) + //.getFromXml(uri) + + // 這兒開始是 Jsoup Document 物件操作 + .select(".main-content div").remove() + .select(".main-content span").remove() + .select("#main-content").text() + //.select("#main-content div.push:contains(噓) .f3.push-content") + //.select("#main-content ") + ); + + + + /* + //只取內容,刪除div、span作法 + Document jsoupObject = CrawlerPack.start().addCookie("over18", "1").getFromHtml(uri); + + jsoupObject.select("#main-content div").remove(); + jsoupObject.select("#main-content span").remove(); + + System.out.println( jsoupObject.select("#main-content").text()); + */ + } +} diff --git a/src/crawler/example/PttGetContent.java b/src/crawler/example/PttGetContent.java new file mode 100644 index 0000000..c24c9fd --- /dev/null +++ b/src/crawler/example/PttGetContent.java @@ -0,0 +1,28 @@ +package crawler.example; + +import com.github.abola.crawler.CrawlerPack; +import org.jsoup.nodes.Document; + + +/** + * 簡易練習 + * + * 找出所有文章中按推的id + * + * @author Abola Lee + * + */ +public class PttGetContent { + + public static void main(String[] args) { + String uri = "https://www.ptt.cc/bbs/Gossiping/M.1525278814.A.571.html"; + + Document jsoupObject = CrawlerPack.start().addCookie("over18", "1").getFromHtml(uri); + + jsoupObject.select("#main-content div").remove(); + jsoupObject.select("#main-content span").remove(); + + + System.out.println( jsoupObject.select("#main-content").text()); + } +} From 3efc1c2a3a182b1dfac992ef7b945c019eb9815e Mon Sep 17 00:00:00 2001 From: frog7431 Date: Thu, 3 May 2018 16:34:31 +0800 Subject: [PATCH 5/5] update BDSE07 0503 FileAdjusted --- src/crawler/example/BasicExample.java | 9 ++++----- src/crawler/example/HighwayStaticInfo.java | 2 +- src/crawler/example/RealPrice.java | 6 +++++- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/crawler/example/BasicExample.java b/src/crawler/example/BasicExample.java index cbf327f..d53bfa2 100644 --- a/src/crawler/example/BasicExample.java +++ b/src/crawler/example/BasicExample.java @@ -20,7 +20,7 @@ public static void main(String[] args) { CrawlerPack.setLoggerLevel(SimpleLog.LOG_LEVEL_OFF); // 遠端資料路徑 - String uri = "http://.../"; + String uri = "https://tw.yahoo.com/"; System.out.println( CrawlerPack.start() @@ -30,13 +30,12 @@ public static void main(String[] args) { //.setRemoteEncoding("big5")// 設定遠端資料文件編碼 // 選擇資料格式 (三選一) - .getFromJson(uri) - //.getFromHtml(uri) + //.getFromJson(uri) + .getFromHtml(uri) //.getFromXml(uri) // 這兒開始是 Jsoup Document 物件操作 - .select(".css .selector ") - + .select(".Va-tt").get(0) ); } } diff --git a/src/crawler/example/HighwayStaticInfo.java b/src/crawler/example/HighwayStaticInfo.java index a19c42c..e860d1c 100644 --- a/src/crawler/example/HighwayStaticInfo.java +++ b/src/crawler/example/HighwayStaticInfo.java @@ -13,7 +13,7 @@ public class HighwayStaticInfo { public static void main(String[] args) { // 遠端資料路徑 - String uri = "gz:http://tisvcloud.freeway.gov.tw/cms_value.xml.gz"; + String uri = "gz:http://tisvcloud.freeway.gov.tw/roadlevel_threshold.xml.gz"; System.out.println( CrawlerPack.start() diff --git a/src/crawler/example/RealPrice.java b/src/crawler/example/RealPrice.java index d98f0df..ad51c2f 100644 --- a/src/crawler/example/RealPrice.java +++ b/src/crawler/example/RealPrice.java @@ -17,11 +17,15 @@ */ public class RealPrice { public static void main(String[] args) { - + + /* String uri = "zip:http://plvr.land.moi.gov.tw" + "/Download?type=zip&fileName=lvr_landxml.zip" + "!/A_LVR_LAND_A.XML"; + */ + String uri = "zip:http://plvr.land.moi.gov.tw/Download?type=zip&fileName=lvr_landxml.zip" + + "!/B_LVR_LAND_A.XML"; Document jsoupDoc = CrawlerPack.start() .getFromXml(uri);