From a2fb9e65d393eaabd59545319173c9f9cb8bf379 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Wed, 22 Sep 2021 13:42:59 -0400 Subject: [PATCH 01/12] add client stats and client cluster model --- models/client_clusters.sql | 36 +++++++++++++++ views/client_stats.sql | 95 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 131 insertions(+) create mode 100644 models/client_clusters.sql create mode 100644 views/client_stats.sql diff --git a/models/client_clusters.sql b/models/client_clusters.sql new file mode 100644 index 0000000..44972e1 --- /dev/null +++ b/models/client_clusters.sql @@ -0,0 +1,36 @@ +# This is a kmeans clustering of clients, to discover the clusters of clients, +# by metro, based on time of day and day of the week. +# This will only be meaningful for clients that test a significant number of +# times over the interval of interest, so it is beneficial to use fairly +# large intervals. We will try 13 week intervals, to get consistent number +# of days of the week. + +# bq query --use_legacy_sql=false < models/client_clusters.sql + +CREATE OR REPLACE MODEL + `mlab-sandbox.gfr.client_clusters_model` OPTIONS(model_type='kmeans', + num_clusters=20) AS + +WITH linear AS ( +SELECT LOG10(tests) AS logTests, +days, hours, +sunday/tests AS sunday, +monday/tests AS monday, +tuesday/tests AS tuesday, +wednesday/tests AS wednesday, +thursday/tests AS thursday, +friday/tests AS friday, +saturday/tests AS saturday, +t00/tests AS t00, +t03/tests AS t03, +t06/tests AS t06, +t09/tests AS t09, +t12/tests AS t12, +t15/tests AS t15, +t18/tests AS t18, +t21/tests AS t21, +FROM `mlab-sandbox.gfr.client_stats` +WHERE tests > 10 --AND metro = "lga" +) + +SELECT * FROM linear \ No newline at end of file diff --git a/views/client_stats.sql b/views/client_stats.sql new file mode 100644 index 0000000..e2ae007 --- /dev/null +++ b/views/client_stats.sql @@ -0,0 +1,95 @@ +# Create a view that tracks the count of tests by hour and day of the week. +# Uses server latitude to adjust the time of day and day of week. +# TODO add stats for intertest interval + +# bq query --use_legacy_sql=false < views/client_stats.sql + +CREATE OR REPLACE VIEW +`mlab-sandbox.gfr.client_stats` +--PARTITION BY metro +OPTIONS(description = 'per metro client test stats by day of week and hour of the day') + -- enable_refresh = true) +AS + +# Select ndt7 downloads (for now) +# Since this is client characterization, we count uploads and downloads, and don''t +# care whether the tests are completely valid + +WITH tests AS ( + SELECT + date, ID, raw.ClientIP, a, + IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1, + IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2, + IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime, + server.Geo.Longitude, # TODO should this be client or server? + LEFT(server.Site, 3) AS metro, server.Site AS site, server.Machine AS machine, + REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion, + IF(raw.Download IS NULL, false, true) AS isDownload, + IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData, + FROM `measurement-lab.ndt.ndt7` +), + +# This join is quite expensive - about 3 slot hours for 2 months of data, even if the clientName field is never used. +add_client_name AS ( + SELECT tests.*, clientName + FROM tests LEFT JOIN ( + SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName + FROM tests, tests.tmpClientMetadata + WHERE Name = "client_name") USING (date, ID) +), + +add_client_os AS ( + SELECT add_client_name.*, clientOS + FROM add_client_name LEFT JOIN ( + SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS + FROM add_client_name, add_client_name.tmpClientMetadata + WHERE Name = "client_os") USING (date, ID) +), + +solar AS ( + SELECT *, + TIMESTAMP_ADD(startTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime, + FROM add_client_os +), + +day_hour AS ( + SELECT + # TODO correct for latitude. + metro, ClientIP, clientName, clientOS, wscale1, wscale2, + EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL))) AS meanSpeed, + EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL))) AS meanMinRTT, + EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, + COUNTIF(isDownload) AS downloads, + COUNT(*) AS tests + FROM solar + WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) + GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2 +) + + +SELECT + metro, ClientIP, clientName, clientOS, wscale1, wscale2, + SUM(downloads) AS downloads, + EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed, + EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT, + SUM(tests) AS tests, + COUNT(DISTINCT day) AS days, + COUNT(DISTINCT hour) AS hours, + SUM(IF(day = 1,tests,0)) AS sunday, + SUM(IF(day = 2,tests,0)) AS monday, + SUM(IF(day = 3,tests,0)) AS tuesday, + SUM(IF(day = 4,tests,0)) AS wednesday, + SUM(IF(day = 5,tests,0)) AS thursday, + SUM(IF(day = 6,tests,0)) AS friday, + SUM(IF(day = 7,tests,0)) AS saturday, + SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00, + SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03, + SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06, + SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09, + SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12, + SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15, + SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18, + SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21, +FROM day_hour +GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 +HAVING tests > 5 From 26834b12001cec1da5e72cd0b9f9227e6a7b58e7 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Wed, 22 Sep 2021 15:25:45 -0400 Subject: [PATCH 02/12] tweak clusters, add labelled view --- models/client_clusters.sql | 4 ++-- views/labelled_client_summary.sql | 35 +++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 views/labelled_client_summary.sql diff --git a/models/client_clusters.sql b/models/client_clusters.sql index 44972e1..8e72888 100644 --- a/models/client_clusters.sql +++ b/models/client_clusters.sql @@ -8,8 +8,8 @@ # bq query --use_legacy_sql=false < models/client_clusters.sql CREATE OR REPLACE MODEL - `mlab-sandbox.gfr.client_clusters_model` OPTIONS(model_type='kmeans', - num_clusters=20) AS + `mlab-sandbox.gfr.client_clusters_model30` OPTIONS(model_type='kmeans', + num_clusters=30) AS WITH linear AS ( SELECT LOG10(tests) AS logTests, diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql new file mode 100644 index 0000000..9d59cea --- /dev/null +++ b/views/labelled_client_summary.sql @@ -0,0 +1,35 @@ +# Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats. + +CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries` +AS + +WITH labelled AS ( +SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model`, + (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, downloads, meanSpeed, meanMinRTT, LOG10(tests) AS logTests, +days, hours, +sunday/tests AS sunday, +monday/tests AS monday, +tuesday/tests AS tuesday, +wednesday/tests AS wednesday, +thursday/tests AS thursday, +friday/tests AS friday, +saturday/tests AS saturday, +t00/tests AS t00, +t03/tests AS t03, +t06/tests AS t06, +t09/tests AS t09, +t12/tests AS t12, +t15/tests AS t15, +t18/tests AS t18, +t21/tests AS t21, +FROM `mlab-sandbox.gfr.client_stats` +WHERE tests > 10 +))) + +SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads, +ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed, +ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed, +ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev, +ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT, +FROM labelled +GROUP BY metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2 From 4c107a7c4aae1ac9eea5e7fdeb0a94d37f90d9a8 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Wed, 22 Sep 2021 21:32:22 -0400 Subject: [PATCH 03/12] hacking interval --- models/client_clusters.sql | 7 +- views/client_stats.sql | 18 +++-- views/client_stats_interval.sql | 118 ++++++++++++++++++++++++++++++ views/labelled_client_summary.sql | 4 +- 4 files changed, 138 insertions(+), 9 deletions(-) create mode 100644 views/client_stats_interval.sql diff --git a/models/client_clusters.sql b/models/client_clusters.sql index 8e72888..bde67f0 100644 --- a/models/client_clusters.sql +++ b/models/client_clusters.sql @@ -8,12 +8,13 @@ # bq query --use_legacy_sql=false < models/client_clusters.sql CREATE OR REPLACE MODEL - `mlab-sandbox.gfr.client_clusters_model30` OPTIONS(model_type='kmeans', - num_clusters=30) AS + `mlab-sandbox.gfr.client_clusters_model_intervals_20` OPTIONS(model_type='kmeans', + num_clusters=20) AS WITH linear AS ( SELECT LOG10(tests) AS logTests, days, hours, +downloadInterval, downloadIntervalVariability, sunday/tests AS sunday, monday/tests AS monday, tuesday/tests AS tuesday, @@ -29,7 +30,7 @@ t12/tests AS t12, t15/tests AS t15, t18/tests AS t18, t21/tests AS t21, -FROM `mlab-sandbox.gfr.client_stats` +FROM `mlab-sandbox.gfr.client_stats_interval` WHERE tests > 10 --AND metro = "lga" ) diff --git a/views/client_stats.sql b/views/client_stats.sql index e2ae007..52ab39a 100644 --- a/views/client_stats.sql +++ b/views/client_stats.sql @@ -1,14 +1,17 @@ # Create a view that tracks the count of tests by hour and day of the week. # Uses server latitude to adjust the time of day and day of week. +# Client id is based on IP address, clientName, clientOS, and wscale. # TODO add stats for intertest interval +# NOTES: +# Anything that takes less than a couple slot hours we can probably just +# do in gardener after processing incoming data each day. + # bq query --use_legacy_sql=false < views/client_stats.sql CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_stats` ---PARTITION BY metro -OPTIONS(description = 'per metro client test stats by day of week and hour of the day') - -- enable_refresh = true) +OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day') AS # Select ndt7 downloads (for now) @@ -20,7 +23,8 @@ WITH tests AS ( date, ID, raw.ClientIP, a, IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1, IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2, - IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime, + a.TestTime, + --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime, server.Geo.Longitude, # TODO should this be client or server? LEFT(server.Site, 3) AS metro, server.Site AS site, server.Machine AS machine, REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion, @@ -48,8 +52,12 @@ add_client_os AS ( solar AS ( SELECT *, - TIMESTAMP_ADD(startTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime, + TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime, + TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval, FROM add_client_os + WINDOW + sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2 + ORDER BY a.TestTime) ), day_hour AS ( diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql new file mode 100644 index 0000000..26a0fc3 --- /dev/null +++ b/views/client_stats_interval.sql @@ -0,0 +1,118 @@ +# Create a view that tracks the count of tests by hour and day of the week. +# Uses server latitude to adjust the time of day and day of week. +# Client id is based on IP address, clientName, clientOS, and wscale. +# TODO add stats for intertest interval + +# NOTES: +# Anything that takes less than a couple slot hours we can probably just +# do in gardener after processing incoming data each day. + +# bq query --use_legacy_sql=false < views/client_stats_interval.sql + +CREATE OR REPLACE VIEW +`mlab-sandbox.gfr.client_stats_interval` +OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day') +AS + +# Select ndt7 downloads (for now) +# Since this is client characterization, we count uploads and downloads, and don''t +# care whether the tests are completely valid + +WITH tests AS ( + SELECT + date, ID, raw.ClientIP, a, + IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1, + IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2, + a.TestTime, + --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime, + server.Geo.Longitude, # TODO should this be client or server? + LEFT(server.Site, 3) AS metro, --server.Site AS site, server.Machine AS machine, + --REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion, + IF(raw.Download IS NULL, false, true) AS isDownload, + IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData, + FROM `measurement-lab.ndt.ndt7` +), + +# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used. +add_client_name AS ( + SELECT tests.*, clientName + FROM tests LEFT JOIN ( + SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName + FROM tests, tests.tmpClientMetadata + WHERE Name = "client_name") USING (date, ID) +), + +add_client_os AS ( + SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS + FROM add_client_name LEFT JOIN ( + SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS + FROM add_client_name, add_client_name.tmpClientMetadata + WHERE Name = "client_os") USING (date, ID) +), + +solar AS ( + SELECT * EXCEPT(Longitude), + TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime, + TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval, + FROM add_client_os + WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) + WINDOW + sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2 + ORDER BY a.TestTime) +), + + +add_intervals AS ( + SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, + AVG(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadInterval, + STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability, + FROM solar + GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2, + MeanThroughputMBPS, MinRTT + WINDOW + client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2) +), + +day_hour_counts AS ( + SELECT + # TODO correct for latitude. + metro, ClientIP, clientName, clientOS, wscale1, wscale2, + EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed, + EXP(AVG(IF(isDownload,SAFE.LN(MinRTT),NULL))) AS meanMinRTT, + EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, + COUNTIF(isDownload) AS downloads, + COUNT(*) AS tests, + ANY_VALUE(downloadInterval) AS downloadInterval, + ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability, + FROM add_intervals + GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2 +) + +SELECT + metro, ClientIP, clientName, clientOS, wscale1, wscale2, + SUM(downloads) AS downloads, + EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed, + EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT, + SUM(tests) AS tests, + ANY_VALUE(downloadInterval) AS downloadInterval, + ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability, + COUNT(DISTINCT day) AS days, + COUNT(DISTINCT hour) AS hours, + SUM(IF(day = 1,tests,0)) AS sunday, + SUM(IF(day = 2,tests,0)) AS monday, + SUM(IF(day = 3,tests,0)) AS tuesday, + SUM(IF(day = 4,tests,0)) AS wednesday, + SUM(IF(day = 5,tests,0)) AS thursday, + SUM(IF(day = 6,tests,0)) AS friday, + SUM(IF(day = 7,tests,0)) AS saturday, + SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00, + SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03, + SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06, + SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09, + SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12, + SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15, + SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18, + SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21, +FROM day_hour_counts +GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 +HAVING tests > 5 diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql index 9d59cea..537b46d 100644 --- a/views/labelled_client_summary.sql +++ b/views/labelled_client_summary.sql @@ -1,6 +1,7 @@ # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats. -CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries` +CREATE MATERIALIZED VIEW + `mlab-sandbox.gfr.client_cluster_summaries` AS WITH labelled AS ( @@ -29,6 +30,7 @@ WHERE tests > 10 SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads, ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed, ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed, +# TODO - is this STDDEV computation valid? ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev, ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT, FROM labelled From af267aaa8737fed99cb6ea8c034e1116d9d4db16 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 08:06:20 -0400 Subject: [PATCH 04/12] add up/down balance --- models/client_clusters.sql | 9 +++++---- views/client_stats_interval.sql | 25 +++++++++++++++---------- views/labelled_client_summary.sql | 5 +++-- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/models/client_clusters.sql b/models/client_clusters.sql index bde67f0..0816bbe 100644 --- a/models/client_clusters.sql +++ b/models/client_clusters.sql @@ -8,13 +8,14 @@ # bq query --use_legacy_sql=false < models/client_clusters.sql CREATE OR REPLACE MODEL - `mlab-sandbox.gfr.client_clusters_model_intervals_20` OPTIONS(model_type='kmeans', - num_clusters=20) AS + `mlab-sandbox.gfr.client_clusters_model_intervals_30` OPTIONS(model_type='kmeans', + num_clusters=30) AS WITH linear AS ( SELECT LOG10(tests) AS logTests, days, hours, -downloadInterval, downloadIntervalVariability, +downloadInterval, downloadIntervalVariability, # interval should be loosely related to number of tests (and start and end date) +duBalance, # balance between downloads (+1) and uploads (-1) sunday/tests AS sunday, monday/tests AS monday, tuesday/tests AS tuesday, @@ -31,7 +32,7 @@ t15/tests AS t15, t18/tests AS t18, t21/tests AS t21, FROM `mlab-sandbox.gfr.client_stats_interval` -WHERE tests > 10 --AND metro = "lga" +WHERE tests > 10 ) SELECT * FROM linear \ No newline at end of file diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql index 26a0fc3..ee93198 100644 --- a/views/client_stats_interval.sql +++ b/views/client_stats_interval.sql @@ -14,21 +14,19 @@ CREATE OR REPLACE VIEW OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day') AS -# Select ndt7 downloads (for now) +# Select ALL ndt7 tests # Since this is client characterization, we count uploads and downloads, and don''t # care whether the tests are completely valid - WITH tests AS ( SELECT date, ID, raw.ClientIP, a, IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1, IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2, a.TestTime, - --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime, server.Geo.Longitude, # TODO should this be client or server? - LEFT(server.Site, 3) AS metro, --server.Site AS site, server.Machine AS machine, - --REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion, + LEFT(server.Site, 3) AS metro, IF(raw.Download IS NULL, false, true) AS isDownload, + # This is used later for extracting the client metadata. IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData, FROM `measurement-lab.ndt.ndt7` ), @@ -50,10 +48,12 @@ add_client_os AS ( WHERE Name = "client_os") USING (date, ID) ), +# This adds the solar time, which is more useful for global clustering than UTC time. solar AS ( SELECT * EXCEPT(Longitude), TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime, - TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval, + # Compute the time, in seconds, since the previous test of the same type (upload or download) + TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval, FROM add_client_os WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) WINDOW @@ -61,7 +61,7 @@ solar AS ( ORDER BY a.TestTime) ), - +# This adds the inter-test interval mean and stdev, for downloads only, to ALL tests add_intervals AS ( SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, AVG(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadInterval, @@ -73,14 +73,19 @@ add_intervals AS ( client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2) ), +# This is intended to identify each test by the hour of the day, and the day of the week. +# It is currently grouping by both, whereas it really should group by each independently. +# This is ok, as later we sum by day, and sum by 3 hour interval, but this means that there +# are 24 * 7 groupings here, instead of fewer. day_hour_counts AS ( SELECT # TODO correct for latitude. metro, ClientIP, clientName, clientOS, wscale1, wscale2, - EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed, + EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed, # Downloads only. EXP(AVG(IF(isDownload,SAFE.LN(MinRTT),NULL))) AS meanMinRTT, EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, COUNTIF(isDownload) AS downloads, + COUNTIF(NOT isDownload) AS uploads, COUNT(*) AS tests, ANY_VALUE(downloadInterval) AS downloadInterval, ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability, @@ -90,10 +95,10 @@ day_hour_counts AS ( SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, - SUM(downloads) AS downloads, + SUM(downloads) AS downloads, SUM(uploads) AS uploads, SUM(tests) AS tests, + SUM(downloads-uploads)/SUM(downloads+uploads) AS duRatio, EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed, EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT, - SUM(tests) AS tests, ANY_VALUE(downloadInterval) AS downloadInterval, ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability, COUNT(DISTINCT day) AS days, diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql index 537b46d..5646370 100644 --- a/views/labelled_client_summary.sql +++ b/views/labelled_client_summary.sql @@ -1,6 +1,6 @@ # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats. -CREATE MATERIALIZED VIEW +CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries` AS @@ -8,6 +8,7 @@ WITH labelled AS ( SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model`, (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, downloads, meanSpeed, meanMinRTT, LOG10(tests) AS logTests, days, hours, +downloadInterval, downloadIntervalVariability, sunday/tests AS sunday, monday/tests AS monday, tuesday/tests AS tuesday, @@ -23,7 +24,7 @@ t12/tests AS t12, t15/tests AS t15, t18/tests AS t18, t21/tests AS t21, -FROM `mlab-sandbox.gfr.client_stats` +FROM `mlab-sandbox.gfr.client_stats_interval` WHERE tests > 10 ))) From 56cc8ff657c979549f0e19b621ac0a38273da33a Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 08:18:24 -0400 Subject: [PATCH 05/12] cleaning up client_stats - broken --- views/client_stats_interval.sql | 74 ++++++++++++++++----------------- 1 file changed, 36 insertions(+), 38 deletions(-) diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql index ee93198..d16313c 100644 --- a/views/client_stats_interval.sql +++ b/views/client_stats_interval.sql @@ -10,7 +10,7 @@ # bq query --use_legacy_sql=false < views/client_stats_interval.sql CREATE OR REPLACE VIEW -`mlab-sandbox.gfr.client_stats_interval` +`mlab-sandbox.gfr.client_stats_2` OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day') AS @@ -62,10 +62,17 @@ solar AS ( ), # This adds the inter-test interval mean and stdev, for downloads only, to ALL tests -add_intervals AS ( - SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, - AVG(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadInterval, - STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability, +some_client_stats AS ( + SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, + STRUCT( + COUNT(*) AS tests, + COUNTIF(isDownload) AS downloads, + COUNTIF(NOT isDownload) AS uploads, + EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed, # Downloads only. + EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT, # Downloads only. + AVG(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadInterval, + STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability + ) AS client_stats, FROM solar GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2, MeanThroughputMBPS, MinRTT @@ -79,45 +86,36 @@ add_intervals AS ( # are 24 * 7 groupings here, instead of fewer. day_hour_counts AS ( SELECT - # TODO correct for latitude. metro, ClientIP, clientName, clientOS, wscale1, wscale2, - EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed, # Downloads only. - EXP(AVG(IF(isDownload,SAFE.LN(MinRTT),NULL))) AS meanMinRTT, - EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, - COUNTIF(isDownload) AS downloads, - COUNTIF(NOT isDownload) AS uploads, COUNT(*) AS tests, - ANY_VALUE(downloadInterval) AS downloadInterval, - ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability, - FROM add_intervals + ANY_VALUE(client_stats) AS client_stats, + EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, + FROM some_client_stats GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2 ) SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, - SUM(downloads) AS downloads, SUM(uploads) AS uploads, SUM(tests) AS tests, - SUM(downloads-uploads)/SUM(downloads+uploads) AS duRatio, - EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed, - EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT, - ANY_VALUE(downloadInterval) AS downloadInterval, - ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability, - COUNT(DISTINCT day) AS days, - COUNT(DISTINCT hour) AS hours, - SUM(IF(day = 1,tests,0)) AS sunday, - SUM(IF(day = 2,tests,0)) AS monday, - SUM(IF(day = 3,tests,0)) AS tuesday, - SUM(IF(day = 4,tests,0)) AS wednesday, - SUM(IF(day = 5,tests,0)) AS thursday, - SUM(IF(day = 6,tests,0)) AS friday, - SUM(IF(day = 7,tests,0)) AS saturday, - SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00, - SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03, - SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06, - SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09, - SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12, - SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15, - SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18, - SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21, + ANY_VALUE(client_stats) AS client_stats, + STRUCT( + COUNT(DISTINCT day) AS days, + COUNT(DISTINCT hour) AS hours, + SUM(IF(day = 1,tests,0)) AS sunday, + SUM(IF(day = 2,tests,0)) AS monday, + SUM(IF(day = 3,tests,0)) AS tuesday, + SUM(IF(day = 4,tests,0)) AS wednesday, + SUM(IF(day = 5,tests,0)) AS thursday, + SUM(IF(day = 6,tests,0)) AS friday, + SUM(IF(day = 7,tests,0)) AS saturday, + SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00, + SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03, + SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06, + SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09, + SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12, + SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15, + SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18, + SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21 + ) AS timing_stats FROM day_hour_counts GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 -HAVING tests > 5 +HAVING client_stats.tests > 5 From 5cfa1d8ec8dd7abcbdbd05da855707d86dcdd6d9 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 08:37:14 -0400 Subject: [PATCH 06/12] client stats cleanup - broken --- views/client_stats_interval.sql | 73 +++++++++++++++------------------ 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql index d16313c..ef5c1fb 100644 --- a/views/client_stats_interval.sql +++ b/views/client_stats_interval.sql @@ -61,61 +61,54 @@ solar AS ( ORDER BY a.TestTime) ), +# This adds all of the client aggregations. # This adds the inter-test interval mean and stdev, for downloads only, to ALL tests some_client_stats AS ( SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, + EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, STRUCT( + EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed, # Downloads only. + EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT # Downloads only. + ) AS performance_stats, + STRUCT ( COUNT(*) AS tests, COUNTIF(isDownload) AS downloads, COUNTIF(NOT isDownload) AS uploads, - EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed, # Downloads only. - EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT, # Downloads only. + (COUNTIF(isDownload) OVER client_win - COUNTIF(NOT isDownload) OVER client_win)/COUNT(*) OVER client_win AS duBalance, AVG(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadInterval, - STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability - ) AS client_stats, + STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability, + + COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days, + COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours, + + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1) OVER client_win AS sunday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2) OVER client_win AS monday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3) OVER client_win AS tuesday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4) OVER client_win AS wednesday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5) OVER client_win AS thursday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6) OVER client_win AS friday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7) OVER client_win AS saturday, + + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2) OVER client_win AS t00, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5) OVER client_win AS t03, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8) OVER client_win AS t06, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10) OVER client_win AS t09, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14) OVER client_win AS t12, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17) OVER client_win AS t15, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20) OVER client_win AS t18, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23) OVER client_win AS t21 + ) AS training_stats FROM solar GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2, MeanThroughputMBPS, MinRTT WINDOW client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2) -), - -# This is intended to identify each test by the hour of the day, and the day of the week. -# It is currently grouping by both, whereas it really should group by each independently. -# This is ok, as later we sum by day, and sum by 3 hour interval, but this means that there -# are 24 * 7 groupings here, instead of fewer. -day_hour_counts AS ( - SELECT - metro, ClientIP, clientName, clientOS, wscale1, wscale2, - COUNT(*) AS tests, - ANY_VALUE(client_stats) AS client_stats, - EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, - FROM some_client_stats - GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2 ) SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, - ANY_VALUE(client_stats) AS client_stats, - STRUCT( - COUNT(DISTINCT day) AS days, - COUNT(DISTINCT hour) AS hours, - SUM(IF(day = 1,tests,0)) AS sunday, - SUM(IF(day = 2,tests,0)) AS monday, - SUM(IF(day = 3,tests,0)) AS tuesday, - SUM(IF(day = 4,tests,0)) AS wednesday, - SUM(IF(day = 5,tests,0)) AS thursday, - SUM(IF(day = 6,tests,0)) AS friday, - SUM(IF(day = 7,tests,0)) AS saturday, - SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00, - SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03, - SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06, - SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09, - SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12, - SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15, - SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18, - SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21 - ) AS timing_stats -FROM day_hour_counts + ANY_VALUE(performance_stats) AS performance_stats, + ANY_VALUE(training_stats) AS training_stats, +FROM some_client_stats GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 -HAVING client_stats.tests > 5 +HAVING training_stats.tests > 5 From 48f206fe3e3a7a5328adbbf4ba5567ce3caf12e2 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 09:44:57 -0400 Subject: [PATCH 07/12] structs for stats, working --- models/client_clusters.sql | 14 ++++++-- views/client_stats_interval.sql | 64 +++++++++++++++------------------ 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/models/client_clusters.sql b/models/client_clusters.sql index 0816bbe..36465ac 100644 --- a/models/client_clusters.sql +++ b/models/client_clusters.sql @@ -8,14 +8,14 @@ # bq query --use_legacy_sql=false < models/client_clusters.sql CREATE OR REPLACE MODEL - `mlab-sandbox.gfr.client_clusters_model_intervals_30` OPTIONS(model_type='kmeans', + `mlab-sandbox.gfr.client_clusters_model_alt_30` OPTIONS(model_type='kmeans', num_clusters=30) AS WITH linear AS ( SELECT LOG10(tests) AS logTests, days, hours, downloadInterval, downloadIntervalVariability, # interval should be loosely related to number of tests (and start and end date) -duBalance, # balance between downloads (+1) and uploads (-1) +--duBalance, # balance between downloads (+1) and uploads (-1) sunday/tests AS sunday, monday/tests AS monday, tuesday/tests AS tuesday, @@ -33,6 +33,14 @@ t18/tests AS t18, t21/tests AS t21, FROM `mlab-sandbox.gfr.client_stats_interval` WHERE tests > 10 +), + +alternate AS ( + SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, + training_stats.* EXCEPT(uploads, downloads, days, hours), + SAFE.LOG10(training_stats.tests) AS logTests, + FROM `mlab-sandbox.gfr.client_stats_2` + WHERE training_stats.tests > 10 ) -SELECT * FROM linear \ No newline at end of file +SELECT * FROM alternate \ No newline at end of file diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql index ef5c1fb..e4107d7 100644 --- a/views/client_stats_interval.sql +++ b/views/client_stats_interval.sql @@ -62,53 +62,45 @@ solar AS ( ), # This adds all of the client aggregations. -# This adds the inter-test interval mean and stdev, for downloads only, to ALL tests some_client_stats AS ( - SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, - EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, + SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, STRUCT( - EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed, # Downloads only. - EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT # Downloads only. + EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed, # Downloads only. + EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT # Downloads only. ) AS performance_stats, STRUCT ( COUNT(*) AS tests, COUNTIF(isDownload) AS downloads, COUNTIF(NOT isDownload) AS uploads, - (COUNTIF(isDownload) OVER client_win - COUNTIF(NOT isDownload) OVER client_win)/COUNT(*) OVER client_win AS duBalance, - AVG(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadInterval, - STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability, + (COUNTIF(isDownload) - COUNTIF(NOT isDownload))/COUNT(*) AS duBalance, - COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days, - COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours, + # These characterize how often the client runs download tests, and how variable that is. + AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval, + STDDEV(IF(isDownload,testInterval,NULL))/AVG(IF(isDownload,testInterval,NULL)) AS downloadIntervalVariability, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1) OVER client_win AS sunday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2) OVER client_win AS monday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3) OVER client_win AS tuesday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4) OVER client_win AS wednesday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5) OVER client_win AS thursday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6) OVER client_win AS friday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7) OVER client_win AS saturday, + COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days, + COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours, + + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2) OVER client_win AS t00, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5) OVER client_win AS t03, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8) OVER client_win AS t06, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10) OVER client_win AS t09, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14) OVER client_win AS t12, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17) OVER client_win AS t15, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20) OVER client_win AS t18, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23) OVER client_win AS t21 + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21 ) AS training_stats FROM solar - GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2, - MeanThroughputMBPS, MinRTT - WINDOW - client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2) + GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 + HAVING training_stats.tests > 5 ) -SELECT - metro, ClientIP, clientName, clientOS, wscale1, wscale2, - ANY_VALUE(performance_stats) AS performance_stats, - ANY_VALUE(training_stats) AS training_stats, -FROM some_client_stats -GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 -HAVING training_stats.tests > 5 +SELECT * FROM some_client_stats From 6b662337f59ca1fa504f409a4da833d81f5b1ff0 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 11:31:46 -0400 Subject: [PATCH 08/12] various fixes for alt_30 --- models/client_clusters.sql | 2 +- views/client_stats_interval.sql | 2 +- views/labelled_client_summary.sql | 57 ++++++++++++++----------------- 3 files changed, 28 insertions(+), 33 deletions(-) diff --git a/models/client_clusters.sql b/models/client_clusters.sql index 36465ac..f1a69af 100644 --- a/models/client_clusters.sql +++ b/models/client_clusters.sql @@ -36,7 +36,7 @@ WHERE tests > 10 ), alternate AS ( - SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, + SELECT --metro, ClientIP, clientName, clientOS, wscale1, wscale2, training_stats.* EXCEPT(uploads, downloads, days, hours), SAFE.LOG10(training_stats.tests) AS logTests, FROM `mlab-sandbox.gfr.client_stats_2` diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql index e4107d7..823fe55 100644 --- a/views/client_stats_interval.sql +++ b/views/client_stats_interval.sql @@ -76,7 +76,7 @@ some_client_stats AS ( # These characterize how often the client runs download tests, and how variable that is. AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval, - STDDEV(IF(isDownload,testInterval,NULL))/AVG(IF(isDownload,testInterval,NULL)) AS downloadIntervalVariability, + SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability, COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days, COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours, diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql index 5646370..7c41b68 100644 --- a/views/labelled_client_summary.sql +++ b/views/labelled_client_summary.sql @@ -1,38 +1,33 @@ # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats. CREATE OR REPLACE VIEW - `mlab-sandbox.gfr.client_cluster_summaries` + `mlab-sandbox.gfr.client_cluster_summaries_30` AS -WITH labelled AS ( -SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model`, - (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, downloads, meanSpeed, meanMinRTT, LOG10(tests) AS logTests, -days, hours, -downloadInterval, downloadIntervalVariability, -sunday/tests AS sunday, -monday/tests AS monday, -tuesday/tests AS tuesday, -wednesday/tests AS wednesday, -thursday/tests AS thursday, -friday/tests AS friday, -saturday/tests AS saturday, -t00/tests AS t00, -t03/tests AS t03, -t06/tests AS t06, -t09/tests AS t09, -t12/tests AS t12, -t15/tests AS t15, -t18/tests AS t18, -t21/tests AS t21, -FROM `mlab-sandbox.gfr.client_stats_interval` -WHERE tests > 10 -))) +WITH +alternate AS ( +SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30`, + (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, performance_stats.*, + training_stats.*, + SAFE.LOG10(training_stats.tests) AS logTests, + FROM `mlab-sandbox.gfr.client_stats_2` + WHERE training_stats.tests > 10)) +) -SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads, -ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed, -ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed, -# TODO - is this STDDEV computation valid? -ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev, -ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT, -FROM labelled +SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads, + SUM(tests*duBalance)/SUM(tests) AS duBalance, + SUM(tests*sunday)/SUM(tests) AS sunday, + SUM(tests*monday)/SUM(tests) AS monday, + SUM(tests*tuesday)/SUM(tests) AS tuesday, + SUM(tests*wednesday)/SUM(tests) AS wednesday, + SUM(tests*thursday)/SUM(tests) AS thursday, + SUM(tests*friday)/SUM(tests) AS friday, + SUM(tests*saturday)/SUM(tests) AS saturday, + ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed, + ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed, + # TODO - is this STDDEV computation valid? + ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev, + ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT, + FROM alternate GROUP BY metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2 + From e3d459a2ca81f462e331d14398f768c2097a93da Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 11:37:50 -0400 Subject: [PATCH 09/12] tweak weekend/weekday --- views/labelled_client_summary.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql index 7c41b68..c71b2c5 100644 --- a/views/labelled_client_summary.sql +++ b/views/labelled_client_summary.sql @@ -14,7 +14,7 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox. WHERE training_stats.tests > 10)) ) -SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads, +SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, SUM(tests*duBalance)/SUM(tests) AS duBalance, SUM(tests*sunday)/SUM(tests) AS sunday, SUM(tests*monday)/SUM(tests) AS monday, From f375277abd1ee504b09522f9680947145929f311 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 17:34:37 -0400 Subject: [PATCH 10/12] change to dlFraction --- views/client_stats_interval.sql | 2 +- views/labelled_client_summary.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql index 823fe55..14a2611 100644 --- a/views/client_stats_interval.sql +++ b/views/client_stats_interval.sql @@ -72,7 +72,7 @@ some_client_stats AS ( COUNT(*) AS tests, COUNTIF(isDownload) AS downloads, COUNTIF(NOT isDownload) AS uploads, - (COUNTIF(isDownload) - COUNTIF(NOT isDownload))/COUNT(*) AS duBalance, + COUNTIF(isDownload)/COUNT(*) AS dlFraction, # These characterize how often the client runs download tests, and how variable that is. AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval, diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql index c71b2c5..d2bc27e 100644 --- a/views/labelled_client_summary.sql +++ b/views/labelled_client_summary.sql @@ -15,7 +15,7 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox. ) SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, - SUM(tests*duBalance)/SUM(tests) AS duBalance, + SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads SUM(tests*sunday)/SUM(tests) AS sunday, SUM(tests*monday)/SUM(tests) AS monday, SUM(tests*tuesday)/SUM(tests) AS tuesday, From 899612f6d3a3c5c877f5de56c7f3e839869feb4f Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Thu, 23 Sep 2021 17:55:16 -0400 Subject: [PATCH 11/12] cleanup --- models/client_clusters.sql | 32 ++------- views/client_stats.sql | 111 +++++++++++++++--------------- views/client_stats_interval.sql | 106 ---------------------------- views/labelled_client_summary.sql | 8 +-- 4 files changed, 63 insertions(+), 194 deletions(-) delete mode 100644 views/client_stats_interval.sql diff --git a/models/client_clusters.sql b/models/client_clusters.sql index f1a69af..198097f 100644 --- a/models/client_clusters.sql +++ b/models/client_clusters.sql @@ -1,5 +1,5 @@ # This is a kmeans clustering of clients, to discover the clusters of clients, -# by metro, based on time of day and day of the week. +# by metro, based on time of day and day of the week, and other testing behaviors. # This will only be meaningful for clients that test a significant number of # times over the interval of interest, so it is beneficial to use fairly # large intervals. We will try 13 week intervals, to get consistent number @@ -11,36 +11,12 @@ CREATE OR REPLACE MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30` OPTIONS(model_type='kmeans', num_clusters=30) AS -WITH linear AS ( -SELECT LOG10(tests) AS logTests, -days, hours, -downloadInterval, downloadIntervalVariability, # interval should be loosely related to number of tests (and start and end date) ---duBalance, # balance between downloads (+1) and uploads (-1) -sunday/tests AS sunday, -monday/tests AS monday, -tuesday/tests AS tuesday, -wednesday/tests AS wednesday, -thursday/tests AS thursday, -friday/tests AS friday, -saturday/tests AS saturday, -t00/tests AS t00, -t03/tests AS t03, -t06/tests AS t06, -t09/tests AS t09, -t12/tests AS t12, -t15/tests AS t15, -t18/tests AS t18, -t21/tests AS t21, -FROM `mlab-sandbox.gfr.client_stats_interval` -WHERE tests > 10 -), - alternate AS ( - SELECT --metro, ClientIP, clientName, clientOS, wscale1, wscale2, + SELECT training_stats.* EXCEPT(uploads, downloads, days, hours), SAFE.LOG10(training_stats.tests) AS logTests, - FROM `mlab-sandbox.gfr.client_stats_2` - WHERE training_stats.tests > 10 + FROM `mlab-sandbox.gfr.client_stats` + WHERE training_stats.tests > 5 ) SELECT * FROM alternate \ No newline at end of file diff --git a/views/client_stats.sql b/views/client_stats.sql index 52ab39a..834579b 100644 --- a/views/client_stats.sql +++ b/views/client_stats.sql @@ -1,39 +1,33 @@ -# Create a view that tracks the count of tests by hour and day of the week. +# Create a view that tracks the count of tests by hour and day of the week, +# and some other stats that might be useful for clustering. # Uses server latitude to adjust the time of day and day of week. # Client id is based on IP address, clientName, clientOS, and wscale. -# TODO add stats for intertest interval -# NOTES: -# Anything that takes less than a couple slot hours we can probably just -# do in gardener after processing incoming data each day. - -# bq query --use_legacy_sql=false < views/client_stats.sql +# bq query --use_legacy_sql=false < views/client_stats_interval.sql CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_stats` -OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day') +OPTIONS(description = 'per metro client test stats') AS -# Select ndt7 downloads (for now) +# Select ALL ndt7 tests # Since this is client characterization, we count uploads and downloads, and don''t # care whether the tests are completely valid - WITH tests AS ( SELECT date, ID, raw.ClientIP, a, IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1, IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2, a.TestTime, - --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime, server.Geo.Longitude, # TODO should this be client or server? - LEFT(server.Site, 3) AS metro, server.Site AS site, server.Machine AS machine, - REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion, + LEFT(server.Site, 3) AS metro, IF(raw.Download IS NULL, false, true) AS isDownload, + # This is used later for extracting the client metadata. IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData, FROM `measurement-lab.ndt.ndt7` ), -# This join is quite expensive - about 3 slot hours for 2 months of data, even if the clientName field is never used. +# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used. add_client_name AS ( SELECT tests.*, clientName FROM tests LEFT JOIN ( @@ -43,61 +37,66 @@ add_client_name AS ( ), add_client_os AS ( - SELECT add_client_name.*, clientOS + SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS FROM add_client_name LEFT JOIN ( SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS FROM add_client_name, add_client_name.tmpClientMetadata WHERE Name = "client_os") USING (date, ID) ), +# This adds the solar time, which is more useful for global clustering than UTC time. solar AS ( - SELECT *, + SELECT * EXCEPT(Longitude), TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime, - TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval, + # Compute the time, in seconds, since the previous test of the same type (upload or download) + TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval, FROM add_client_os + WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) WINDOW sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2 ORDER BY a.TestTime) ), -day_hour AS ( - SELECT - # TODO correct for latitude. - metro, ClientIP, clientName, clientOS, wscale1, wscale2, - EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL))) AS meanSpeed, - EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL))) AS meanMinRTT, - EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour, - COUNTIF(isDownload) AS downloads, - COUNT(*) AS tests - FROM solar - WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) - GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2 -) +# This adds all of the client aggregations. +client_stats AS ( + SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, + STRUCT( + EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed, # Downloads only. + EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT # Downloads only. + ) AS performance_stats, + STRUCT ( + COUNT(*) AS tests, + COUNTIF(isDownload) AS downloads, + COUNTIF(NOT isDownload) AS uploads, + COUNTIF(isDownload)/COUNT(*) AS dlFraction, + # These characterize how often the client runs download tests, and how variable that is. + AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval, + SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability, + + COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days, + COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours, + + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday, + COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday, + + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18, + COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21 + ) AS training_stats + FROM solar + GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 + HAVING training_stats.tests > 5 # only bother with this for clients that have more than 5 tests +) -SELECT - metro, ClientIP, clientName, clientOS, wscale1, wscale2, - SUM(downloads) AS downloads, - EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed, - EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT, - SUM(tests) AS tests, - COUNT(DISTINCT day) AS days, - COUNT(DISTINCT hour) AS hours, - SUM(IF(day = 1,tests,0)) AS sunday, - SUM(IF(day = 2,tests,0)) AS monday, - SUM(IF(day = 3,tests,0)) AS tuesday, - SUM(IF(day = 4,tests,0)) AS wednesday, - SUM(IF(day = 5,tests,0)) AS thursday, - SUM(IF(day = 6,tests,0)) AS friday, - SUM(IF(day = 7,tests,0)) AS saturday, - SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00, - SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03, - SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06, - SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09, - SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12, - SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15, - SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18, - SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21, -FROM day_hour -GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 -HAVING tests > 5 +SELECT * FROM client_stats diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql deleted file mode 100644 index 14a2611..0000000 --- a/views/client_stats_interval.sql +++ /dev/null @@ -1,106 +0,0 @@ -# Create a view that tracks the count of tests by hour and day of the week. -# Uses server latitude to adjust the time of day and day of week. -# Client id is based on IP address, clientName, clientOS, and wscale. -# TODO add stats for intertest interval - -# NOTES: -# Anything that takes less than a couple slot hours we can probably just -# do in gardener after processing incoming data each day. - -# bq query --use_legacy_sql=false < views/client_stats_interval.sql - -CREATE OR REPLACE VIEW -`mlab-sandbox.gfr.client_stats_2` -OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day') -AS - -# Select ALL ndt7 tests -# Since this is client characterization, we count uploads and downloads, and don''t -# care whether the tests are completely valid -WITH tests AS ( - SELECT - date, ID, raw.ClientIP, a, - IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1, - IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2, - a.TestTime, - server.Geo.Longitude, # TODO should this be client or server? - LEFT(server.Site, 3) AS metro, - IF(raw.Download IS NULL, false, true) AS isDownload, - # This is used later for extracting the client metadata. - IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData, - FROM `measurement-lab.ndt.ndt7` -), - -# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used. -add_client_name AS ( - SELECT tests.*, clientName - FROM tests LEFT JOIN ( - SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName - FROM tests, tests.tmpClientMetadata - WHERE Name = "client_name") USING (date, ID) -), - -add_client_os AS ( - SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS - FROM add_client_name LEFT JOIN ( - SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS - FROM add_client_name, add_client_name.tmpClientMetadata - WHERE Name = "client_os") USING (date, ID) -), - -# This adds the solar time, which is more useful for global clustering than UTC time. -solar AS ( - SELECT * EXCEPT(Longitude), - TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime, - # Compute the time, in seconds, since the previous test of the same type (upload or download) - TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval, - FROM add_client_os - WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY) - WINDOW - sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2 - ORDER BY a.TestTime) -), - -# This adds all of the client aggregations. -some_client_stats AS ( - SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, - STRUCT( - EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed, # Downloads only. - EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT # Downloads only. - ) AS performance_stats, - STRUCT ( - COUNT(*) AS tests, - COUNTIF(isDownload) AS downloads, - COUNTIF(NOT isDownload) AS uploads, - COUNTIF(isDownload)/COUNT(*) AS dlFraction, - - # These characterize how often the client runs download tests, and how variable that is. - AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval, - SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability, - - COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days, - COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours, - - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday, - COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday, - - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18, - COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21 - ) AS training_stats - FROM solar - GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2 - HAVING training_stats.tests > 5 -) - -SELECT * FROM some_client_stats diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql index d2bc27e..0422f61 100644 --- a/views/labelled_client_summary.sql +++ b/views/labelled_client_summary.sql @@ -10,12 +10,12 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox. (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, performance_stats.*, training_stats.*, SAFE.LOG10(training_stats.tests) AS logTests, - FROM `mlab-sandbox.gfr.client_stats_2` + FROM `mlab-sandbox.gfr.client_stats` WHERE training_stats.tests > 10)) ) SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, - SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads + SUM(tests*duBalance)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads SUM(tests*sunday)/SUM(tests) AS sunday, SUM(tests*monday)/SUM(tests) AS monday, SUM(tests*tuesday)/SUM(tests) AS tuesday, @@ -23,9 +23,9 @@ SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS c SUM(tests*thursday)/SUM(tests) AS thursday, SUM(tests*friday)/SUM(tests) AS friday, SUM(tests*saturday)/SUM(tests) AS saturday, - ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed, + # Use mean speed per client, so that each client contributes equal weight to the average. ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed, - # TODO - is this STDDEV computation valid? + # Speed deviation across clients in a cluster. TODO - is this STDDEV computation valid? ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev, ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT, FROM alternate From a3a44605bead044039ccce4296bca7a6afecfbd9 Mon Sep 17 00:00:00 2001 From: Gregory Russell Date: Wed, 20 Oct 2021 09:39:26 -0400 Subject: [PATCH 12/12] minor formatting tweaks, dlFraction --- models/client_clusters.sql | 5 ++--- views/client_stats.sql | 3 +-- views/labelled_client_summary.sql | 5 ++--- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/models/client_clusters.sql b/models/client_clusters.sql index 198097f..2f14946 100644 --- a/models/client_clusters.sql +++ b/models/client_clusters.sql @@ -7,9 +7,8 @@ # bq query --use_legacy_sql=false < models/client_clusters.sql -CREATE OR REPLACE MODEL - `mlab-sandbox.gfr.client_clusters_model_alt_30` OPTIONS(model_type='kmeans', - num_clusters=30) AS +CREATE OR REPLACE MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30` + OPTIONS(model_type='kmeans', num_clusters=30) AS alternate AS ( SELECT diff --git a/views/client_stats.sql b/views/client_stats.sql index 834579b..1d9cdc8 100644 --- a/views/client_stats.sql +++ b/views/client_stats.sql @@ -5,8 +5,7 @@ # bq query --use_legacy_sql=false < views/client_stats_interval.sql -CREATE OR REPLACE VIEW -`mlab-sandbox.gfr.client_stats` +CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_stats` OPTIONS(description = 'per metro client test stats') AS diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql index 0422f61..317ddf8 100644 --- a/views/labelled_client_summary.sql +++ b/views/labelled_client_summary.sql @@ -1,7 +1,6 @@ # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats. -CREATE OR REPLACE VIEW - `mlab-sandbox.gfr.client_cluster_summaries_30` +CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries_30` AS WITH @@ -15,7 +14,7 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox. ) SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, - SUM(tests*duBalance)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads + SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads SUM(tests*sunday)/SUM(tests) AS sunday, SUM(tests*monday)/SUM(tests) AS monday, SUM(tests*tuesday)/SUM(tests) AS tuesday,