m-lab · gfr10598 · Sep 22, 2021 · Sep 22, 2021 · Sep 23, 2021 · Sep 23, 2021
diff --git a/models/client_clusters.sql b/models/client_clusters.sql
@@ -0,0 +1,21 @@
+# This is a kmeans clustering of clients, to discover the clusters of clients,
+# by metro, based on time of day and day of the week, and other testing behaviors.
+# This will only be meaningful for clients that test a significant number of
+# times over the interval of interest, so it is beneficial to use fairly
+# large intervals.  We will try 13 week intervals, to get consistent number
+# of days of the week.
+
+# bq query --use_legacy_sql=false < models/client_clusters.sql
+
+CREATE OR REPLACE MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30`
+  OPTIONS(model_type='kmeans', num_clusters=30) AS
+
+alternate AS (
+  SELECT 
+    training_stats.* EXCEPT(uploads, downloads, days, hours),
+    SAFE.LOG10(training_stats.tests) AS logTests,
+  FROM `mlab-sandbox.gfr.client_stats`
+  WHERE training_stats.tests > 5
+)
+
+SELECT * FROM alternate
diff --git a/views/client_stats.sql b/views/client_stats.sql
@@ -0,0 +1,101 @@
+# Create a view that tracks the count of tests by hour and day of the week,
+# and some other stats that might be useful for clustering.
+# Uses server latitude to adjust the time of day and day of week.
+# Client id is based on IP address, clientName, clientOS, and wscale.
+
+# bq query --use_legacy_sql=false < views/client_stats_interval.sql
+
+CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_stats`
+OPTIONS(description = 'per metro client test stats')
+AS 
+
+# Select ALL ndt7 tests
+# Since this is client characterization, we count uploads and downloads, and don''t
+# care whether the tests are completely valid
+WITH tests AS (
+  SELECT
+  date, ID, raw.ClientIP, a,
+  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
+  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
+  a.TestTime,
+  server.Geo.Longitude,  # TODO should this be client or server?
+  LEFT(server.Site, 3) AS metro,
+  IF(raw.Download IS NULL, false, true) AS isDownload,
+  # This is used later for extracting the client metadata.
+  IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData,
+  FROM `measurement-lab.ndt.ndt7`
+),
+
+# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used.
+add_client_name AS (
+  SELECT tests.*, clientName
+  FROM tests LEFT JOIN (
+    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName
+    FROM tests, tests.tmpClientMetadata
+    WHERE Name = "client_name") USING (date, ID)
+),
+
+add_client_os AS (
+  SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS
+  FROM add_client_name LEFT JOIN (
+    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS
+    FROM add_client_name, add_client_name.tmpClientMetadata
+    WHERE Name = "client_os") USING (date, ID)
+),
+
+# This adds the solar time, which is more useful for global clustering than UTC time.
+solar AS (
+  SELECT * EXCEPT(Longitude),
+    TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
+    # Compute the time, in seconds, since the previous test of the same type (upload or download)
+    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval,
+  FROM add_client_os
+  WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
+  WINDOW
+    sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2
+    ORDER BY a.TestTime)
+),
+
+# This adds all of the client aggregations.
+client_stats AS (
+  SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+    STRUCT( 
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed,  # Downloads only.
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT             # Downloads only.
+    ) AS performance_stats,
+    STRUCT (
+      COUNT(*) AS tests,
+      COUNTIF(isDownload) AS downloads,
+      COUNTIF(NOT isDownload) AS uploads,
+      COUNTIF(isDownload)/COUNT(*)  AS dlFraction,
+
+      # These characterize how often the client runs download tests, and how variable that is.
+      AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval,
+      SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability,
+
+      COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime))  AS days,
+      COUNT(DISTINCT EXTRACT(HOUR FROM solarTime))  AS hours,
+
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday,
+
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21
+    ) AS training_stats
+  FROM solar
+  GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
+  HAVING training_stats.tests > 5 # only bother with this for clients that have more than 5 tests
+)
+
+SELECT * FROM client_stats
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
@@ -0,0 +1,32 @@
+# Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats.
+
+CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries_30`
+AS 
+
+WITH
+alternate AS (
+SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30`,
+  (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, performance_stats.*,
+    training_stats.*,
+    SAFE.LOG10(training_stats.tests) AS logTests,
+  FROM `mlab-sandbox.gfr.client_stats`
+  WHERE training_stats.tests > 10))
+)
+
+SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, 
+  SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads
+  SUM(tests*sunday)/SUM(tests) AS sunday,
+  SUM(tests*monday)/SUM(tests) AS monday,
+  SUM(tests*tuesday)/SUM(tests) AS tuesday,
+  SUM(tests*wednesday)/SUM(tests) AS wednesday,
+  SUM(tests*thursday)/SUM(tests) AS thursday,
+  SUM(tests*friday)/SUM(tests) AS friday,
+  SUM(tests*saturday)/SUM(tests) AS saturday,
+  # Use mean speed per client, so that each client contributes equal weight to the average.
+  ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed,
+  # Speed deviation across clients in a cluster.  TODO - is this STDDEV computation valid?
+  ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev,
+  ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT,
+  FROM alternate
+GROUP BY metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2
+