From a2fb9e65d393eaabd59545319173c9f9cb8bf379 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Wed, 22 Sep 2021 13:42:59 -0400
Subject: [PATCH 01/12] add client stats and client cluster model

---
 models/client_clusters.sql | 36 +++++++++++++++
 views/client_stats.sql     | 95 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 131 insertions(+)
 create mode 100644 models/client_clusters.sql
 create mode 100644 views/client_stats.sql

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
new file mode 100644
index 0000000..44972e1
--- /dev/null
+++ b/models/client_clusters.sql
@@ -0,0 +1,36 @@
+# This is a kmeans clustering of clients, to discover the clusters of clients,
+# by metro, based on time of day and day of the week.
+# This will only be meaningful for clients that test a significant number of
+# times over the interval of interest, so it is beneficial to use fairly
+# large intervals.  We will try 13 week intervals, to get consistent number
+# of days of the week.
+
+# bq query --use_legacy_sql=false < models/client_clusters.sql
+
+CREATE OR REPLACE MODEL
+  `mlab-sandbox.gfr.client_clusters_model` OPTIONS(model_type='kmeans',
+    num_clusters=20) AS
+
+WITH linear AS (
+SELECT LOG10(tests) AS logTests,
+days, hours,
+sunday/tests AS sunday,
+monday/tests AS monday,
+tuesday/tests AS tuesday,
+wednesday/tests AS wednesday,
+thursday/tests AS thursday,
+friday/tests AS friday,
+saturday/tests AS saturday,
+t00/tests AS t00,
+t03/tests AS t03,
+t06/tests AS t06,
+t09/tests AS t09,
+t12/tests AS t12,
+t15/tests AS t15,
+t18/tests AS t18,
+t21/tests AS t21,
+FROM `mlab-sandbox.gfr.client_stats`
+WHERE tests > 10 --AND metro = "lga"
+)
+
+SELECT * FROM linear
\ No newline at end of file
diff --git a/views/client_stats.sql b/views/client_stats.sql
new file mode 100644
index 0000000..e2ae007
--- /dev/null
+++ b/views/client_stats.sql
@@ -0,0 +1,95 @@
+# Create a view that tracks the count of tests by hour and day of the week.
+# Uses server latitude to adjust the time of day and day of week.
+# TODO add stats for intertest interval
+
+# bq query --use_legacy_sql=false < views/client_stats.sql
+
+CREATE OR REPLACE VIEW
+`mlab-sandbox.gfr.client_stats`
+--PARTITION BY metro
+OPTIONS(description = 'per metro client test stats by day of week and hour of the day')
+  --      enable_refresh = true)
+AS 
+
+# Select ndt7 downloads (for now)
+# Since this is client characterization, we count uploads and downloads, and don''t
+# care whether the tests are completely valid
+
+WITH tests AS (
+  SELECT
+  date, ID, raw.ClientIP, a,
+  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
+  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
+  IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime,
+  server.Geo.Longitude,  # TODO should this be client or server?
+  LEFT(server.Site, 3) AS metro, server.Site AS site, server.Machine AS machine,
+  REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion,
+  IF(raw.Download IS NULL, false, true) AS isDownload,
+  IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData,
+  FROM `measurement-lab.ndt.ndt7`
+),
+
+# This join is quite expensive - about 3 slot hours for 2 months of data, even if the clientName field is never used.
+add_client_name AS (
+  SELECT tests.*, clientName
+  FROM tests LEFT JOIN (
+    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName
+    FROM tests, tests.tmpClientMetadata
+    WHERE Name = "client_name") USING (date, ID)
+),
+
+add_client_os AS (
+  SELECT add_client_name.*, clientOS
+  FROM add_client_name LEFT JOIN (
+    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS
+    FROM add_client_name, add_client_name.tmpClientMetadata
+    WHERE Name = "client_os") USING (date, ID)
+),
+
+solar AS (
+  SELECT *,
+    TIMESTAMP_ADD(startTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
+  FROM add_client_os
+),
+
+day_hour AS (
+  SELECT
+    # TODO correct for latitude. 
+   metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+   EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL))) AS meanSpeed,
+   EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL))) AS meanMinRTT,
+   EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
+   COUNTIF(isDownload) AS downloads,
+   COUNT(*) AS tests
+   FROM solar
+   WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
+   GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2
+)
+
+
+SELECT 
+    metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+    SUM(downloads) AS downloads,
+    EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed,
+    EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT,
+    SUM(tests) AS tests,
+    COUNT(DISTINCT day) AS days,
+    COUNT(DISTINCT hour) AS hours,
+    SUM(IF(day = 1,tests,0)) AS sunday,
+    SUM(IF(day = 2,tests,0)) AS monday,
+    SUM(IF(day = 3,tests,0)) AS tuesday,
+    SUM(IF(day = 4,tests,0)) AS wednesday,
+    SUM(IF(day = 5,tests,0)) AS thursday,
+    SUM(IF(day = 6,tests,0)) AS friday,
+    SUM(IF(day = 7,tests,0)) AS saturday,
+    SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00,
+    SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03,
+    SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06,
+    SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09,
+    SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12,
+    SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15,
+    SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18,
+    SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21,
+FROM day_hour
+GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
+HAVING tests > 5

From 26834b12001cec1da5e72cd0b9f9227e6a7b58e7 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Wed, 22 Sep 2021 15:25:45 -0400
Subject: [PATCH 02/12] tweak clusters, add labelled view

---
 models/client_clusters.sql        |  4 ++--
 views/labelled_client_summary.sql | 35 +++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 views/labelled_client_summary.sql

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
index 44972e1..8e72888 100644
--- a/models/client_clusters.sql
+++ b/models/client_clusters.sql
@@ -8,8 +8,8 @@
 # bq query --use_legacy_sql=false < models/client_clusters.sql
 
 CREATE OR REPLACE MODEL
-  `mlab-sandbox.gfr.client_clusters_model` OPTIONS(model_type='kmeans',
-    num_clusters=20) AS
+  `mlab-sandbox.gfr.client_clusters_model30` OPTIONS(model_type='kmeans',
+    num_clusters=30) AS
 
 WITH linear AS (
 SELECT LOG10(tests) AS logTests,
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
new file mode 100644
index 0000000..9d59cea
--- /dev/null
+++ b/views/labelled_client_summary.sql
@@ -0,0 +1,35 @@
+# Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats.
+
+CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries`
+AS 
+
+WITH labelled AS (
+SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model`,
+ (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, downloads, meanSpeed, meanMinRTT, LOG10(tests) AS logTests,
+days, hours,
+sunday/tests AS sunday,
+monday/tests AS monday,
+tuesday/tests AS tuesday,
+wednesday/tests AS wednesday,
+thursday/tests AS thursday,
+friday/tests AS friday,
+saturday/tests AS saturday,
+t00/tests AS t00,
+t03/tests AS t03,
+t06/tests AS t06,
+t09/tests AS t09,
+t12/tests AS t12,
+t15/tests AS t15,
+t18/tests AS t18,
+t21/tests AS t21, 
+FROM `mlab-sandbox.gfr.client_stats`
+WHERE tests > 10 
+)))
+
+SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads,
+ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed,
+ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed,
+ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev,
+ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT,
+FROM labelled
+GROUP BY metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2

From 4c107a7c4aae1ac9eea5e7fdeb0a94d37f90d9a8 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Wed, 22 Sep 2021 21:32:22 -0400
Subject: [PATCH 03/12] hacking interval

---
 models/client_clusters.sql        |   7 +-
 views/client_stats.sql            |  18 +++--
 views/client_stats_interval.sql   | 118 ++++++++++++++++++++++++++++++
 views/labelled_client_summary.sql |   4 +-
 4 files changed, 138 insertions(+), 9 deletions(-)
 create mode 100644 views/client_stats_interval.sql

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
index 8e72888..bde67f0 100644
--- a/models/client_clusters.sql
+++ b/models/client_clusters.sql
@@ -8,12 +8,13 @@
 # bq query --use_legacy_sql=false < models/client_clusters.sql
 
 CREATE OR REPLACE MODEL
-  `mlab-sandbox.gfr.client_clusters_model30` OPTIONS(model_type='kmeans',
-    num_clusters=30) AS
+  `mlab-sandbox.gfr.client_clusters_model_intervals_20` OPTIONS(model_type='kmeans',
+    num_clusters=20) AS
 
 WITH linear AS (
 SELECT LOG10(tests) AS logTests,
 days, hours,
+downloadInterval, downloadIntervalVariability,
 sunday/tests AS sunday,
 monday/tests AS monday,
 tuesday/tests AS tuesday,
@@ -29,7 +30,7 @@ t12/tests AS t12,
 t15/tests AS t15,
 t18/tests AS t18,
 t21/tests AS t21,
-FROM `mlab-sandbox.gfr.client_stats`
+FROM `mlab-sandbox.gfr.client_stats_interval`
 WHERE tests > 10 --AND metro = "lga"
 )
 
diff --git a/views/client_stats.sql b/views/client_stats.sql
index e2ae007..52ab39a 100644
--- a/views/client_stats.sql
+++ b/views/client_stats.sql
@@ -1,14 +1,17 @@
 # Create a view that tracks the count of tests by hour and day of the week.
 # Uses server latitude to adjust the time of day and day of week.
+# Client id is based on IP address, clientName, clientOS, and wscale.
 # TODO add stats for intertest interval
 
+# NOTES:
+# Anything that takes less than a couple slot hours we can probably just
+# do in gardener after processing incoming data each day.
+
 # bq query --use_legacy_sql=false < views/client_stats.sql
 
 CREATE OR REPLACE VIEW
 `mlab-sandbox.gfr.client_stats`
---PARTITION BY metro
-OPTIONS(description = 'per metro client test stats by day of week and hour of the day')
-  --      enable_refresh = true)
+OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day')
 AS 
 
 # Select ndt7 downloads (for now)
@@ -20,7 +23,8 @@ WITH tests AS (
   date, ID, raw.ClientIP, a,
   IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
   IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
-  IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime,
+  a.TestTime,
+  --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime,
   server.Geo.Longitude,  # TODO should this be client or server?
   LEFT(server.Site, 3) AS metro, server.Site AS site, server.Machine AS machine,
   REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion,
@@ -48,8 +52,12 @@ add_client_os AS (
 
 solar AS (
   SELECT *,
-    TIMESTAMP_ADD(startTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
+    TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
+    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval,
   FROM add_client_os
+  WINDOW
+    sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2
+    ORDER BY a.TestTime)
 ),
 
 day_hour AS (
diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
new file mode 100644
index 0000000..26a0fc3
--- /dev/null
+++ b/views/client_stats_interval.sql
@@ -0,0 +1,118 @@
+# Create a view that tracks the count of tests by hour and day of the week.
+# Uses server latitude to adjust the time of day and day of week.
+# Client id is based on IP address, clientName, clientOS, and wscale.
+# TODO add stats for intertest interval
+
+# NOTES:
+# Anything that takes less than a couple slot hours we can probably just
+# do in gardener after processing incoming data each day.
+
+# bq query --use_legacy_sql=false < views/client_stats_interval.sql
+
+CREATE OR REPLACE VIEW
+`mlab-sandbox.gfr.client_stats_interval`
+OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day')
+AS 
+
+# Select ndt7 downloads (for now)
+# Since this is client characterization, we count uploads and downloads, and don''t
+# care whether the tests are completely valid
+
+WITH tests AS (
+  SELECT
+  date, ID, raw.ClientIP, a,
+  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
+  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
+  a.TestTime,
+  --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime,
+  server.Geo.Longitude,  # TODO should this be client or server?
+  LEFT(server.Site, 3) AS metro, --server.Site AS site, server.Machine AS machine,
+  --REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion,
+  IF(raw.Download IS NULL, false, true) AS isDownload,
+  IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData,
+  FROM `measurement-lab.ndt.ndt7`
+),
+
+# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used.
+add_client_name AS (
+  SELECT tests.*, clientName
+  FROM tests LEFT JOIN (
+    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName
+    FROM tests, tests.tmpClientMetadata
+    WHERE Name = "client_name") USING (date, ID)
+),
+
+add_client_os AS (
+  SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS
+  FROM add_client_name LEFT JOIN (
+    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS
+    FROM add_client_name, add_client_name.tmpClientMetadata
+    WHERE Name = "client_os") USING (date, ID)
+),
+
+solar AS (
+  SELECT * EXCEPT(Longitude),
+    TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
+    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval,
+  FROM add_client_os
+  WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
+  WINDOW
+    sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2
+    ORDER BY a.TestTime)
+),
+
+
+add_intervals AS (
+  SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, 
+    AVG(IF(isDownload,testInterval,NULL)) OVER client_win  AS downloadInterval,
+    STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability,
+  FROM solar
+  GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2,
+    MeanThroughputMBPS, MinRTT
+  WINDOW
+    client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2)
+),
+
+day_hour_counts AS (
+  SELECT
+    # TODO correct for latitude. 
+   metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+   EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed,
+   EXP(AVG(IF(isDownload,SAFE.LN(MinRTT),NULL))) AS meanMinRTT,
+   EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
+   COUNTIF(isDownload) AS downloads,
+   COUNT(*) AS tests,
+   ANY_VALUE(downloadInterval) AS downloadInterval,
+   ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability,
+   FROM add_intervals
+   GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2
+)
+
+SELECT 
+    metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+    SUM(downloads) AS downloads,
+    EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed,
+    EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT,
+    SUM(tests) AS tests,
+    ANY_VALUE(downloadInterval) AS downloadInterval,
+    ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability,
+    COUNT(DISTINCT day) AS days,
+    COUNT(DISTINCT hour) AS hours,
+    SUM(IF(day = 1,tests,0)) AS sunday,
+    SUM(IF(day = 2,tests,0)) AS monday,
+    SUM(IF(day = 3,tests,0)) AS tuesday,
+    SUM(IF(day = 4,tests,0)) AS wednesday,
+    SUM(IF(day = 5,tests,0)) AS thursday,
+    SUM(IF(day = 6,tests,0)) AS friday,
+    SUM(IF(day = 7,tests,0)) AS saturday,
+    SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00,
+    SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03,
+    SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06,
+    SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09,
+    SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12,
+    SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15,
+    SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18,
+    SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21,
+FROM day_hour_counts
+GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
+HAVING tests > 5
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
index 9d59cea..537b46d 100644
--- a/views/labelled_client_summary.sql
+++ b/views/labelled_client_summary.sql
@@ -1,6 +1,7 @@
 # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats.
 
-CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries`
+CREATE MATERIALIZED VIEW
+ `mlab-sandbox.gfr.client_cluster_summaries`
 AS 
 
 WITH labelled AS (
@@ -29,6 +30,7 @@ WHERE tests > 10
 SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads,
 ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed,
 ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed,
+# TODO - is this STDDEV computation valid?
 ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev,
 ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT,
 FROM labelled

From af267aaa8737fed99cb6ea8c034e1116d9d4db16 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 08:06:20 -0400
Subject: [PATCH 04/12] add up/down balance

---
 models/client_clusters.sql        |  9 +++++----
 views/client_stats_interval.sql   | 25 +++++++++++++++----------
 views/labelled_client_summary.sql |  5 +++--
 3 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
index bde67f0..0816bbe 100644
--- a/models/client_clusters.sql
+++ b/models/client_clusters.sql
@@ -8,13 +8,14 @@
 # bq query --use_legacy_sql=false < models/client_clusters.sql
 
 CREATE OR REPLACE MODEL
-  `mlab-sandbox.gfr.client_clusters_model_intervals_20` OPTIONS(model_type='kmeans',
-    num_clusters=20) AS
+  `mlab-sandbox.gfr.client_clusters_model_intervals_30` OPTIONS(model_type='kmeans',
+    num_clusters=30) AS
 
 WITH linear AS (
 SELECT LOG10(tests) AS logTests,
 days, hours,
-downloadInterval, downloadIntervalVariability,
+downloadInterval, downloadIntervalVariability, # interval should be loosely related to number of tests (and start and end date)
+duBalance, # balance between downloads (+1) and uploads (-1)
 sunday/tests AS sunday,
 monday/tests AS monday,
 tuesday/tests AS tuesday,
@@ -31,7 +32,7 @@ t15/tests AS t15,
 t18/tests AS t18,
 t21/tests AS t21,
 FROM `mlab-sandbox.gfr.client_stats_interval`
-WHERE tests > 10 --AND metro = "lga"
+WHERE tests > 10
 )
 
 SELECT * FROM linear
\ No newline at end of file
diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
index 26a0fc3..ee93198 100644
--- a/views/client_stats_interval.sql
+++ b/views/client_stats_interval.sql
@@ -14,21 +14,19 @@ CREATE OR REPLACE VIEW
 OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day')
 AS 
 
-# Select ndt7 downloads (for now)
+# Select ALL ndt7 tests
 # Since this is client characterization, we count uploads and downloads, and don''t
 # care whether the tests are completely valid
-
 WITH tests AS (
   SELECT
   date, ID, raw.ClientIP, a,
   IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
   IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
   a.TestTime,
-  --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime,
   server.Geo.Longitude,  # TODO should this be client or server?
-  LEFT(server.Site, 3) AS metro, --server.Site AS site, server.Machine AS machine,
-  --REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion,
+  LEFT(server.Site, 3) AS metro,
   IF(raw.Download IS NULL, false, true) AS isDownload,
+  # This is used later for extracting the client metadata.
   IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData,
   FROM `measurement-lab.ndt.ndt7`
 ),
@@ -50,10 +48,12 @@ add_client_os AS (
     WHERE Name = "client_os") USING (date, ID)
 ),
 
+# This adds the solar time, which is more useful for global clustering than UTC time.
 solar AS (
   SELECT * EXCEPT(Longitude),
     TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
-    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval,
+    # Compute the time, in seconds, since the previous test of the same type (upload or download)
+    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval,
   FROM add_client_os
   WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
   WINDOW
@@ -61,7 +61,7 @@ solar AS (
     ORDER BY a.TestTime)
 ),
 
-
+# This adds the inter-test interval mean and stdev, for downloads only, to ALL tests
 add_intervals AS (
   SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, 
     AVG(IF(isDownload,testInterval,NULL)) OVER client_win  AS downloadInterval,
@@ -73,14 +73,19 @@ add_intervals AS (
     client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2)
 ),
 
+# This is intended to identify each test by the hour of the day, and the day of the week.
+# It is currently grouping by both, whereas it really should group by each independently.
+# This is ok, as later we sum by day, and sum by 3 hour interval, but this means that there
+# are 24 * 7 groupings here, instead of fewer.
 day_hour_counts AS (
   SELECT
     # TODO correct for latitude. 
    metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-   EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed,
+   EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed,  # Downloads only.
    EXP(AVG(IF(isDownload,SAFE.LN(MinRTT),NULL))) AS meanMinRTT,
    EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
    COUNTIF(isDownload) AS downloads,
+   COUNTIF(NOT isDownload) AS uploads,
    COUNT(*) AS tests,
    ANY_VALUE(downloadInterval) AS downloadInterval,
    ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability,
@@ -90,10 +95,10 @@ day_hour_counts AS (
 
 SELECT 
     metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-    SUM(downloads) AS downloads,
+    SUM(downloads) AS downloads, SUM(uploads) AS uploads, SUM(tests) AS tests,
+    SUM(downloads-uploads)/SUM(downloads+uploads) AS duRatio,
     EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed,
     EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT,
-    SUM(tests) AS tests,
     ANY_VALUE(downloadInterval) AS downloadInterval,
     ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability,
     COUNT(DISTINCT day) AS days,
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
index 537b46d..5646370 100644
--- a/views/labelled_client_summary.sql
+++ b/views/labelled_client_summary.sql
@@ -1,6 +1,6 @@
 # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats.
 
-CREATE MATERIALIZED VIEW
+CREATE OR REPLACE VIEW
  `mlab-sandbox.gfr.client_cluster_summaries`
 AS 
 
@@ -8,6 +8,7 @@ WITH labelled AS (
 SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model`,
  (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, downloads, meanSpeed, meanMinRTT, LOG10(tests) AS logTests,
 days, hours,
+downloadInterval, downloadIntervalVariability,
 sunday/tests AS sunday,
 monday/tests AS monday,
 tuesday/tests AS tuesday,
@@ -23,7 +24,7 @@ t12/tests AS t12,
 t15/tests AS t15,
 t18/tests AS t18,
 t21/tests AS t21, 
-FROM `mlab-sandbox.gfr.client_stats`
+FROM `mlab-sandbox.gfr.client_stats_interval`
 WHERE tests > 10 
 )))
 

From 56cc8ff657c979549f0e19b621ac0a38273da33a Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 08:18:24 -0400
Subject: [PATCH 05/12] cleaning up client_stats - broken

---
 views/client_stats_interval.sql | 74 ++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 38 deletions(-)

diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
index ee93198..d16313c 100644
--- a/views/client_stats_interval.sql
+++ b/views/client_stats_interval.sql
@@ -10,7 +10,7 @@
 # bq query --use_legacy_sql=false < views/client_stats_interval.sql
 
 CREATE OR REPLACE VIEW
-`mlab-sandbox.gfr.client_stats_interval`
+`mlab-sandbox.gfr.client_stats_2`
 OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day')
 AS 
 
@@ -62,10 +62,17 @@ solar AS (
 ),
 
 # This adds the inter-test interval mean and stdev, for downloads only, to ALL tests
-add_intervals AS (
-  SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT, 
-    AVG(IF(isDownload,testInterval,NULL)) OVER client_win  AS downloadInterval,
-    STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability,
+some_client_stats AS (
+  SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT,
+    STRUCT( 
+      COUNT(*) AS tests,
+      COUNTIF(isDownload) AS downloads,
+      COUNTIF(NOT isDownload) AS uploads,
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed,  # Downloads only.
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT,             # Downloads only.
+      AVG(IF(isDownload,testInterval,NULL)) OVER client_win  AS downloadInterval,
+      STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability
+    ) AS client_stats,
   FROM solar
   GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2,
     MeanThroughputMBPS, MinRTT
@@ -79,45 +86,36 @@ add_intervals AS (
 # are 24 * 7 groupings here, instead of fewer.
 day_hour_counts AS (
   SELECT
-    # TODO correct for latitude. 
    metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-   EXP(AVG(IF(isDownload,SAFE.LN(MeanThroughputMBPS),NULL))) AS meanSpeed,  # Downloads only.
-   EXP(AVG(IF(isDownload,SAFE.LN(MinRTT),NULL))) AS meanMinRTT,
-   EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
-   COUNTIF(isDownload) AS downloads,
-   COUNTIF(NOT isDownload) AS uploads,
    COUNT(*) AS tests,
-   ANY_VALUE(downloadInterval) AS downloadInterval,
-   ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability,
-   FROM add_intervals
+   ANY_VALUE(client_stats) AS client_stats,
+   EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
+   FROM some_client_stats
    GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2
 )
 
 SELECT 
     metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-    SUM(downloads) AS downloads, SUM(uploads) AS uploads, SUM(tests) AS tests,
-    SUM(downloads-uploads)/SUM(downloads+uploads) AS duRatio,
-    EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed,
-    EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT,
-    ANY_VALUE(downloadInterval) AS downloadInterval,
-    ANY_VALUE(downloadIntervalVariability) AS downloadIntervalVariability,
-    COUNT(DISTINCT day) AS days,
-    COUNT(DISTINCT hour) AS hours,
-    SUM(IF(day = 1,tests,0)) AS sunday,
-    SUM(IF(day = 2,tests,0)) AS monday,
-    SUM(IF(day = 3,tests,0)) AS tuesday,
-    SUM(IF(day = 4,tests,0)) AS wednesday,
-    SUM(IF(day = 5,tests,0)) AS thursday,
-    SUM(IF(day = 6,tests,0)) AS friday,
-    SUM(IF(day = 7,tests,0)) AS saturday,
-    SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00,
-    SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03,
-    SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06,
-    SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09,
-    SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12,
-    SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15,
-    SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18,
-    SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21,
+    ANY_VALUE(client_stats) AS client_stats,
+    STRUCT(
+      COUNT(DISTINCT day) AS days,
+      COUNT(DISTINCT hour) AS hours,
+      SUM(IF(day = 1,tests,0)) AS sunday,
+      SUM(IF(day = 2,tests,0)) AS monday,
+      SUM(IF(day = 3,tests,0)) AS tuesday,
+      SUM(IF(day = 4,tests,0)) AS wednesday,
+      SUM(IF(day = 5,tests,0)) AS thursday,
+      SUM(IF(day = 6,tests,0)) AS friday,
+      SUM(IF(day = 7,tests,0)) AS saturday,
+      SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00,
+      SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03,
+      SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06,
+      SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09,
+      SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12,
+      SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15,
+      SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18,
+      SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21
+    ) AS timing_stats
 FROM day_hour_counts
 GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
-HAVING tests > 5
+HAVING client_stats.tests > 5

From 5cfa1d8ec8dd7abcbdbd05da855707d86dcdd6d9 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 08:37:14 -0400
Subject: [PATCH 06/12] client stats cleanup - broken

---
 views/client_stats_interval.sql | 73 +++++++++++++++------------------
 1 file changed, 33 insertions(+), 40 deletions(-)

diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
index d16313c..ef5c1fb 100644
--- a/views/client_stats_interval.sql
+++ b/views/client_stats_interval.sql
@@ -61,61 +61,54 @@ solar AS (
     ORDER BY a.TestTime)
 ),
 
+# This adds all of the client aggregations.
 # This adds the inter-test interval mean and stdev, for downloads only, to ALL tests
 some_client_stats AS (
   SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT,
+  EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
     STRUCT( 
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed,  # Downloads only.
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT             # Downloads only.
+    ) AS performance_stats,
+    STRUCT (
       COUNT(*) AS tests,
       COUNTIF(isDownload) AS downloads,
       COUNTIF(NOT isDownload) AS uploads,
-      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed,  # Downloads only.
-      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT,             # Downloads only.
+      (COUNTIF(isDownload) OVER client_win  - COUNTIF(NOT isDownload) OVER client_win)/COUNT(*) OVER client_win AS duBalance,
       AVG(IF(isDownload,testInterval,NULL)) OVER client_win  AS downloadInterval,
-      STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability
-    ) AS client_stats,
+      STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability,
+
+      COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days,
+      COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours,
+
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1) OVER client_win AS sunday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2) OVER client_win AS monday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3) OVER client_win AS tuesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4) OVER client_win AS wednesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5) OVER client_win AS thursday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6) OVER client_win AS friday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7) OVER client_win AS saturday,
+      
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2) OVER client_win AS t00,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5) OVER client_win AS t03,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8) OVER client_win AS t06,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10) OVER client_win AS t09,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14) OVER client_win AS t12,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17) OVER client_win AS t15,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20) OVER client_win AS t18,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23) OVER client_win AS t21
+    ) AS training_stats
   FROM solar
   GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2,
     MeanThroughputMBPS, MinRTT
   WINDOW
     client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2)
-),
-
-# This is intended to identify each test by the hour of the day, and the day of the week.
-# It is currently grouping by both, whereas it really should group by each independently.
-# This is ok, as later we sum by day, and sum by 3 hour interval, but this means that there
-# are 24 * 7 groupings here, instead of fewer.
-day_hour_counts AS (
-  SELECT
-   metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-   COUNT(*) AS tests,
-   ANY_VALUE(client_stats) AS client_stats,
-   EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
-   FROM some_client_stats
-   GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2
 )
 
 SELECT 
     metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-    ANY_VALUE(client_stats) AS client_stats,
-    STRUCT(
-      COUNT(DISTINCT day) AS days,
-      COUNT(DISTINCT hour) AS hours,
-      SUM(IF(day = 1,tests,0)) AS sunday,
-      SUM(IF(day = 2,tests,0)) AS monday,
-      SUM(IF(day = 3,tests,0)) AS tuesday,
-      SUM(IF(day = 4,tests,0)) AS wednesday,
-      SUM(IF(day = 5,tests,0)) AS thursday,
-      SUM(IF(day = 6,tests,0)) AS friday,
-      SUM(IF(day = 7,tests,0)) AS saturday,
-      SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00,
-      SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03,
-      SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06,
-      SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09,
-      SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12,
-      SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15,
-      SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18,
-      SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21
-    ) AS timing_stats
-FROM day_hour_counts
+    ANY_VALUE(performance_stats) AS performance_stats,
+    ANY_VALUE(training_stats) AS training_stats,
+FROM some_client_stats
 GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
-HAVING client_stats.tests > 5
+HAVING training_stats.tests > 5

From 48f206fe3e3a7a5328adbbf4ba5567ce3caf12e2 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 09:44:57 -0400
Subject: [PATCH 07/12] structs for stats, working

---
 models/client_clusters.sql      | 14 ++++++--
 views/client_stats_interval.sql | 64 +++++++++++++++------------------
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
index 0816bbe..36465ac 100644
--- a/models/client_clusters.sql
+++ b/models/client_clusters.sql
@@ -8,14 +8,14 @@
 # bq query --use_legacy_sql=false < models/client_clusters.sql
 
 CREATE OR REPLACE MODEL
-  `mlab-sandbox.gfr.client_clusters_model_intervals_30` OPTIONS(model_type='kmeans',
+  `mlab-sandbox.gfr.client_clusters_model_alt_30` OPTIONS(model_type='kmeans',
     num_clusters=30) AS
 
 WITH linear AS (
 SELECT LOG10(tests) AS logTests,
 days, hours,
 downloadInterval, downloadIntervalVariability, # interval should be loosely related to number of tests (and start and end date)
-duBalance, # balance between downloads (+1) and uploads (-1)
+--duBalance, # balance between downloads (+1) and uploads (-1)
 sunday/tests AS sunday,
 monday/tests AS monday,
 tuesday/tests AS tuesday,
@@ -33,6 +33,14 @@ t18/tests AS t18,
 t21/tests AS t21,
 FROM `mlab-sandbox.gfr.client_stats_interval`
 WHERE tests > 10
+),
+
+alternate AS (
+  SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+    training_stats.* EXCEPT(uploads, downloads, days, hours),
+    SAFE.LOG10(training_stats.tests) AS logTests,
+  FROM `mlab-sandbox.gfr.client_stats_2`
+  WHERE training_stats.tests > 10
 )
 
-SELECT * FROM linear
\ No newline at end of file
+SELECT * FROM alternate
\ No newline at end of file
diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
index ef5c1fb..e4107d7 100644
--- a/views/client_stats_interval.sql
+++ b/views/client_stats_interval.sql
@@ -62,53 +62,45 @@ solar AS (
 ),
 
 # This adds all of the client aggregations.
-# This adds the inter-test interval mean and stdev, for downloads only, to ALL tests
 some_client_stats AS (
-  SELECT * EXCEPT(a), a.MeanThroughputMBPS, a.MinRTT,
-  EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
+  SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2,
     STRUCT( 
-      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) OVER client_win) AS meanSpeed,  # Downloads only.
-      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) OVER client_win) AS meanMinRTT             # Downloads only.
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed,  # Downloads only.
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT             # Downloads only.
     ) AS performance_stats,
     STRUCT (
       COUNT(*) AS tests,
       COUNTIF(isDownload) AS downloads,
       COUNTIF(NOT isDownload) AS uploads,
-      (COUNTIF(isDownload) OVER client_win  - COUNTIF(NOT isDownload) OVER client_win)/COUNT(*) OVER client_win AS duBalance,
-      AVG(IF(isDownload,testInterval,NULL)) OVER client_win  AS downloadInterval,
-      STDDEV(IF(isDownload,testInterval,NULL)) OVER client_win AS downloadIntervalVariability,
+      (COUNTIF(isDownload) - COUNTIF(NOT isDownload))/COUNT(*)  AS duBalance,
 
-      COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days,
-      COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours,
+      # These characterize how often the client runs download tests, and how variable that is.
+      AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval,
+      STDDEV(IF(isDownload,testInterval,NULL))/AVG(IF(isDownload,testInterval,NULL)) AS downloadIntervalVariability,
 
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1) OVER client_win AS sunday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2) OVER client_win AS monday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3) OVER client_win AS tuesday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4) OVER client_win AS wednesday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5) OVER client_win AS thursday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6) OVER client_win AS friday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7) OVER client_win AS saturday,
+      COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime))  AS days,
+      COUNT(DISTINCT EXTRACT(HOUR FROM solarTime))  AS hours,
+
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday,
       
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2) OVER client_win AS t00,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5) OVER client_win AS t03,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8) OVER client_win AS t06,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10) OVER client_win AS t09,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14) OVER client_win AS t12,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17) OVER client_win AS t15,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20) OVER client_win AS t18,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23) OVER client_win AS t21
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21
     ) AS training_stats
   FROM solar
-  GROUP BY date, TestTime, solarTime, testInterval, ID, isDownload, metro, clientIP, clientName, clientOS, wscale1, wscale2,
-    MeanThroughputMBPS, MinRTT
-  WINDOW
-    client_win AS (PARTITION BY metro, ClientIP, clientName, clientOS, wscale1, wscale2)
+  GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
+  HAVING training_stats.tests > 5
 )
 
-SELECT 
-    metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-    ANY_VALUE(performance_stats) AS performance_stats,
-    ANY_VALUE(training_stats) AS training_stats,
-FROM some_client_stats
-GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
-HAVING training_stats.tests > 5
+SELECT * FROM some_client_stats

From 6b662337f59ca1fa504f409a4da833d81f5b1ff0 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 11:31:46 -0400
Subject: [PATCH 08/12] various fixes for alt_30

---
 models/client_clusters.sql        |  2 +-
 views/client_stats_interval.sql   |  2 +-
 views/labelled_client_summary.sql | 57 ++++++++++++++-----------------
 3 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
index 36465ac..f1a69af 100644
--- a/models/client_clusters.sql
+++ b/models/client_clusters.sql
@@ -36,7 +36,7 @@ WHERE tests > 10
 ),
 
 alternate AS (
-  SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+  SELECT --metro, ClientIP, clientName, clientOS, wscale1, wscale2,
     training_stats.* EXCEPT(uploads, downloads, days, hours),
     SAFE.LOG10(training_stats.tests) AS logTests,
   FROM `mlab-sandbox.gfr.client_stats_2`
diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
index e4107d7..823fe55 100644
--- a/views/client_stats_interval.sql
+++ b/views/client_stats_interval.sql
@@ -76,7 +76,7 @@ some_client_stats AS (
 
       # These characterize how often the client runs download tests, and how variable that is.
       AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval,
-      STDDEV(IF(isDownload,testInterval,NULL))/AVG(IF(isDownload,testInterval,NULL)) AS downloadIntervalVariability,
+      SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability,
 
       COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime))  AS days,
       COUNT(DISTINCT EXTRACT(HOUR FROM solarTime))  AS hours,
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
index 5646370..7c41b68 100644
--- a/views/labelled_client_summary.sql
+++ b/views/labelled_client_summary.sql
@@ -1,38 +1,33 @@
 # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats.
 
 CREATE OR REPLACE VIEW
- `mlab-sandbox.gfr.client_cluster_summaries`
+ `mlab-sandbox.gfr.client_cluster_summaries_30`
 AS 
 
-WITH labelled AS (
-SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model`,
- (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, downloads, meanSpeed, meanMinRTT, LOG10(tests) AS logTests,
-days, hours,
-downloadInterval, downloadIntervalVariability,
-sunday/tests AS sunday,
-monday/tests AS monday,
-tuesday/tests AS tuesday,
-wednesday/tests AS wednesday,
-thursday/tests AS thursday,
-friday/tests AS friday,
-saturday/tests AS saturday,
-t00/tests AS t00,
-t03/tests AS t03,
-t06/tests AS t06,
-t09/tests AS t09,
-t12/tests AS t12,
-t15/tests AS t15,
-t18/tests AS t18,
-t21/tests AS t21, 
-FROM `mlab-sandbox.gfr.client_stats_interval`
-WHERE tests > 10 
-)))
+WITH
+alternate AS (
+SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30`,
+  (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, performance_stats.*,
+    training_stats.*,
+    SAFE.LOG10(training_stats.tests) AS logTests,
+  FROM `mlab-sandbox.gfr.client_stats_2`
+  WHERE training_stats.tests > 10))
+)
 
-SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads,
-ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed,
-ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed,
-# TODO - is this STDDEV computation valid?
-ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev,
-ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT,
-FROM labelled
+SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads, 
+  SUM(tests*duBalance)/SUM(tests) AS duBalance,
+  SUM(tests*sunday)/SUM(tests) AS sunday,
+  SUM(tests*monday)/SUM(tests) AS monday,
+  SUM(tests*tuesday)/SUM(tests) AS tuesday,
+  SUM(tests*wednesday)/SUM(tests) AS wednesday,
+  SUM(tests*thursday)/SUM(tests) AS thursday,
+  SUM(tests*friday)/SUM(tests) AS friday,
+  SUM(tests*saturday)/SUM(tests) AS saturday,
+  ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed,
+  ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed,
+  # TODO - is this STDDEV computation valid?
+  ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev,
+  ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT,
+  FROM alternate
 GROUP BY metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2
+

From e3d459a2ca81f462e331d14398f768c2097a93da Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 11:37:50 -0400
Subject: [PATCH 09/12] tweak weekend/weekday

---
 views/labelled_client_summary.sql | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
index 7c41b68..c71b2c5 100644
--- a/views/labelled_client_summary.sql
+++ b/views/labelled_client_summary.sql
@@ -14,7 +14,7 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.
   WHERE training_stats.tests > 10))
 )
 
-SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(downloads) AS downloads, 
+SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, 
   SUM(tests*duBalance)/SUM(tests) AS duBalance,
   SUM(tests*sunday)/SUM(tests) AS sunday,
   SUM(tests*monday)/SUM(tests) AS monday,

From f375277abd1ee504b09522f9680947145929f311 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 17:34:37 -0400
Subject: [PATCH 10/12] change to dlFraction

---
 views/client_stats_interval.sql   | 2 +-
 views/labelled_client_summary.sql | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
index 823fe55..14a2611 100644
--- a/views/client_stats_interval.sql
+++ b/views/client_stats_interval.sql
@@ -72,7 +72,7 @@ some_client_stats AS (
       COUNT(*) AS tests,
       COUNTIF(isDownload) AS downloads,
       COUNTIF(NOT isDownload) AS uploads,
-      (COUNTIF(isDownload) - COUNTIF(NOT isDownload))/COUNT(*)  AS duBalance,
+      COUNTIF(isDownload)/COUNT(*)  AS dlFraction,
 
       # These characterize how often the client runs download tests, and how variable that is.
       AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval,
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
index c71b2c5..d2bc27e 100644
--- a/views/labelled_client_summary.sql
+++ b/views/labelled_client_summary.sql
@@ -15,7 +15,7 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.
 )
 
 SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, 
-  SUM(tests*duBalance)/SUM(tests) AS duBalance,
+  SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads
   SUM(tests*sunday)/SUM(tests) AS sunday,
   SUM(tests*monday)/SUM(tests) AS monday,
   SUM(tests*tuesday)/SUM(tests) AS tuesday,

From 899612f6d3a3c5c877f5de56c7f3e839869feb4f Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Thu, 23 Sep 2021 17:55:16 -0400
Subject: [PATCH 11/12] cleanup

---
 models/client_clusters.sql        |  32 ++-------
 views/client_stats.sql            | 111 +++++++++++++++---------------
 views/client_stats_interval.sql   | 106 ----------------------------
 views/labelled_client_summary.sql |   8 +--
 4 files changed, 63 insertions(+), 194 deletions(-)
 delete mode 100644 views/client_stats_interval.sql

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
index f1a69af..198097f 100644
--- a/models/client_clusters.sql
+++ b/models/client_clusters.sql
@@ -1,5 +1,5 @@
 # This is a kmeans clustering of clients, to discover the clusters of clients,
-# by metro, based on time of day and day of the week.
+# by metro, based on time of day and day of the week, and other testing behaviors.
 # This will only be meaningful for clients that test a significant number of
 # times over the interval of interest, so it is beneficial to use fairly
 # large intervals.  We will try 13 week intervals, to get consistent number
@@ -11,36 +11,12 @@ CREATE OR REPLACE MODEL
   `mlab-sandbox.gfr.client_clusters_model_alt_30` OPTIONS(model_type='kmeans',
     num_clusters=30) AS
 
-WITH linear AS (
-SELECT LOG10(tests) AS logTests,
-days, hours,
-downloadInterval, downloadIntervalVariability, # interval should be loosely related to number of tests (and start and end date)
---duBalance, # balance between downloads (+1) and uploads (-1)
-sunday/tests AS sunday,
-monday/tests AS monday,
-tuesday/tests AS tuesday,
-wednesday/tests AS wednesday,
-thursday/tests AS thursday,
-friday/tests AS friday,
-saturday/tests AS saturday,
-t00/tests AS t00,
-t03/tests AS t03,
-t06/tests AS t06,
-t09/tests AS t09,
-t12/tests AS t12,
-t15/tests AS t15,
-t18/tests AS t18,
-t21/tests AS t21,
-FROM `mlab-sandbox.gfr.client_stats_interval`
-WHERE tests > 10
-),
-
 alternate AS (
-  SELECT --metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+  SELECT 
     training_stats.* EXCEPT(uploads, downloads, days, hours),
     SAFE.LOG10(training_stats.tests) AS logTests,
-  FROM `mlab-sandbox.gfr.client_stats_2`
-  WHERE training_stats.tests > 10
+  FROM `mlab-sandbox.gfr.client_stats`
+  WHERE training_stats.tests > 5
 )
 
 SELECT * FROM alternate
\ No newline at end of file
diff --git a/views/client_stats.sql b/views/client_stats.sql
index 52ab39a..834579b 100644
--- a/views/client_stats.sql
+++ b/views/client_stats.sql
@@ -1,39 +1,33 @@
-# Create a view that tracks the count of tests by hour and day of the week.
+# Create a view that tracks the count of tests by hour and day of the week,
+# and some other stats that might be useful for clustering.
 # Uses server latitude to adjust the time of day and day of week.
 # Client id is based on IP address, clientName, clientOS, and wscale.
-# TODO add stats for intertest interval
 
-# NOTES:
-# Anything that takes less than a couple slot hours we can probably just
-# do in gardener after processing incoming data each day.
-
-# bq query --use_legacy_sql=false < views/client_stats.sql
+# bq query --use_legacy_sql=false < views/client_stats_interval.sql
 
 CREATE OR REPLACE VIEW
 `mlab-sandbox.gfr.client_stats`
-OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day')
+OPTIONS(description = 'per metro client test stats')
 AS 
 
-# Select ndt7 downloads (for now)
+# Select ALL ndt7 tests
 # Since this is client characterization, we count uploads and downloads, and don''t
 # care whether the tests are completely valid
-
 WITH tests AS (
   SELECT
   date, ID, raw.ClientIP, a,
   IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
   IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
   a.TestTime,
-  --IFNULL(raw.Download.StartTime, raw.Upload.StartTime) AS startTime,
   server.Geo.Longitude,  # TODO should this be client or server?
-  LEFT(server.Site, 3) AS metro, server.Site AS site, server.Machine AS machine,
-  REGEXP_EXTRACT(ID, "(ndt-?.*)-.*") AS NDTVersion,
+  LEFT(server.Site, 3) AS metro,
   IF(raw.Download IS NULL, false, true) AS isDownload,
+  # This is used later for extracting the client metadata.
   IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData,
   FROM `measurement-lab.ndt.ndt7`
 ),
 
-# This join is quite expensive - about 3 slot hours for 2 months of data, even if the clientName field is never used.
+# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used.
 add_client_name AS (
   SELECT tests.*, clientName
   FROM tests LEFT JOIN (
@@ -43,61 +37,66 @@ add_client_name AS (
 ),
 
 add_client_os AS (
-  SELECT add_client_name.*, clientOS
+  SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS
   FROM add_client_name LEFT JOIN (
     SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS
     FROM add_client_name, add_client_name.tmpClientMetadata
     WHERE Name = "client_os") USING (date, ID)
 ),
 
+# This adds the solar time, which is more useful for global clustering than UTC time.
 solar AS (
-  SELECT *,
+  SELECT * EXCEPT(Longitude),
     TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
-    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND) AS testInterval,
+    # Compute the time, in seconds, since the previous test of the same type (upload or download)
+    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval,
   FROM add_client_os
+  WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
   WINDOW
     sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2
     ORDER BY a.TestTime)
 ),
 
-day_hour AS (
-  SELECT
-    # TODO correct for latitude. 
-   metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-   EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL))) AS meanSpeed,
-   EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL))) AS meanMinRTT,
-   EXTRACT(DAYOFWEEK FROM solarTime) AS day, EXTRACT(HOUR FROM solarTime) AS hour,
-   COUNTIF(isDownload) AS downloads,
-   COUNT(*) AS tests
-   FROM solar
-   WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
-   GROUP BY metro, clientIP, clientName, clientOS, day, hour, wscale1, wscale2
-)
+# This adds all of the client aggregations.
+client_stats AS (
+  SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2,
+    STRUCT( 
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed,  # Downloads only.
+      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT             # Downloads only.
+    ) AS performance_stats,
+    STRUCT (
+      COUNT(*) AS tests,
+      COUNTIF(isDownload) AS downloads,
+      COUNTIF(NOT isDownload) AS uploads,
+      COUNTIF(isDownload)/COUNT(*)  AS dlFraction,
 
+      # These characterize how often the client runs download tests, and how variable that is.
+      AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval,
+      SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability,
+
+      COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime))  AS days,
+      COUNT(DISTINCT EXTRACT(HOUR FROM solarTime))  AS hours,
+
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday,
+      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday,
+      
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18,
+      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21
+    ) AS training_stats
+  FROM solar
+  GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
+  HAVING training_stats.tests > 5 # only bother with this for clients that have more than 5 tests
+)
 
-SELECT 
-    metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-    SUM(downloads) AS downloads,
-    EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)) AS meanSpeed,
-    EXP(SUM(downloads*SAFE.LN(meanMinRTT))/SUM(downloads)) AS meanMinRTT,
-    SUM(tests) AS tests,
-    COUNT(DISTINCT day) AS days,
-    COUNT(DISTINCT hour) AS hours,
-    SUM(IF(day = 1,tests,0)) AS sunday,
-    SUM(IF(day = 2,tests,0)) AS monday,
-    SUM(IF(day = 3,tests,0)) AS tuesday,
-    SUM(IF(day = 4,tests,0)) AS wednesday,
-    SUM(IF(day = 5,tests,0)) AS thursday,
-    SUM(IF(day = 6,tests,0)) AS friday,
-    SUM(IF(day = 7,tests,0)) AS saturday,
-    SUM(IF(hour BETWEEN 0 AND 2,tests,0)) AS t00,
-    SUM(IF(hour BETWEEN 3 AND 5,tests,0)) AS t03,
-    SUM(IF(hour BETWEEN 6 AND 8,tests,0)) AS t06,
-    SUM(IF(hour BETWEEN 9 AND 11,tests,0)) AS t09,
-    SUM(IF(hour BETWEEN 12 AND 14,tests,0)) AS t12,
-    SUM(IF(hour BETWEEN 15 AND 17,tests,0)) AS t15,
-    SUM(IF(hour BETWEEN 18 AND 20,tests,0)) AS t18,
-    SUM(IF(hour BETWEEN 21 AND 23,tests,0)) AS t21,
-FROM day_hour
-GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
-HAVING tests > 5
+SELECT * FROM client_stats
diff --git a/views/client_stats_interval.sql b/views/client_stats_interval.sql
deleted file mode 100644
index 14a2611..0000000
--- a/views/client_stats_interval.sql
+++ /dev/null
@@ -1,106 +0,0 @@
-# Create a view that tracks the count of tests by hour and day of the week.
-# Uses server latitude to adjust the time of day and day of week.
-# Client id is based on IP address, clientName, clientOS, and wscale.
-# TODO add stats for intertest interval
-
-# NOTES:
-# Anything that takes less than a couple slot hours we can probably just
-# do in gardener after processing incoming data each day.
-
-# bq query --use_legacy_sql=false < views/client_stats_interval.sql
-
-CREATE OR REPLACE VIEW
-`mlab-sandbox.gfr.client_stats_2`
-OPTIONS(description = 'per metro client test stats - tests by day of week and hour of the day')
-AS 
-
-# Select ALL ndt7 tests
-# Since this is client characterization, we count uploads and downloads, and don''t
-# care whether the tests are completely valid
-WITH tests AS (
-  SELECT
-  date, ID, raw.ClientIP, a,
-  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
-  IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
-  a.TestTime,
-  server.Geo.Longitude,  # TODO should this be client or server?
-  LEFT(server.Site, 3) AS metro,
-  IF(raw.Download IS NULL, false, true) AS isDownload,
-  # This is used later for extracting the client metadata.
-  IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData,
-  FROM `measurement-lab.ndt.ndt7`
-),
-
-# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used.
-add_client_name AS (
-  SELECT tests.*, clientName
-  FROM tests LEFT JOIN (
-    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName
-    FROM tests, tests.tmpClientMetadata
-    WHERE Name = "client_name") USING (date, ID)
-),
-
-add_client_os AS (
-  SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS
-  FROM add_client_name LEFT JOIN (
-    SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS
-    FROM add_client_name, add_client_name.tmpClientMetadata
-    WHERE Name = "client_os") USING (date, ID)
-),
-
-# This adds the solar time, which is more useful for global clustering than UTC time.
-solar AS (
-  SELECT * EXCEPT(Longitude),
-    TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
-    # Compute the time, in seconds, since the previous test of the same type (upload or download)
-    TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval,
-  FROM add_client_os
-  WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
-  WINDOW
-    sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2
-    ORDER BY a.TestTime)
-),
-
-# This adds all of the client aggregations.
-some_client_stats AS (
-  SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2,
-    STRUCT( 
-      EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed,  # Downloads only.
-      EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT             # Downloads only.
-    ) AS performance_stats,
-    STRUCT (
-      COUNT(*) AS tests,
-      COUNTIF(isDownload) AS downloads,
-      COUNTIF(NOT isDownload) AS uploads,
-      COUNTIF(isDownload)/COUNT(*)  AS dlFraction,
-
-      # These characterize how often the client runs download tests, and how variable that is.
-      AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval,
-      SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability,
-
-      COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime))  AS days,
-      COUNT(DISTINCT EXTRACT(HOUR FROM solarTime))  AS hours,
-
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday,
-      COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday,
-      
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18,
-      COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21
-    ) AS training_stats
-  FROM solar
-  GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
-  HAVING training_stats.tests > 5
-)
-
-SELECT * FROM some_client_stats
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
index d2bc27e..0422f61 100644
--- a/views/labelled_client_summary.sql
+++ b/views/labelled_client_summary.sql
@@ -10,12 +10,12 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.
   (SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, performance_stats.*,
     training_stats.*,
     SAFE.LOG10(training_stats.tests) AS logTests,
-  FROM `mlab-sandbox.gfr.client_stats_2`
+  FROM `mlab-sandbox.gfr.client_stats`
   WHERE training_stats.tests > 10))
 )
 
 SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, 
-  SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads
+  SUM(tests*duBalance)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads
   SUM(tests*sunday)/SUM(tests) AS sunday,
   SUM(tests*monday)/SUM(tests) AS monday,
   SUM(tests*tuesday)/SUM(tests) AS tuesday,
@@ -23,9 +23,9 @@ SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS c
   SUM(tests*thursday)/SUM(tests) AS thursday,
   SUM(tests*friday)/SUM(tests) AS friday,
   SUM(tests*saturday)/SUM(tests) AS saturday,
-  ROUND(EXP(SUM(downloads*SAFE.LN(meanSpeed))/SUM(downloads)),3) AS meanSpeed,
+  # Use mean speed per client, so that each client contributes equal weight to the average.
   ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed,
-  # TODO - is this STDDEV computation valid?
+  # Speed deviation across clients in a cluster.  TODO - is this STDDEV computation valid?
   ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev,
   ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT,
   FROM alternate

From a3a44605bead044039ccce4296bca7a6afecfbd9 Mon Sep 17 00:00:00 2001
From: Gregory Russell <gfr@google.com>
Date: Wed, 20 Oct 2021 09:39:26 -0400
Subject: [PATCH 12/12] minor formatting tweaks, dlFraction

---
 models/client_clusters.sql        | 5 ++---
 views/client_stats.sql            | 3 +--
 views/labelled_client_summary.sql | 5 ++---
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/models/client_clusters.sql b/models/client_clusters.sql
index 198097f..2f14946 100644
--- a/models/client_clusters.sql
+++ b/models/client_clusters.sql
@@ -7,9 +7,8 @@
 
 # bq query --use_legacy_sql=false < models/client_clusters.sql
 
-CREATE OR REPLACE MODEL
-  `mlab-sandbox.gfr.client_clusters_model_alt_30` OPTIONS(model_type='kmeans',
-    num_clusters=30) AS
+CREATE OR REPLACE MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30`
+  OPTIONS(model_type='kmeans', num_clusters=30) AS
 
 alternate AS (
   SELECT 
diff --git a/views/client_stats.sql b/views/client_stats.sql
index 834579b..1d9cdc8 100644
--- a/views/client_stats.sql
+++ b/views/client_stats.sql
@@ -5,8 +5,7 @@
 
 # bq query --use_legacy_sql=false < views/client_stats_interval.sql
 
-CREATE OR REPLACE VIEW
-`mlab-sandbox.gfr.client_stats`
+CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_stats`
 OPTIONS(description = 'per metro client test stats')
 AS 
 
diff --git a/views/labelled_client_summary.sql b/views/labelled_client_summary.sql
index 0422f61..317ddf8 100644
--- a/views/labelled_client_summary.sql
+++ b/views/labelled_client_summary.sql
@@ -1,7 +1,6 @@
 # Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats.
 
-CREATE OR REPLACE VIEW
- `mlab-sandbox.gfr.client_cluster_summaries_30`
+CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries_30`
 AS 
 
 WITH
@@ -15,7 +14,7 @@ SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.
 )
 
 SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads, 
-  SUM(tests*duBalance)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads
+  SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads
   SUM(tests*sunday)/SUM(tests) AS sunday,
   SUM(tests*monday)/SUM(tests) AS monday,
   SUM(tests*tuesday)/SUM(tests) AS tuesday,