Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions models/client_clusters.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# This is a kmeans clustering of clients, to discover the clusters of clients,
# by metro, based on time of day and day of the week, and other testing behaviors.
# This will only be meaningful for clients that test a significant number of
# times over the interval of interest, so it is beneficial to use fairly
# large intervals. We will try 13 week intervals, to get consistent number
# of days of the week.

# bq query --use_legacy_sql=false < models/client_clusters.sql

CREATE OR REPLACE MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30`
OPTIONS(model_type='kmeans', num_clusters=30) AS

alternate AS (
SELECT
training_stats.* EXCEPT(uploads, downloads, days, hours),
SAFE.LOG10(training_stats.tests) AS logTests,
FROM `mlab-sandbox.gfr.client_stats`
WHERE training_stats.tests > 5
)

SELECT * FROM alternate
101 changes: 101 additions & 0 deletions views/client_stats.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Create a view that tracks the count of tests by hour and day of the week,
# and some other stats that might be useful for clustering.
# Uses server latitude to adjust the time of day and day of week.
# Client id is based on IP address, clientName, clientOS, and wscale.

# bq query --use_legacy_sql=false < views/client_stats_interval.sql

CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_stats`
OPTIONS(description = 'per metro client test stats')
AS

# Select ALL ndt7 tests
# Since this is client characterization, we count uploads and downloads, and don''t
# care whether the tests are completely valid
WITH tests AS (
SELECT
date, ID, raw.ClientIP, a,
IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale & 0x0F AS WScale1,
IFNULL(raw.Download, raw.Upload).ServerMeasurements[SAFE_OFFSET(0)].TCPInfo.WScale >> 4 AS WScale2,
a.TestTime,
server.Geo.Longitude, # TODO should this be client or server?
LEFT(server.Site, 3) AS metro,
IF(raw.Download IS NULL, false, true) AS isDownload,
# This is used later for extracting the client metadata.
IFNULL(raw.Download.ClientMetadata, raw.Upload.ClientMetadata) AS tmpClientMetaData,
FROM `measurement-lab.ndt.ndt7`
),

# These metadata joins are quite expensive - about 3 slot hours for 2 months of data, even if the field is never used.
add_client_name AS (
SELECT tests.*, clientName
FROM tests LEFT JOIN (
SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientName
FROM tests, tests.tmpClientMetadata
WHERE Name = "client_name") USING (date, ID)
),

add_client_os AS (
SELECT add_client_name.* EXCEPT(tmpClientMetaData), clientOS
FROM add_client_name LEFT JOIN (
SELECT * EXCEPT(tmpClientMetadata, Name, Value), Value AS clientOS
FROM add_client_name, add_client_name.tmpClientMetadata
WHERE Name = "client_os") USING (date, ID)
),

# This adds the solar time, which is more useful for global clustering than UTC time.
solar AS (
SELECT * EXCEPT(Longitude),
TIMESTAMP_ADD(testTime, INTERVAL CAST(-60*Longitude/15 AS INT) MINUTE) AS solarTime,
# Compute the time, in seconds, since the previous test of the same type (upload or download)
TIMESTAMP_DIFF(testTime, LAG(testTime, 1) OVER sequence, SECOND)AS testInterval,
FROM add_client_os
WHERE date BETWEEN DATE_SUB(CURRENT_DATE(), INTERVAL 93 DAY) AND DATE_SUB(CURRENT_DATE(), INTERVAL 2 DAY)
WINDOW
sequence AS (PARTITION BY isDownload, metro, ClientIP, clientName, clientOS, wscale1, wscale2
ORDER BY a.TestTime)
),

# This adds all of the client aggregations.
client_stats AS (
SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2,
STRUCT(
EXP(AVG(IF(isDownload,SAFE.LN(a.MeanThroughputMBPS),NULL)) ) AS meanSpeed, # Downloads only.
EXP(AVG(IF(isDownload,SAFE.LN(a.MinRTT),NULL)) ) AS meanMinRTT # Downloads only.
) AS performance_stats,
STRUCT (
COUNT(*) AS tests,
COUNTIF(isDownload) AS downloads,
COUNTIF(NOT isDownload) AS uploads,
COUNTIF(isDownload)/COUNT(*) AS dlFraction,

# These characterize how often the client runs download tests, and how variable that is.
AVG(IF(isDownload,testInterval,NULL)) AS downloadInterval,
SAFE_DIVIDE(STDDEV(IF(isDownload,testInterval,NULL)),AVG(IF(isDownload,testInterval,NULL))) AS downloadIntervalVariability,

COUNT(DISTINCT EXTRACT(DAYOFWEEK FROM solarTime)) AS days,
COUNT(DISTINCT EXTRACT(HOUR FROM solarTime)) AS hours,

COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 1)/COUNT(*) AS sunday,
COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 2)/COUNT(*) AS monday,
COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 3)/COUNT(*) AS tuesday,
COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 4)/COUNT(*) AS wednesday,
COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 5)/COUNT(*) AS thursday,
COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 6)/COUNT(*) AS friday,
COUNTIF(EXTRACT(DAYOFWEEK FROM solarTime) = 7)/COUNT(*) AS saturday,

COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 0 AND 2)/COUNT(*) AS t00,
COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 3 AND 5)/COUNT(*) AS t03,
COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 6 AND 8)/COUNT(*) AS t06,
COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 9 AND 10)/COUNT(*) AS t09,
COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 12 AND 14)/COUNT(*) AS t12,
COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 15 AND 17)/COUNT(*) AS t15,
COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 18 AND 20)/COUNT(*) AS t18,
COUNTIF(EXTRACT(HOUR FROM solarTime) BETWEEN 21 AND 23)/COUNT(*) AS t21
) AS training_stats
FROM solar
GROUP BY metro, ClientIP, clientName, clientOS, wscale1, wscale2
HAVING training_stats.tests > 5 # only bother with this for clients that have more than 5 tests
)

SELECT * FROM client_stats
32 changes: 32 additions & 0 deletions views/labelled_client_summary.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Uses the clusters from client_clusters_model to label client groups, and compute aggregate group stats.

CREATE OR REPLACE VIEW `mlab-sandbox.gfr.client_cluster_summaries_30`
AS

WITH
alternate AS (
SELECT * EXCEPT(NEAREST_CENTROIDS_DISTANCE) FROM ML.PREDICT(MODEL `mlab-sandbox.gfr.client_clusters_model_alt_30`,
(SELECT metro, ClientIP, clientName, clientOS, wscale1, wscale2, performance_stats.*,
training_stats.*,
SAFE.LOG10(training_stats.tests) AS logTests,
FROM `mlab-sandbox.gfr.client_stats`
WHERE training_stats.tests > 10))
)

SELECT metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2, COUNT(*) AS clients, SUM(tests) AS tests, SUM(downloads) AS downloads,
SUM(tests*dlFraction)/SUM(tests) AS dlFraction, # 0 to 1 indicating what fraction of tests are downloads
SUM(tests*sunday)/SUM(tests) AS sunday,
SUM(tests*monday)/SUM(tests) AS monday,
SUM(tests*tuesday)/SUM(tests) AS tuesday,
SUM(tests*wednesday)/SUM(tests) AS wednesday,
SUM(tests*thursday)/SUM(tests) AS thursday,
SUM(tests*friday)/SUM(tests) AS friday,
SUM(tests*saturday)/SUM(tests) AS saturday,
# Use mean speed per client, so that each client contributes equal weight to the average.
ROUND(EXP(AVG(SAFE.LN(meanSpeed))),3) AS debiasedSpeed,
# Speed deviation across clients in a cluster. TODO - is this STDDEV computation valid?
ROUND(100*SAFE_DIVIDE(EXP(STDDEV(SAFE.LN(meanSpeed))), EXP(AVG(SAFE.LN(meanSpeed)))),1) AS speedDev,
ROUND(EXP(AVG(SAFE.LN(meanMinRTT))),2) AS debiasedMinRTT,
FROM alternate
GROUP BY metro, CENTROID_ID, clientName, clientOS, wscale1, wscale2