From a8e606bda664de84ee7fae4945d2d71d86983687 Mon Sep 17 00:00:00 2001 From: Sravanth Bangari Date: Tue, 30 Nov 2021 12:52:22 -0800 Subject: [PATCH 1/5] Add code for Monitoring Windows AKS Node for failures --- .../debug/monitoring/MonitorWindowsNode.ps1 | 118 ++++++++++++++++++ .../strategies/CopyLogsToBlobStorage.psm1 | 43 +++++++ .../LoadBalancerPolicyStrategy.psm1 | 46 +++++++ .../strategies/StrategyModuleTemplate.psm1 | 39 ++++++ 4 files changed, 246 insertions(+) create mode 100644 Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1 create mode 100644 Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 create mode 100644 Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 create mode 100644 Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 diff --git a/Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1 b/Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1 new file mode 100644 index 00000000..a1d3f5cd --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/MonitorWindowsNode.ps1 @@ -0,0 +1,118 @@ +[CmdletBinding()] +param +( + # Path to the module defining the strategy to use for monitoring the node + [string] + $StrategyModulePath = "C:\k\debug\StrategyModulePath.psm1" +) + +function Start-HNSTrace +{ + .\collectlogs.ps1 + $sessionName = 'HnsCapture' + Write-Host "Starting HNS tracing" + + $curDir = Get-Location + # Generate a random directory to capture all the logs + $etlPath = [io.Path]::Combine($curDir.Path, "HNSTrace.etl") + .\starthnstrace.ps1 -NoPrompt -MaxFileSize 1024 -EtlFile $etlPath +} + +function Stop-HNSTrace +{ + # Stop the tracing + $sessionName = 'HnsCapture' + Write-Host "Stopping $sessionName." + Stop-NetEventSession $sessionName + + # Collect logs + .\collectlogs.ps1 + .\collect-windows-logs.ps1 + + # Take a HNS Process dump + $hnsProcessId = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Hns'" | Select-Object -ExpandProperty ProcessId + .\Procdump\Procdump.exe -ma $hnsProcessId /accepteula +} + +''' +Start-Monitoring + +Monitors Windows node for an error condition by polling every 15 seconds. +Gathers all the necessary logs if Windows node goes into an error/faulted state. +''' +function Start-Monitoring +{ + param + ( + # Path with filename where the configuration module is located + [string] + $StrategyModulePath = "C:\k\debug\StrategyModule.psm1", + + # Interval to poll for failure in seconds + [int] + $PollingInterval = 15, + + # Number of consecutive failures to declare the node is faulty + [int] + $FailureThreshold = 3 + ) + + $curDir = Get-Location + # Generate a random directory to capture all the logs + $outDir = [io.Path]::Combine($curDir.Path, [io.Path]::GetRandomFileName()) + md $outDir + pushd + cd $outDir + + # Download necessary files + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/collectlogs.ps1 -o collectlogs.ps1 + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/VFP.psm1 -o VFP.psm1 + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.psm1 -o HNS.psm1 + wget https://raw.githubusercontent.com/Azure/aks-engine/master/scripts/collect-windows-logs.ps1 -o collect-windows-logs.ps1 + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/debug/starthnstrace.ps1 -o starthnstrace.ps1 + wget https://download.sysinternals.com/files/Procdump.zip -o Procdump.zip + Expand-Archive .\Procdump.zip + wget $StrategyModulePath -o StrategyModule.psm1 + ipmo .\VFP.psm1 + ipmo .\HNS.psm1 + ipmo .\StrategyModule.psm1 + + Start-HNSTrace + $consecutiveFailures = 0 + + StartHandler + + LogMessage "Started Monitoring" + + while($true) + { + if(IsNodeFaulted) + { + $consecutiveFailures++ + # Number of consecutive failures to confirm that the Windows node is faulted for real + # and this is not an intermittent failure + if ($consecutiveFailures -ge $FailureThreshold) + { + Stop-HNSTrace + + popd + + TerminateHandler($outDir) + + LogMessage "Diagnostic logs are available at $outDir" + return + } + } + else + { + $consecutiveFailures = 0 + } + + # Adjust the sleep time to lower the polling frequency + Start-Sleep -Seconds $PollingInterval + } +} + +##### Start execution ######### + +Start-Monitoring -StrategyModulePath $StrategyModulePath \ No newline at end of file diff --git a/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 b/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 new file mode 100644 index 00000000..84f8721c --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 @@ -0,0 +1,43 @@ +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + #logic here +} + +function TerminateHandler +{ + param + ( + [string] $LogPath = "" + ) + + # copy the logs to Azure blob + Invoke-WebRequest https://azcopyvnext.azureedge.net/release20211027/azcopy_windows_amd64_10.13.0.zip -OutFile azcopyv10.zip + Expand-Archive .\azcopyv10.zip -Force + .\azcopyv10\azcopy_windows_amd64_10.13.0\azcopy.exe copy $LogPath "https://sban91storage.blob.core.windows.net/akslogs?sp=rw&st=2021-11-30T18:59:20Z&se=2021-12-12T02:59:20Z&spr=https&sv=2020-08-04&sr=c&sig=3uzRPB72k4NnM2q1k1vZ1xqugkjDSUSWSPMdiMQkwMI%3D" --recursive=true +} + +function IsNodeFaulted +{ + #logic here + return $true +} \ No newline at end of file diff --git a/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 b/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 new file mode 100644 index 00000000..8fb4c529 --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 @@ -0,0 +1,46 @@ +$ServiceIp = "192.168.0.10" +$ServicePort = 53 + +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + LogMessage "Capturing some information before the repro." + $hnsInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'hns'" + $kubeproxyInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Kubeproxy'" + LogMessage $hnsInfo + LogMessage $kubeproxyInfo +} + +function TerminateHandler +{ + LogMessage "Capturing some information after the repro." + $hnsInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'hns'" + $kubeproxyInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Kubeproxy'" + LogMessage $hnsInfo + LogMessage $kubeproxyInfo + LogMessage "HNS Policy for K8's Service with IP $ServiceIp and Port $ServicePort is missing" +} + +function IsNodeFaulted +{ + return ((Get-HnsPolicyList | where {($_.Policies.VIPs.Count -ge 1) -and $_.Policies.VIPs.Contains($ServiceIp) -and $_.Policies.ExternalPort -eq $ServicePort}) -eq $null) +} diff --git a/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 new file mode 100644 index 00000000..10b967e3 --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 @@ -0,0 +1,39 @@ +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + #logic here +} + +function TerminateHandler +{ + param + ( + [string] $LogPath = "" + ) + #logic here +} + +function IsNodeFaulted +{ + #logic here + return $true +} \ No newline at end of file From 62db8eed95671acbe2d38a21abdcddaca8a93c73 Mon Sep 17 00:00:00 2001 From: Sravanth Bangari Date: Tue, 30 Nov 2021 12:57:08 -0800 Subject: [PATCH 2/5] Adding logpath to terminate handler --- .../monitoring/strategies/LoadBalancerPolicyStrategy.psm1 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 b/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 index 8fb4c529..518198ca 100644 --- a/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 +++ b/Kubernetes/windows/debug/monitoring/strategies/LoadBalancerPolicyStrategy.psm1 @@ -32,6 +32,10 @@ function StartHandler function TerminateHandler { + param + ( + [string] $LogPath = "" + ) LogMessage "Capturing some information after the repro." $hnsInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'hns'" $kubeproxyInfo = Get-WmiObject -Class Win32_Service -Filter "Name LIKE 'Kubeproxy'" From ac0219760e83ebf05045e46515d89be1d40e3ccf Mon Sep 17 00:00:00 2001 From: Sravanth Bangari Date: Wed, 1 Dec 2021 15:25:03 -0800 Subject: [PATCH 3/5] zip all the content --- .../debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 b/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 index 84f8721c..b08a87e4 100644 --- a/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 +++ b/Kubernetes/windows/debug/monitoring/strategies/CopyLogsToBlobStorage.psm1 @@ -33,7 +33,11 @@ function TerminateHandler # copy the logs to Azure blob Invoke-WebRequest https://azcopyvnext.azureedge.net/release20211027/azcopy_windows_amd64_10.13.0.zip -OutFile azcopyv10.zip Expand-Archive .\azcopyv10.zip -Force - .\azcopyv10\azcopy_windows_amd64_10.13.0\azcopy.exe copy $LogPath "https://sban91storage.blob.core.windows.net/akslogs?sp=rw&st=2021-11-30T18:59:20Z&se=2021-12-12T02:59:20Z&spr=https&sv=2020-08-04&sr=c&sig=3uzRPB72k4NnM2q1k1vZ1xqugkjDSUSWSPMdiMQkwMI%3D" --recursive=true + + $timeStamp = get-date -format 'yyyyMMdd-hhmmss' + $zipFileName = "$env:computername-$($timeStamp)_logs.zip" + Compress-Archive -LiteralPath $LogPath -DestinationPath $zipFileName + .\azcopyv10\azcopy_windows_amd64_10.13.0\azcopy.exe copy $zipFileName "https://sban91storage.blob.core.windows.net/akslogs?sp=rw&st=2021-11-30T18:59:20Z&se=2021-12-12T02:59:20Z&spr=https&sv=2020-08-04&sr=c&sig=3uzRPB72k4NnM2q1k1vZ1xqugkjDSUSWSPMdiMQkwMI%3D" } function IsNodeFaulted From d3f5429a14da5604f1ef94da3f0c0ba2c0aeead8 Mon Sep 17 00:00:00 2001 From: Albin Sumic Date: Fri, 22 Jul 2022 14:03:18 -0500 Subject: [PATCH 4/5] Updating failure collection to automate processs --- .../strategies/StrategyModuleTemplate.psm1 | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 index 10b967e3..d7c1edf4 100644 --- a/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 +++ b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 @@ -20,7 +20,10 @@ function LogMessage function StartHandler { - #logic here + #download file + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.v2.psm1 -o HNS.V2.psm1 + + ipmo .\HNS.V2.psm1 } function TerminateHandler @@ -34,6 +37,26 @@ function TerminateHandler function IsNodeFaulted { - #logic here - return $true + #More specific lookup by azure name. Needs more testing before is used. + #((get-hnsnetwork | ? name -like azure)[0].Policies | Where-Object PolicyType -eq IPSET).count + $expectedNumPolicies = (((get-hnsnetwork | Select Policies)[1].Policies) | Where-Object PolicyType -eq IPSET).Count + if($expectedNumPolicies -eq 0){ + return $false + } + $EndpointPorts = Get-HnsEndpoint | %{$_.Resources.Allocators} | Where-Object Tag -eq "Endpoint Port" | Select -ExpandProperty EndpointPortGuid + foreach ($endPort in $EndpointPorts) + { + $currNumPolicies = (vfpctrl /port $endPort /list-tag | Select-String "Friendly Name").Count + #if difference is greater than or equal to 10% + if($currNumPolicies -le ($expectedNumPolicies - $expectedNumPolicies * .1)){ + + #get the virtualNetwork + $netId = Get-HnsEndpoint | where-object {$_.Resources.Allocators.EndPointPortGuid -eq $endPort} | Select -ExpandProperty VirtualNetwork + #send test policy to simplify log lookup + New-HNSSetPolicy -NetworkId $netId -setType 0 -setValues "10.22.0.44" -setName "spTestName" -setId "spTestId" -Verbose + + return $true + } + } + return $false } \ No newline at end of file From 95769655221018102639047e629533360f3086d4 Mon Sep 17 00:00:00 2001 From: Albin Sumic Date: Tue, 26 Jul 2022 11:52:10 -0500 Subject: [PATCH 5/5] Add compare hns and vfp endpoint strategy that will log when a difference is noted between the two --- .../strategies/CompareHnsAndVfpEndpoints.psm1 | 66 +++++++++++++++++++ .../strategies/StrategyModuleTemplate.psm1 | 29 +------- 2 files changed, 69 insertions(+), 26 deletions(-) create mode 100644 Kubernetes/windows/debug/monitoring/strategies/CompareHnsAndVfpEndpoints.psm1 diff --git a/Kubernetes/windows/debug/monitoring/strategies/CompareHnsAndVfpEndpoints.psm1 b/Kubernetes/windows/debug/monitoring/strategies/CompareHnsAndVfpEndpoints.psm1 new file mode 100644 index 00000000..d8a962af --- /dev/null +++ b/Kubernetes/windows/debug/monitoring/strategies/CompareHnsAndVfpEndpoints.psm1 @@ -0,0 +1,66 @@ +#Implement these 4 methods: +# 1. LogMessage - Implements logic to log messages. Defaults to logging to a file. +# 2. StartHandler - Handler invoked after the monitoring starts (before the node is in repro state) +# 3. TerminateHandler - Handler invoked before the monitoring stops (after the node is in repro state) +# 4. IsNodeFaulted - Returns a $true when the node is in repro state, $false otherwise + +function LogMessage +{ + param + ( + [string] $Message = "" + ) + + #re-implement if needed + $FilePath = "C:\k\debug\MonitorWindowsNode.txt" + Get-Date | Out-File -FilePath $FilePath -Append + $Message | Out-File -FilePath $FilePath -Append + +} + +function StartHandler +{ + #download file + wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.v2.psm1 -o HNS.V2.psm1 + + ipmo .\HNS.V2.psm1 +} + +function TerminateHandler +{ + param + ( + [string] $LogPath = "" + ) + LogMessage "Capturing information after node failure" + LogMessage "Information has been logged: $LogPath" + + #TODO: add azure blob + #TODO: add way to notify user of issue +} + +function IsNodeFaulted +{ + #More specific lookup by azure name. Needs more testing before is used. + #((get-hnsnetwork | ? name -like azure)[0].Policies | Where-Object PolicyType -eq IPSET).count + $expectedNumPolicies = (((get-hnsnetwork | Select Policies)[1].Policies) | Where-Object PolicyType -eq IPSET).Count + if($expectedNumPolicies -eq 0){ + return $false + } + $EndpointPorts = Get-HnsEndpoint | %{$_.Resources.Allocators} | Where-Object Tag -eq "Endpoint Port" | Select -ExpandProperty EndpointPortGuid + foreach ($endPort in $EndpointPorts) + { + $currNumPolicies = (vfpctrl /port $endPort /list-tag | Select-String "Friendly Name").Count + #if difference is greater than or equal to 10% + if($currNumPolicies -le ($expectedNumPolicies - $expectedNumPolicies * .1)){ + + #get the virtualNetwork + $netId = Get-HnsEndpoint | where-object {$_.Resources.Allocators.EndPointPortGuid -eq $endPort} | Select -ExpandProperty VirtualNetwork + #send test policy to simplify log lookup + New-HNSSetPolicy -NetworkId $netId -setType 0 -setValues "10.22.0.44" -setName "spTestName" -setId "spTestId" + + return $true + } + } + return $false +} \ No newline at end of file diff --git a/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 index d7c1edf4..10b967e3 100644 --- a/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 +++ b/Kubernetes/windows/debug/monitoring/strategies/StrategyModuleTemplate.psm1 @@ -20,10 +20,7 @@ function LogMessage function StartHandler { - #download file - wget https://raw.githubusercontent.com/microsoft/SDN/master/Kubernetes/windows/hns.v2.psm1 -o HNS.V2.psm1 - - ipmo .\HNS.V2.psm1 + #logic here } function TerminateHandler @@ -37,26 +34,6 @@ function TerminateHandler function IsNodeFaulted { - #More specific lookup by azure name. Needs more testing before is used. - #((get-hnsnetwork | ? name -like azure)[0].Policies | Where-Object PolicyType -eq IPSET).count - $expectedNumPolicies = (((get-hnsnetwork | Select Policies)[1].Policies) | Where-Object PolicyType -eq IPSET).Count - if($expectedNumPolicies -eq 0){ - return $false - } - $EndpointPorts = Get-HnsEndpoint | %{$_.Resources.Allocators} | Where-Object Tag -eq "Endpoint Port" | Select -ExpandProperty EndpointPortGuid - foreach ($endPort in $EndpointPorts) - { - $currNumPolicies = (vfpctrl /port $endPort /list-tag | Select-String "Friendly Name").Count - #if difference is greater than or equal to 10% - if($currNumPolicies -le ($expectedNumPolicies - $expectedNumPolicies * .1)){ - - #get the virtualNetwork - $netId = Get-HnsEndpoint | where-object {$_.Resources.Allocators.EndPointPortGuid -eq $endPort} | Select -ExpandProperty VirtualNetwork - #send test policy to simplify log lookup - New-HNSSetPolicy -NetworkId $netId -setType 0 -setValues "10.22.0.44" -setName "spTestName" -setId "spTestId" -Verbose - - return $true - } - } - return $false + #logic here + return $true } \ No newline at end of file