From 2267207acc004b6c1cbc7389e92980f3939bd321 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 10 Feb 2022 16:17:13 -0800 Subject: [PATCH 1/9] 1.0.14 (Dev) --- Build-SFPKGs.ps1 | 8 ++-- FabricHealer.nuspec.template | 10 +++-- FabricHealer/FabricHealer.csproj | 4 +- FabricHealer/FabricHealerManager.cs | 44 +++++++++++++++++-- .../Config/LogicRules/AppRules.config.txt | 4 +- .../Config/LogicRules/DiskRules.config.txt | 34 +++++++++++++- .../LogicRules/FabricNodeRules.config.txt | 4 +- .../Config/LogicRules/ReplicaRules.config.txt | 4 +- .../LogicRules/SystemAppRules.config.txt | 34 ++++++-------- .../Config/LogicRules/VmRules.config.txt | 5 +-- FabricHealer/PackageRoot/ServiceManifest.xml | 6 +-- FabricHealer/Repair/RepairConstants.cs | 2 + FabricHealer/Repair/RepairTaskManager.cs | 1 + FabricHealer/Utilities/FOErrorWarningCodes.cs | 15 ++++++- .../ApplicationManifest.xml | 6 +-- 15 files changed, 133 insertions(+), 48 deletions(-) diff --git a/Build-SFPKGs.ps1 b/Build-SFPKGs.ps1 index 69e237e0..c13a40b2 100644 --- a/Build-SFPKGs.ps1 +++ b/Build-SFPKGs.ps1 @@ -23,11 +23,11 @@ function Build-SFPkg { try { Push-Location $scriptPath - Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.0.13" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.0.13" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.0.14" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.0.14" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.0.13" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType" - Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.0.13" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.0.14" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType" + Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.0.14" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType" } finally { Pop-Location diff --git a/FabricHealer.nuspec.template b/FabricHealer.nuspec.template index f2ab1f84..6593c085 100644 --- a/FabricHealer.nuspec.template +++ b/FabricHealer.nuspec.template @@ -2,11 +2,13 @@ %PACKAGE_ID% - 1.0.13 + 1.0.14 -- Fixed bugs due to Guan 1.0.4 breaking changes. *This version requires Guan 1.0.4*. -- Updated Disk Repair feature. -- Updated logic rules. +- Updated Disk Repair feature to support new FabricObserver error codes related to Folder Size monitoring. +- Updated Disk logic rules with Folder Size Warning repair workflow. +- Added more descriptions to all rules files to help clarify how to compose successful related rules. +- Added ObserverName named argument to Mitigate CompoundTerm. +- Added GetRepairRulesForSupportedObserver function. Microsoft MIT diff --git a/FabricHealer/FabricHealer.csproj b/FabricHealer/FabricHealer.csproj index edeb2df9..3e980d5a 100644 --- a/FabricHealer/FabricHealer.csproj +++ b/FabricHealer/FabricHealer.csproj @@ -12,8 +12,8 @@ win-x64--> linux-x64;win-x64 FabricHealer - 1.0.13 - 1.0.13 + 1.0.14 + 1.0.14 true true FabricHealer.Program diff --git a/FabricHealer/FabricHealerManager.cs b/FabricHealer/FabricHealerManager.cs index 7455d6ec..0cf226ff 100644 --- a/FabricHealer/FabricHealerManager.cs +++ b/FabricHealer/FabricHealerManager.cs @@ -28,7 +28,7 @@ public sealed class FabricHealerManager : IDisposable internal static RepairData RepairHistory; // Folks often use their own version numbers. This is for internal diagnostic telemetry. - private const string InternalVersionNumber = "1.0.13"; + private const string InternalVersionNumber = "1.0.14"; private static FabricHealerManager singleton; private bool disposedValue; private readonly StatelessServiceContext serviceContext; @@ -1225,7 +1225,7 @@ private List GetRepairRulesFromFOCode(string foErrorCode, string app = n switch (foErrorCode) { - // App level. + // App repair (user and system). case FOErrorWarningCodes.AppErrorCpuPercent: case FOErrorWarningCodes.AppErrorMemoryMB: case FOErrorWarningCodes.AppErrorMemoryPercent: @@ -1244,7 +1244,7 @@ private List GetRepairRulesFromFOCode(string foErrorCode, string app = n repairPolicySectionName = app == RepairConstants.SystemAppName ? RepairConstants.SystemAppRepairPolicySectionName : RepairConstants.AppRepairPolicySectionName; break; - // Node level. (node = VM, not Fabric node) + // VM repair. case FOErrorWarningCodes.NodeErrorCpuPercent: case FOErrorWarningCodes.NodeErrorMemoryMB: case FOErrorWarningCodes.NodeErrorMemoryPercent: @@ -1261,10 +1261,13 @@ private List GetRepairRulesFromFOCode(string foErrorCode, string app = n repairPolicySectionName = RepairConstants.VmRepairPolicySectionName; break; + // Disk repair. case FOErrorWarningCodes.NodeWarningDiskSpaceMB: case FOErrorWarningCodes.NodeErrorDiskSpaceMB: case FOErrorWarningCodes.NodeWarningDiskSpacePercent: case FOErrorWarningCodes.NodeErrorDiskSpacePercent: + case FOErrorWarningCodes.NodeWarningFolderSizeMB: + case FOErrorWarningCodes.NodeErrorFolderSizeMB: repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName; break; @@ -1276,6 +1279,41 @@ private List GetRepairRulesFromFOCode(string foErrorCode, string app = n return GetRepairRulesFromConfiguration(repairPolicySectionName); } + private List GetRepairRulesForSupportedObserver(string observerName) + { + string repairPolicySectionName; + + switch (observerName) + { + // App repair (user). + case RepairConstants.AppObserver: + + repairPolicySectionName = RepairConstants.AppRepairPolicySectionName; + break; + + // System service repair. + case RepairConstants.FabricSystemObserver: + repairPolicySectionName = RepairConstants.SystemAppRepairPolicySectionName; + break; + + // Disk repair + case RepairConstants.DiskObserver: + repairPolicySectionName = RepairConstants.DiskRepairPolicySectionName; + break; + + // VM repair. + case RepairConstants.NodeObserver: + + repairPolicySectionName = RepairConstants.VmRepairPolicySectionName; + break; + + default: + return null; + } + + return GetRepairRulesFromConfiguration(repairPolicySectionName); + } + private List GetRepairRulesFromConfiguration(string repairPolicySectionName) { // Get config filename and read lines from file. diff --git a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.config.txt index 7b422d45..55f53efc 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.config.txt +++ b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.config.txt @@ -1,18 +1,18 @@ ## Logic rules for Service Fabric Application-level repairs. -## Mitigate (the goal, Guan.CompoundTerm type) Named Arguments (CompoundTerm Arguments (of type Guan.Constant)) - Corresponding data is supplied by FabricObserver, renamed for brevity by FH. +## Applicable Named Arguments for user App service repair - Corresponding data is supplied by FabricObserver, renamed for brevity by FH. ## | Argument Name | Definition | ## |---------------------------|----------------------------------------------------------------------------------------------| ## | AppName | Name of the SF application, format is fabric:/SomeApp | ## | ServiceName | Name of the SF service, format is fabric:/SomeApp/SomeService | ## | NodeName | Name of the node | ## | NodeType | Type of node | +## | ObserverName | Name of Observer that generated the event. | ## | PartitionId | Id of the partition | ## | ReplicaOrInstanceId | Id of the replica or instance | ## | FOErrorCode | Error Code emitted by FO (e.g. "FO002") | ## | MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) | ## | MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) | -## | SystemServiceProcessName | The name of a Fabric system service process supplied in FO health data | ## | OS | The name of the OS from which the FO data was collected (Linux or Windows) | ## Application-related Metric Names. diff --git a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.config.txt index 183c239a..4b793084 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.config.txt +++ b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.config.txt @@ -1,5 +1,29 @@ ## Logic rules for Disk repair. Only file management is supported (file deletion). +## Applicable Named Arguments for Disk repair - Corresponding data is supplied by FabricObserver, renamed for brevity by FH. +## | Argument Name | Definition | +## |---------------------------|----------------------------------------------------------------------------------------------| +## | NodeName | Name of the node | +## | NodeType | Type of node | +## | ObserverName | Name of Observer that generated the event | +## | FOErrorCode | Error Code emitted by FO (Disk codes are FO007-FO010, FO042, F0043) | +## | MetricName | Name of the metric supplied by FO | +## | MetricValue | Corresponding value for supplied metric name | +## | OS | The name of the OS from which the FO data was collected (Linux or Windows) | + +## Disk-related Metric Names. +## | Name | +## |---------------------------| +## | DiskSpacePercent | +## | DiskSpaceMB | +## | FolderSizeMB | + +## Currently implemented external predicates for use in Disk repair rules. +## | Name | Definition +## |---------------------------|---------------------------------------------------------------------------------------------------------------| +## | CheckFolderSize | Returns true if supplied folder is equal to or larger than specified value. | +## | DeleteFiles | Deletes files in a specified directory (full path) with optional arguments. | + ## First, check if we are inside run interval. If so, then cut (!). ## This is commented out by default. Just uncomment and set the global run interval for disk level repairs to suit your needs. @@ -39,4 +63,12 @@ Mitigate(MetricName=?MetricName) :- match(?MetricName, "DiskSpace"), GetRepairHi Mitigate(MetricName=?MetricName) :- match(?MetricName, "DiskSpace"), GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, CheckFolderSize("C:\SFDevCluster\Log\Traces", MaxFolderSizeGB=20), - DeleteFiles("C:\SFDevCluster\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=10, RecurseSubdirectories=true, SearchPattern="SFBDMiniport_traces*"). \ No newline at end of file + DeleteFiles("C:\SFDevCluster\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=10, RecurseSubdirectories=true, SearchPattern="SFBDMiniport_traces*"). + +## Folder size Warning, check ErrorCode from FO (you could also check MetricName -> MetricName=FolderSizeMB, but we already do that in several places). +## See FOErrorWarningCodes.cs for list of codes and renaming function (GetMetricNameFromCode). +## Note: FH renames FO MetricNames to make is easier to use them in rules (often this means making them smaller in length without losing meaning). +Mitigate(FOErrorCode=FO043) :- GetRepairHistory(?repairCount, 08:00:00), + ?repairCount < 4, + CheckFolderSize("C:\SFDevCluster\Log\Traces", MaxFolderSizeMB=500), + DeleteFiles("C:\SFDevCluster\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=5, RecurseSubdirectories=false). \ No newline at end of file diff --git a/FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.config.txt index c1eb7945..16a3bfcc 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.config.txt +++ b/FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.config.txt @@ -1,4 +1,6 @@ -## Logic rules for Service Fabric Node repairs. +## Logic rules for Service Fabric Node repairs. These are not used today. FabricObserver does not monitor Fabric nodes. +## Fabric nodes are only put into Warning or Error health state by FO, as configured by user, if the underlying VM is having issues. +## See SystemAppRules.config.txt for logic rules related to system service process issues detected by FabricObserver. ## First check if we are inside the run interval. If so, cut (!). ## This is commented out by default. Just uncomment and set the global run interval for app Fabric node level repairs to suit your needs. diff --git a/FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.config.txt index a5da95ba..abc8cc55 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.config.txt +++ b/FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.config.txt @@ -1,4 +1,6 @@ -## These rules demonstrates a workflow that employs multiple internal and external predicates to get to a solution for a single unhealthy replica scenario: +## Experimental. + +## These rules demonstrates a workflow that employs multiple internal and external predicates to get to a solution for a single unhealthy replica scenario: ## [SourceId] ='System.RAP' reported Warning/Error for property... ## [Property] = 'IStatefulServiceReplica.ChangeRole(N)Duration'. ## [Description] = The api IStatefulServiceReplica.ChangeRole(N) on node [NodeName] is stuck. diff --git a/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.config.txt index 0e3c54c2..4d99f832 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.config.txt +++ b/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.config.txt @@ -1,29 +1,24 @@ ## Logic rules for Service Fabric System Service repairs. -## Mitigate Named Arguments - Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. -## | Argument Name | Definition | -## |---------------------------|----------------------------------------------------------------------------------------------| -## | AppName | Name of the SF application, format is fabric:/SomeApp | -## | ServiceName | Name of the SF service, format is fabric:/SomeApp/SomeService | -## | NodeName | Name of the node | -## | NodeType | Type of node | -## | PartitionId | Id of the partition | -## | ReplicaOrInstanceId | Id of the replica or instance | -## | FOErrorCode | Error Code emitted by FO (e.g. "FO002") | -## | MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) | -## | MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) | -## | SystemServiceProcessName | The name of a Fabric system service process supplied in FO health data | -## | OS | The name of the OS from which the FO data was collected (Linux or Windows) | - - -## Application-related Metric Names. +## Applicable Named Arguments for System Service Repair - Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. +## | Argument Name | Definition | +## |---------------------------|--------------------------------------------------------------------------------------------------------------| +## | AppName* | Name of the SF System Application. *This is always fabric:/System (FO monitors SF system service processes). | +## | NodeName | Name of the node | +## | NodeType | Type of node | +## | FOErrorCode | Error Code emitted by FO (e.g. "FO002") | +## | MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) | +## | MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) | +## | SystemServiceProcessName | The name of a Fabric system service process supplied in FO health data | +## | OS | The name of the OS from which the FO data was collected (Linux or Windows) | + +## System Service-related Metric Names. ## | Name | ## |---------------------------| ## | ActiveTcpPorts | ## | CpuPercent | ## | EphemeralPorts | ## | MemoryMB | -## | MemoryPercent | ## | FileHandles | ## | FileHandlesPercent | ## | Threads | @@ -115,5 +110,4 @@ Mitigate(MetricName="FileHandles", OS="Linux", SystemServiceProcessName=?SysProc ## Open File Handles - Linux, Fabric or FabricHost. In these cases, we want a safe (graceful) restart of the Fabric node; not just kill the process, which will restart the node, but not gracefully. ## Restart the Fabric node where the offending instance is running. -Mitigate(MetricName="FileHandles", OS="Linux", SystemServiceProcessName="Fabric") :- TimeScopedRestartFabricNode(2, 08:00:00). -Mitigate(MetricName="FileHandles", OS="Linux", SystemServiceProcessName="FabricHost") :- TimeScopedRestartFabricNode(2, 08:00:00). \ No newline at end of file +Mitigate(MetricName="FileHandles", OS="Linux", SystemServiceProcessName="Fabric") :- TimeScopedRestartFabricNode(2, 08:00:00). \ No newline at end of file diff --git a/FabricHealer/PackageRoot/Config/LogicRules/VmRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/VmRules.config.txt index 29b523e7..0d5ad331 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/VmRules.config.txt +++ b/FabricHealer/PackageRoot/Config/LogicRules/VmRules.config.txt @@ -1,6 +1,6 @@ ## Logic rules for Virtual Machine level repairs in the cluster. Only OS reboot is supported today. -## Mitigate Named Arguments related to VM repair - Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. +## Applicable Named Arguments related to VM repair - Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. ## | Argument Name | Definition | ## |---------------------------|----------------------------------------------------------------------------------------------| ## | NodeName | Name of the node | @@ -9,8 +9,7 @@ ## | MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) | ## | MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) | ## | OS | The name of the OS from which the FO data was collected (Linux or Windows) | - - + ## VM-related Metric Names. ## | Name | ## |---------------------------| diff --git a/FabricHealer/PackageRoot/ServiceManifest.xml b/FabricHealer/PackageRoot/ServiceManifest.xml index 9ec24ecd..dcdce89b 100644 --- a/FabricHealer/PackageRoot/ServiceManifest.xml +++ b/FabricHealer/PackageRoot/ServiceManifest.xml @@ -1,6 +1,6 @@  @@ -11,7 +11,7 @@ - + FabricHealer @@ -21,5 +21,5 @@ - + \ No newline at end of file diff --git a/FabricHealer/Repair/RepairConstants.cs b/FabricHealer/Repair/RepairConstants.cs index 40a3c11f..027d7840 100644 --- a/FabricHealer/Repair/RepairConstants.cs +++ b/FabricHealer/Repair/RepairConstants.cs @@ -82,6 +82,7 @@ public static class RepairConstants public const string CpuPercent = "CpuPercent"; public const string DiskAverageQueueLength = "DiskAverageQueueLength"; public const string DiskSpaceMB = "DiskSpaceMB"; + public const string FolderSizeMB = "FolderSizeMB"; public const string DiskSpacePercent = "DiskSpacePercent"; public const string EphemeralPorts = "EphemeralPorts"; public const string EndpointUnreachable = "EndpointUnreachable"; @@ -106,5 +107,6 @@ public static class RepairConstants public const string RepairData = "RepairData"; public const string RepairPolicy = "RepairPolicy"; public const string FabricHealer = "FabricHealer"; + public const string ObserverName = "ObserverName"; } } \ No newline at end of file diff --git a/FabricHealer/Repair/RepairTaskManager.cs b/FabricHealer/Repair/RepairTaskManager.cs index 030bbc97..57675dd2 100644 --- a/FabricHealer/Repair/RepairTaskManager.cs +++ b/FabricHealer/Repair/RepairTaskManager.cs @@ -214,6 +214,7 @@ public async Task RunGuanQueryAsync(TelemetryData foHealthData, List AppErrorCodesDictionary { get; @@ -92,7 +97,8 @@ public static Dictionary AppErrorCodesDictionary { AppErrorTooManyOpenFileHandles, "AppErrorTooManyOpenFileHandles" }, { AppWarningTooManyOpenFileHandles, "AppWarningTooManyOpenFileHandles" }, { AppErrorTooManyThreads, "AppErrorTooManyThreads" }, - { AppWarningTooManyThreads, "AppWarningTooManyThreads" } + { AppWarningTooManyThreads, "AppWarningTooManyThreads" }, + { AppWarningKvsLvidsPercentUsed, "AppWarningKvsLvidsPercentUsed"} }; public static Dictionary NodeErrorCodesDictionary @@ -111,6 +117,8 @@ public static Dictionary NodeErrorCodesDictionary { NodeWarningDiskSpaceMB, "NodeWarningDiskSpaceMB" }, { NodeErrorDiskAverageQueueLength, "NodeErrorDiskAverageQueueLength" }, { NodeWarningDiskAverageQueueLength, "NodeWarningDiskAverageQueueLength" }, + { NodeErrorFolderSizeMB, "NodeErrorFolderSizeMB" }, + { NodeWarningFolderSizeMB, "NodeWarningFolderSizeMB" }, { NodeErrorMemoryPercent, "NodeErrorMemoryPercent" }, { NodeWarningMemoryPercent, "NodeWarningMemoryPercent" }, { NodeErrorMemoryMB, "NodeErrorMemoryMB" }, @@ -174,6 +182,11 @@ public static string GetMetricNameFromCode(string code) return RepairConstants.DiskSpacePercent; } + if (GetIsResourceType(code, RepairConstants.FolderSizeMB)) + { + return RepairConstants.FolderSizeMB; + } + if (GetIsResourceType(code, RepairConstants.EndpointUnreachable)) { return RepairConstants.EndpointUnreachable; diff --git a/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml index b46b772a..1caf9fcd 100644 --- a/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -1,5 +1,5 @@  - + @@ -15,7 +15,7 @@ - + @@ -25,7 +25,7 @@ should match the Name and Version attributes of the ServiceManifest element defined in the ServiceManifest.xml file. --> - + From 47eae0ee9ac9b4447d3f2a23c42abcf0ee7703b4 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Tue, 15 Feb 2022 12:51:42 -0800 Subject: [PATCH 2/9] 1.1.14 (RC) --- FHTest/FHUnitTests.cs | 40 ++++---- FHTest/testrules_wellformed | 6 ++ FabricHealer.nuspec.template | 10 +- FabricHealer/FabricHealer.csproj | 20 ++++ FabricHealer/FabricHealerManager.cs | 93 +++++++++---------- .../{AppRules.config.txt => AppRules.guan} | 0 .../{DiskRules.config.txt => DiskRules.guan} | 17 ++-- ...eRules.config.txt => FabricNodeRules.guan} | 0 ...licaRules.config.txt => ReplicaRules.guan} | 0 ...ppRules.config.txt => SystemAppRules.guan} | 0 .../{VmRules.config.txt => VmRules.guan} | 0 FabricHealer/PackageRoot/Config/Settings.xml | 12 +-- .../Guan/GetRepairHistoryPredicateType.cs | 6 +- FabricHealer/Repair/RepairTaskManager.cs | 2 +- .../ApplicationManifest.xml | 2 +- TelemetryLib/TelemetryLib.csproj | 4 +- 16 files changed, 119 insertions(+), 93 deletions(-) rename FabricHealer/PackageRoot/Config/LogicRules/{AppRules.config.txt => AppRules.guan} (100%) rename FabricHealer/PackageRoot/Config/LogicRules/{DiskRules.config.txt => DiskRules.guan} (86%) rename FabricHealer/PackageRoot/Config/LogicRules/{FabricNodeRules.config.txt => FabricNodeRules.guan} (100%) rename FabricHealer/PackageRoot/Config/LogicRules/{ReplicaRules.config.txt => ReplicaRules.guan} (100%) rename FabricHealer/PackageRoot/Config/LogicRules/{SystemAppRules.config.txt => SystemAppRules.guan} (100%) rename FabricHealer/PackageRoot/Config/LogicRules/{VmRules.config.txt => VmRules.guan} (100%) diff --git a/FHTest/FHUnitTests.cs b/FHTest/FHUnitTests.cs index c8bc59a0..6c3cc780 100644 --- a/FHTest/FHUnitTests.cs +++ b/FHTest/FHUnitTests.cs @@ -24,6 +24,7 @@ namespace FHTest { /// /// NOTE: Run these tests on your machine with a local SF dev cluster running. + /// TODO: More code coverage. /// [TestClass] @@ -73,6 +74,10 @@ private static bool IsLocalSFRuntimePresent() } } + /// + /// This function cancels the local repair tasks created by the tests. + /// + /// private static async Task CleanupTestRepairJobsAsync() { // Complete (Cancel) any existing Test Repair Jobs. @@ -80,6 +85,7 @@ private static async Task CleanupTestRepairJobsAsync() { var repairTasks = await fabricClient.RepairManager.GetRepairTaskListAsync(); var testRepairTasks = repairTasks.Where(r => r.TaskId.EndsWith("TEST_0")); + foreach (var repairTask in testRepairTasks) { if (repairTask.State != RepairTaskState.Completed) @@ -90,9 +96,7 @@ private static async Task CleanupTestRepairJobsAsync() } catch (FabricException) { -#if DEBUG throw; -#endif } } @@ -103,10 +107,10 @@ public static async Task TestClassCleanupAsync() } /* GuanLogic Tests */ - // TODO: Add more tests. + // Currently, the tests below validate logic rules and the successful scheduling of related local repair jobs. - // This test ensures your actual rule files contain legitimate rules. This will catch bugs in your - // logic. Of course, you should have caught these flaws in your end-to-end tests. This is just an extra precaution. + // This test ensures your shipping rule files (the guan files located in Config/LogicRules folder) + // contain correctly written rules and that the related local repair jobs are successfully created. [TestMethod] public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_QueryInitialized() { @@ -136,7 +140,7 @@ public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_Quer foreach (var file in Directory.GetFiles(FHRulesDirectory)) { - List repairRules = ParseRulesFile((await File.ReadAllLinesAsync(file, token)).ToList()); + List repairRules = ParseRulesFile((await File.ReadAllLinesAsync(file, token))); try { @@ -152,9 +156,8 @@ public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_Quer Assert.IsTrue(true); } - // This test ensures a given rule can successfully be turned into a GL query. - // This means that the rule is well-formed logic and that the referenced predicates exist. - // So, if the rule is malformed or not a logic rule or no predicate exists as written, this test will fail. + // This test ensures your test rules housed testrules_wellformed file contain correctly written rules + // and that the related local repair jobs are successfully created. [TestMethod] public async Task TestGuanLogicRule_GoodRule_QueryInitialized() { @@ -170,7 +173,7 @@ public async Task TestGuanLogicRule_GoodRule_QueryInitialized() string testRulesFilePath = Path.Combine(Environment.CurrentDirectory, "testrules_wellformed"); string[] rules = await File.ReadAllLinesAsync(testRulesFilePath, token).ConfigureAwait(true); - List repairRules = ParseRulesFile(rules.ToList()); + List repairRules = ParseRulesFile(rules); var foHealthData = new TelemetryData { ApplicationName = "fabric:/test0", @@ -208,7 +211,7 @@ public async Task TestGuanLogicRule_BadRule_ShouldThrowGuanException() }; string[] rules = await File.ReadAllLinesAsync(Path.Combine(Environment.CurrentDirectory, "testrules_malformed"), token).ConfigureAwait(true); - List repairAction = ParseRulesFile(rules.ToList()); + List repairAction = ParseRulesFile(rules); var foHealthData = new TelemetryData { @@ -231,12 +234,6 @@ public async Task TestGuanLogicRule_BadRule_ShouldThrowGuanException() await Assert.ThrowsExceptionAsync(async () => { await TestInitializeGuanAndRunQuery(foHealthData, repairAction, executorData); }); } - /* FH Repair Scheduler Tests */ - // TODO. - - /* FH Repair Excecutor Tests */ - // TODO. - /* private Helpers */ private async Task TestInitializeGuanAndRunQuery(TelemetryData foHealthData, List repairRules, RepairExecutorData executorData) @@ -299,13 +296,13 @@ private async Task TestInitializeGuanAndRunQuery(TelemetryData foHealthDat //return Task.FromResult(true); } - private List ParseRulesFile(List rules) + private static List ParseRulesFile(string[] rules) { var repairRules = new List(); - int ptr1 = 0; int ptr2 = 0; - rules = rules.Where(s => !string.IsNullOrWhiteSpace(s)).ToList(); + int ptr1 = 0, ptr2 = 0; + rules = rules.Where(s => !string.IsNullOrWhiteSpace(s)).ToArray(); - while (ptr1 < rules.Count && ptr2 < rules.Count) + while (ptr1 < rules.Length && ptr2 < rules.Length) { // Single line comments removal. if (rules[ptr2].TrimStart().StartsWith("##")) @@ -329,6 +326,7 @@ private List ParseRulesFile(List rules) { rule = rule + ' ' + rules[i].Replace('\t', ' ').TrimStart(' '); } + repairRules.Add(rule.Remove(rule.Length - 1, 1)); } ptr2++; diff --git a/FHTest/testrules_wellformed b/FHTest/testrules_wellformed index 6666b721..52c6e160 100644 --- a/FHTest/testrules_wellformed +++ b/FHTest/testrules_wellformed @@ -67,6 +67,12 @@ Mitigate(MetricName="MemoryMB", MetricValue=?MetricValue) :- ?MetricValue >= 102 ?HealthEventCount >= 3, TimeScopedRestartCodePackage(1, 01:00:00). +## Disk + +Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00), + ?repairCount < 4, + CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50), + DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). ## Ports diff --git a/FabricHealer.nuspec.template b/FabricHealer.nuspec.template index 6593c085..e0544e43 100644 --- a/FabricHealer.nuspec.template +++ b/FabricHealer.nuspec.template @@ -4,11 +4,13 @@ %PACKAGE_ID% 1.0.14 -- Updated Disk Repair feature to support new FabricObserver error codes related to Folder Size monitoring. - Updated Disk logic rules with Folder Size Warning repair workflow. -- Added more descriptions to all rules files to help clarify how to compose successful related rules. -- Added ObserverName named argument to Mitigate CompoundTerm. -- Added GetRepairRulesForSupportedObserver function. +- Added more descriptions to all rules files to help clarify how to compose successful related logic. +- Added ObserverName named argument to Mitigate CompoundTerm (e.g., Mitigate(ObserverName=DiskObserver) :- ...). +- Added GetRepairRulesForSupportedObserver function to add more flexibility to getting related rules Lists. This will help limit required FH code changes to support new FO capabilities. +- Renamed rules text files to '[repair type].guan'. Ex: AppRules.guan, DiskRules.guan, etc. +- EnableTelemetryProvider is now an Application Parameter. +- Code improvements. Microsoft MIT diff --git a/FabricHealer/FabricHealer.csproj b/FabricHealer/FabricHealer.csproj index 3e980d5a..c351c538 100644 --- a/FabricHealer/FabricHealer.csproj +++ b/FabricHealer/FabricHealer.csproj @@ -39,4 +39,24 @@ + + + Never + + + Never + + + Never + + + Never + + + Never + + + Never + + \ No newline at end of file diff --git a/FabricHealer/FabricHealerManager.cs b/FabricHealer/FabricHealerManager.cs index 0cf226ff..315c128d 100644 --- a/FabricHealer/FabricHealerManager.cs +++ b/FabricHealer/FabricHealerManager.cs @@ -341,12 +341,8 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( { try { - using var telemetryEvents = new TelemetryEvents( - fabricClient, - serviceContext, - ServiceEventSource.Current, - Token, - EtwEnabled); + using var telemetryEvents = + new TelemetryEvents(fabricClient, serviceContext, ServiceEventSource.Current, Token, EtwEnabled); var fhData = new FabricHealerCriticalErrorEventData { @@ -556,10 +552,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( } // Check to see if an Azure tenant update is in progress. Do not conduct repairs if so. - if (await UpgradeChecker.IsAzureTenantUpdateInProgress( - fabricClient, - serviceContext.NodeContext.NodeType, - Token).ConfigureAwait(false)) + if (await UpgradeChecker.IsAzureTenantUpdateInProgress(fabricClient, serviceContext.NodeContext.NodeType, Token).ConfigureAwait(false)) { return; } @@ -583,11 +576,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( { await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates).ConfigureAwait(false); } - catch (Exception e) when ( - e is FabricException || - e is OperationCanceledException || - e is TaskCanceledException || - e is TimeoutException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, @@ -596,7 +585,6 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( Token, null, ConfigSettings.EnableVerboseLogging); - } } else if (kind != null && kind.Contains("Application")) @@ -610,11 +598,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( { await ProcessApplicationHealthAsync(clusterHealth.ApplicationHealthStates).ConfigureAwait(false); } - catch (Exception e) when ( - e is FabricException || - e is OperationCanceledException || - e is TaskCanceledException || - e is TimeoutException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, @@ -637,11 +621,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( { await ProcessReplicaHealthAsync(evaluation).ConfigureAwait(false); } - catch (Exception e) when ( - e is FabricException || - e is TimeoutException || - e is TaskCanceledException || - e is OperationCanceledException) + catch (Exception e) when (e is FabricException || e is TimeoutException) { await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, @@ -654,11 +634,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( } } } - catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TimeoutException) - { - return; - } - catch (Exception e) + catch (Exception e) when (!(e is OperationCanceledException || e is TaskCanceledException)) { await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Error, @@ -814,7 +790,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( } } - repairRules = GetRepairRulesFromFOCode(foHealthData.Code, RepairConstants.SystemAppName); + repairRules = GetRepairRulesForSupportedObserver(RepairConstants.FabricSystemObserver); if (repairRules == null || repairRules?.Count == 0) { @@ -861,7 +837,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( continue; } - repairRules = GetRepairRulesFromFOCode(foHealthData.Code); + repairRules = GetRepairRulesForSupportedObserver(RepairConstants.AppObserver); // Nothing to do here. if (repairRules == null || repairRules?.Count == 0) @@ -1036,8 +1012,8 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( } } - // Get configuration settings related to supported Node repair. - var repairRules = GetRepairRulesFromFOCode(foHealthData.Code); + // Get repair rules related to supported Node repair. + var repairRules = GetRepairRulesForSupportedObserver(foHealthData.ObserverName); if (repairRules == null || repairRules.Count == 0) { @@ -1316,19 +1292,36 @@ private List GetRepairRulesForSupportedObserver(string observerName) private List GetRepairRulesFromConfiguration(string repairPolicySectionName) { - // Get config filename and read lines from file. - string logicRulesConfigFileName = GetSettingParameterValue( - serviceContext, - repairPolicySectionName, - RepairConstants.LogicRulesConfigurationFile); - - var configPath = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path; - var rulesFolderPath = Path.Combine(configPath, RepairConstants.LogicRulesFolderName); - var rulesFilePath = Path.Combine(rulesFolderPath, logicRulesConfigFileName); - List rules = File.ReadAllLines(rulesFilePath).ToList(); - List repairRules = ParseRulesFile(rules); + try + { + string logicRulesConfigFileName = GetSettingParameterValue( + serviceContext, + repairPolicySectionName, + RepairConstants.LogicRulesConfigurationFile); - return repairRules; + var configPath = serviceContext.CodePackageActivationContext.GetConfigurationPackageObject("Config").Path; + var rulesFolderPath = Path.Combine(configPath, RepairConstants.LogicRulesFolderName); + var rulesFilePath = Path.Combine(rulesFolderPath, logicRulesConfigFileName); + + if (!File.Exists(rulesFilePath)) + { + return null; + } + + string[] rules = File.ReadAllLines(rulesFilePath); + + if (rules.Length == 0) + { + return null; + } + + List repairRules = ParseRulesFile(rules); + return repairRules; + } + catch (Exception ex) when (ex is ArgumentException || ex is IOException) + { + return null; + } } private int GetEnabledRepairRuleCount() @@ -1372,13 +1365,13 @@ public void Dispose() Dispose(true); } - private static List ParseRulesFile(List rules) + private static List ParseRulesFile(string[] rules) { var repairRules = new List(); int ptr1 = 0, ptr2 = 0; - rules = rules.Where(s => !string.IsNullOrWhiteSpace(s)).ToList(); + rules = rules.Where(s => !string.IsNullOrWhiteSpace(s)).ToArray(); - while (ptr1 < rules.Count && ptr2 < rules.Count) + while (ptr1 < rules.Length && ptr2 < rules.Length) { // Single line comments removal. if (rules[ptr2].TrimStart().StartsWith("##")) diff --git a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan similarity index 100% rename from FabricHealer/PackageRoot/Config/LogicRules/AppRules.config.txt rename to FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan diff --git a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan similarity index 86% rename from FabricHealer/PackageRoot/Config/LogicRules/DiskRules.config.txt rename to FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan index 4b793084..b4ce2414 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.config.txt +++ b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan @@ -52,7 +52,7 @@ ## of the same argument values (less rules to write..). Mitigate(MetricName=?MetricName) :- match(?MetricName, "DiskSpace"), GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, - member(config(?X,?Y), [config("C:\SFDevCluster\Log\QueryTraces", 50), config("C:\fabric_observer_logs", 1), config("E:\temp", 10)]), + member(config(?X,?Y), [config("D:\SvcFab\Log\Traces", 50), config("C:\fabric_observer_logs", 1), config("E:\temp", 10)]), CheckFolderSize(?X, MaxFolderSizeGB=?Y), DeleteFiles(?X, SortOrder=Ascending, MaxFilesToDelete=10, RecurseSubdirectories=true). @@ -62,13 +62,18 @@ Mitigate(MetricName=?MetricName) :- match(?MetricName, "DiskSpace"), GetRepairHi ## that are also mathematical operators (*,+,/,-,%, etc...). Mitigate(MetricName=?MetricName) :- match(?MetricName, "DiskSpace"), GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, - CheckFolderSize("C:\SFDevCluster\Log\Traces", MaxFolderSizeGB=20), - DeleteFiles("C:\SFDevCluster\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=10, RecurseSubdirectories=true, SearchPattern="SFBDMiniport_traces*"). + CheckFolderSize("E:\SvcFab\Log\QueryTraces", MaxFolderSizeGB=20), + DeleteFiles("E:\SvcFab\Log\QueryTraces", SortOrder=Ascending, MaxFilesToDelete=10, RecurseSubdirectories=true, SearchPattern="SFBDMiniport_traces*"). ## Folder size Warning, check ErrorCode from FO (you could also check MetricName -> MetricName=FolderSizeMB, but we already do that in several places). ## See FOErrorWarningCodes.cs for list of codes and renaming function (GetMetricNameFromCode). -## Note: FH renames FO MetricNames to make is easier to use them in rules (often this means making them smaller in length without losing meaning). Mitigate(FOErrorCode=FO043) :- GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, - CheckFolderSize("C:\SFDevCluster\Log\Traces", MaxFolderSizeMB=500), - DeleteFiles("C:\SFDevCluster\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=5, RecurseSubdirectories=false). \ No newline at end of file + CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50), + DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). + +## Constrain on folder size Error or Warning code. +Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00), + ?repairCount < 4, + CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50), + DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). \ No newline at end of file diff --git a/FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.guan similarity index 100% rename from FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.config.txt rename to FabricHealer/PackageRoot/Config/LogicRules/FabricNodeRules.guan diff --git a/FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.guan similarity index 100% rename from FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.config.txt rename to FabricHealer/PackageRoot/Config/LogicRules/ReplicaRules.guan diff --git a/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.guan similarity index 100% rename from FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.config.txt rename to FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.guan diff --git a/FabricHealer/PackageRoot/Config/LogicRules/VmRules.config.txt b/FabricHealer/PackageRoot/Config/LogicRules/VmRules.guan similarity index 100% rename from FabricHealer/PackageRoot/Config/LogicRules/VmRules.config.txt rename to FabricHealer/PackageRoot/Config/LogicRules/VmRules.guan diff --git a/FabricHealer/PackageRoot/Config/Settings.xml b/FabricHealer/PackageRoot/Config/Settings.xml index a0e2c2d2..8b3c9107 100644 --- a/FabricHealer/PackageRoot/Config/Settings.xml +++ b/FabricHealer/PackageRoot/Config/Settings.xml @@ -32,26 +32,26 @@ Overridable Parameters (Enabled) must be set in ApplicationManifest.xml. -->
- +
- +
- +
- +
- +
- +
diff --git a/FabricHealer/Repair/Guan/GetRepairHistoryPredicateType.cs b/FabricHealer/Repair/Guan/GetRepairHistoryPredicateType.cs index 7159cdca..189a62ee 100644 --- a/FabricHealer/Repair/Guan/GetRepairHistoryPredicateType.cs +++ b/FabricHealer/Repair/Guan/GetRepairHistoryPredicateType.cs @@ -40,9 +40,11 @@ protected override async Task GetNextTermAsync() } else { - string message = "You must supply a valid TimeSpan string for TimeWindow argument of GetRepairHistoryPredicate. Default result has been supplied (0)."; + string message = + "You must supply a valid TimeSpan string for TimeWindow argument of GetRepairHistoryPredicate. " + + "Default result has been supplied (0)."; - await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, $"GetRepairHistoryPredicate::{FOHealthData.RepairId}", message, diff --git a/FabricHealer/Repair/RepairTaskManager.cs b/FabricHealer/Repair/RepairTaskManager.cs index 57675dd2..d4d1b215 100644 --- a/FabricHealer/Repair/RepairTaskManager.cs +++ b/FabricHealer/Repair/RepairTaskManager.cs @@ -505,7 +505,7 @@ public async Task ScheduleFabricHealerRepairTaskAsync(RepairConfigur return null; } - // Don't attempt a node level repair on a node where there is already an active node-level repair. + // Don't attempt a node-level repair on a node where there is already an active node-level repair. var currentlyExecutingRepairs = await FabricClientInstance.RepairManager.GetRepairTaskListAsync( RepairTaskEngine.FHTaskIdPrefix, diff --git a/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml index 1caf9fcd..53f21730 100644 --- a/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -15,7 +15,7 @@ - + diff --git a/TelemetryLib/TelemetryLib.csproj b/TelemetryLib/TelemetryLib.csproj index a23fdd94..e27e0722 100644 --- a/TelemetryLib/TelemetryLib.csproj +++ b/TelemetryLib/TelemetryLib.csproj @@ -7,8 +7,8 @@ TelemetryLib netstandard2.0 x64 - 1.0.0.0 - 1.0.0.0 + 2.0.0.0 + 2.0.0.0 Copyright © 2020 TelemetryLib AnyCPU;x64 From 8b850d5d09c46b96d7a990bfdc109fbdd9a6734b Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 16 Feb 2022 15:25:49 -0800 Subject: [PATCH 3/9] Code changes. New rules. Updated tests. --- FHTest/FHUnitTests.cs | 33 ++++++++++--------- FHTest/testrules_wellformed | 5 +++ FabricHealer/FabricHealerManager.cs | 2 +- .../Config/LogicRules/DiskRules.guan | 8 ++++- .../Guan/CheckFolderSizePredicateType.cs | 10 ++++++ .../Repair/Guan/DeleteFilesPredicateType.cs | 10 ++++++ .../Repair/Guan/GuanQueryDispatcher.cs | 6 ++-- FabricHealer/Repair/RepairTaskManager.cs | 6 ++-- .../ApplicationManifest.xml | 2 +- 9 files changed, 57 insertions(+), 25 deletions(-) diff --git a/FHTest/FHUnitTests.cs b/FHTest/FHUnitTests.cs index 6c3cc780..19bedb5c 100644 --- a/FHTest/FHUnitTests.cs +++ b/FHTest/FHUnitTests.cs @@ -110,7 +110,7 @@ public static async Task TestClassCleanupAsync() // Currently, the tests below validate logic rules and the successful scheduling of related local repair jobs. // This test ensures your shipping rule files (the guan files located in Config/LogicRules folder) - // contain correctly written rules and that the related local repair jobs are successfully created. + // contain correctly written rules and that the related local repair job is successfully created. [TestMethod] public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_QueryInitialized() { @@ -124,6 +124,7 @@ public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_Quer TelemetryEnabled = false }; + // This will be the mock data used to create a repair task. var foHealthData = new TelemetryData { ApplicationName = "fabric:/test", @@ -131,6 +132,7 @@ public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_Quer RepairId = "Test42", Code = FOErrorWarningCodes.AppErrorMemoryMB, ServiceName = "fabric:/test0/service0", + Value = 1024.0 }; var executorData = new RepairExecutorData @@ -140,24 +142,20 @@ public async Task TestGuanLogic_AllRules_FabricHealer_EnsureWellFormedRules_Quer foreach (var file in Directory.GetFiles(FHRulesDirectory)) { - List repairRules = ParseRulesFile((await File.ReadAllLinesAsync(file, token))); + List repairRules = ParseRulesFile(await File.ReadAllLinesAsync(file, token)); try { - Assert.IsTrue(await TestInitializeGuanAndRunQuery(foHealthData, repairRules, executorData).ConfigureAwait(true)); + await TestInitializeGuanAndRunQuery(foHealthData, repairRules, executorData); } catch (GuanException ge) { - Console.WriteLine(ge.ToString()); - throw; + throw new AssertFailedException(ge.Message, ge); } } - - Assert.IsTrue(true); } - // This test ensures your test rules housed testrules_wellformed file contain correctly written rules - // and that the related local repair jobs are successfully created. + // This test ensures your test rules housed in testrules_wellformed file or in fact correct. [TestMethod] public async Task TestGuanLogicRule_GoodRule_QueryInitialized() { @@ -192,11 +190,17 @@ public async Task TestGuanLogicRule_GoodRule_QueryInitialized() RepairPolicy = new RepairPolicy { RepairAction = RepairActionType.RestartCodePackage }, }; - Assert.IsTrue(await TestInitializeGuanAndRunQuery(foHealthData, repairRules, executorData).ConfigureAwait(true)); + try + { + await TestInitializeGuanAndRunQuery(foHealthData, repairRules, executorData); + } + catch (GuanException ge) + { + throw new AssertFailedException(ge.Message, ge); + } } - // All rules in target rules file are malformed. They should all lead to GuanExceptions. - // If they do not lead to a GuanException from TestInitializeGuanAndRunQuery, then this test will fail. + // This test ensures your test rules housed in testrules_malformed file or in fact incorrect. [TestMethod] public async Task TestGuanLogicRule_BadRule_ShouldThrowGuanException() { @@ -236,7 +240,7 @@ public async Task TestGuanLogicRule_BadRule_ShouldThrowGuanException() /* private Helpers */ - private async Task TestInitializeGuanAndRunQuery(TelemetryData foHealthData, List repairRules, RepairExecutorData executorData) + private async Task TestInitializeGuanAndRunQuery(TelemetryData foHealthData, List repairRules, RepairExecutorData executorData) { var fabricClient = new FabricClient(); var repairTaskManager = new RepairTaskManager(fabricClient, context, token); @@ -292,8 +296,7 @@ private async Task TestInitializeGuanAndRunQuery(TelemetryData foHealthDat compoundTerm.AddArgument(new Constant(Convert.ToInt64(foHealthData.Value)), RepairConstants.MetricValue); compoundTerms.Add(compoundTerm); - return await queryDispatcher.RunQueryAsync(compoundTerms).ConfigureAwait(false); - //return Task.FromResult(true); + await queryDispatcher.RunQueryAsync(compoundTerms).ConfigureAwait(false); } private static List ParseRulesFile(string[] rules) diff --git a/FHTest/testrules_wellformed b/FHTest/testrules_wellformed index 52c6e160..6493e9b8 100644 --- a/FHTest/testrules_wellformed +++ b/FHTest/testrules_wellformed @@ -74,6 +74,11 @@ Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50), DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). +Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00), + ?repairCount < 4, + CheckFolderSize("%SOMEPATHVAR%", MaxFolderSizeGB=50), + DeleteFiles("%SOMEPATHVAR%", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). + ## Ports ## Local Active TCP Ports - Any app service. 5 repairs within 5 hour window. This means if FO warns on Active Ports, then heal. There are no conditional checks (on MetricValue) to take place. diff --git a/FabricHealer/FabricHealerManager.cs b/FabricHealer/FabricHealerManager.cs index 315c128d..aec23d13 100644 --- a/FabricHealer/FabricHealerManager.cs +++ b/FabricHealer/FabricHealerManager.cs @@ -487,7 +487,7 @@ await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( Code = errorCode, }; - _ = await repairTaskManager.RunGuanQueryAsync(foHealthData, repairRules, repairExecutorData).ConfigureAwait(false); + await repairTaskManager.RunGuanQueryAsync(foHealthData, repairRules, repairExecutorData).ConfigureAwait(false); } } catch (Exception e) when (e is FabricException || e is OperationCanceledException || e is TaskCanceledException) diff --git a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan index b4ce2414..e131676c 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan +++ b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan @@ -76,4 +76,10 @@ Mitigate(FOErrorCode=FO043) :- GetRepairHistory(?repairCount, 08:00:00), Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50), - DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). \ No newline at end of file + DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). + +## Constrain on folder size Error or Warning code; use environment variable for/in supplied path. Note: Environment variable string must be enclosed in quotes. +Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00), + ?repairCount < 4, + CheckFolderSize("%SOMEPATHVAR%", MaxFolderSizeGB=50), + DeleteFiles("%SOMEPATHVAR%", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). \ No newline at end of file diff --git a/FabricHealer/Repair/Guan/CheckFolderSizePredicateType.cs b/FabricHealer/Repair/Guan/CheckFolderSizePredicateType.cs index 39997213..35a5728e 100644 --- a/FabricHealer/Repair/Guan/CheckFolderSizePredicateType.cs +++ b/FabricHealer/Repair/Guan/CheckFolderSizePredicateType.cs @@ -10,6 +10,7 @@ using FabricHealer.Utilities.Telemetry; using System.Threading.Tasks; using System; +using System.Text.RegularExpressions; namespace FabricHealer.Repair.Guan { @@ -58,6 +59,15 @@ protected override async Task CheckAsync() } } + // Contains env variable(s)? + if (folderPath.Contains('%')) + { + if (Regex.Match(folderPath, @"^%[a-zA-Z0-9_]+%").Success) + { + folderPath = Environment.ExpandEnvironmentVariables(folderPath); + } + } + if (!Directory.Exists(folderPath)) { await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( diff --git a/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs b/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs index 3ea5b919..f8b38723 100644 --- a/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs +++ b/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs @@ -9,6 +9,7 @@ using FabricHealer.Utilities.Telemetry; using System.Threading.Tasks; using System.IO; +using System.Text.RegularExpressions; namespace FabricHealer.Repair.Guan { @@ -50,6 +51,15 @@ protected override async Task CheckAsync() bool recurseSubDirectories = false; string path = Input.Arguments[0].Value.GetEffectiveTerm().GetStringValue(); + // Contains env variable(s)? + if (path.Contains('%')) + { + if (Regex.Match(path, @"^%[a-zA-Z0-9_]+%").Success) + { + path = Environment.ExpandEnvironmentVariables(path); + } + } + if (string.IsNullOrWhiteSpace(path)) { throw new GuanException("You must specify a full folder path as the first argument of DeleteFiles predicate."); diff --git a/FabricHealer/Repair/Guan/GuanQueryDispatcher.cs b/FabricHealer/Repair/Guan/GuanQueryDispatcher.cs index 9394c975..073a632d 100644 --- a/FabricHealer/Repair/Guan/GuanQueryDispatcher.cs +++ b/FabricHealer/Repair/Guan/GuanQueryDispatcher.cs @@ -17,7 +17,7 @@ public GuanQueryDispatcher(Module module) module_ = module; } - public async Task RunQueryAsync(string queryExpression) + public async Task RunQueryAsync(string queryExpression) { ResolveOrder order = ResolveOrder.None; ModuleProvider moduleProvider = new ModuleProvider(); @@ -26,10 +26,9 @@ public async Task RunQueryAsync(string queryExpression) queryContext.SetDirection(null, order); Query query = Query.Create(queryExpression, queryContext); await query.GetNextAsync().ConfigureAwait(false); - return true; } - public async Task RunQueryAsync(List queryExpressions) + public async Task RunQueryAsync(List queryExpressions) { ResolveOrder order = ResolveOrder.None; ModuleProvider moduleProvider = new ModuleProvider(); @@ -38,7 +37,6 @@ public async Task RunQueryAsync(List queryExpressions) queryContext.SetDirection(null, order); Query query = Query.Create(queryExpressions, queryContext, moduleProvider); await query.GetNextAsync().ConfigureAwait(false); - return true; } } } diff --git a/FabricHealer/Repair/RepairTaskManager.cs b/FabricHealer/Repair/RepairTaskManager.cs index d4d1b215..94f2e736 100644 --- a/FabricHealer/Repair/RepairTaskManager.cs +++ b/FabricHealer/Repair/RepairTaskManager.cs @@ -148,7 +148,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( try { - _ = await RunGuanQueryAsync(foHealthData, repairRules); + await RunGuanQueryAsync(foHealthData, repairRules); } catch (GuanException ge) { @@ -170,7 +170,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( /// Repair rules that are related to target SF entity /// Optional Repair data that is used primarily when some repair is being restarted (after an FH restart, for example) /// - public async Task RunGuanQueryAsync(TelemetryData foHealthData, List repairRules, RepairExecutorData repairExecutorData = null) + public async Task RunGuanQueryAsync(TelemetryData foHealthData, List repairRules, RepairExecutorData repairExecutorData = null) { // Add predicate types to functor table. Note that all health information data from FO are automatically passed to all predicates. FunctorTable functorTable = new FunctorTable(); @@ -225,7 +225,7 @@ public async Task RunGuanQueryAsync(TelemetryData foHealthData, List - + From 0f8ea6670775319b7093328709609e8e517d7697 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 16 Feb 2022 15:26:17 -0800 Subject: [PATCH 4/9] default config --- FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml b/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml index 1caf9fcd..53f21730 100644 --- a/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml +++ b/FabricHealerApp/ApplicationPackageRoot/ApplicationManifest.xml @@ -15,7 +15,7 @@ - + From 6e30b105535dda1d203147dc5b6f022d65e5d043 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 16 Feb 2022 16:59:01 -0800 Subject: [PATCH 5/9] Finer grained exception handling. --- FabricHealer/FabricHealerManager.cs | 67 +++++++++++++++++++---------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/FabricHealer/FabricHealerManager.cs b/FabricHealer/FabricHealerManager.cs index aec23d13..5090442c 100644 --- a/FabricHealer/FabricHealerManager.cs +++ b/FabricHealer/FabricHealerManager.cs @@ -659,14 +659,28 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( private async Task ProcessApplicationHealthAsync(IEnumerable appHealthStates) { var supportedAppHealthStates = appHealthStates.Where(a => a.AggregatedHealthState == HealthState.Warning || a.AggregatedHealthState == HealthState.Error); - var nodeList = await fabricClient.QueryManager.GetNodeListAsync().ConfigureAwait(false); + var nodeList = await fabricClient.QueryManager.GetNodeListAsync(null, ConfigSettings.AsyncTimeout, Token).ConfigureAwait(false); foreach (var app in supportedAppHealthStates) { Token.ThrowIfCancellationRequested(); + + ApplicationHealth appHealth = null; + Uri appName = null; - var appHealth = await fabricClient.HealthManager.GetApplicationHealthAsync(app.ApplicationName).ConfigureAwait(false); - var appName = app.ApplicationName; + try + { + appHealth = + await fabricClient.HealthManager.GetApplicationHealthAsync( + app.ApplicationName, ConfigSettings.AsyncTimeout, Token).ConfigureAwait(false); + + appName = app.ApplicationName; + } + catch (Exception e) when (e is FabricException || e is TimeoutException) + { + // Application does not exist or health data retrieval fails/times out for some internal reason. Move to next app. + continue; + } // System app target? Do not proceed if system app repair is not enabled. if (appName.OriginalString == RepairConstants.SystemAppName && !ConfigSettings.EnableSystemAppRepair) @@ -682,31 +696,38 @@ private async Task ProcessApplicationHealthAsync(IEnumerable udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(fabricClient, appName, Token).ConfigureAwait(false); - string udText = string.Empty; + var appUpgradeStatus = await fabricClient.ApplicationManager.GetApplicationUpgradeProgressAsync(appName).ConfigureAwait(false); - // -1 means no upgrade in progress for application. - if (udInAppUpgrade.Any(ud => ud > -1)) + if (appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingBackInProgress + || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardInProgress + || appUpgradeStatus.UpgradeState == ApplicationUpgradeState.RollingForwardPending) { - udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}"; - } + List udInAppUpgrade = await UpgradeChecker.GetUdsWhereApplicationUpgradeInProgressAsync(fabricClient, appName, Token).ConfigureAwait(false); + string udText = string.Empty; - string telemetryDescription = $"{appName} is upgrading {udText}. Will not attempt application repair at this time."; + // -1 means no upgrade in progress for application. + if (udInAppUpgrade.Any(ud => ud > -1)) + { + udText = $"in UD {udInAppUpgrade.First(ud => ud > -1)}"; + } - await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( - LogLevel.Info, - "MonitorRepairableHealthEventsAsync::AppUpgradeDetected", - telemetryDescription, - Token, - null, - ConfigSettings.EnableVerboseLogging).ConfigureAwait(false); - continue; + string telemetryDescription = $"{appName} is upgrading {udText}. Will not attempt application repair at this time."; + + await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( + LogLevel.Info, + "MonitorRepairableHealthEventsAsync::AppUpgradeDetected", + telemetryDescription, + Token, + null, + ConfigSettings.EnableVerboseLogging).ConfigureAwait(false); + continue; + } + } + catch (FabricException) + { + // This upgrade check should not prevent moving forward if the fabric client call fails with an FE. } } From f431fdedae30173fba03c323c977ad4d9b33c8ce Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 23 Feb 2022 09:05:12 -0800 Subject: [PATCH 6/9] 1.0.14 --- FabricHealer/FabricHealerManager.cs | 2 +- .../PackageRoot/Config/LogicRules/AppRules.guan | 14 +++++++------- .../Config/LogicRules/DiskRules.guan | 8 ++++---- .../Config/LogicRules/SystemAppRules.guan | 2 +- .../PackageRoot/Config/LogicRules/VmRules.guan | 2 +- .../Repair/Guan/DeleteFilesPredicateType.cs | 17 ++++++++++++----- 6 files changed, 26 insertions(+), 19 deletions(-) diff --git a/FabricHealer/FabricHealerManager.cs b/FabricHealer/FabricHealerManager.cs index 5090442c..19c16942 100644 --- a/FabricHealer/FabricHealerManager.cs +++ b/FabricHealer/FabricHealerManager.cs @@ -1033,7 +1033,7 @@ await TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( } } - // Get repair rules related to supported Node repair. + // Get repair rules for supported source Observer. var repairRules = GetRepairRulesForSupportedObserver(foHealthData.ObserverName); if (repairRules == null || repairRules.Count == 0) diff --git a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan index 55f53efc..da03491f 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan +++ b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan @@ -1,6 +1,6 @@ ## Logic rules for Service Fabric Application-level repairs. -## Applicable Named Arguments for user App service repair - Corresponding data is supplied by FabricObserver, renamed for brevity by FH. +## Applicable Named Arguments for Mitigate. Corresponding data is supplied by FabricObserver, renamed for brevity by FH. ## | Argument Name | Definition | ## |---------------------------|----------------------------------------------------------------------------------------------| ## | AppName | Name of the SF application, format is fabric:/SomeApp | @@ -125,20 +125,20 @@ Mitigate(MetricName="MemoryMB", MetricValue=?MetricValue) :- ?MetricValue >= 102 ?HealthEventCount >= 3, TimeScopedRestartCodePackage(1, 01:00:00). - ## Ports -## Local Active TCP Ports - Any app service. 5 repairs within 5 hour window. This means if FO warns on Active Ports, then heal. There are no conditional checks (on MetricValue) to take place. +## Local Active TCP Ports - Any app service. 5 repairs within 5 hour window. This means if FO warns on Active Ports, then heal. +## There are no conditional checks. Mitigate(MetricName="ActiveTcpPorts") :- TimeScopedRestartCodePackage(5, 05:00:00). -## Ephemeral Ports - Specific Application: any of its services, constrained on number of local ephemeral ports open. -## 5 repairs within 5 hour window. -Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPorts", MetricValue=?MetricValue) :- ?MetricValue > 5000, TimeScopedRestartCodePackage(5, 05:00:00). - ## Ephemeral TCP Ports - Any app service. 5 repairs within 5 hour window. This means if FO warns on Ephemeral ports usage, then heal. ## There are no conditional checks. Mitigate(MetricName="EphemeralPorts") :- TimeScopedRestartCodePackage(5, 05:00:00). +## Ephemeral Ports - Specific Application: any of its services, constrained on number of local ephemeral ports open. +## 5 repairs within 5 hour window. +Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPorts", MetricValue=?MetricValue) :- ?MetricValue > 5000, TimeScopedRestartCodePackage(5, 05:00:00). + ## Threads ## Threads - Ignore specific application (FabricObserver, just for example - it's fine to target FO for repairs, generally), constrained on number of threads in use by the offending service process. diff --git a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan index e131676c..f5933dba 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan +++ b/FabricHealer/PackageRoot/Config/LogicRules/DiskRules.guan @@ -1,6 +1,6 @@ ## Logic rules for Disk repair. Only file management is supported (file deletion). -## Applicable Named Arguments for Disk repair - Corresponding data is supplied by FabricObserver, renamed for brevity by FH. +## Applicable Named Arguments for Mitigate. Corresponding data is supplied by FabricObserver, renamed for brevity by FH. ## | Argument Name | Definition | ## |---------------------------|----------------------------------------------------------------------------------------------| ## | NodeName | Name of the node | @@ -52,7 +52,7 @@ ## of the same argument values (less rules to write..). Mitigate(MetricName=?MetricName) :- match(?MetricName, "DiskSpace"), GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, - member(config(?X,?Y), [config("D:\SvcFab\Log\Traces", 50), config("C:\fabric_observer_logs", 1), config("E:\temp", 10)]), + member(config(?X,?Y), [config("D:\SvcFab\Log\Traces", 50), config("E:\fabric_observer_logs", 1), config("E:\temp", 10)]), CheckFolderSize(?X, MaxFolderSizeGB=?Y), DeleteFiles(?X, SortOrder=Ascending, MaxFilesToDelete=10, RecurseSubdirectories=true). @@ -75,8 +75,8 @@ Mitigate(FOErrorCode=FO043) :- GetRepairHistory(?repairCount, 08:00:00), ## Constrain on folder size Error or Warning code. Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00), ?repairCount < 4, - CheckFolderSize("E:\SvcFab\Log\Traces", MaxFolderSizeGB=50), - DeleteFiles("E:\SvcFab\Log\Traces", SortOrder=Ascending, MaxFilesToDelete=25, RecurseSubdirectories=false). + CheckFolderSize("C:\fabric_observer_logs", MaxFolderSizeMB=250), + DeleteFiles("C:\fabric_observer_logs", SortOrder=Ascending, MaxFilesToDelete=5, RecurseSubdirectories=true, SearchPattern="*.dmp"). ## Constrain on folder size Error or Warning code; use environment variable for/in supplied path. Note: Environment variable string must be enclosed in quotes. Mitigate(FOErrorCode=?FOErrorCode) :- ?FOErrorCode == "FO042" || ?FOErrorCode == "FO043", GetRepairHistory(?repairCount, 08:00:00), diff --git a/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.guan b/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.guan index 4d99f832..8a293b1f 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.guan +++ b/FabricHealer/PackageRoot/Config/LogicRules/SystemAppRules.guan @@ -1,6 +1,6 @@ ## Logic rules for Service Fabric System Service repairs. -## Applicable Named Arguments for System Service Repair - Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. +## Applicable Named Arguments for Mitigate. Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. ## | Argument Name | Definition | ## |---------------------------|--------------------------------------------------------------------------------------------------------------| ## | AppName* | Name of the SF System Application. *This is always fabric:/System (FO monitors SF system service processes). | diff --git a/FabricHealer/PackageRoot/Config/LogicRules/VmRules.guan b/FabricHealer/PackageRoot/Config/LogicRules/VmRules.guan index 0d5ad331..cb718e36 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/VmRules.guan +++ b/FabricHealer/PackageRoot/Config/LogicRules/VmRules.guan @@ -1,6 +1,6 @@ ## Logic rules for Virtual Machine level repairs in the cluster. Only OS reboot is supported today. -## Applicable Named Arguments related to VM repair - Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. +## Applicable Named Arguments for Mitigate. Corresponding data is supplied by FabricObserver, Renamed for brevity by FH. ## | Argument Name | Definition | ## |---------------------------|----------------------------------------------------------------------------------------------| ## | NodeName | Name of the node | diff --git a/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs b/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs index f8b38723..69b0848c 100644 --- a/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs +++ b/FabricHealer/Repair/Guan/DeleteFilesPredicateType.cs @@ -89,7 +89,7 @@ protected override async Task CheckAsync() break; case "recursesubdirectories": - _ = bool.TryParse(Input.Arguments[i].Value.GetStringValue(), out recurseSubDirectories); + recurseSubDirectories = (bool)Input.Arguments[i].Value.GetEffectiveTerm().GetObjectValue(); break; case "searchpattern": @@ -103,7 +103,7 @@ protected override async Task CheckAsync() if (searchPattern != null) { - if (!ValidateFileSearchPattern(searchPattern, path)) + if (!ValidateFileSearchPattern(searchPattern, path, recurseSubDirectories)) { await RepairTaskManager.TelemetryUtilities.EmitTelemetryEtwHealthEventAsync( LogLevel.Info, @@ -168,16 +168,23 @@ private DeleteFilesPredicateType(string name) } - private static bool ValidateFileSearchPattern(string searchPattern, string path) + private static bool ValidateFileSearchPattern(string searchPattern, string path, bool recurse) { if (string.IsNullOrWhiteSpace(searchPattern) || string.IsNullOrWhiteSpace(path) || !Directory.Exists(path)) { return false; } - if (Directory.GetFiles(path, searchPattern).Length > 0) + try { - return true; + if (Directory.GetFiles(path, searchPattern, recurse ? SearchOption.AllDirectories : SearchOption.TopDirectoryOnly).Length > 0) + { + return true; + } + } + catch (Exception e) when (e is IOException || e is UnauthorizedAccessException) + { + } return false; From 3cf53dbf1d73157175b83dc6767c6878c6658f8b Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Thu, 3 Mar 2022 11:04:51 -0800 Subject: [PATCH 7/9] Support for new FO metric. Updated related rules/FO err code processor. --- FabricHealer.nuspec.template | 15 ++++++------- .../Config/LogicRules/AppRules.guan | 21 ++++++++++++------- FabricHealer/Repair/RepairConstants.cs | 1 + FabricHealer/Utilities/FOErrorWarningCodes.cs | 13 ++++++++++++ 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/FabricHealer.nuspec.template b/FabricHealer.nuspec.template index e0544e43..5d203a78 100644 --- a/FabricHealer.nuspec.template +++ b/FabricHealer.nuspec.template @@ -4,13 +4,14 @@ %PACKAGE_ID% 1.0.14 -- Updated Disk logic rules with Folder Size Warning repair workflow. -- Added more descriptions to all rules files to help clarify how to compose successful related logic. -- Added ObserverName named argument to Mitigate CompoundTerm (e.g., Mitigate(ObserverName=DiskObserver) :- ...). -- Added GetRepairRulesForSupportedObserver function to add more flexibility to getting related rules Lists. This will help limit required FH code changes to support new FO capabilities. -- Renamed rules text files to '[repair type].guan'. Ex: AppRules.guan, DiskRules.guan, etc. -- EnableTelemetryProvider is now an Application Parameter. -- Code improvements. +Added support for new FabricObserver ephemeral ports metric (Percentage). +Updated Disk logic rules with Folder Size Warning repair workflow. +Added more descriptions to all rules files to help clarify how to compose successful related logic. +Added ObserverName named argument to Mitigate CompoundTerm (e.g., Mitigate(ObserverName=DiskObserver) :- ...). +Added GetRepairRulesForSupportedObserver function to add more flexibility to getting related rules Lists. This will help limit required FH code changes to support new FO capabilities. +Renamed rules text files to '[repair type].guan'. Ex: AppRules.guan, DiskRules.guan, etc. +EnableTelemetryProvider is now an Application Parameter. +Code improvements. Microsoft MIT diff --git a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan index da03491f..8cc0294e 100644 --- a/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan +++ b/FabricHealer/PackageRoot/Config/LogicRules/AppRules.guan @@ -20,7 +20,8 @@ ## |---------------------------| ## | ActiveTcpPorts | ## | CpuPercent | -## | EphemeralPorts | +## | EphemeralPorts | +## | EphemeralPortsPercent | ## | EndpointUnreachable* | ## | MemoryMB | ## | MemoryPercent | @@ -127,17 +128,21 @@ Mitigate(MetricName="MemoryMB", MetricValue=?MetricValue) :- ?MetricValue >= 102 ## Ports -## Local Active TCP Ports - Any app service. 5 repairs within 5 hour window. This means if FO warns on Active Ports, then heal. -## There are no conditional checks. +## Local Active TCP Ports. +## Any app service. 5 repairs within 5 hour window. This means if FO warns on Active Ports, then heal. Mitigate(MetricName="ActiveTcpPorts") :- TimeScopedRestartCodePackage(5, 05:00:00). -## Ephemeral TCP Ports - Any app service. 5 repairs within 5 hour window. This means if FO warns on Ephemeral ports usage, then heal. -## There are no conditional checks. -Mitigate(MetricName="EphemeralPorts") :- TimeScopedRestartCodePackage(5, 05:00:00). +## Local Ports in Dynamic Range (aka Ephemeral Ports) +## Percentage in use or Total Ephemeral ports - Any app service. 5 repairs within 5 hour window. This means if FO warns on Ephemeral ports usage, then heal. +Mitigate(MetricName=?MetricName) :- match(?MetricName, "EphemeralPorts"), TimeScopedRestartCodePackage(5, 05:00:00). -## Ephemeral Ports - Specific Application: any of its services, constrained on number of local ephemeral ports open. +## Total Ephemeral Ports - Specific Application: any of its services, constrained on number of local ephemeral ports open. ## 5 repairs within 5 hour window. -Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPorts", MetricValue=?MetricValue) :- ?MetricValue > 5000, TimeScopedRestartCodePackage(5, 05:00:00). +Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPorts", MetricValue=?MetricValue) :- ?MetricValue > 7000, TimeScopedRestartCodePackage(5, 05:00:00). + +## Percentage Ephemeral Ports - Specific Application: any of its services, constrained on ephemeral ports percent (of total) usage. +## 5 repairs within 5 hour window. +Mitigate(AppName="fabric:/MyApp42", MetricName="EphemeralPortsPercent", MetricValue=?MetricValue) :- ?MetricValue > 20, TimeScopedRestartCodePackage(5, 05:00:00). ## Threads diff --git a/FabricHealer/Repair/RepairConstants.cs b/FabricHealer/Repair/RepairConstants.cs index 027d7840..de2f59d9 100644 --- a/FabricHealer/Repair/RepairConstants.cs +++ b/FabricHealer/Repair/RepairConstants.cs @@ -85,6 +85,7 @@ public static class RepairConstants public const string FolderSizeMB = "FolderSizeMB"; public const string DiskSpacePercent = "DiskSpacePercent"; public const string EphemeralPorts = "EphemeralPorts"; + public const string EphemeralPortsPercent = "EphemeralPortsPercent"; public const string EndpointUnreachable = "EndpointUnreachable"; public const string FirewallRules = "FirewallRules"; public const string MemoryMB = "MemoryMB"; diff --git a/FabricHealer/Utilities/FOErrorWarningCodes.cs b/FabricHealer/Utilities/FOErrorWarningCodes.cs index c02398c1..f0c7e3bd 100644 --- a/FabricHealer/Utilities/FOErrorWarningCodes.cs +++ b/FabricHealer/Utilities/FOErrorWarningCodes.cs @@ -58,6 +58,10 @@ public static class FOErrorWarningCodes public const string AppWarningTooManyActiveEphemeralPorts = "FO030"; public const string NodeErrorTooManyActiveEphemeralPorts = "FO031"; public const string NodeWarningTooManyActiveEphemeralPorts = "FO032"; + public const string AppErrorActiveEphemeralPortsPercent = "FO044"; + public const string AppWarningActiveEphemeralPortsPercent = "FO045"; + public const string NodeErrorActiveEphemeralPortsPercent = "FO046"; + public const string NodeWarningActiveEphemeralPortsPercent = "FO047"; // Process owned File Handles / File Descriptors - Linux (File Descriptors) and Windows (File Handles) public const string AppErrorTooManyOpenFileHandles = "FO033"; @@ -94,6 +98,8 @@ public static Dictionary AppErrorCodesDictionary { AppWarningTooManyActiveTcpPorts, "AppWarningTooManyActiveTcpPorts" }, { AppErrorTooManyActiveEphemeralPorts, "AppErrorTooManyActiveEphemeralPorts" }, { AppWarningTooManyActiveEphemeralPorts, "AppWarningTooManyActiveEphemeralPorts" }, + { AppErrorActiveEphemeralPortsPercent, "AppErrorActiveEphemeralPortsPercent" }, + { AppWarningActiveEphemeralPortsPercent, "AppWarningActiveEphemeralPortsPercent" }, { AppErrorTooManyOpenFileHandles, "AppErrorTooManyOpenFileHandles" }, { AppWarningTooManyOpenFileHandles, "AppWarningTooManyOpenFileHandles" }, { AppErrorTooManyThreads, "AppErrorTooManyThreads" }, @@ -129,6 +135,8 @@ public static Dictionary NodeErrorCodesDictionary { WarningTooManyFirewallRules, "NodeWarningTooManyFirewallRules" }, { NodeErrorTooManyActiveEphemeralPorts, "NodeErrorTooManyActiveEphemeralPorts" }, { NodeWarningTooManyActiveEphemeralPorts, "NodeWarningTooManyActiveEphemeralPorts" }, + { NodeErrorActiveEphemeralPortsPercent, "NodeErrorActiveEphemeralPortsPercent" }, + { NodeWarningActiveEphemeralPortsPercent, "NodeWarningActiveEphemeralPortsPercent" }, { NodeErrorTotalOpenFileHandlesPercent, "NodeErrorTotalOpenFileHandlesPercent" }, { NodeWarningTotalOpenFileHandlesPercent, "NodeWarningTotalOpenFileHandlesPercent" }, { NodeErrorTooManyOpenFileHandles, "NodeErrorTooManyOpenFileHandles" }, @@ -192,6 +200,11 @@ public static string GetMetricNameFromCode(string code) return RepairConstants.EndpointUnreachable; } + if (GetIsResourceType(code, RepairConstants.EphemeralPortsPercent)) + { + return RepairConstants.EphemeralPortsPercent; + } + if (GetIsResourceType(code, RepairConstants.EphemeralPorts)) { return RepairConstants.EphemeralPorts; From 572bcd3284d51a8d958b86df160af631a49905ee Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Wed, 9 Mar 2022 10:41:53 -0800 Subject: [PATCH 8/9] nuspec++ --- FabricHealer.nuspec.template | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FabricHealer.nuspec.template b/FabricHealer.nuspec.template index 5d203a78..b539665a 100644 --- a/FabricHealer.nuspec.template +++ b/FabricHealer.nuspec.template @@ -4,7 +4,7 @@ %PACKAGE_ID% 1.0.14 -Added support for new FabricObserver ephemeral ports metric (Percentage). +Added support for new FabricObserver 3.1.25 - new ephemeral ports metric (Percentage in use of total dynamic ports configured for machine). Updated Disk logic rules with Folder Size Warning repair workflow. Added more descriptions to all rules files to help clarify how to compose successful related logic. Added ObserverName named argument to Mitigate CompoundTerm (e.g., Mitigate(ObserverName=DiskObserver) :- ...). From 5230c42580646da12eee5291956adc895005c838 Mon Sep 17 00:00:00 2001 From: Charles Torre Date: Mon, 14 Mar 2022 15:56:54 -0700 Subject: [PATCH 9/9] Bug fixes --- FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs | 2 +- FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs | 2 +- .../Repair/Guan/RestartFabricSystemProcessPredicateType.cs | 2 +- FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs | 2 +- FabricHealer/Repair/Guan/RestartVMPredicateType.cs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs b/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs index e702abd8..9bb01981 100644 --- a/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartCodePackagePredicateType.cs @@ -59,7 +59,7 @@ protected override async Task CheckAsync() break; case "Boolean": - repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetObjectValue(); + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue(); break; default: diff --git a/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs b/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs index 8d680335..58a4c405 100644 --- a/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartFabricNodePredicateType.cs @@ -60,7 +60,7 @@ protected override async Task CheckAsync() break; case "Boolean": - repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetObjectValue(); + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue(); break; default: diff --git a/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs b/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs index 5d9d3174..281f7fe5 100644 --- a/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartFabricSystemProcessPredicateType.cs @@ -67,7 +67,7 @@ protected override async Task CheckAsync() break; case "Boolean": - repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetObjectValue(); + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue(); break; default: diff --git a/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs b/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs index 29843fec..426f6b1b 100644 --- a/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartReplicaPredicateType.cs @@ -56,7 +56,7 @@ protected override async Task CheckAsync() break; case "Boolean": - repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetObjectValue(); + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue(); break; default: diff --git a/FabricHealer/Repair/Guan/RestartVMPredicateType.cs b/FabricHealer/Repair/Guan/RestartVMPredicateType.cs index b60bae01..6d2ce982 100644 --- a/FabricHealer/Repair/Guan/RestartVMPredicateType.cs +++ b/FabricHealer/Repair/Guan/RestartVMPredicateType.cs @@ -57,7 +57,7 @@ protected override async Task CheckAsync() break; case "Boolean": - repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[0].Value.GetObjectValue(); + repairConfiguration.RepairPolicy.DoHealthChecks = (bool)Input.Arguments[i].Value.GetObjectValue(); break; default: