Skip to content

Commit

Permalink
FH 1.1.0.831
Browse files Browse the repository at this point in the history
FH 1.1.0.831
  • Loading branch information
GitTorre authored Jul 12, 2022
2 parents a67fbc4 + 1dd706c commit a93d872
Show file tree
Hide file tree
Showing 99 changed files with 6,354 additions and 2,822 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,5 @@ ASALocalRun/
# BeatPulse healthcheck temp database
healthchecksdb

**/PublishProfiles/Cloud.xml
**/PublishProfiles/Cloud.xml
/nuget.exe
15 changes: 15 additions & 0 deletions Build-FHProxy.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
$ErrorActionPreference = "Stop"

$Configuration="Release"
[string] $scriptPath = Split-Path -Parent $MyInvocation.MyCommand.Definition

try {
Push-Location $scriptPath

Remove-Item $scriptPath\FabricHealerProxy\bin\release\netstandard2.0\ -Recurse -Force -EA SilentlyContinue

dotnet publish $scriptPath\FabricHealerProxy\FabricHealerProxy.csproj -o bin\release\netstandard2.0 -c $Configuration
}
finally {
Pop-Location
}
45 changes: 45 additions & 0 deletions Build-FHProxyNupkg.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
function Install-Nuget {
# Path to Latest nuget.exe on nuget.org
$source = "https://dist.nuget.org/win-x86-commandline/latest/nuget.exe"

# Save file to top level directory in repo
$destination = "$scriptPath\nuget.exe"

#Download the file
if (-Not [System.IO.File]::Exists($destination)) {
Invoke-WebRequest -Uri $source -OutFile $destination
}
}

function Build-Nuget {
param (
[string]
$packageId,

[string]
$basePath
)

[string] $nugetSpecTemplate = [System.IO.File]::ReadAllText([System.IO.Path]::Combine($scriptPath, "FabricHealerProxy.nuspec.template"))

[string] $nugetSpecPath = "$scriptPath\FabricHealerProxy\bin\release\netstandard2.0\$($packageId).nuspec"

[System.IO.File]::WriteAllText($nugetSpecPath, $nugetSpecTemplate.Replace("%PACKAGE_ID%", $packageId).Replace("%ROOT_PATH%", $scriptPath))

.\nuget.exe pack $nugetSpecPath -basepath $basePath -OutputDirectory bin\release\FabricHealerProxy\Nugets -properties NoWarn=NU5100,NU5128
}

[string] $scriptPath = Split-Path -Parent $MyInvocation.MyCommand.Definition

try {

Push-Location $scriptPath

Install-Nuget

Build-Nuget "Microsoft.ServiceFabricApps.FabricHealerProxy" "$scriptPath\FabricHealerProxy\bin\release\netstandard2.0"
}
finally {

Pop-Location
}
8 changes: 4 additions & 4 deletions Build-SFPKGs.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ function Build-SFPkg {
try {
Push-Location $scriptPath

Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.0.15" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.0.15" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.SelfContained.1.1.0.831" "$scriptPath\bin\release\FabricHealer\linux-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Linux.FrameworkDependent.1.1.0.831" "$scriptPath\bin\release\FabricHealer\linux-x64\framework-dependent\FabricHealerType"

Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.0.15" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.0.15" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.0.831" "$scriptPath\bin\release\FabricHealer\win-x64\self-contained\FabricHealerType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricHealer.Windows.FrameworkDependent.1.1.0.831" "$scriptPath\bin\release\FabricHealer\win-x64\framework-dependent\FabricHealerType"
}
finally {
Pop-Location
Expand Down
17 changes: 17 additions & 0 deletions Documentation/Deployment/Deploy-FabricHealer.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@

$subscriptionId = "<YOUR-AZURE-SUBSCRIPTION-ID>"
Try {
Select-AzSubscription -SubscriptionId $subscriptionId -ErrorAction Stop
} Catch {
Login-AzAccount
Set-AzContext -SubscriptionId $subscriptionId
}

$resourceGroup = "<YOUR-CLUSTER-RESOURCE-NAME>"
$armTemplate = "service-fabric-healer.json"
$armTemplateParameters = "service-fabric-healer.v1.1.0.831.parameters.json"

cd "<LOCAL-FH-REPO-PATH>\Documentation\Deployment"

New-AzResourceGroupDeployment -Name "deploy-service-fabric-healer" -ResourceGroupName $resourceGroup -TemplateFile $armTemplate -TemplateParameterFile $armTemplateParameters -Verbose -Mode Incremental

29 changes: 29 additions & 0 deletions Documentation/Deployment/Deployment.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Deploy FabricHealer using ARM

Like all elementary services such as resource monitoring, Windows Update, scripts, scaling, etc., FabricHealer should be installed with your initial Service Fabric cluster deployment.

There are two options:
1. Add the resource provided in the ARM template [service-fabric-healer.json](service-fabric-healer.json) in the template which also deploys the Service Fabric cluster.
To guarantee the correct deployment order the first resource has to depend on the cluster resource.
Using 'dependsOn' makes sure that the Service Fabric Resource Provider deploys the application only after the cluster deployment step has completed.

```ARM
{
"apiVersion": "[variables('sfApiVersion')]",
"type": "Microsoft.ServiceFabric/clusters/applicationTypes",
"name": "[concat(parameters('clusterName'), '/', variables('applicationTypeName'))]",
"location": "[resourceGroup().location]",
"dependsOn": [
"[concat('Microsoft.ServiceFabric/clusters/', parameters('clusterName'))]"
],
"properties": {
"provisioningState": "Default"
}
},
```

2. The app can be deployed manually by using the provided PowerShell script file [Deploy-FabricHealer.ps1](Deploy-FabricHealer.ps1).


## Further reading
- [Upgrade the Service Fabric application by using Resource Manager](https://docs.microsoft.com/en-us/azure/service-fabric/service-fabric-concept-resource-model#upgrade-the-service-fabric-application-by-using-resource-manager)
116 changes: 116 additions & 0 deletions Documentation/Deployment/service-fabric-healer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"clusterName": {
"type": "string",
"defaultValue": "",
"metadata": {
"description": "The Service Fabric cluster resource name from the Azure resource group. Example: servicefabriccluster123"
}
},
"applicationTypeVersionFabricHealer": {
"type": "string",
"defaultValue": "",
"metadata": {
"description": "Provide the app version number of FabricHealer. This must be identical with the version provided for the parameter appPackageUrl. Example: '1.1.0.831'"
}
},
"packageUrlFabricHealer": {
"type": "string",
"defaultValue": "",
"metadata": {
"description": "This has to be a public accessible URL for the sfpkg file which contains the FabricHealer app package. Example: https://github.com/microsoft/service-fabric-healer/releases/download/48361039/Microsoft.ServiceFabricApps.FabricHealer.Windows.SelfContained.1.1.0.831.sfpkg"
}
}
},
"variables": {
"applicationTypeNameFabricHealer": "FabricHealerType",
"applicationNameFabricHealer": "FabricHealerApplication",
"serviceNameFabricHealer": "[concat(variables('applicationNameFabricHealer'), '~FabricHealerService')]",
"serviceTypeNameFabricHealer": "FabricHealerType",
"sfrpApiVersion": "2021-06-01"
},
"resources": [
{
"apiVersion": "[variables('sfrpApiVersion')]",
"type": "Microsoft.ServiceFabric/clusters/applicationTypes",
"name": "[concat(parameters('clusterName'), '/', variables('applicationTypeNameFabricHealer'))]",
"location": "[resourceGroup().location]",
"properties": {
"provisioningState": "Default"
}
},
{
"apiVersion": "[variables('sfrpApiVersion')]",
"type": "Microsoft.ServiceFabric/clusters/applicationTypes/versions",
"name": "[concat(parameters('clusterName'), '/', variables('applicationTypeNameFabricHealer'), '/', parameters('applicationTypeVersionFabricHealer'))]",
"location": "[resourceGroup().location]",
"dependsOn": [
"[concat('Microsoft.ServiceFabric/clusters/', parameters('clusterName'), '/applicationTypes/', variables('applicationTypeNameFabricHealer'))]"
],
"properties": {
"provisioningState": "Default",
"appPackageUrl": "[parameters('packageUrlFabricHealer')]"
}
},
{
"apiVersion": "[variables('sfrpApiVersion')]",
"type": "Microsoft.ServiceFabric/clusters/applications",
"name": "[concat(parameters('clusterName'), '/', variables('applicationNameFabricHealer'))]",
"location": "[resourceGroup().location]",
"dependsOn": [
"[concat('Microsoft.ServiceFabric/clusters/', parameters('clusterName'), '/applicationTypes/', variables('applicationTypeNameFabricHealer'), '/versions/', parameters('applicationTypeVersionFabricHealer'))]"
],
"properties": {
"provisioningState": "Default",
"typeName": "[variables('applicationTypeNameFabricHealer')]",
"typeVersion": "[parameters('applicationTypeVersionFabricHealer')]",
"parameters": {
"EnableDiskRepair": "true"
},
"upgradePolicy": {
"upgradeReplicaSetCheckTimeout": "01:00:00.0",
"forceRestart": "false",
"rollingUpgradeMonitoringPolicy": {
"healthCheckWaitDuration": "00:02:00.0",
"healthCheckStableDuration": "00:05:00.0",
"healthCheckRetryTimeout": "00:10:00.0",
"upgradeTimeout": "01:00:00.0",
"upgradeDomainTimeout": "00:20:00.0"
},
"applicationHealthPolicy": {
"considerWarningAsError": "false",
"maxPercentUnhealthyDeployedApplications": "50",
"defaultServiceTypeHealthPolicy": {
"maxPercentUnhealthyServices": "50",
"maxPercentUnhealthyPartitionsPerService": "50",
"maxPercentUnhealthyReplicasPerPartition": "50"
}
}
}
}
},
{
"apiVersion": "[variables('sfrpApiVersion')]",
"type": "Microsoft.ServiceFabric/clusters/applications/services",
"name": "[concat(parameters('clusterName'), '/', variables('applicationNameFabricHealer'), '/', variables('serviceNameFabricHealer'))]",
"location": "[resourceGroup().location]",
"dependsOn": [
"[concat('Microsoft.ServiceFabric/clusters/', parameters('clusterName'), '/applications/', variables('applicationNameFabricHealer'))]"
],
"properties": {
"provisioningState": "Default",
"serviceKind": "Stateless",
"serviceTypeName": "[variables('serviceTypeNameFabricHealer')]",
"instanceCount": "-1",
"partitionDescription": {
"partitionScheme": "Singleton"
},
"correlationScheme": [],
"serviceLoadMetrics": [],
"servicePlacementPolicies": []
}
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"$schema": "https://schema.management.azure.com/schemas/2015-01-01/deploymentParameters.json#",
"contentVersion": "1.0.0.0",
"parameters": {
"clusterName": {
"value": "<YOUR-CLUSTER-RESOURCE-NAME>"
},
"applicationTypeVersionFabricHealer": {
"value": "1.1.0.831"
},
"packageUrlFabricHealer": {
"value": "<PUBLIC-ACCESSIBLE-URL-FOR-FABRICHEALER-SFPKG>"
}
}
}
29 changes: 15 additions & 14 deletions Documentation/LogicWorkflows.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@ Mitigate(AppName="fabric:/App1", MetricName="MemoryPercent") :- RestartCodePacka

Don't be alarmed if you don't understand how to read that repair action! We will go more in-depth later about the syntax and semantics of Guan. The takeaway is that expressing a Guan repair workflow doesn't require a deep knowledge of Prolog programming to get started. Hopefully this also gives you a general idea about the kinds of repair workflows we can express with GuanLogic.

Each repair policy has its own corresponding configuration file:
Each repair policy has its own corresponding configuration file located in the FabricHealer project's PackageRoot/Config/LogicRules folder:

| Repair Policy | Configuration File Name |
|---------------------------|------------------------------|
| AppRepairPolicy | AppRules.config.txt |
| DiskRepairPolicy | DiskRules.config.txt |
| FabricNodeRepairPolicy | FabricNodeRules.config.txt |
| ReplicaRepairPolicy | ReplicaRules.config.txt |
| SystemAppRepairPolicy | SystemAppRules.config.txt |
| VMRepairPolicy | VmRules.config.txt |
| AppRepairPolicy | AppRules.guan |
| DiskRepairPolicy | DiskRules.guan |
| FabricNodeRepairPolicy | FabricNodeRules.guan |
| MachineRepairPolicy | MachineRules.guan |
| ReplicaRepairPolicy | ReplicaRules.guan |
| SystemServiceRepairPolicy | SystemServiceRules.guan |


Now let's look at *how* to actually define a Guan logic repair workflow, so that you will have the knowledge necessary to express your own.
Expand Down Expand Up @@ -113,17 +113,18 @@ By default, for logic-based repair workflows, FH will execute a query which call

| Argument Name | Definition |
|---------------------------|----------------------------------------------------------------------------------------------|
| AppName | Name of the SF application, format is fabric:/SomeApp |
| ServiceName | Name of the SF service, format is fabric:/SomeApp/SomeService |
| AppName | Name of the SF application, format is "fabric:/SomeApp" (Quotes are required) |
| ServiceName | Name of the SF service, format is "fabric:/SomeApp/SomeService" (Quotes are required) |
| NodeName | Name of the node |
| NodeType | Type of node |
| ObserverName | Name of Observer that generated the event (if the data comes from FabricObserver service) |
| PartitionId | Id of the partition |
| ReplicaOrInstanceId | Id of the replica or instance |
| FOErrorCode | Error Code emitted by FO (e.g. "FO002") |
| MetricName | Name of the resource supplied by FO (e.g., CpuPercent or MemoryMB, etc.) |
| MetricValue | Corresponding Metric Value supplied by FO (e.g. "85" indicating 85% CPU usage) |
| SystemServiceProcessName | The name of a Fabric system service process supplied in FO health data |
| OS | The name of the OS from which the FO data was collected (Linux or Windows) |
| ErrorCode | Supported Error Code emitted by caller (e.g. "FO002") |
| MetricName | Name of the metric (e.g., CpuPercent or MemoryMB, etc.) |
| MetricValue | Corresponding Metric Value (e.g. "85" indicating 85% CPU usage) |
| OS | The name of the OS from which the data was collected (Linux or Windows) |
| HealthState | The HealthState of the target entity: Error or Warning |

For example if you wanted to use AppName and ServiceName in your repair workflow you would specify them like so:
```
Expand Down
30 changes: 16 additions & 14 deletions Documentation/OperationalTelemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,27 +37,29 @@ Here is a full example of exactly what is sent in one of these telemetry events,

```JSON
{
"EventName": "OperationalEvent",
"TaskName": "FabricHealer",
"EventRunInterval": "1.00:00:00",
"ClusterId": "00000000-1111-1111-0000-00f00d000d",
"ClusterType": "SFRP",
"NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad",
"FHVersion": "1.0.0",
"UpTime": "00:00:00.1956784",
"Timestamp": "2021-12-11T03:12:47.0410613Z",
"OS": "Windows",
"EnabledRepairCount": 2,
"TotalRepairAttempts": 5,
"SuccessfulRepairs": 5,
"FailedRepairs": 0
"EventName": "OperationalEvent",
"TaskName": "FabricHealer",
"EventRunInterval": "1.00:00:00",
"SFRuntimeVersion": "8.2.1620.9590",
"ClusterId": "00000000-1111-1111-0000-00f00d000d",
"ClusterType": "SFRP",
"NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad",
"FHVersion": "1.1.0.831",
"UpTime": "00:00:00.2164523",
"Timestamp": "2022-07-08T21:45:25.2443014Z",
"OS": "Windows",
"EnabledRepairCount": 1,
"TotalRepairAttempts": 0,
"SuccessfulRepairs": 0,
"FailedRepairs": 0
}
```

Let's take a look at the data and why we think it is useful to share with us. We'll go through each object property in the JSON above.
- **EventName** - this is the name of the telemetry event.
- **TaskName** - this specifies that the event is from FabricHealer.
- **EventRunInterval** - this is how often this telemetry is sent from a node in a cluster.
- **SFRuntimeVersion** - this is the Service Fabric runtime version installed on the machine.
- **ClusterId** - this is used to both uniquely identify a telemetry event and to correlate data that comes from a cluster.
- **ClusterType** - this is the type of cluster: Standalone or SFRP.
- **NodeNameHash** - this is a sha256 hash of the name of the Fabric node from where the data originates. It is used to correlate data from specific nodes in a cluster (the hashed node name will be known to be part of the cluster with a specific cluster id).
Expand Down
9 changes: 5 additions & 4 deletions FHTest/FHTest.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,17 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.9.4" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.3" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.3" />
<PackageReference Include="coverlet.collector" Version="3.0.3">
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.2.0" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.10" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.10" />
<PackageReference Include="coverlet.collector" Version="3.1.2">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
</PackageReference>
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\FabricHealerProxy\FabricHealerProxy.csproj" />
<ProjectReference Include="..\FabricHealer\FabricHealer.csproj" />
</ItemGroup>

Expand Down
Loading

0 comments on commit a93d872

Please sign in to comment.