Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ocrvs 7687 #1190

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions infrastructure/monitoring/filebeat/filebeat.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ filebeat.inputs:
enabled: true
paths:
- /var/log/opencrvs-restore.log
- type: log
enabled: true
paths:
- /var/log/opencrvs-backup.error.log

# https://github.com/elastic/beats/blob/master/filebeat/filebeat.reference.yml

Expand Down
2 changes: 1 addition & 1 deletion infrastructure/monitoring/kibana/config.ndjson
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{"attributes":{"actions":[{"actionRef":"preconfigured:preconfigured-alert-history-es-index","actionTypeId":".index","group":"metrics.inventory_threshold.fired","params":{"documents":["{\"@timestamp\":\"2024-08-06T07:57:35.644Z\",\"tags\":\"{{rule.tags}}\",\"rule\":{\"id\":\"{{rule.id}}\",\"name\":\"{{rule.name}}\",\"params\":{\"{{rule__type}}\":\"{{params}}\"},\"space\":\"{{rule.spaceId}}\",\"type\":\"{{rule.type}}\"},\"kibana\":{\"alert\":{\"id\":\"{{alert.id}}\",\"context\":{\"{{rule__type}}\":\"{{context}}\"},\"actionGroup\":\"{{alert.actionGroup}}\",\"actionGroupName\":\"{{alert.actionGroupName}}\"}},\"event\":{\"kind\":\"alert\"}}"]}}],"alertTypeId":"metrics.alert.inventory.threshold","apiKey":null,"apiKeyOwner":null,"consumer":"alerts","createdAt":"2023-11-17T12:01:46.420Z","createdBy":"elastic","enabled":false,"executionStatus":{"error":null,"lastExecutionDate":"2024-08-06T08:02:44.568Z","status":"pending"},"legacyId":null,"meta":{"versionApiKeyLastmodified":"7.17.18"},"muteAll":false,"mutedInstanceIds":[],"name":"Available disk space in root file system","notifyWhen":"onActionGroupChange","params":{"alertOnNoData":true,"criteria":[{"comparator":">=","customMetric":{"aggregation":"max","field":"system.filesystem.used.pct","id":"alert-custom-metric","type":"custom"},"metric":"custom","threshold":[0.7],"timeSize":1,"timeUnit":"h","warningComparator":">=","warningThreshold":[0.5]}],"filterQuery":"{\"bool\":{\"should\":[{\"match_phrase\":{\"system.filesystem.mount_point\":\"/hostfs\"}}],\"minimum_should_match\":1}}","filterQueryText":"system.filesystem.mount_point : \"/hostfs\"","nodeType":"host","sourceId":"default"},"schedule":{"interval":"1h"},"scheduledTaskId":null,"tags":["infra","opencrvs-builtin"],"throttle":null,"updatedAt":"2024-08-06T08:01:56.542Z","updatedBy":"elastic"},"coreMigrationVersion":"7.17.18","id":"14778650-8541-11ee-9002-2f37fdc4e5d5","migrationVersion":{"alert":"7.16.0"},"references":[],"sort":[1722931316554,643],"type":"alert","updated_at":"2024-08-06T08:01:56.554Z","version":"WzM5MywxXQ=="}
{"attributes":{"actions":[{"actionRef":"preconfigured:preconfigured-alert-history-es-index","actionTypeId":".index","group":"threshold_met","params":{"documents":["{\"@timestamp\":\"2022-04-18T07:05:33.819Z\",\"tags\":\"{{rule.tags}}\",\"rule\":{\"id\":\"{{rule.id}}\",\"name\":\"{{rule.name}}\",\"params\":{\"{{rule__type}}\":\"{{params}}\"},\"space\":\"{{rule.spaceId}}\",\"type\":\"{{rule.type}}\"},\"kibana\":{\"alert\":{\"id\":\"{{alert.id}}\",\"context\":{\"{{rule__type}}\":\"{{context}}\"},\"actionGroup\":\"{{alert.actionGroup}}\",\"actionGroupName\":\"{{alert.actionGroupName}}\"}},\"event\":{\"kind\":\"alert\"}}"]}}],"alertTypeId":"apm.error_rate","apiKey":null,"apiKeyOwner":null,"consumer":"alerts","createdAt":"2022-06-01T11:30:27.033Z","createdBy":"opencrvs-admin","enabled":false,"executionStatus":{"error":null,"lastExecutionDate":"2024-02-07T08:28:08.400Z","status":"pending"},"legacyId":null,"meta":{"versionApiKeyLastmodified":"7.17.0"},"muteAll":false,"mutedInstanceIds":[],"name":"Error in service","notifyWhen":"onActionGroupChange","params":{"environment":"ENVIRONMENT_ALL","threshold":1,"windowSize":1,"windowUnit":"m"},"schedule":{"interval":"1m"},"scheduledTaskId":null,"tags":[],"throttle":null,"updatedAt":"2024-02-05T03:00:20.633Z","updatedBy":"opencrvs-admin"},"coreMigrationVersion":"7.17.0","id":"3b6722e0-e19e-11ec-ba8e-51649755648d","migrationVersion":{"alert":"7.16.0"},"references":[],"sort":[1707273006619,214975],"type":"alert","updated_at":"2024-02-07T02:30:06.619Z","version":"WzQ5MjAzNCwxOV0="}
{"attributes":{"buildNum":46534,"defaultIndex":"metricbeat-*"},"coreMigrationVersion":"7.17.0","id":"7.17.0","migrationVersion":{"config":"7.13.0"},"references":[],"sort":[1707273006619,216009],"type":"config","updated_at":"2024-02-07T02:30:06.619Z","version":"WzQ5MjQyOCwxOV0="}
{"attributes":{"actions":[{"actionRef":"preconfigured:preconfigured-alert-history-es-index","actionTypeId":".index","group":"logs.threshold.fired","params":{"documents":["{\"@timestamp\":\"2023-11-22T08:25:47.329Z\",\"tags\":\"{{rule.tags}}\",\"rule\":{\"id\":\"{{rule.id}}\",\"name\":\"{{rule.name}}\",\"params\":{\"{{rule__type}}\":\"{{params}}\"},\"space\":\"{{rule.spaceId}}\",\"type\":\"{{rule.type}}\"},\"kibana\":{\"alert\":{\"id\":\"{{alert.id}}\",\"context\":{\"{{rule__type}}\":\"{{context}}\"},\"actionGroup\":\"{{alert.actionGroup}}\",\"actionGroupName\":\"{{alert.actionGroupName}}\"}},\"event\":{\"kind\":\"alert\"}}"]}}],"alertTypeId":"logs.alert.document.count","apiKey":null,"apiKeyOwner":null,"consumer":"alerts","createdAt":"2023-11-22T08:32:38.272Z","createdBy":"elastic","enabled":false,"executionStatus":{"error":null,"lastExecutionDate":"2024-02-07T08:28:08.400Z","status":"pending"},"legacyId":null,"meta":{"versionApiKeyLastmodified":"7.17.0"},"muteAll":false,"mutedInstanceIds":[],"name":"Error in backup logs","notifyWhen":"onActionGroupChange","params":{"count":{"comparator":"more than or equals","value":1},"criteria":[{"comparator":"matches","field":"message","value":"error"},{"comparator":"equals","field":"log.file.path","value":"/var/log/opencrvs-backup.log"}],"timeSize":1,"timeUnit":"h"},"schedule":{"interval":"1h"},"scheduledTaskId":null,"tags":[],"throttle":null,"updatedAt":"2024-02-07T02:27:22.558Z","updatedBy":"elastic"},"coreMigrationVersion":"7.17.0","id":"b166fcb0-8911-11ee-8111-2f3be9e93efc","migrationVersion":{"alert":"7.16.0"},"references":[],"sort":[1707294436464,232766],"type":"alert","updated_at":"2024-02-07T08:27:16.464Z","version":"WzQ5NTQzNCwxOV0="}
{"attributes":{"actions":[{"actionRef":"preconfigured:preconfigured-alert-history-es-index","actionTypeId":".index","group":"logs.threshold.fired","params":{"documents":["{\"@timestamp\":\"2023-11-22T08:25:47.329Z\",\"tags\":\"{{rule.tags}}\",\"rule\":{\"id\":\"{{rule.id}}\",\"name\":\"{{rule.name}}\",\"params\":{\"{{rule__type}}\":\"{{params}}\"},\"space\":\"{{rule.spaceId}}\",\"type\":\"{{rule.type}}\"},\"kibana\":{\"alert\":{\"id\":\"{{alert.id}}\",\"context\":{\"{{rule__type}}\":\"{{context}}\"},\"actionGroup\":\"{{alert.actionGroup}}\",\"actionGroupName\":\"{{alert.actionGroupName}}\"}},\"event\":{\"kind\":\"alert\"}}"]}}],"alertTypeId":"logs.alert.document.count","apiKey":null,"apiKeyOwner":null,"consumer":"alerts","createdAt":"2023-11-22T08:32:38.272Z","createdBy":"elastic","enabled":false,"executionStatus":{"error":null,"lastExecutionDate":"2024-02-07T08:28:08.400Z","status":"pending"},"legacyId":null,"meta":{"versionApiKeyLastmodified":"7.17.0"},"muteAll":false,"mutedInstanceIds":[],"name":"Error in backup logs","notifyWhen":"onActionGroupChange","params":{"count":{"comparator":"more than or equals","value":1},"criteria":[{"comparator":"equals","field":"log.file.path","value":"/var/log/opencrvs-backup.error.log"}],"timeSize":1,"timeUnit":"h"},"schedule":{"interval":"1h"},"scheduledTaskId":null,"tags":[],"throttle":null,"updatedAt":"2024-02-07T02:27:22.558Z","updatedBy":"elastic"},"coreMigrationVersion":"7.17.0","id":"b166fcb0-8911-11ee-8111-2f3be9e93efc","migrationVersion":{"alert":"7.16.0"},"references":[],"sort":[1707294436464,232766],"type":"alert","updated_at":"2024-02-07T08:27:16.464Z","version":"WzQ5NTQzNCwxOV0="}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be 100% sure no errors are ending up in /var/log/opencrvs-backup.log anymore, we'd need to test the old error scenarios and verify the indeed end up in the right file. In the backup script, especially these can be a little dangerous and might not correctly output their error.

Question you can think about is:
How could we test the different error scenarios are now properly sending an alert and write errors to the opencrvs-backup.error.log?

As testing this can be quite time consuming, my suggestion is: let's also keep the old alert rule in place that tries to find the word "error" in the /var/log/opencrvs-backup.log. We can then at some point come back to this task and see what type of stuff each file contains and remove this "extra" error rule.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you end up using a different name for the "deprecated" (so the old) alert rule, please note that you need to add a new filter for Elastalert too

https://github.com/opencrvs/opencrvs-countryconfig/blob/242878c13e0ce335c096eed10c59f77be812f805/infrastructure/monitoring/elastalert/rules/log-alert.yaml#L24-L30

Elastalert is a service that

  1. Listens Elasticsearch indices for the alerts triggered
  2. Sends a HTTP request to our country config package's POST /email endpoint
  3. Country config then sends an email (which is in our environments configured to be a Slack email inbox)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually I like this idea to keep the old alert rule, that will ensure us about the correctness of our work without taking risk.

{"attributes":{"anomalyThreshold":50,"description":"","fields":{"container":"container.id","host":"host.name","message":["message","@message"],"pod":"kubernetes.pod.uid","tiebreaker":"_doc","timestamp":"@timestamp"},"inventoryDefaultView":"0","logColumns":[{"timestampColumn":{"id":"5e7f964a-be8a-40d8-88d2-fbcfbdca0e2f"}},{"fieldColumn":{"field":"event.dataset","id":" eb9777a8-fcd3-420e-ba7d-172fff6da7a2"}},{"messageColumn":{"id":"b645d6da-824b-4723-9a2a-e8cece1645c0"}}],"logIndices":{"indexName":"logs-*,filebeat-*,kibana_sample_data_logs*,logstash*,ecs-logstash*","type":"index_name"},"metricAlias":"metrics-*,metricbeat-*","metricsExplorerDefaultView":"0","name":"Default"},"coreMigrationVersion":"7.17.0","id":"default","migrationVersion":{"infrastructure-ui-source":"7.16.2"},"references":[],"sort":[1707273006619,217714],"type":"infrastructure-ui-source","updated_at":"2024-02-07T02:30:06.619Z","version":"WzQ5MzAyNywxOV0="}
{"attributes":{"actions":[{"actionRef":"preconfigured:preconfigured-alert-history-es-index","actionTypeId":".index","group":"query matched","params":{"documents":["{\"@timestamp\":\"2023-11-20T10:19:30.521Z\",\"tags\":\"{{rule.tags}}\",\"rule\":{\"id\":\"{{rule.id}}\",\"name\":\"{{rule.name}}\",\"params\":{\"{{rule__type}}\":\"{{params}}\"},\"space\":\"{{rule.spaceId}}\",\"type\":\"{{rule.type}}\"},\"kibana\":{\"alert\":{\"id\":\"{{alert.id}}\",\"context\":{\"{{rule__type}}\":\"{{context}}\"},\"actionGroup\":\"{{alert.actionGroup}}\",\"actionGroupName\":\"{{alert.actionGroupName}}\"}},\"event\":{\"kind\":\"alert\"}}"]}}],"alertTypeId":".es-query","apiKey":null,"apiKeyOwner":null,"consumer":"alerts","createdAt":"2023-11-20T09:12:19.237Z","createdBy":"elastic","enabled":false,"executionStatus":{"error":null,"lastExecutionDate":"2024-02-07T08:28:08.400Z","status":"pending"},"legacyId":null,"meta":{"versionApiKeyLastmodified":"7.17.0"},"muteAll":false,"mutedInstanceIds":[],"name":"Successful SSH login","notifyWhen":"onActionGroupChange","params":{"esQuery":"{ \"query\": { \"bool\": { \"must\": [ \n { \"term\": { \"log.file.path\": \"/var/log/auth.log\" } },\n { \"term\": { \"event.outcome\": \"success\" }}\n ] } } }","index":["filebeat-*"],"size":100,"threshold":[1],"thresholdComparator":">=","timeField":"@timestamp","timeWindowSize":1,"timeWindowUnit":"m"},"schedule":{"interval":"1m"},"scheduledTaskId":null,"tags":[],"throttle":null,"updatedAt":"2024-02-07T02:27:19.537Z","updatedBy":"elastic"},"coreMigrationVersion":"7.17.0","id":"e79aaa90-8784-11ee-b9ba-89bbe73df7ff","migrationVersion":{"alert":"7.16.0"},"references":[],"sort":[1707294457367,232778],"type":"alert","updated_at":"2024-02-07T08:27:37.367Z","version":"WzQ5NTQ0MCwxOV0="}
{"attributes":{"actions":[{"actionRef":"preconfigured:preconfigured-alert-history-es-index","actionTypeId":".index","group":"metrics.inventory_threshold.fired","params":{"documents":["{\"@timestamp\":\"2022-06-20T06:16:33.414Z\",\"tags\":\"{{rule.tags}}\",\"rule\":{\"id\":\"{{rule.id}}\",\"name\":\"{{rule.name}}\",\"params\":{\"{{rule__type}}\":\"{{params}}\"},\"space\":\"{{rule.spaceId}}\",\"type\":\"{{rule.type}}\"},\"kibana\":{\"alert\":{\"id\":\"{{alert.id}}\",\"context\":{\"{{rule__type}}\":\"{{context}}\"},\"actionGroup\":\"{{alert.actionGroup}}\",\"actionGroupName\":\"{{alert.actionGroupName}}\"}},\"event\":{\"kind\":\"alert\"}}"]}}],"alertTypeId":"metrics.alert.inventory.threshold","apiKey":null,"apiKeyOwner":null,"consumer":"alerts","createdAt":"2022-05-31T10:10:47.084Z","createdBy":"opencrvs-admin","enabled":false,"executionStatus":{"error":null,"lastExecutionDate":"2024-02-07T08:28:08.400Z","status":"pending"},"legacyId":null,"meta":{"versionApiKeyLastmodified":"7.17.0"},"muteAll":false,"mutedInstanceIds":[],"name":"CPU under heavy load","notifyWhen":"onActionGroupChange","params":{"criteria":[{"comparator":">","customMetric":{"aggregation":"avg","field":"","id":"alert-custom-metric","type":"custom"},"metric":"cpu","threshold":[70],"timeSize":1,"timeUnit":"m"}],"nodeType":"host","sourceId":"default"},"schedule":{"interval":"1m"},"scheduledTaskId":null,"tags":["infra","opencrvs-builtin"],"throttle":null,"updatedAt":"2024-02-07T02:27:30.573Z","updatedBy":"elastic"},"coreMigrationVersion":"7.17.0","id":"f022bee0-e0c9-11ec-99b8-dbfd54551fda","migrationVersion":{"alert":"7.16.0"},"references":[],"sort":[1707294457362,232774],"type":"alert","updated_at":"2024-02-07T08:27:37.362Z","version":"WzQ5NTQzOCwxOV0="}
Expand Down
8 changes: 8 additions & 0 deletions infrastructure/server-setup/tasks/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@
state: touch
mode: 'u+rwX,g+rwX,o-rwx'

- name: Create backup logfile for errors
ansible.builtin.file:
path: /var/log/opencrvs-backup.error.log
owner: '{{ ansible_user }}'
group: 'application'
state: touch
mode: 'u+rwX,g+rwX,o-rwx'

- name: Create restore logfile
ansible.builtin.file:
path: /var/log/opencrvs-restore.log
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/server-setup/tasks/backups/crontab.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
name: 'backup opencrvs'
minute: '0'
hour: '0'
job: 'bash {{ crontab_user_home }}/backup.sh --passphrase={{ backup_encryption_passphrase }} --ssh_user={{ backup_server_user }} --ssh_host={{ backup_hostname }} --ssh_port={{ backup_port }} --remote_dir={{ backup_server_remote_target_directory }} --replicas=1 >> /var/log/opencrvs-backup.log 2>&1'
job: 'bash {{ crontab_user_home }}/backup.sh --passphrase={{ backup_encryption_passphrase }} --ssh_user={{ backup_server_user }} --ssh_host={{ backup_hostname }} --ssh_port={{ backup_port }} --remote_dir={{ backup_server_remote_target_directory }} --replicas=1 >> /var/log/opencrvs-backup.log 2>> /var/log/opencrvs-backup.error.log'
state: "{{ 'present' if (backup_hostname is defined and backup_encryption_passphrase and (enable_backups | default(false))) else 'absent' }}"

##
Expand Down
Loading