Dead man's switch via healthchecks.io integration (#223) + new monitoring documentation.

This commit is contained in:
Dan Helfman 2019-10-15 10:49:14 -07:00
parent b1941bcce9
commit 128ebf04ce
13 changed files with 255 additions and 88 deletions

8
NEWS
View file

@ -1,3 +1,11 @@
1.3.25
* #223: Dead man's switch to detect when backups start failing silently, implemented via
healthchecks.io hook integration. See the documentation for more information:
https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook
* Documentation on monitoring and alerting options for borgmatic backups:
https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/
* Automatically rewrite links when developing on documentation locally.
1.3.24
* #86: Add "borgmatic list --successful" flag to only list successful (non-checkpoint) archives.
* Add a suggestion form to all documentation pages, so users can submit ideas for improving the

View file

@ -63,6 +63,7 @@ href="https://asciinema.org/a/203761" target="_blank">screencast</a>.
* [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups/)
* [Deal with very large backups](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups/)
* [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups/)
* [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/)
* [Restore a backup](https://torsion.org/borgmatic/docs/how-to/restore-a-backup/)
* [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups/)
* [Upgrade borgmatic](https://torsion.org/borgmatic/docs/how-to/upgrade/)
@ -116,8 +117,3 @@ your thing. In general, contributions are very welcome. We don't bite!
Also, please check out the [borgmatic development
how-to](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic/) for
info on cloning source code, running tests, etc.
<script>
var links = document.getElementsByClassName("referral");
links[Math.floor(Math.random() * links.length)].style.display = "none";
</script>

View file

@ -60,6 +60,9 @@ def run_configuration(config_filename, config, arguments):
'pre-backup',
global_arguments.dry_run,
)
hook.ping_healthchecks(
hooks.get('healthchecks'), config_filename, global_arguments.dry_run, 'start'
)
except (OSError, CalledProcessError) as error:
encountered_error = error
yield from make_error_log_records(
@ -95,6 +98,9 @@ def run_configuration(config_filename, config, arguments):
'post-backup',
global_arguments.dry_run,
)
hook.ping_healthchecks(
hooks.get('healthchecks'), config_filename, global_arguments.dry_run
)
except (OSError, CalledProcessError) as error:
encountered_error = error
yield from make_error_log_records(
@ -113,6 +119,9 @@ def run_configuration(config_filename, config, arguments):
error=encountered_error,
output=getattr(encountered_error, 'output', ''),
)
hook.ping_healthchecks(
hooks.get('healthchecks'), config_filename, global_arguments.dry_run, 'fail'
)
except (OSError, CalledProcessError) as error:
yield from make_error_log_records(
'{}: Error running on-error hook'.format(config_filename), error

View file

@ -337,8 +337,8 @@ map:
example: false
hooks:
desc: |
Shell commands or scripts to execute at various points during a borgmatic run.
IMPORTANT: All provided commands and scripts are executed with user permissions of
Shell commands, scripts, or integrations to execute at various points during a borgmatic
run. IMPORTANT: All provided commands and scripts are executed with user permissions of
borgmatic. Do not forget to set secure permissions on this configuration file (chmod
0600) as well as on any script called from a hook (chmod 0700) to prevent potential
shell injection or privilege escalation.
@ -363,10 +363,17 @@ map:
seq:
- type: str
desc: |
List of one or more shell commands or scripts to execute when an exception occurs
during a backup or when running a before_backup or after_backup hook.
List of one or more shell commands or scripts to execute when an exception
occurs during a backup or when running a before_backup or after_backup hook.
example:
- echo "Error while creating a backup or running a backup hook."
healthchecks:
type: str
desc: |
Healthchecks ping URL or UUID to notify when a backup begins, ends, or errors.
Create an account at https://healthchecks.io if you'd like to use this service.
example:
https://hc-ping.com/your-uuid-here
before_everything:
seq:
- type: str

View file

@ -1,6 +1,8 @@
import logging
import os
import requests
from borgmatic import execute
logger = logging.getLogger(__name__)
@ -69,3 +71,34 @@ def execute_hook(commands, umask, config_filename, description, dry_run, **conte
finally:
if original_umask:
os.umask(original_umask)
def ping_healthchecks(ping_url_or_uuid, config_filename, dry_run, append=None):
'''
Ping the given healthchecks.io URL or UUID, appending the append string if any. Use the given
configuration filename in any log entries. If this is a dry run, then don't actually ping
anything.
'''
if not ping_url_or_uuid:
logger.debug('{}: No healthchecks hook set'.format(config_filename))
return
ping_url = (
ping_url_or_uuid
if ping_url_or_uuid.startswith('http')
else 'https://hc-ping.com/{}'.format(ping_url_or_uuid)
)
dry_run_label = ' (dry run; not actually pinging)' if dry_run else ''
if append:
ping_url = '{}/{}'.format(ping_url, append)
logger.info(
'{}: Pinging healthchecks.io{}{}'.format(
config_filename, ' ' + append if append else '', dry_run_label
)
)
logger.debug('{}: Using healthchecks.io ping URL {}'.format(config_filename, ping_url))
logging.getLogger('urllib3').setLevel(logging.ERROR)
requests.get(ping_url)

View file

@ -48,8 +48,8 @@ a backup or a backup hook, but not if an error occurs during a
`before_everything` hook.
borgmatic also runs `on_error` hooks if an error occurs, either when creating
a backup or running a backup hook. See the [error alerting
documentation](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
a backup or running a backup hook. See the [monitoring and alerting
documentation](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)
for more information.
## Hook output
@ -73,3 +73,4 @@ invoked by hooks.
* [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
* [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups.md)
* [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
* [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)

View file

@ -22,7 +22,7 @@ borgmatic --verbosity 2
## Backup summary
If you're less concerned with progress during a backup, and you just want to
If you're less concerned with progress during a backup, and you only want to
see the summary of archive statistics at the end, you can use the stats
option when performing a backup:
@ -83,78 +83,10 @@ Note that the [sample borgmatic systemd service
file](https://torsion.org/borgmatic/docs/how-to/set-up-backups/#systemd)
already has this rate limit disabled.
## Error alerting
When an error occurs during a backup, borgmatic can run configurable shell
commands to fire off custom error notifications or take other actions, so you
can get alerted as soon as something goes wrong. Here's a not-so-useful
example:
```yaml
hooks:
on_error:
- echo "Error while creating a backup or running a backup hook."
```
The `on_error` hook supports interpolating particular runtime variables into
the hook command. Here's an example that assumes you provide a separate shell
script to handle the alerting:
```yaml
hooks:
on_error:
- send-text-message.sh "{configuration_filename}" "{repository}"
```
In this example, when the error occurs, borgmatic interpolates a few runtime
values into the hook command: the borgmatic configuration filename, and the
path of the repository. Here's the full set of supported variables you can use
here:
* `configuration_filename`: borgmatic configuration filename in which the
error occurred
* `repository`: path of the repository in which the error occurred (may be
blank if the error occurs in a hook)
* `error`: the error message itself
* `output`: output of the command that failed (may be blank if an error
occurred without running a command)
Note that borgmatic does not run `on_error` hooks if an error occurs within a
`before_everything` or `after_everything` hook. For more about hooks, see the
[borgmatic hooks
documentation](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md),
especially the security information.
## Scripting borgmatic
To consume the output of borgmatic in other software, you can include an
optional `--json` flag with `create`, `list`, or `info` to get the output
formatted as JSON.
Note that when you specify the `--json` flag, Borg's other non-JSON output is
suppressed so as not to interfere with the captured JSON. Also note that JSON
output only shows up at the console, and not in syslog.
### Successful backups
`borgmatic list` includes support for a `--successful` flag that only lists
successful (non-checkpoint) backups. Combined with a built-in Borg flag like
`--last`, you can list the last successful backup for use in your monitoring
scripts. Here's an example combined with `--json`:
```bash
borgmatic list --successful --last 1 --json
```
Note that this particular combination will only work if you've got a single
backup "series" in your repository. If you're instead backing up, say, from
multiple different hosts into a single repository, then you'll need to get
fancier with your archive listing. See `borg list --help` for more flags.
## Related documentation
* [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
* [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)
* [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md)
* [Develop on borgmatic](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic.md)

View file

@ -0,0 +1,152 @@
---
title: How to monitor your backups
---
## Monitoring and alerting
Having backups is great, but they won't do you a lot of good unless you have
confidence that they're running on a regular basis. That's where monitoring
and alerting comes in.
There are several different ways you can monitor your backups and find out
whether they're succeeding. Which of these you choose to do is up to you and
your particular infrastructure:
1. **Job runner alerts**: The easiest place to start is with failure alerts
from the [scheduled job
runner](https://torsion.org/borgmatic/docs/how-to/set-up-backups/#autopilot) (cron,
systemd, etc.) that's running borgmatic. But note that if the job doesn't even
get scheduled (e.g. due to the job runner not running), you probably won't get
an alert at all! Still, this is a decent first line of defense, especially
when combined with some of the other approaches below.
2. **borgmatic error hooks**: The `on_error` hook allows you to run an arbitrary
command or script when borgmatic itself encounters an error running your
backups. So for instance, you can run a script to send yourself a text message
alert. But note that if borgmatic doesn't actually run, this alert won't fire.
See [error
hooks](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#error-hooks)
below for how to configure this.
4. **borgmatic Healthchecks hook**: This feature integrates with the
[Healthchecks](https://healthchecks.io/) service, and pings Healthchecks
whenever borgmatic runs. That way, Healthchecks can alert you when something
goes wrong or it doesn't hear from borgmatic for a configured interval. (See
[Healthchecks
hook](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#healthchecks-hook)
below for how to configure this.)
3. **Third-party monitoring software**: You can use traditional monitoring
software to consume borgmatic JSON output and track when the last
successful backup occurred. See [scripting
borgmatic](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups/#scripting-borgmatic)
below for how to configure this.
5. **Borg hosting providers**: Most [Borg hosting
providers](https://torsion.org/borgmatic/#hosting-providers) include
monitoring and alerting as part of their offering. This gives you a dashboard
to check on all of your backups, and can alert you if the service doesn't hear
from borgmatic for a configured interval.
6. **borgmatic consistency checks**: While not strictly part of monitoring, if you
really want confidence that your backups are not only running but are
restorable as well, you can configure particular [consistency
checks](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups/#consistency-check-configuration)
or even script full [restore
tests](https://torsion.org/borgmatic/docs/how-to/restore-a-backup/).
## Error hooks
When an error occurs during a backup, borgmatic can run configurable shell
commands to fire off custom error notifications or take other actions, so you
can get alerted as soon as something goes wrong. Here's a not-so-useful
example:
```yaml
hooks:
on_error:
- echo "Error while creating a backup or running a backup hook."
```
The `on_error` hook supports interpolating particular runtime variables into
the hook command. Here's an example that assumes you provide a separate shell
script to handle the alerting:
```yaml
hooks:
on_error:
- send-text-message.sh "{configuration_filename}" "{repository}"
```
In this example, when the error occurs, borgmatic interpolates a few runtime
values into the hook command: the borgmatic configuration filename, and the
path of the repository. Here's the full set of supported variables you can use
here:
* `configuration_filename`: borgmatic configuration filename in which the
error occurred
* `repository`: path of the repository in which the error occurred (may be
blank if the error occurs in a hook)
* `error`: the error message itself
* `output`: output of the command that failed (may be blank if an error
occurred without running a command)
Note that borgmatic does not run `on_error` hooks if an error occurs within a
`before_everything` or `after_everything` hook. For more about hooks, see the
[borgmatic hooks
documentation](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md),
especially the security information.
## Healthchecks hook
[Healthchecks](https://healthchecks.io/) is a service that provides "instant
alerts when your cron jobs fail silently", and borgmatic has built-in
integration with it. Once you create a Healthchecks account and project on
their site, all you need to do is configure borgmatic with the unique "Ping
URL" for your project. Here's an example:
```yaml
hooks:
healthchecks: https://hc-ping.com/addffa72-da17-40ae-be9c-ff591afb942a
```
With this hook in place, borgmatic will ping your Healthchecks project when a
backup begins, ends, or errors. Then you can configure Healthchecks to notify
you by a [variety of
mechanisms](https://healthchecks.io/#welcome-integrations) when backups fail
or it doesn't hear from borgmatic for a certain period of time.
## Scripting borgmatic
To consume the output of borgmatic in other software, you can include an
optional `--json` flag with `create`, `list`, or `info` to get the output
formatted as JSON.
Note that when you specify the `--json` flag, Borg's other non-JSON output is
suppressed so as not to interfere with the captured JSON. Also note that JSON
output only shows up at the console, and not in syslog.
### Successful backups
`borgmatic list` includes support for a `--successful` flag that only lists
successful (non-checkpoint) backups. Combined with a built-in Borg flag like
`--last`, you can list the last successful backup for use in your monitoring
scripts. Here's an example combined with `--json`:
```bash
borgmatic list --successful --last 1 --json
```
Note that this particular combination will only work if you've got a single
backup "series" in your repository. If you're instead backing up, say, from
multiple different hosts into a single repository, then you'll need to get
fancier with your archive listing. See `borg list --help` for more flags.
## Related documentation
* [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
* [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
* [Add preparation and cleanup steps to backups](https://torsion.org/borgmatic/docs/how-to/add-preparation-and-cleanup-steps-to-backups.md)
* [Restore a backup](https://torsion.org/borgmatic/docs/how-to/restore-a-backup.md)
* [Develop on borgmatic](https://torsion.org/borgmatic/docs/how-to/develop-on-borgmatic.md)

View file

@ -65,3 +65,4 @@ Like a whole-archive restore, this also restores into the current directory.
* [Set up backups with borgmatic](https://torsion.org/borgmatic/docs/how-to/set-up-backups.md)
* [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
* [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)

View file

@ -228,7 +228,7 @@ found character that cannot start any token
in "config.yaml", line 230, column 1
```
YAML does not allow tabs. So to fix this, simply replace any tabs in your
YAML does not allow tabs. So to fix this, replace any tabs in your
configuration file with the requisite number of spaces.
### libyaml compilation errors
@ -247,10 +247,6 @@ it.
* [Make per-application backups](https://torsion.org/borgmatic/docs/how-to/make-per-application-backups.md)
* [Deal with very large backups](https://torsion.org/borgmatic/docs/how-to/deal-with-very-large-backups.md)
* [Inspect your backups](https://torsion.org/borgmatic/docs/how-to/inspect-your-backups.md)
* [Monitor your backups](https://torsion.org/borgmatic/docs/how-to/monitor-your-backups.md)
* [borgmatic configuration reference](https://torsion.org/borgmatic/docs/reference/configuration.md)
* [borgmatic command-line reference](https://torsion.org/borgmatic/docs/reference/command-line.md)
<script>
var links = document.getElementsByClassName("referral");
links[Math.floor(Math.random() * links.length)].style.display = "none";
</script>

View file

@ -1,6 +1,6 @@
from setuptools import find_packages, setup
VERSION = '1.3.24'
VERSION = '1.3.25'
setup(
@ -31,6 +31,7 @@ setup(
obsoletes=['atticmatic'],
install_requires=(
'pykwalify>=1.6.0,<14.06',
'requests',
'ruamel.yaml>0.15.0,<0.17.0',
'setuptools',
'colorama>=0.4.1,<0.5',

View file

@ -20,5 +20,6 @@ pytest==5.1.2
pytest-cov==2.7.1
python-dateutil==2.8.0
PyYAML==5.1.2
requests==2.22.0
ruamel.yaml>0.15.0,<0.17.0
toml==0.10.0

View file

@ -79,3 +79,33 @@ def test_execute_hook_on_error_logs_as_error():
).once()
module.execute_hook([':'], None, 'config.yaml', 'on-error', dry_run=False)
def test_ping_healthchecks_hits_ping_url():
ping_url = 'https://example.com'
flexmock(module.requests).should_receive('get').with_args(ping_url)
module.ping_healthchecks(ping_url, 'config.yaml', dry_run=False)
def test_ping_healthchecks_without_ping_url_does_not_raise():
flexmock(module.requests).should_receive('get').never()
module.ping_healthchecks(ping_url_or_uuid=None, config_filename='config.yaml', dry_run=False)
def test_ping_healthchecks_with_ping_uuid_hits_corresponding_url():
ping_uuid = 'abcd-efgh-ijkl-mnop'
flexmock(module.requests).should_receive('get').with_args(
'https://hc-ping.com/{}'.format(ping_uuid)
)
module.ping_healthchecks(ping_uuid, 'config.yaml', dry_run=False)
def test_ping_healthchecks_hits_ping_url_with_append():
ping_url = 'https://example.com'
append = 'failed-so-hard'
flexmock(module.requests).should_receive('get').with_args('{}/{}'.format(ping_url, append))
module.ping_healthchecks(ping_url, 'config.yaml', dry_run=False, append=append)