Skip to content

Commit

Permalink
Cleanup script to search and remove rageshakes from applications base…
Browse files Browse the repository at this point in the history
…d on a time (#61)

Cleanup script to search and remove applications that we do not want.

Co-authored-by: Richard van der Hoff <[email protected]>
  • Loading branch information
michaelkaye and richvdh authored Jan 16, 2023
1 parent 0f14dc8 commit 3f03f64
Show file tree
Hide file tree
Showing 3 changed files with 259 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/.idea
/bin
/bugs
/pkg
Expand Down
1 change: 1 addition & 0 deletions changelog.d/61.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Add a zero-dependency python script to cleanup old rageshakes.
257 changes: 257 additions & 0 deletions scripts/cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
#!/usr/bin/env python3
import argparse
import glob
import gzip
import os
import sys
from datetime import datetime, timedelta
from typing import Dict, Iterable, List, Set


# Cleanup for rageshake server output files
#
# Example usage:
#
# ./cleanup.py --dry-run --path /home/rageshakes/store --max-days 100 element-auto-uisi:90
#
# No dependencies required beyond a modern python3.


class Cleanup:
"""
Cleanup a rageshake bug repository.
Once created, call cleanup() to begin the actual operation. Statistics are available after cleanup completes.
"""
def __init__(
self,
limits: Dict[str, int],
days_to_check: Iterable[int],
dry_run: bool,
root_path: str,
mxids_to_exclude: List[str],
):
"""
Set options for a cleanup run of a rageshake repository.
@param limits: Map of app name to integer number of days that application's rageshakes should be retained
@param days_to_check: List of ints each representing "days ago" that should be checked for rageshakes to delete
@param dry_run: If set, perform all actions but do not complete deletion of files
@param root_path: Base path to rageshake bug repository
@param mxids_to_exclude: Rageshakes sent by this list of mxids should always be preserved.
"""
self._limits = limits
self._days_to_check = days_to_check
self._dry_run = dry_run
self._root_path = root_path
self._mxids_to_exclude = mxids_to_exclude
# Count of files we deleted or would delete (dry-run)
self.deleted = 0
# Count of files we checked
self.checked = 0
# Sum of bytes in files we deleted or would delete (dry-run)
self.disk_saved = 0
# History of how many times a given mxid saved a file.
self.excluded_count_by_user = {mxid: 0 for mxid in mxids_to_exclude}

def cleanup(self) -> None:
"""
Check for rageshakes to remove according to settings.
Do not run multiple times as statistics are generated internally during each call.
"""
today = datetime.today()
for days_ago in self._days_to_check:
target = today - timedelta(days=days_ago)
folder_name = target.strftime("%Y-%m-%d")
applications = set()
for name in self._limits.keys():
if self._limits[name] < days_ago:
applications.add(name)
self._check_date(self._root_path + "/" + folder_name, applications)

def _check_date(self, folder_name: str, applications_to_delete: Set[str]) -> None:
"""
Check all rageshakes on a given date (folder)
"""
if len(applications_to_delete) == 0:
print(f"W Not checking {folder_name}, no applications would be removed")
return

if not os.path.exists(folder_name):
print(f"W Not checking {folder_name}, not present or not a directory")
return

checked = 0
deleted = 0
with os.scandir(folder_name) as rageshakes:
for rageshake in rageshakes:
rageshake_path = folder_name + os.pathsep + rageshake.name
if rageshake.is_dir():
checked += 1
if self._check_rageshake(rageshake_path, applications_to_delete):
deleted += 1
else:
print(
f"W File in rageshake tree {rageshake_path} is not a directory"
)

print(
f"I Checked {folder_name} for {applications_to_delete}, "
f"{'would delete' if self._dry_run else 'deleted'} {deleted}/{checked} rageshakes"
)

self.deleted += deleted
self.checked += checked
# optionally delete folder if we deleted 100% of rageshakes, but for now it' s fine.

def _check_rageshake(
self, rageshake_folder_path: str, applications_to_delete: Set[str]
) -> bool:
"""
Checks a given rageshake folder against the application and userid lists.
If the folder matches, and dryrun mode is disabled, the folder is deleted.
@returns: True if the rageshake matched, False if it was skipped.
"""
app_name = None
mxid = None

try:
with gzip.open(rageshake_folder_path + "/details.log.gz") as details:
for line in details.readlines():
parts = line.decode("utf-8").split(":", maxsplit=1)
if parts[0] == "Application":
app_name = parts[1].strip()
if parts[0] == "user_id":
mxid = parts[1].strip()
except FileNotFoundError as e:
print(
f"W Unable to open {e.filename} to check for application name. Ignoring this folder."
)
return False

if app_name in applications_to_delete:
if mxid in self._mxids_to_exclude:
self.excluded_count_by_user[mxid] += 1
else:
self._delete(rageshake_folder_path)
return True

return False

def _delete(self, rageshake_folder_path: str) -> None:
"""
Delete a given rageshake folder, unless dryrun mode is enabled
"""
files = glob.glob(rageshake_folder_path + "/*")
for file in files:
self.disk_saved += os.stat(file).st_size
if self._dry_run:
print(f"I would delete {file}")
else:
print(f"I deleting {file}")
os.unlink(file)

if self._dry_run:
print(f"I would remove directory {rageshake_folder_path}")
else:
print(f"I removing directory {rageshake_folder_path}")
os.rmdir(rageshake_folder_path)


def main():
parser = argparse.ArgumentParser(description="Cleanup rageshake files on disk")
parser.add_argument(
"limits",
metavar="LIMIT",
type=str,
nargs="+",
help="application_name retention limits in days (each formatted app-name:10)",
)
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument(
"--max-days",
dest="max_days",
type=int,
help="Search all days until this maximum",
)
group.add_argument(
"--days-to-check",
dest="days_to_check",
type=str,
help="Explicitly supply days in the past to check for deletion, eg '1,2,3,5'",
)
parser.add_argument(
"--exclude-mxids-file",
dest="exclude_mxids_file",
type=str,
help="Supply a text file containing one mxid per line to exclude from cleanup. Blank lines and lines starting # are ignored.",
)
parser.add_argument(
"--dry-run", dest="dry_run", action="store_true", help="Dry run (do not delete)"
)
parser.add_argument(
"--path",
dest="path",
type=str,
required=True,
help="Root path of rageshakes (eg /home/rageshakes/bugs/)",
)

args = parser.parse_args()
application_limits: Dict[str, int] = {}
for l in args.limits:
parts = l.rsplit(":", 1)
try:
if len(parts) < 2:
raise ValueError("missing :")
limit = int(parts[1])
except ValueError as e:
print(f"E Malformed --limits argument: {e}", file=sys.stderr)
sys.exit(1)

application_limits[parts[0]] = limit

days_to_check: Iterable[int] = []
if args.max_days:
days_to_check = range(args.max_days)
if args.days_to_check:
days_to_check = map(lambda x: int(x), args.days_to_check.split(","))

mxids_to_exclude = []
if args.exclude_mxids_file:
with open(args.exclude_mxids_file) as file:
for lineno, data in enumerate(file):
data = data.strip()
if len(data) == 0:
# blank line, ignore
pass
elif data[0] == "#":
# comment, ignore
pass
elif data[0] == "@":
# mxid
mxids_to_exclude.append(data)
else:
print(
f"E Unable to parse --exclude-mxids-file on line {lineno + 1}: {data}",
file=sys.stderr,
)
sys.exit(1)

cleanup = Cleanup(
application_limits, days_to_check, args.dry_run, args.path, mxids_to_exclude
)

cleanup.cleanup()
print(
f"I Deleted {cleanup.deleted} of {cleanup.checked} rageshakes, "
f"saving {cleanup.disk_saved} bytes. Dry run? {cleanup._dry_run}"
)
print(f"I excluded count by user {cleanup.excluded_count_by_user}")


if __name__ == "__main__":
main()

0 comments on commit 3f03f64

Please sign in to comment.