From 16cf916a040e344ab400bc995174c928a94b4787 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 16:39:20 -0400 Subject: [PATCH 01/24] Add script for disks that require user-supplied tsk options --- process_with_tsk_options.py | 473 ++++++++++++++++++++++++++++++++++++ 1 file changed, 473 insertions(+) create mode 100644 process_with_tsk_options.py diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py new file mode 100644 index 0000000..96b330a --- /dev/null +++ b/process_with_tsk_options.py @@ -0,0 +1,473 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Creates a SIP from any single disk image using options for +tsk_recover provided by the user. + +Will only work for disk images containing a file system +able to be parsed by TSK. + +Python 3 + +MIT License +(c) Tim Walsh 2017 +http://bitarchivist.net +""" + +import argparse +import csv +import datetime +import itertools +import math +import os +import shutil +import subprocess +import sys +import time + +def convert_size(size): + # convert size to human-readable form + if (size == 0): + return '0 bytes' + size_name = ("bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") + i = int(math.floor(math.log(size,1024))) + p = math.pow(1024,i) + s = round(size/p) + s = str(s) + s = s.replace('.0', '') + return '%s %s' % (s,size_name[i]) + +def time_to_int(str_time): + dt = time.mktime(datetime.datetime.strptime(str_time, + "%Y-%m-%dT%H:%M:%S").timetuple()) + return dt + +def create_spreadsheet(files_only, exportall, sip_dir): + # process each SIP + current = os.path.abspath(sip_dir) + # test if entry if directory + if os.path.isdir(current): + + # intialize values + number_files = 0 + total_bytes = 0 + mtimes = [] + ctimes = [] + crtimes = [] + + # parse dfxml file + if args.bagfiles == True: + dfxml_file = os.path.abspath(os.path.join(current, + 'data', 'metadata', 'submissionDocumentation', 'dfxml.xml')) + else: + dfxml_file = os.path.abspath(os.path.join(current, + 'metadata', 'submissionDocumentation', 'dfxml.xml')) + + # try to read DFXML file + try: + # gather info for each FileObject + for (event, obj) in Objects.iterparse(dfxml_file): + + # only work on FileObjects + if not isinstance(obj, Objects.FileObject): + continue + + # skip directories and links + if obj.name_type: + if obj.name_type != "r": + continue + + # skip unallocated if args.exportall is False + if exportall == False: + if obj.unalloc: + if obj.unalloc == 1: + continue + + # gather info + number_files += 1 + + try: + mtime = obj.mtime + mtime = str(mtime) + mtimes.append(mtime) + except: + pass + + try: + ctime = obj.ctime + ctime = str(ctime) + ctimes.append(ctime) + except: + pass + + try: + crtime = obj.crtime + crtime = str(crtime) + crtimes.append(crtime) + except: + pass + + total_bytes += obj.filesize + + # filter 'None' values from date lists + for date_list in mtimes, ctimes, crtimes: + while 'None' in date_list: + date_list.remove('None') + + + # build extent statement + size_readable = convert_size(total_bytes) + if number_files == 1: + extent = "1 digital file (%s)" % size_readable + elif number_files == 0: + extent = "EMPTY" + else: + extent = "%d digital files (%s)" % (number_files, size_readable) + + # determine earliest and latest MAC dates from lists + date_earliest_m = "" + date_latest_m = "" + date_earliest_c = "" + date_latest_c = "" + date_earliest_cr = "" + date_latest_cr = "" + date_statement = "" + + if mtimes: + date_earliest_m = min(mtimes) + date_latest_m = max(mtimes) + if ctimes: + date_earliest_c = min(ctimes) + date_latest_c = max(ctimes) + if crtimes: + date_earliest_cr = min(crtimes) + date_latest_cr = max(crtimes) + + # determine which set of dates to use (logic: use set with earliest start date) + use_ctimes = False + use_crtimes = False + + if not date_earliest_m: + date_earliest_m = "N/A" + date_latest_m = "N/A" + date_to_use = date_earliest_m # default to date modified + + if date_earliest_c: + if date_earliest_c < date_to_use: + date_to_use = date_earliest_c + use_ctimes = True + if date_earliest_cr: + if date_earliest_cr < date_to_use: + date_to_use = date_earliest_cr + use_ctimes = False + use_crtimes = True + + # store date_earliest and date_latest values based on datetype used + if use_ctimes == True: + date_earliest = date_earliest_c[:10] + date_latest = date_latest_c[:10] + elif use_crtimes == True: + date_earliest = date_earliest_cr[:10] + date_latest = date_latest_cr[:10] + else: + date_earliest = date_earliest_m[:10] + date_latest = date_latest_m[:10] + + # write date statement + if date_earliest[:4] == date_latest[:4]: + date_statement = '%s' % date_earliest[:4] + else: + date_statement = '%s - %s' % (date_earliest[:4], date_latest[:4]) + + # gather file system info, discern tool used + if args.bagfiles == True: + disktype = os.path.join(current, 'data', 'metadata', + 'submissionDocumentation', 'disktype.txt') + else: + disktype = os.path.join(current, 'metadata', + 'submissionDocumentation', 'disktype.txt') + # pull filesystem info from disktype.txt + disk_fs = '' + try: + for line in open(disktype, 'r'): + if "file system" in line: + disk_fs = line.strip() + except: # handle non-Unicode chars + for line in open(disktype, 'rb'): + if "file system" in line.decode('utf-8','ignore'): + disk_fs = line.decode('utf-8','ignore').strip() + + # save tool used to carve files + if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): + tool = "carved from the disk image using the Sleuth Kit command line utility tsk_recover" + elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): + tool = "carved from disk image using the HFSExplorer command line utility" + elif 'udf' in disk_fs.lower(): + tool = "copied from the mounted disk image" + else: + tool = "UNSUCCESSFULLY" + + # gather info from brunnhilde & write scope and content note + if extent == 'EMPTY': + scopecontent = '' + formatlist = '' + else: + fileformats = [] + formatlist = '' + fileformat_csv = '' + if args.bagfiles == True: + fileformat_csv = os.path.join(current, 'data', 'metadata', 'submissionDocumentation', + 'brunnhilde', 'csv_reports', 'formats.csv') + else: + fileformat_csv = os.path.join(current, 'metadata', 'submissionDocumentation', + 'brunnhilde', 'csv_reports', 'formats.csv') + try: + with open(fileformat_csv, 'r') as f: + reader = csv.reader(f) + next(reader) + for row in itertools.islice(reader, 5): + fileformats.append(row[0]) + except: + fileformats.append("ERROR! No formats.csv file to pull formats from.") + # replace empty elements with 'Unidentified + fileformats = [element or 'Unidentified' for element in fileformats] + formatlist = ', '.join(fileformats) + + + # create scope and content note + if files_only == True: + scopecontent = 'File includes digital files %s. Most common file formats: %s' % (tool, formatlist) + else: + scopecontent = 'File includes both a disk image and digital files %s. Most common file formats: %s' % (tool, formatlist) + + # write csv row + writer.writerow(['', item, '', '', date_statement, date_earliest, date_latest, 'File', extent, + scopecontent, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) + + print('Described %s successfully.' % (current)) + + # if error reading DFXML file, report that + except: + # write error to csv + writer.writerow(['', item, '', '', 'Error', 'Error', 'Error', 'File', 'Error', + 'Error reading DFXML file.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) + + print('ERROR: DFXML file for %s not well-formed.' % (current)) + +def keep_logical_files_only(objects_dir): + # get list of files in files dir + files_dir = os.path.join(objects_dir, 'files') + fileList = os.listdir(files_dir) + fileList = [files_dir + '/' + filename for filename in fileList] + + # move files up one directory + for f in fileList: + shutil.move(f, objects_dir) + + # delete file and diskimage dirs + shutil.rmtree(files_dir) + shutil.rmtree(os.path.join(objects_dir, 'diskimage')) + +# MAIN FLOW + +# parse arguments +parser = argparse.ArgumentParser() +parser.add_argument("-b", "--bagfiles", help="Bag files instead of writing checksum.md5", action="store_true") +parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") +parser.add_argument("-f", "--filesonly", help="Include digital files only (not disk images) in SIPs", action="store_true") +parser.add_argument("-p", "--piiscan", help="Run bulk_extractor in Brunnhilde scan", action="store_true") +parser.add_argument("-r", "--resforks", help="Export AppleDouble resource forks from HFS-formatted disks", action="store_true") +parser.add_argument("--imgtype", help="Disk image type (see tsk_recover man page for values)", action="store") +parser.add_argument("--fstype", help="File system type (see tsk_recover man page for values)", action="store") +parser.add_argument("--sector_offset", help="Sector offset of partition to parse (see tsk-recover man page for details)", action="store") +parser.add_argument("source", help="Source directory containing disk image (and related files)") +parser.add_argument("destination", help="Output destination") +args = parser.parse_args() + +destination = args.destination + +# create output directories +if not os.path.exists(destination): + os.makedirs(destination) + +# open description spreadsheet +try: + spreadsheet = open(os.path.join(destination,'description.csv'), 'w') + writer = csv.writer(spreadsheet, quoting=csv.QUOTE_NONNUMERIC) + header_list = ['Parent ID', 'Identifier', 'Title', 'Archive Creator', 'Date expression', 'Date start', 'Date end', + 'Level of description', 'Extent and medium', 'Scope and content', 'Arrangement (optional)', 'Accession number', + 'Appraisal, destruction, and scheduling information (optional)', 'Name access points (optional)', + 'Geographic access points (optional)', 'Conditions governing access (optional)', 'Conditions governing reproduction (optional)', + 'Language of material (optional)', 'Physical characteristics & technical requirements affecting use (optional)', + 'Finding aids (optional)', 'Related units of description (optional)', 'Archival history (optional)', + 'Immediate source of acquisition or transfer (optional)', "Archivists' note (optional)", 'General note (optional)', + 'Description status'] + writer.writerow(header_list) +except: + print('There was an error creating the processing spreadsheet.') + sys.exit() + +# iterate through files in source directory +for file in sorted(os.listdir(args.source)): + + # record filename in log + print('>>> NEW FILE: %s' % (file)) + + # determine if disk image + if file.endswith((".E01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): + + # save info about file + image_path = os.path.join(args.source, file) + image_id = os.path.splitext(file)[0] + image_ext = os.path.splitext(file)[1] + + # create new folders + sip_dir = os.path.join(destination, file) + object_dir = os.path.join(sip_dir, 'objects') + diskimage_dir = os.path.join(object_dir, 'diskimage') + files_dir = os.path.join(object_dir, 'files') + metadata_dir = os.path.join(sip_dir, 'metadata') + subdoc_dir = os.path.join(metadata_dir, 'submissionDocumentation') + + for folder in sip_dir, object_dir, diskimage_dir, files_dir, metadata_dir, subdoc_dir: + os.makedirs(folder) + + # disk image status + raw_image = False + + # check if disk image is ewf + if image_ext == ".E01": + # convert disk image to raw and write to /objects/diskimage + raw_out = os.path.join(diskimage_dir, image_id) + try: + subprocess.check_output(['ewfexport', '-t', raw_out, '-f', 'raw', '-o', '0', '-S', '0', '-u', image_path]) + raw_image = True + os.rename(os.path.join(diskimage_dir, '%s.raw' % (image_id)), os.path.join(diskimage_dir, '%s.img' % image_id)) # change file extension from .raw to .img + os.rename(os.path.join(diskimage_dir, '%s.raw.info' % (image_id)), os.path.join(diskimage_dir, '%s.img.info' % image_id)) # rename sidecar md5 file + diskimage = os.path.join(diskimage_dir, '%s.img' % (image_id)) # use raw disk image in objects/diskimage moving forward + except subprocess.CalledProcessError: + print('ERROR: Disk image could not be converted to raw image format. Skipping disk.') + + else: + raw_image = True + for movefile in os.listdir(args.source): + # if filename starts with disk image basename (this will also capture info and log files, multi-part disk images, etc.) + if movefile.startswith(image_id): + # copy file to objects/diskimage + try: + shutil.copyfile(os.path.join(args.source, movefile), os.path.join(diskimage_dir, movefile)) + except: + print('ERROR: File %s not successfully copied to %s' % (movefile, diskimage_dir)) + diskimage = os.path.join(diskimage_dir, file) # use disk image in objects/diskimage moving forward + + # if raw disk image, process + if raw_image == True: + + # use fiwalk to make dfxml + fiwalk_file = os.path.join(subdoc_dir, 'dfxml.xml') + try: + subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) + except subprocess.CalledProcessError as e: + print('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) + + # carve images using tsk_recover with user-supplied options + if args.exportall == True: + carvefiles = ['tsk_recover', '-e', diskimage, files_dir] + else: + carvefiles = ['tsk_recover', '-a', diskimage, files_dir] + + if args.fstype: + carvefiles.insert(2, '-f') + carvefiles.insert(3, args.fstype) + if args.imgtype: + carvefiles.insert(2, '-i') + carvefiles.insert(3, args.imgtype) + if args.sector_offset: + carvefiles.insert(2, '-o') + carvefiles.insert(3, args.sector_offset) + + try: + subprocess.check_output(carvefiles) + except subprocess.CalledProcessError as e: + print('ERROR: tsk_recover could not carve files from disk. STDERR: %s' % (e.output)) + + # modify file permissions + subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) + subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) + + # rewrite last modified dates of files based on values in DFXML + for (event, obj) in Objects.iterparse(fiwalk_file): + + # only work on FileObjects + if not isinstance(obj, Objects.FileObject): + continue + + # skip directories and links + if obj.name_type: + if obj.name_type != "r": + continue + + # record filename + dfxml_filename = obj.filename + dfxml_filedate = int(time.time()) # default to current time + + # record last modified or last created date + try: + mtime = obj.mtime + mtime = str(mtime) + except: + pass + + try: + crtime = obj.crtime + crtime = str(crtime) + except: + pass + + # fallback to created date if last modified doesn't exist + if mtime and (mtime != 'None'): + mtime = time_to_int(mtime[:19]) + dfxml_filedate = mtime + elif crtime and (crtime != 'None'): + crtime = time_to_int(crtime[:19]) + dfxml_filedate = crtime + else: + continue + + # rewrite last modified date of corresponding file in objects/files + exported_filepath = os.path.join(files_dir, dfxml_filename) + if os.path.isfile(exported_filepath): + os.utime(exported_filepath, (dfxml_filedate, dfxml_filedate)) + + # run brunnhilde and write to submissionDocumentation + files_abs = os.path.abspath(files_dir) + if args.piiscan == True: # brunnhilde with bulk_extractor + subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + else: # brunnhilde without bulk_extractor + subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + + # if user selected 'filesonly', remove disk image files and repackage + if args.filesonly == True: + keep_logical_files_only(object_dir) + + # write checksums + if args.bagfiles == True: # bag entire SIP + subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) + else: # write metadata/checksum.md5 + subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) + + # write description spreadsheet + create_spreadsheet(args.filesonly, args.exportall, sip_dir) + + # no raw disk image + else: + print('NOTICE: No raw disk image. Skipping disk.') + + else: + # write skipped file to log + print('NOTICE: File is not a disk image. Skipping file.') + +# close files +spreadsheet.close() +log.close() \ No newline at end of file From 77b76c86ea263551d21945d052d5fc2f71608317 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 16:40:22 -0400 Subject: [PATCH 02/24] import Objects --- process_with_tsk_options.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index 96b330a..e8545b5 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -26,6 +26,9 @@ import sys import time +#import Objects.py from python dfxml tools +import Objects + def convert_size(size): # convert size to human-readable form if (size == 0): From e2d00f8e500d9752c5b68878525a842497628d45 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 16:48:13 -0400 Subject: [PATCH 03/24] Remove resforks option --- process_with_tsk_options.py | 1 - 1 file changed, 1 deletion(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index e8545b5..4b2eff3 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -280,7 +280,6 @@ def keep_logical_files_only(objects_dir): parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") parser.add_argument("-f", "--filesonly", help="Include digital files only (not disk images) in SIPs", action="store_true") parser.add_argument("-p", "--piiscan", help="Run bulk_extractor in Brunnhilde scan", action="store_true") -parser.add_argument("-r", "--resforks", help="Export AppleDouble resource forks from HFS-formatted disks", action="store_true") parser.add_argument("--imgtype", help="Disk image type (see tsk_recover man page for values)", action="store") parser.add_argument("--fstype", help="File system type (see tsk_recover man page for values)", action="store") parser.add_argument("--sector_offset", help="Sector offset of partition to parse (see tsk-recover man page for details)", action="store") From ae1f85696b12d98804e04d05455168611e5f99b1 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 16:51:22 -0400 Subject: [PATCH 04/24] Fix spreadsheet line --- process_with_tsk_options.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index 4b2eff3..8a6a1da 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -46,7 +46,7 @@ def time_to_int(str_time): "%Y-%m-%dT%H:%M:%S").timetuple()) return dt -def create_spreadsheet(files_only, exportall, sip_dir): +def create_spreadsheet(files_only, exportall, sip_dir, filename): # process each SIP current = os.path.abspath(sip_dir) # test if entry if directory @@ -245,7 +245,7 @@ def create_spreadsheet(files_only, exportall, sip_dir): scopecontent = 'File includes both a disk image and digital files %s. Most common file formats: %s' % (tool, formatlist) # write csv row - writer.writerow(['', item, '', '', date_statement, date_earliest, date_latest, 'File', extent, + writer.writerow(['', filename, '', '', date_statement, date_earliest, date_latest, 'File', extent, scopecontent, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) print('Described %s successfully.' % (current)) @@ -253,7 +253,7 @@ def create_spreadsheet(files_only, exportall, sip_dir): # if error reading DFXML file, report that except: # write error to csv - writer.writerow(['', item, '', '', 'Error', 'Error', 'Error', 'File', 'Error', + writer.writerow(['', filename, '', '', 'Error', 'Error', 'Error', 'File', 'Error', 'Error reading DFXML file.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) print('ERROR: DFXML file for %s not well-formed.' % (current)) @@ -460,7 +460,7 @@ def keep_logical_files_only(objects_dir): subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) # write description spreadsheet - create_spreadsheet(args.filesonly, args.exportall, sip_dir) + create_spreadsheet(args.filesonly, args.exportall, sip_dir, file) # no raw disk image else: From 26c1a4e041174505fc730cdaa2cee4f2cab2fc31 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 16:53:41 -0400 Subject: [PATCH 05/24] Hardcore tool info --- process_with_tsk_options.py | 32 ++------------------------------ 1 file changed, 2 insertions(+), 30 deletions(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index 8a6a1da..bcd7420 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -183,34 +183,6 @@ def create_spreadsheet(files_only, exportall, sip_dir, filename): else: date_statement = '%s - %s' % (date_earliest[:4], date_latest[:4]) - # gather file system info, discern tool used - if args.bagfiles == True: - disktype = os.path.join(current, 'data', 'metadata', - 'submissionDocumentation', 'disktype.txt') - else: - disktype = os.path.join(current, 'metadata', - 'submissionDocumentation', 'disktype.txt') - # pull filesystem info from disktype.txt - disk_fs = '' - try: - for line in open(disktype, 'r'): - if "file system" in line: - disk_fs = line.strip() - except: # handle non-Unicode chars - for line in open(disktype, 'rb'): - if "file system" in line.decode('utf-8','ignore'): - disk_fs = line.decode('utf-8','ignore').strip() - - # save tool used to carve files - if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): - tool = "carved from the disk image using the Sleuth Kit command line utility tsk_recover" - elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): - tool = "carved from disk image using the HFSExplorer command line utility" - elif 'udf' in disk_fs.lower(): - tool = "copied from the mounted disk image" - else: - tool = "UNSUCCESSFULLY" - # gather info from brunnhilde & write scope and content note if extent == 'EMPTY': scopecontent = '' @@ -240,9 +212,9 @@ def create_spreadsheet(files_only, exportall, sip_dir, filename): # create scope and content note if files_only == True: - scopecontent = 'File includes digital files %s. Most common file formats: %s' % (tool, formatlist) + scopecontent = 'File includes digital files carved from a disk image using tsk_recover. Most common file formats: %s' % (formatlist) else: - scopecontent = 'File includes both a disk image and digital files %s. Most common file formats: %s' % (tool, formatlist) + scopecontent = 'File includes both a disk image and digital files carved from the disk image using tsk_recover. Most common file formats: %s' % (formatlist) # write csv row writer.writerow(['', filename, '', '', date_statement, date_earliest, date_latest, 'File', extent, From bd24417b30719ab38108dca28c60ec651f24c0a3 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 16:57:06 -0400 Subject: [PATCH 06/24] Remove log close --- process_with_tsk_options.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index bcd7420..ef4c31e 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -443,5 +443,4 @@ def keep_logical_files_only(objects_dir): print('NOTICE: File is not a disk image. Skipping file.') # close files -spreadsheet.close() -log.close() \ No newline at end of file +spreadsheet.close() \ No newline at end of file From 92e9b62133e441647112e79c01d337f74febaaf9 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 17:17:31 -0400 Subject: [PATCH 07/24] Add process_with_tsk_options.py --- install.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/install.sh b/install.sh index 3cd04ec..74a6e65 100755 --- a/install.sh +++ b/install.sh @@ -24,6 +24,7 @@ fi # Move files into /usr/share/ccatools/diskimageprocessor sudo mv diskimageprocessor.py /usr/share/ccatools/diskimageprocessor sudo mv diskimageanalyzer.py /usr/share/ccatools/diskimageprocessor +sudo mv process_with_tsk_options.py /usr/share/ccatools/diskimageprocessor sudo mv main.py /usr/share/ccatools/diskimageprocessor sudo mv launch /usr/share/ccatools/diskimageprocessor sudo mv design.py /usr/share/ccatools/diskimageprocessor From d62eed6d2793bbd278df29826575249d68c7aacb Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 20:16:27 -0400 Subject: [PATCH 08/24] Restructure into main() --- process_with_tsk_options.py | 711 ++++++++++++++++++------------------ 1 file changed, 356 insertions(+), 355 deletions(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index ef4c31e..2da5ff8 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -46,189 +46,202 @@ def time_to_int(str_time): "%Y-%m-%dT%H:%M:%S").timetuple()) return dt -def create_spreadsheet(files_only, exportall, sip_dir, filename): - # process each SIP - current = os.path.abspath(sip_dir) - # test if entry if directory - if os.path.isdir(current): - - # intialize values - number_files = 0 - total_bytes = 0 - mtimes = [] - ctimes = [] - crtimes = [] - - # parse dfxml file - if args.bagfiles == True: - dfxml_file = os.path.abspath(os.path.join(current, - 'data', 'metadata', 'submissionDocumentation', 'dfxml.xml')) - else: - dfxml_file = os.path.abspath(os.path.join(current, - 'metadata', 'submissionDocumentation', 'dfxml.xml')) - - # try to read DFXML file - try: - # gather info for each FileObject - for (event, obj) in Objects.iterparse(dfxml_file): - - # only work on FileObjects - if not isinstance(obj, Objects.FileObject): - continue +def create_spreadsheet(files_only, exportall, destination, sip_dir, filename): + # open description spreadsheet and write header + with open(os.path.join(destination,'description.csv'), 'w') as spreadsheet: + writer = csv.writer(spreadsheet, quoting=csv.QUOTE_NONNUMERIC) + header_list = ['Parent ID', 'Identifier', 'Title', 'Archive Creator', 'Date expression', 'Date start', 'Date end', + 'Level of description', 'Extent and medium', 'Scope and content', 'Arrangement (optional)', 'Accession number', + 'Appraisal, destruction, and scheduling information (optional)', 'Name access points (optional)', + 'Geographic access points (optional)', 'Conditions governing access (optional)', 'Conditions governing reproduction (optional)', + 'Language of material (optional)', 'Physical characteristics & technical requirements affecting use (optional)', + 'Finding aids (optional)', 'Related units of description (optional)', 'Archival history (optional)', + 'Immediate source of acquisition or transfer (optional)', "Archivists' note (optional)", 'General note (optional)', + 'Description status'] + writer.writerow(header_list) + + # add info for SIP in new line + current = os.path.abspath(sip_dir) + # test if entry if directory + if os.path.isdir(current): + + # intialize values + number_files = 0 + total_bytes = 0 + mtimes = [] + ctimes = [] + crtimes = [] + + # parse dfxml file + if args.bagfiles == True: + dfxml_file = os.path.abspath(os.path.join(current, + 'data', 'metadata', 'submissionDocumentation', 'dfxml.xml')) + else: + dfxml_file = os.path.abspath(os.path.join(current, + 'metadata', 'submissionDocumentation', 'dfxml.xml')) - # skip directories and links - if obj.name_type: - if obj.name_type != "r": + # try to read DFXML file + try: + # gather info for each FileObject + for (event, obj) in Objects.iterparse(dfxml_file): + + # only work on FileObjects + if not isinstance(obj, Objects.FileObject): continue - # skip unallocated if args.exportall is False - if exportall == False: - if obj.unalloc: - if obj.unalloc == 1: + # skip directories and links + if obj.name_type: + if obj.name_type != "r": continue - - # gather info - number_files += 1 - try: - mtime = obj.mtime - mtime = str(mtime) - mtimes.append(mtime) - except: - pass + # skip unallocated if args.exportall is False + if exportall == False: + if obj.unalloc: + if obj.unalloc == 1: + continue + + # gather info + number_files += 1 - try: - ctime = obj.ctime - ctime = str(ctime) - ctimes.append(ctime) - except: - pass + try: + mtime = obj.mtime + mtime = str(mtime) + mtimes.append(mtime) + except: + pass - try: - crtime = obj.crtime - crtime = str(crtime) - crtimes.append(crtime) - except: - pass - - total_bytes += obj.filesize - - # filter 'None' values from date lists - for date_list in mtimes, ctimes, crtimes: - while 'None' in date_list: - date_list.remove('None') - - - # build extent statement - size_readable = convert_size(total_bytes) - if number_files == 1: - extent = "1 digital file (%s)" % size_readable - elif number_files == 0: - extent = "EMPTY" - else: - extent = "%d digital files (%s)" % (number_files, size_readable) - - # determine earliest and latest MAC dates from lists - date_earliest_m = "" - date_latest_m = "" - date_earliest_c = "" - date_latest_c = "" - date_earliest_cr = "" - date_latest_cr = "" - date_statement = "" - - if mtimes: - date_earliest_m = min(mtimes) - date_latest_m = max(mtimes) - if ctimes: - date_earliest_c = min(ctimes) - date_latest_c = max(ctimes) - if crtimes: - date_earliest_cr = min(crtimes) - date_latest_cr = max(crtimes) - - # determine which set of dates to use (logic: use set with earliest start date) - use_ctimes = False - use_crtimes = False - - if not date_earliest_m: - date_earliest_m = "N/A" - date_latest_m = "N/A" - date_to_use = date_earliest_m # default to date modified - - if date_earliest_c: - if date_earliest_c < date_to_use: - date_to_use = date_earliest_c - use_ctimes = True - if date_earliest_cr: - if date_earliest_cr < date_to_use: - date_to_use = date_earliest_cr - use_ctimes = False - use_crtimes = True - - # store date_earliest and date_latest values based on datetype used - if use_ctimes == True: - date_earliest = date_earliest_c[:10] - date_latest = date_latest_c[:10] - elif use_crtimes == True: - date_earliest = date_earliest_cr[:10] - date_latest = date_latest_cr[:10] - else: - date_earliest = date_earliest_m[:10] - date_latest = date_latest_m[:10] + try: + ctime = obj.ctime + ctime = str(ctime) + ctimes.append(ctime) + except: + pass - # write date statement - if date_earliest[:4] == date_latest[:4]: - date_statement = '%s' % date_earliest[:4] - else: - date_statement = '%s - %s' % (date_earliest[:4], date_latest[:4]) + try: + crtime = obj.crtime + crtime = str(crtime) + crtimes.append(crtime) + except: + pass + + total_bytes += obj.filesize - # gather info from brunnhilde & write scope and content note - if extent == 'EMPTY': - scopecontent = '' - formatlist = '' - else: - fileformats = [] - formatlist = '' - fileformat_csv = '' - if args.bagfiles == True: - fileformat_csv = os.path.join(current, 'data', 'metadata', 'submissionDocumentation', - 'brunnhilde', 'csv_reports', 'formats.csv') + # filter 'None' values from date lists + for date_list in mtimes, ctimes, crtimes: + while 'None' in date_list: + date_list.remove('None') + + + # build extent statement + size_readable = convert_size(total_bytes) + if number_files == 1: + extent = "1 digital file (%s)" % size_readable + elif number_files == 0: + extent = "EMPTY" else: - fileformat_csv = os.path.join(current, 'metadata', 'submissionDocumentation', - 'brunnhilde', 'csv_reports', 'formats.csv') - try: - with open(fileformat_csv, 'r') as f: - reader = csv.reader(f) - next(reader) - for row in itertools.islice(reader, 5): - fileformats.append(row[0]) - except: - fileformats.append("ERROR! No formats.csv file to pull formats from.") - # replace empty elements with 'Unidentified - fileformats = [element or 'Unidentified' for element in fileformats] - formatlist = ', '.join(fileformats) - - - # create scope and content note - if files_only == True: - scopecontent = 'File includes digital files carved from a disk image using tsk_recover. Most common file formats: %s' % (formatlist) + extent = "%d digital files (%s)" % (number_files, size_readable) + + # determine earliest and latest MAC dates from lists + date_earliest_m = "" + date_latest_m = "" + date_earliest_c = "" + date_latest_c = "" + date_earliest_cr = "" + date_latest_cr = "" + date_statement = "" + + if mtimes: + date_earliest_m = min(mtimes) + date_latest_m = max(mtimes) + if ctimes: + date_earliest_c = min(ctimes) + date_latest_c = max(ctimes) + if crtimes: + date_earliest_cr = min(crtimes) + date_latest_cr = max(crtimes) + + # determine which set of dates to use (logic: use set with earliest start date) + use_ctimes = False + use_crtimes = False + + if not date_earliest_m: + date_earliest_m = "N/A" + date_latest_m = "N/A" + date_to_use = date_earliest_m # default to date modified + + if date_earliest_c: + if date_earliest_c < date_to_use: + date_to_use = date_earliest_c + use_ctimes = True + if date_earliest_cr: + if date_earliest_cr < date_to_use: + date_to_use = date_earliest_cr + use_ctimes = False + use_crtimes = True + + # store date_earliest and date_latest values based on datetype used + if use_ctimes == True: + date_earliest = date_earliest_c[:10] + date_latest = date_latest_c[:10] + elif use_crtimes == True: + date_earliest = date_earliest_cr[:10] + date_latest = date_latest_cr[:10] else: - scopecontent = 'File includes both a disk image and digital files carved from the disk image using tsk_recover. Most common file formats: %s' % (formatlist) + date_earliest = date_earliest_m[:10] + date_latest = date_latest_m[:10] - # write csv row - writer.writerow(['', filename, '', '', date_statement, date_earliest, date_latest, 'File', extent, - scopecontent, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) - - print('Described %s successfully.' % (current)) + # write date statement + if date_earliest[:4] == date_latest[:4]: + date_statement = '%s' % date_earliest[:4] + else: + date_statement = '%s - %s' % (date_earliest[:4], date_latest[:4]) + + # gather info from brunnhilde & write scope and content note + if extent == 'EMPTY': + scopecontent = '' + formatlist = '' + else: + fileformats = [] + formatlist = '' + fileformat_csv = '' + if args.bagfiles == True: + fileformat_csv = os.path.join(current, 'data', 'metadata', 'submissionDocumentation', + 'brunnhilde', 'csv_reports', 'formats.csv') + else: + fileformat_csv = os.path.join(current, 'metadata', 'submissionDocumentation', + 'brunnhilde', 'csv_reports', 'formats.csv') + try: + with open(fileformat_csv, 'r') as f: + reader = csv.reader(f) + next(reader) + for row in itertools.islice(reader, 5): + fileformats.append(row[0]) + except: + fileformats.append("ERROR! No formats.csv file to pull formats from.") + # replace empty elements with 'Unidentified + fileformats = [element or 'Unidentified' for element in fileformats] + formatlist = ', '.join(fileformats) + + + # create scope and content note + if files_only == True: + scopecontent = 'File includes digital files carved from a disk image using tsk_recover. Most common file formats: %s' % (formatlist) + else: + scopecontent = 'File includes both a disk image and digital files carved from the disk image using tsk_recover. Most common file formats: %s' % (formatlist) + + # write csv row + writer.writerow(['', filename, '', '', date_statement, date_earliest, date_latest, 'File', extent, + scopecontent, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) + + print('Described %s successfully.' % (current)) - # if error reading DFXML file, report that - except: - # write error to csv - writer.writerow(['', filename, '', '', 'Error', 'Error', 'Error', 'File', 'Error', - 'Error reading DFXML file.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) + # if error reading DFXML file, report that + except: + # write error to csv + writer.writerow(['', filename, '', '', 'Error', 'Error', 'Error', 'File', 'Error', + 'Error reading DFXML file.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) - print('ERROR: DFXML file for %s not well-formed.' % (current)) + print('ERROR: DFXML file for %s not well-formed.' % (current)) def keep_logical_files_only(objects_dir): # get list of files in files dir @@ -244,203 +257,191 @@ def keep_logical_files_only(objects_dir): shutil.rmtree(files_dir) shutil.rmtree(os.path.join(objects_dir, 'diskimage')) -# MAIN FLOW - -# parse arguments -parser = argparse.ArgumentParser() -parser.add_argument("-b", "--bagfiles", help="Bag files instead of writing checksum.md5", action="store_true") -parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") -parser.add_argument("-f", "--filesonly", help="Include digital files only (not disk images) in SIPs", action="store_true") -parser.add_argument("-p", "--piiscan", help="Run bulk_extractor in Brunnhilde scan", action="store_true") -parser.add_argument("--imgtype", help="Disk image type (see tsk_recover man page for values)", action="store") -parser.add_argument("--fstype", help="File system type (see tsk_recover man page for values)", action="store") -parser.add_argument("--sector_offset", help="Sector offset of partition to parse (see tsk-recover man page for details)", action="store") -parser.add_argument("source", help="Source directory containing disk image (and related files)") -parser.add_argument("destination", help="Output destination") -args = parser.parse_args() - -destination = args.destination - -# create output directories -if not os.path.exists(destination): - os.makedirs(destination) - -# open description spreadsheet -try: - spreadsheet = open(os.path.join(destination,'description.csv'), 'w') - writer = csv.writer(spreadsheet, quoting=csv.QUOTE_NONNUMERIC) - header_list = ['Parent ID', 'Identifier', 'Title', 'Archive Creator', 'Date expression', 'Date start', 'Date end', - 'Level of description', 'Extent and medium', 'Scope and content', 'Arrangement (optional)', 'Accession number', - 'Appraisal, destruction, and scheduling information (optional)', 'Name access points (optional)', - 'Geographic access points (optional)', 'Conditions governing access (optional)', 'Conditions governing reproduction (optional)', - 'Language of material (optional)', 'Physical characteristics & technical requirements affecting use (optional)', - 'Finding aids (optional)', 'Related units of description (optional)', 'Archival history (optional)', - 'Immediate source of acquisition or transfer (optional)', "Archivists' note (optional)", 'General note (optional)', - 'Description status'] - writer.writerow(header_list) -except: - print('There was an error creating the processing spreadsheet.') - sys.exit() - -# iterate through files in source directory -for file in sorted(os.listdir(args.source)): - - # record filename in log - print('>>> NEW FILE: %s' % (file)) - - # determine if disk image - if file.endswith((".E01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): - - # save info about file - image_path = os.path.join(args.source, file) - image_id = os.path.splitext(file)[0] - image_ext = os.path.splitext(file)[1] - - # create new folders - sip_dir = os.path.join(destination, file) - object_dir = os.path.join(sip_dir, 'objects') - diskimage_dir = os.path.join(object_dir, 'diskimage') - files_dir = os.path.join(object_dir, 'files') - metadata_dir = os.path.join(sip_dir, 'metadata') - subdoc_dir = os.path.join(metadata_dir, 'submissionDocumentation') - - for folder in sip_dir, object_dir, diskimage_dir, files_dir, metadata_dir, subdoc_dir: - os.makedirs(folder) - - # disk image status - raw_image = False - - # check if disk image is ewf - if image_ext == ".E01": - # convert disk image to raw and write to /objects/diskimage - raw_out = os.path.join(diskimage_dir, image_id) - try: - subprocess.check_output(['ewfexport', '-t', raw_out, '-f', 'raw', '-o', '0', '-S', '0', '-u', image_path]) +def _make_parser(): + + parser = argparse.ArgumentParser() + parser.add_argument("-b", "--bagfiles", help="Bag files instead of writing checksum.md5", action="store_true") + parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") + parser.add_argument("-f", "--filesonly", help="Include digital files only (not disk images) in SIPs", action="store_true") + parser.add_argument("-p", "--piiscan", help="Run bulk_extractor in Brunnhilde scan", action="store_true") + parser.add_argument("--imgtype", help="Disk image type (see tsk_recover man page for values)", action="store") + parser.add_argument("--fstype", help="File system type (see tsk_recover man page for values)", action="store") + parser.add_argument("--sector_offset", help="Sector offset of partition to parse (see tsk-recover man page for details)", action="store") + parser.add_argument("source", help="Source directory containing disk image (and related files)") + parser.add_argument("destination", help="Output destination") + + return parser + +def main(): + # parse args + parser = _make_parser(brunnhilde_version) + args = parser.parse_args() + source = os.path.abspath(args.source) + destination = os.path.abspath(args.destination) + + # create output directories + if not os.path.exists(destination): + os.makedirs(destination) + + # iterate through files in source directory + for file in sorted(os.listdir(source)): + + # record filename in log + print('>>> NEW FILE: %s' % (file)) + + # determine if disk image + if file.endswith((".E01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): + + # save info about file + image_path = os.path.join(source, file) + image_id = os.path.splitext(file)[0] + image_ext = os.path.splitext(file)[1] + + # create new folders + sip_dir = os.path.join(args.destination, file) + object_dir = os.path.join(sip_dir, 'objects') + diskimage_dir = os.path.join(object_dir, 'diskimage') + files_dir = os.path.join(object_dir, 'files') + metadata_dir = os.path.join(sip_dir, 'metadata') + subdoc_dir = os.path.join(metadata_dir, 'submissionDocumentation') + + for folder in sip_dir, object_dir, diskimage_dir, files_dir, metadata_dir, subdoc_dir: + os.makedirs(folder) + + # disk image status + raw_image = False + + # check if disk image is ewf + if image_ext == ".E01": + # convert disk image to raw and write to /objects/diskimage + raw_out = os.path.join(diskimage_dir, image_id) + try: + subprocess.check_output(['ewfexport', '-t', raw_out, '-f', 'raw', '-o', '0', '-S', '0', '-u', image_path]) + raw_image = True + os.rename(os.path.join(diskimage_dir, '%s.raw' % (image_id)), os.path.join(diskimage_dir, '%s.img' % image_id)) # change file extension from .raw to .img + os.rename(os.path.join(diskimage_dir, '%s.raw.info' % (image_id)), os.path.join(diskimage_dir, '%s.img.info' % image_id)) # rename sidecar md5 file + diskimage = os.path.join(diskimage_dir, '%s.img' % (image_id)) # use raw disk image in objects/diskimage moving forward + except subprocess.CalledProcessError: + print('ERROR: Disk image could not be converted to raw image format. Skipping disk.') + + else: raw_image = True - os.rename(os.path.join(diskimage_dir, '%s.raw' % (image_id)), os.path.join(diskimage_dir, '%s.img' % image_id)) # change file extension from .raw to .img - os.rename(os.path.join(diskimage_dir, '%s.raw.info' % (image_id)), os.path.join(diskimage_dir, '%s.img.info' % image_id)) # rename sidecar md5 file - diskimage = os.path.join(diskimage_dir, '%s.img' % (image_id)) # use raw disk image in objects/diskimage moving forward - except subprocess.CalledProcessError: - print('ERROR: Disk image could not be converted to raw image format. Skipping disk.') + for movefile in os.listdir(source): + # if filename starts with disk image basename (this will also capture info and log files, multi-part disk images, etc.) + if movefile.startswith(image_id): + # copy file to objects/diskimage + try: + shutil.copyfile(os.path.join(source, movefile), os.path.join(diskimage_dir, movefile)) + except: + print('ERROR: File %s not successfully copied to %s' % (movefile, diskimage_dir)) + diskimage = os.path.join(diskimage_dir, file) # use disk image in objects/diskimage moving forward + + # if raw disk image, process + if raw_image == True: + + # use fiwalk to make dfxml + fiwalk_file = os.path.join(subdoc_dir, 'dfxml.xml') + try: + subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) + except subprocess.CalledProcessError as e: + print('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) + + # carve images using tsk_recover with user-supplied options + if args.exportall == True: + carvefiles = ['tsk_recover', '-e', diskimage, files_dir] + else: + carvefiles = ['tsk_recover', '-a', diskimage, files_dir] + + if args.fstype: + carvefiles.insert(2, '-f') + carvefiles.insert(3, args.fstype) + if args.imgtype: + carvefiles.insert(2, '-i') + carvefiles.insert(3, args.imgtype) + if args.sector_offset: + carvefiles.insert(2, '-o') + carvefiles.insert(3, args.sector_offset) - else: - raw_image = True - for movefile in os.listdir(args.source): - # if filename starts with disk image basename (this will also capture info and log files, multi-part disk images, etc.) - if movefile.startswith(image_id): - # copy file to objects/diskimage + try: + subprocess.check_output(carvefiles) + except subprocess.CalledProcessError as e: + print('ERROR: tsk_recover could not carve files from disk. STDERR: %s' % (e.output)) + + # modify file permissions + subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) + subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) + + # rewrite last modified dates of files based on values in DFXML + for (event, obj) in Objects.iterparse(fiwalk_file): + + # only work on FileObjects + if not isinstance(obj, Objects.FileObject): + continue + + # skip directories and links + if obj.name_type: + if obj.name_type != "r": + continue + + # record filename + dfxml_filename = obj.filename + dfxml_filedate = int(time.time()) # default to current time + + # record last modified or last created date try: - shutil.copyfile(os.path.join(args.source, movefile), os.path.join(diskimage_dir, movefile)) + mtime = obj.mtime + mtime = str(mtime) except: - print('ERROR: File %s not successfully copied to %s' % (movefile, diskimage_dir)) - diskimage = os.path.join(diskimage_dir, file) # use disk image in objects/diskimage moving forward + pass - # if raw disk image, process - if raw_image == True: + try: + crtime = obj.crtime + crtime = str(crtime) + except: + pass + + # fallback to created date if last modified doesn't exist + if mtime and (mtime != 'None'): + mtime = time_to_int(mtime[:19]) + dfxml_filedate = mtime + elif crtime and (crtime != 'None'): + crtime = time_to_int(crtime[:19]) + dfxml_filedate = crtime + else: + continue - # use fiwalk to make dfxml - fiwalk_file = os.path.join(subdoc_dir, 'dfxml.xml') - try: - subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) - except subprocess.CalledProcessError as e: - print('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) - - # carve images using tsk_recover with user-supplied options - if args.exportall == True: - carvefiles = ['tsk_recover', '-e', diskimage, files_dir] - else: - carvefiles = ['tsk_recover', '-a', diskimage, files_dir] - - if args.fstype: - carvefiles.insert(2, '-f') - carvefiles.insert(3, args.fstype) - if args.imgtype: - carvefiles.insert(2, '-i') - carvefiles.insert(3, args.imgtype) - if args.sector_offset: - carvefiles.insert(2, '-o') - carvefiles.insert(3, args.sector_offset) + # rewrite last modified date of corresponding file in objects/files + exported_filepath = os.path.join(files_dir, dfxml_filename) + if os.path.isfile(exported_filepath): + os.utime(exported_filepath, (dfxml_filedate, dfxml_filedate)) - try: - subprocess.check_output(carvefiles) - except subprocess.CalledProcessError as e: - print('ERROR: tsk_recover could not carve files from disk. STDERR: %s' % (e.output)) + # run brunnhilde and write to submissionDocumentation + files_abs = os.path.abspath(files_dir) + if args.piiscan == True: # brunnhilde with bulk_extractor + subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + else: # brunnhilde without bulk_extractor + subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - # modify file permissions - subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) - subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) + # if user selected 'filesonly', remove disk image files and repackage + if args.filesonly == True: + keep_logical_files_only(object_dir) - # rewrite last modified dates of files based on values in DFXML - for (event, obj) in Objects.iterparse(fiwalk_file): - - # only work on FileObjects - if not isinstance(obj, Objects.FileObject): - continue + # write checksums + if args.bagfiles == True: # bag entire SIP + subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) + else: # write metadata/checksum.md5 + subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) - # skip directories and links - if obj.name_type: - if obj.name_type != "r": - continue - - # record filename - dfxml_filename = obj.filename - dfxml_filedate = int(time.time()) # default to current time + # write description spreadsheet + populate_spreadsheet(args.filesonly, args.exportall, destination sip_dir, file) - # record last modified or last created date - try: - mtime = obj.mtime - mtime = str(mtime) - except: - pass + # no raw disk image + else: + print('NOTICE: No raw disk image. Skipping disk.') - try: - crtime = obj.crtime - crtime = str(crtime) - except: - pass - - # fallback to created date if last modified doesn't exist - if mtime and (mtime != 'None'): - mtime = time_to_int(mtime[:19]) - dfxml_filedate = mtime - elif crtime and (crtime != 'None'): - crtime = time_to_int(crtime[:19]) - dfxml_filedate = crtime - else: - continue - - # rewrite last modified date of corresponding file in objects/files - exported_filepath = os.path.join(files_dir, dfxml_filename) - if os.path.isfile(exported_filepath): - os.utime(exported_filepath, (dfxml_filedate, dfxml_filedate)) - - # run brunnhilde and write to submissionDocumentation - files_abs = os.path.abspath(files_dir) - if args.piiscan == True: # brunnhilde with bulk_extractor - subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - else: # brunnhilde without bulk_extractor - subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - - # if user selected 'filesonly', remove disk image files and repackage - if args.filesonly == True: - keep_logical_files_only(object_dir) - - # write checksums - if args.bagfiles == True: # bag entire SIP - subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) - else: # write metadata/checksum.md5 - subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) - - # write description spreadsheet - create_spreadsheet(args.filesonly, args.exportall, sip_dir, file) - - # no raw disk image else: - print('NOTICE: No raw disk image. Skipping disk.') - - else: - # write skipped file to log - print('NOTICE: File is not a disk image. Skipping file.') + # write skipped file to log + print('NOTICE: File is not a disk image. Skipping file.') -# close files -spreadsheet.close() \ No newline at end of file +if __name__ == '__main__': + main() \ No newline at end of file From 0b818601f043b669d27624fdb02b5beed35f4e33 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 20:18:35 -0400 Subject: [PATCH 09/24] Fix typo --- process_with_tsk_options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index 2da5ff8..8b9f5c2 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -433,7 +433,7 @@ def main(): subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) # write description spreadsheet - populate_spreadsheet(args.filesonly, args.exportall, destination sip_dir, file) + populate_spreadsheet(args.filesonly, args.exportall, destination, sip_dir, file) # no raw disk image else: From 6aadcc753c7c523155cb3287b9f121b2bde74261 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 20:20:10 -0400 Subject: [PATCH 10/24] Fix arg parser --- process_with_tsk_options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index 8b9f5c2..86d1a43 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -274,7 +274,7 @@ def _make_parser(): def main(): # parse args - parser = _make_parser(brunnhilde_version) + parser = _make_parser() args = parser.parse_args() source = os.path.abspath(args.source) destination = os.path.abspath(args.destination) From 4658f105abda820dadbaf488c9b3541e9ab7ba0a Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 20:23:00 -0400 Subject: [PATCH 11/24] Modify spreadsheet function --- process_with_tsk_options.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index 86d1a43..549be68 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -433,7 +433,8 @@ def main(): subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) # write description spreadsheet - populate_spreadsheet(args.filesonly, args.exportall, destination, sip_dir, file) + print('Generating description spreadsheet for file %s...' % (file)) + create_spreadsheet(args.filesonly, args.exportall, destination, sip_dir, file) # no raw disk image else: From 1b5456e3806132c50c954a4a642e2b0e56998528 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Thu, 12 Oct 2017 20:27:46 -0400 Subject: [PATCH 12/24] Fix arg imports to create_spreadsheet function --- process_with_tsk_options.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/process_with_tsk_options.py b/process_with_tsk_options.py index 549be68..3c1cdf0 100644 --- a/process_with_tsk_options.py +++ b/process_with_tsk_options.py @@ -46,9 +46,10 @@ def time_to_int(str_time): "%Y-%m-%dT%H:%M:%S").timetuple()) return dt -def create_spreadsheet(files_only, exportall, destination, sip_dir, filename): +def create_spreadsheet(args, destination, sip_dir, filename): # open description spreadsheet and write header with open(os.path.join(destination,'description.csv'), 'w') as spreadsheet: + writer = csv.writer(spreadsheet, quoting=csv.QUOTE_NONNUMERIC) header_list = ['Parent ID', 'Identifier', 'Title', 'Archive Creator', 'Date expression', 'Date start', 'Date end', 'Level of description', 'Extent and medium', 'Scope and content', 'Arrangement (optional)', 'Accession number', @@ -95,7 +96,7 @@ def create_spreadsheet(files_only, exportall, destination, sip_dir, filename): continue # skip unallocated if args.exportall is False - if exportall == False: + if args.exportall == False: if obj.unalloc: if obj.unalloc == 1: continue @@ -224,7 +225,7 @@ def create_spreadsheet(files_only, exportall, destination, sip_dir, filename): # create scope and content note - if files_only == True: + if args.filesonly == True: scopecontent = 'File includes digital files carved from a disk image using tsk_recover. Most common file formats: %s' % (formatlist) else: scopecontent = 'File includes both a disk image and digital files carved from the disk image using tsk_recover. Most common file formats: %s' % (formatlist) @@ -434,7 +435,7 @@ def main(): # write description spreadsheet print('Generating description spreadsheet for file %s...' % (file)) - create_spreadsheet(args.filesonly, args.exportall, destination, sip_dir, file) + create_spreadsheet(args, destination, sip_dir, file) # no raw disk image else: From 3ead8676fadd3c13d5d30b51b332bf589b3a7b3d Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:26:08 -0400 Subject: [PATCH 13/24] Refactor --- diskimageprocessor.py | 988 +++++++++++++++++++++--------------------- 1 file changed, 498 insertions(+), 490 deletions(-) diff --git a/diskimageprocessor.py b/diskimageprocessor.py index a417781..8e0fe94 100644 --- a/diskimageprocessor.py +++ b/diskimageprocessor.py @@ -40,12 +40,13 @@ import Objects def logandprint(message): + """ Print to log and terminal """ log.write('\n' + (time.strftime("%H:%M:%S %b %d, %Y - ", time.localtime())) + message) print(message) def convert_size(size): - # convert size to human-readable form + """ Convert size to human-readable form """ if (size == 0): return '0 bytes' size_name = ("bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") @@ -57,228 +58,247 @@ def convert_size(size): return '%s %s' % (s,size_name[i]) def time_to_int(str_time): + """ Convert datetime to unix integer value """ dt = time.mktime(datetime.datetime.strptime(str_time, "%Y-%m-%dT%H:%M:%S").timetuple()) return dt -def create_spreadsheet(files_only, exportall): - # process each SIP - for item in sorted(os.listdir(sips)): - current = os.path.join(sips, item) - # test if entry if directory - if os.path.isdir(current): - - # intialize values - number_files = 0 - total_bytes = 0 - mtimes = [] - ctimes = [] - crtimes = [] - - # parse dfxml file - if args.bagfiles == True: - dfxml_file = os.path.abspath(os.path.join(current, - 'data', 'metadata', 'submissionDocumentation', 'dfxml.xml')) - else: - dfxml_file = os.path.abspath(os.path.join(current, - 'metadata', 'submissionDocumentation', 'dfxml.xml')) - - # try to read DFXML file - try: - # gather info for each FileObject - for (event, obj) in Objects.iterparse(dfxml_file): - - # only work on FileObjects - if not isinstance(obj, Objects.FileObject): - continue +def create_spreadsheet(args): + """ Create csv describing created SIPs """ + + # open description spreadsheet + csv_path = os.path.abspath(os.path.join(args.destination, 'description.csv')) + with open(csv_path, 'w') as description_csv: + writer = csv.writer(description_csv, quoting=csv.QUOTE_NONNUMERIC) + + # write header + header_list = ['Parent ID', 'Identifier', 'Title', 'Archive Creator', 'Date expression', 'Date start', 'Date end', + 'Level of description', 'Extent and medium', 'Scope and content', 'Arrangement (optional)', 'Accession number', + 'Appraisal, destruction, and scheduling information (optional)', 'Name access points (optional)', + 'Geographic access points (optional)', 'Conditions governing access (optional)', 'Conditions governing reproduction (optional)', + 'Language of material (optional)', 'Physical characteristics & technical requirements affecting use (optional)', + 'Finding aids (optional)', 'Related units of description (optional)', 'Archival history (optional)', + 'Immediate source of acquisition or transfer (optional)', "Archivists' note (optional)", 'General note (optional)', + 'Description status'] + writer.writerow(header_list) + + # process each SIP + for item in sorted(os.listdir(sips)): + current = os.path.join(sips, item) + # test if entry if directory + if os.path.isdir(current): + + # intialize values + number_files = 0 + total_bytes = 0 + mtimes = [] + ctimes = [] + crtimes = [] + + # parse dfxml file + if args.bagfiles == True: + dfxml_file = os.path.abspath(os.path.join(current, + 'data', 'metadata', 'submissionDocumentation', 'dfxml.xml')) + else: + dfxml_file = os.path.abspath(os.path.join(current, + 'metadata', 'submissionDocumentation', 'dfxml.xml')) - # skip directories and links - if obj.name_type: - if obj.name_type != "r": + # try to read DFXML file + try: + # gather info for each FileObject + for (event, obj) in Objects.iterparse(dfxml_file): + + # only work on FileObjects + if not isinstance(obj, Objects.FileObject): continue - # skip unallocated if args.exportall is False - if exportall == False: - if obj.unalloc: - if obj.unalloc == 1: + # skip directories and links + if obj.name_type: + if obj.name_type != "r": continue - - # gather info - number_files += 1 - - try: - mtime = obj.mtime - mtime = str(mtime) - mtimes.append(mtime) - except: - pass - try: - ctime = obj.ctime - ctime = str(ctime) - ctimes.append(ctime) - except: - pass + # skip unallocated if args.exportall is False + if args.exportall == False: + if obj.unalloc: + if obj.unalloc == 1: + continue + + # gather info + number_files += 1 + + try: + mtime = obj.mtime + mtime = str(mtime) + mtimes.append(mtime) + except: + pass + + try: + ctime = obj.ctime + ctime = str(ctime) + ctimes.append(ctime) + except: + pass + + try: + crtime = obj.crtime + crtime = str(crtime) + crtimes.append(crtime) + except: + pass + + total_bytes += obj.filesize - try: - crtime = obj.crtime - crtime = str(crtime) - crtimes.append(crtime) - except: - pass - - total_bytes += obj.filesize - - # filter 'None' values from date lists - for date_list in mtimes, ctimes, crtimes: - while 'None' in date_list: - date_list.remove('None') - - - # build extent statement - size_readable = convert_size(total_bytes) - if number_files == 1: - extent = "1 digital file (%s)" % size_readable - elif number_files == 0: - extent = "EMPTY" - else: - extent = "%d digital files (%s)" % (number_files, size_readable) - - # determine earliest and latest MAC dates from lists - date_earliest_m = "" - date_latest_m = "" - date_earliest_c = "" - date_latest_c = "" - date_earliest_cr = "" - date_latest_cr = "" - date_statement = "" - - if mtimes: - date_earliest_m = min(mtimes) - date_latest_m = max(mtimes) - if ctimes: - date_earliest_c = min(ctimes) - date_latest_c = max(ctimes) - if crtimes: - date_earliest_cr = min(crtimes) - date_latest_cr = max(crtimes) - - # determine which set of dates to use (logic: use set with earliest start date) - use_ctimes = False - use_crtimes = False - - if not date_earliest_m: - date_earliest_m = "N/A" - date_latest_m = "N/A" - date_to_use = date_earliest_m # default to date modified - - if date_earliest_c: - if date_earliest_c < date_to_use: - date_to_use = date_earliest_c - use_ctimes = True - if date_earliest_cr: - if date_earliest_cr < date_to_use: - date_to_use = date_earliest_cr - use_ctimes = False - use_crtimes = True - - # store date_earliest and date_latest values based on datetype used - if use_ctimes == True: - date_earliest = date_earliest_c[:10] - date_latest = date_latest_c[:10] - elif use_crtimes == True: - date_earliest = date_earliest_cr[:10] - date_latest = date_latest_cr[:10] - else: - date_earliest = date_earliest_m[:10] - date_latest = date_latest_m[:10] + # filter 'None' values from date lists + for date_list in mtimes, ctimes, crtimes: + while 'None' in date_list: + date_list.remove('None') - # write date statement - if date_earliest[:4] == date_latest[:4]: - date_statement = '%s' % date_earliest[:4] - else: - date_statement = '%s - %s' % (date_earliest[:4], date_latest[:4]) - # gather file system info, discern tool used - if args.bagfiles == True: - disktype = os.path.join(current, 'data', 'metadata', - 'submissionDocumentation', 'disktype.txt') - else: - disktype = os.path.join(current, 'metadata', - 'submissionDocumentation', 'disktype.txt') - # pull filesystem info from disktype.txt - disk_fs = '' - try: - for line in open(disktype, 'r'): - if "file system" in line: - disk_fs = line.strip() - except: # handle non-Unicode chars - for line in open(disktype, 'rb'): - if "file system" in line.decode('utf-8','ignore'): - disk_fs = line.decode('utf-8','ignore').strip() + # build extent statement + size_readable = convert_size(total_bytes) + if number_files == 1: + extent = "1 digital file (%s)" % size_readable + elif number_files == 0: + extent = "EMPTY" + else: + extent = "%d digital files (%s)" % (number_files, size_readable) + + # determine earliest and latest MAC dates from lists + date_earliest_m = "" + date_latest_m = "" + date_earliest_c = "" + date_latest_c = "" + date_earliest_cr = "" + date_latest_cr = "" + date_statement = "" + + if mtimes: + date_earliest_m = min(mtimes) + date_latest_m = max(mtimes) + if ctimes: + date_earliest_c = min(ctimes) + date_latest_c = max(ctimes) + if crtimes: + date_earliest_cr = min(crtimes) + date_latest_cr = max(crtimes) + + # determine which set of dates to use (logic: use set with earliest start date) + use_ctimes = False + use_crtimes = False + + if not date_earliest_m: + date_earliest_m = "N/A" + date_latest_m = "N/A" + date_to_use = date_earliest_m # default to date modified + + if date_earliest_c: + if date_earliest_c < date_to_use: + date_to_use = date_earliest_c + use_ctimes = True + if date_earliest_cr: + if date_earliest_cr < date_to_use: + date_to_use = date_earliest_cr + use_ctimes = False + use_crtimes = True + + # store date_earliest and date_latest values based on datetype used + if use_ctimes == True: + date_earliest = date_earliest_c[:10] + date_latest = date_latest_c[:10] + elif use_crtimes == True: + date_earliest = date_earliest_cr[:10] + date_latest = date_latest_cr[:10] + else: + date_earliest = date_earliest_m[:10] + date_latest = date_latest_m[:10] - # save tool used to carve files - if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): - tool = "carved from the disk image using the Sleuth Kit command line utility tsk_recover" - elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): - tool = "carved from disk image using the HFSExplorer command line utility" - elif 'udf' in disk_fs.lower(): - tool = "copied from the mounted disk image" - else: - tool = "UNSUCCESSFULLY" + # write date statement + if date_earliest[:4] == date_latest[:4]: + date_statement = '%s' % date_earliest[:4] + else: + date_statement = '%s - %s' % (date_earliest[:4], date_latest[:4]) - # gather info from brunnhilde & write scope and content note - if extent == 'EMPTY': - scopecontent = '' - formatlist = '' - else: - fileformats = [] - formatlist = '' - fileformat_csv = '' + # gather file system info, discern tool used if args.bagfiles == True: - fileformat_csv = os.path.join(current, 'data', 'metadata', 'submissionDocumentation', - 'brunnhilde', 'csv_reports', 'formats.csv') + disktype = os.path.join(current, 'data', 'metadata', + 'submissionDocumentation', 'disktype.txt') else: - fileformat_csv = os.path.join(current, 'metadata', 'submissionDocumentation', - 'brunnhilde', 'csv_reports', 'formats.csv') - try: - with open(fileformat_csv, 'r') as f: - reader = csv.reader(f) - next(reader) - for row in itertools.islice(reader, 5): - fileformats.append(row[0]) - except: - fileformats.append("ERROR! No formats.csv file to pull formats from.") - # replace empty elements with 'Unidentified - fileformats = [element or 'Unidentified' for element in fileformats] - formatlist = ', '.join(fileformats) - - - # create scope and content note - if files_only == True: - scopecontent = 'File includes digital files %s. Most common file formats: %s' % (tool, formatlist) + disktype = os.path.join(current, 'metadata', + 'submissionDocumentation', 'disktype.txt') + # pull filesystem info from disktype.txt + disk_fs = '' + try: + for line in open(disktype, 'r'): + if "file system" in line: + disk_fs = line.strip() + except: # handle non-Unicode chars + for line in open(disktype, 'rb'): + if "file system" in line.decode('utf-8','ignore'): + disk_fs = line.decode('utf-8','ignore').strip() + + # save tool used to carve files + if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): + tool = "carved from the disk image using the Sleuth Kit command line utility tsk_recover" + elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): + tool = "carved from disk image using the HFSExplorer command line utility" + elif 'udf' in disk_fs.lower(): + tool = "copied from the mounted disk image" else: - scopecontent = 'File includes both a disk image and digital files %s. Most common file formats: %s' % (tool, formatlist) - - # write csv row - writer.writerow(['', item, '', '', date_statement, date_earliest, date_latest, 'File', extent, - scopecontent, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) - - logandprint('Described %s successfully.' % (current)) - - # if error reading DFXML file, report that - except: - # write error to csv - writer.writerow(['', item, '', '', 'Error', 'Error', 'Error', 'File', 'Error', - 'Error reading DFXML file.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) + tool = "UNSUCCESSFULLY" - logandprint('ERROR: DFXML file for %s not well-formed.' % (current)) + # gather info from brunnhilde & write scope and content note + if extent == 'EMPTY': + scopecontent = '' + formatlist = '' + else: + fileformats = [] + formatlist = '' + fileformat_csv = '' + if args.bagfiles == True: + fileformat_csv = os.path.join(current, 'data', 'metadata', 'submissionDocumentation', + 'brunnhilde', 'csv_reports', 'formats.csv') + else: + fileformat_csv = os.path.join(current, 'metadata', 'submissionDocumentation', + 'brunnhilde', 'csv_reports', 'formats.csv') + try: + with open(fileformat_csv, 'r') as f: + reader = csv.reader(f) + next(reader) + for row in itertools.islice(reader, 5): + fileformats.append(row[0]) + except: + fileformats.append("ERROR! No formats.csv file to pull formats from.") + # replace empty elements with 'Unidentified + fileformats = [element or 'Unidentified' for element in fileformats] + formatlist = ', '.join(fileformats) + + + # create scope and content note + if args.filesonly == True: + scopecontent = 'File includes digital files %s. Most common file formats: %s' % (tool, formatlist) + else: + scopecontent = 'File includes both a disk image and digital files %s. Most common file formats: %s' % (tool, formatlist) + + # write csv row + writer.writerow(['', item, '', '', date_statement, date_earliest, date_latest, 'File', extent, + scopecontent, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) + + logandprint('Described %s successfully.' % (current)) + # if error reading DFXML file, report that + except: + # write error to csv + writer.writerow(['', item, '', '', 'Error', 'Error', 'Error', 'File', 'Error', + 'Error reading DFXML file.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) + logandprint('ERROR: DFXML file for %s not well-formed.' % (current)) - logandprint('All SIPs described in spreadsheet. Process complete.') + logandprint('Description CSV created.') def keep_logical_files_only(objects_dir): + """ Remove disk image from SIP and repackage """ + # get list of files in files dir files_dir = os.path.join(objects_dir, 'files') fileList = os.listdir(files_dir) @@ -292,326 +312,314 @@ def keep_logical_files_only(objects_dir): shutil.rmtree(files_dir) shutil.rmtree(os.path.join(objects_dir, 'diskimage')) +def _make_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("-b", "--bagfiles", help="Bag files instead of writing checksum.md5", action="store_true") + parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") + parser.add_argument("-f", "--filesonly", help="Include digital files only (not disk images) in SIPs", action="store_true") + parser.add_argument("-p", "--piiscan", help="Run bulk_extractor in Brunnhilde scan", action="store_true") + parser.add_argument("-r", "--resforks", help="Export AppleDouble resource forks from HFS-formatted disks", action="store_true") + parser.add_argument("source", help="Source directory containing disk images (and related files)") + parser.add_argument("destination", help="Output destination") + + return parser + +def main(): + + parser = _make_parser() + args = parser.parse_args() + + destination = os.path.abspath(args.destination) + + # create output directories + if not os.path.exists(destination): + os.makedirs(destination) + + sips = os.path.join(destination, 'SIPs') + os.makedirs(sips) + + # open log file + log_file = os.path.join(destination, 'diskimageprocessor-log.txt') + try: + log = open(log_file, 'w') # open the log file + logandprint('Source of disk images: %s' % (args.source)) + except: + logandprint('There was an error creating the log file.') + + # make list for unprocessed disks + unprocessed = [] + + # iterate through files in source directory + for file in sorted(os.listdir(args.source)): + + # record filename in log + logandprint('>>> NEW FILE: %s' % (file)) + + # determine if disk image + if file.lower().endswith((".e01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): + + # save info about file + image_path = os.path.join(args.source, file) + image_id = os.path.splitext(file)[0] + image_ext = os.path.splitext(file)[1] + + # create new folders + sip_dir = os.path.join(sips, file) + object_dir = os.path.join(sip_dir, 'objects') + diskimage_dir = os.path.join(object_dir, 'diskimage') + files_dir = os.path.join(object_dir, 'files') + metadata_dir = os.path.join(sip_dir, 'metadata') + subdoc_dir = os.path.join(metadata_dir, 'submissionDocumentation') + + for folder in sip_dir, object_dir, diskimage_dir, files_dir, metadata_dir, subdoc_dir: + os.makedirs(folder) + + # disk image status + raw_image = False + + # check if disk image is ewf + if image_ext == ".E01": + # convert disk image to raw and write to /objects/diskimage + raw_out = os.path.join(diskimage_dir, image_id) + try: + subprocess.check_output(['ewfexport', '-t', raw_out, '-f', 'raw', '-o', '0', '-S', '0', '-u', image_path]) + raw_image = True + os.rename(os.path.join(diskimage_dir, '%s.raw' % (image_id)), os.path.join(diskimage_dir, '%s.img' % image_id)) # change file extension from .raw to .img + os.rename(os.path.join(diskimage_dir, '%s.raw.info' % (image_id)), os.path.join(diskimage_dir, '%s.img.info' % image_id)) # rename sidecar md5 file + diskimage = os.path.join(diskimage_dir, '%s.img' % (image_id)) # use raw disk image in objects/diskimage moving forward + except subprocess.CalledProcessError: + logandprint('ERROR: Disk image could not be converted to raw image format. Skipping disk.') -# MAIN FLOW - -# parse arguments -parser = argparse.ArgumentParser() -parser.add_argument("-b", "--bagfiles", help="Bag files instead of writing checksum.md5", action="store_true") -parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") -parser.add_argument("-f", "--filesonly", help="Include digital files only (not disk images) in SIPs", action="store_true") -parser.add_argument("-p", "--piiscan", help="Run bulk_extractor in Brunnhilde scan", action="store_true") -parser.add_argument("-r", "--resforks", help="Export AppleDouble resource forks from HFS-formatted disks", action="store_true") -parser.add_argument("source", help="Source directory containing disk images (and related files)") -parser.add_argument("destination", help="Output destination") -args = parser.parse_args() - -destination = args.destination - -# create output directories -if not os.path.exists(destination): - os.makedirs(destination) - -sips = os.path.join(destination, 'SIPs') -os.makedirs(sips) - -# open log file -log_file = os.path.join(destination, 'diskimageprocessor-log.txt') -try: - log = open(log_file, 'w') # open the log file - logandprint('Source of disk images: %s' % (args.source)) -except: - sys.exit('There was an error creating the log file.') - -# open description spreadsheet -try: - spreadsheet = open(os.path.join(destination,'description.csv'), 'w') - writer = csv.writer(spreadsheet, quoting=csv.QUOTE_NONNUMERIC) - header_list = ['Parent ID', 'Identifier', 'Title', 'Archive Creator', 'Date expression', 'Date start', 'Date end', - 'Level of description', 'Extent and medium', 'Scope and content', 'Arrangement (optional)', 'Accession number', - 'Appraisal, destruction, and scheduling information (optional)', 'Name access points (optional)', - 'Geographic access points (optional)', 'Conditions governing access (optional)', 'Conditions governing reproduction (optional)', - 'Language of material (optional)', 'Physical characteristics & technical requirements affecting use (optional)', - 'Finding aids (optional)', 'Related units of description (optional)', 'Archival history (optional)', - 'Immediate source of acquisition or transfer (optional)', "Archivists' note (optional)", 'General note (optional)', - 'Description status'] - writer.writerow(header_list) -except: - logandprint('There was an error creating the processing spreadsheet.') - sys.exit() - -# make list for unprocessed disks -unprocessed = [] - -# iterate through files in source directory -for file in sorted(os.listdir(args.source)): - - # record filename in log - logandprint('>>> NEW FILE: %s' % (file)) - - # determine if disk image - if file.endswith((".E01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): - - # save info about file - image_path = os.path.join(args.source, file) - image_id = os.path.splitext(file)[0] - image_ext = os.path.splitext(file)[1] - - # create new folders - sip_dir = os.path.join(sips, file) - object_dir = os.path.join(sip_dir, 'objects') - diskimage_dir = os.path.join(object_dir, 'diskimage') - files_dir = os.path.join(object_dir, 'files') - metadata_dir = os.path.join(sip_dir, 'metadata') - subdoc_dir = os.path.join(metadata_dir, 'submissionDocumentation') - - for folder in sip_dir, object_dir, diskimage_dir, files_dir, metadata_dir, subdoc_dir: - os.makedirs(folder) - - # disk image status - raw_image = False - - # check if disk image is ewf - if image_ext == ".E01": - # convert disk image to raw and write to /objects/diskimage - raw_out = os.path.join(diskimage_dir, image_id) - try: - subprocess.check_output(['ewfexport', '-t', raw_out, '-f', 'raw', '-o', '0', '-S', '0', '-u', image_path]) + else: raw_image = True - os.rename(os.path.join(diskimage_dir, '%s.raw' % (image_id)), os.path.join(diskimage_dir, '%s.img' % image_id)) # change file extension from .raw to .img - os.rename(os.path.join(diskimage_dir, '%s.raw.info' % (image_id)), os.path.join(diskimage_dir, '%s.img.info' % image_id)) # rename sidecar md5 file - diskimage = os.path.join(diskimage_dir, '%s.img' % (image_id)) # use raw disk image in objects/diskimage moving forward - except subprocess.CalledProcessError: - logandprint('ERROR: Disk image could not be converted to raw image format. Skipping disk.') + for movefile in os.listdir(args.source): + # if filename starts with disk image basename (this will also capture info and log files, multi-part disk images, etc.) + if movefile.startswith(image_id): + # copy file to objects/diskimage + try: + shutil.copyfile(os.path.join(args.source, movefile), os.path.join(diskimage_dir, movefile)) + except: + logandprint('ERROR: File %s not successfully copied to %s' % (movefile, diskimage_dir)) + diskimage = os.path.join(diskimage_dir, file) # use disk image in objects/diskimage moving forward + + # raw disk image + if raw_image == True: + + # run disktype on disk image, save output to submissionDocumentation + disktype = os.path.join(subdoc_dir, 'disktype.txt') + subprocess.call("disktype '%s' > '%s'" % (diskimage, disktype), shell=True) - else: - raw_image = True - for movefile in os.listdir(args.source): - # if filename starts with disk image basename (this will also capture info and log files, multi-part disk images, etc.) - if movefile.startswith(image_id): - # copy file to objects/diskimage - try: - shutil.copyfile(os.path.join(args.source, movefile), os.path.join(diskimage_dir, movefile)) - except: - logandprint('ERROR: File %s not successfully copied to %s' % (movefile, diskimage_dir)) - diskimage = os.path.join(diskimage_dir, file) # use disk image in objects/diskimage moving forward - - # raw disk image - if raw_image == True: - - # run disktype on disk image, save output to submissionDocumentation - disktype = os.path.join(subdoc_dir, 'disktype.txt') - subprocess.call("disktype '%s' > '%s'" % (diskimage, disktype), shell=True) - - # pull filesystem info from disktype.txt - disk_fs = '' - try: - for line in open(disktype, 'r'): - if "file system" in line: - disk_fs = line.strip() - except: # handle non-Unicode chars - for line in open(disktype, 'rb'): - if "file system" in line.decode('utf-8','ignore'): - disk_fs = line.decode('utf-8','ignore').strip() - logandprint('File system: %s' % (disk_fs)) - - # handle differently by file system - if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): - # use fiwalk to make dfxml - fiwalk_file = os.path.join(subdoc_dir, 'dfxml.xml') - try: - subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) - except subprocess.CalledProcessError as e: - logandprint('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) - - # carve images using tsk_recover - carve_flag = '-a' # default to exporting allocated files - if args.exportall == True: - carve_flag = '-e' + # pull filesystem info from disktype.txt + disk_fs = '' try: - subprocess.check_output(['tsk_recover', carve_flag, diskimage, files_dir]) - except subprocess.CalledProcessError as e: - logandprint('ERROR: tsk_recover could not carve allocated files from disk. STDERR: %s' % (e.output)) - - # modify file permissions - subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) - subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) + for line in open(disktype, 'r'): + if "file system" in line: + disk_fs = line.strip() + except: # handle non-Unicode chars + for line in open(disktype, 'rb'): + if "file system" in line.decode('utf-8','ignore'): + disk_fs = line.decode('utf-8','ignore').strip() + logandprint('File system: %s' % (disk_fs)) - # rewrite last modified dates of files based on values in DFXML - for (event, obj) in Objects.iterparse(fiwalk_file): + # handle differently by file system + if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): + # use fiwalk to make dfxml + fiwalk_file = os.path.join(subdoc_dir, 'dfxml.xml') + try: + subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) + except subprocess.CalledProcessError as e: + logandprint('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) - # only work on FileObjects - if not isinstance(obj, Objects.FileObject): - continue + # carve images using tsk_recover + carve_flag = '-a' # default to exporting allocated files + if args.exportall == True: + carve_flag = '-e' + try: + subprocess.check_output(['tsk_recover', carve_flag, diskimage, files_dir]) + except subprocess.CalledProcessError as e: + logandprint('ERROR: tsk_recover could not carve allocated files from disk. STDERR: %s' % (e.output)) + + # modify file permissions + subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) + subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) - # skip directories and links - if obj.name_type: - if obj.name_type != "r": + # rewrite last modified dates of files based on values in DFXML + for (event, obj) in Objects.iterparse(fiwalk_file): + + # only work on FileObjects + if not isinstance(obj, Objects.FileObject): continue - # record filename - dfxml_filename = obj.filename - dfxml_filedate = int(time.time()) # default to current time + # skip directories and links + if obj.name_type: + if obj.name_type != "r": + continue - # record last modified or last created date - try: - mtime = obj.mtime - mtime = str(mtime) - except: - pass + # record filename + dfxml_filename = obj.filename + dfxml_filedate = int(time.time()) # default to current time + + # record last modified or last created date + try: + mtime = obj.mtime + mtime = str(mtime) + except: + pass + + try: + crtime = obj.crtime + crtime = str(crtime) + except: + pass + + # fallback to created date if last modified doesn't exist + if mtime and (mtime != 'None'): + mtime = time_to_int(mtime[:19]) + dfxml_filedate = mtime + elif crtime and (crtime != 'None'): + crtime = time_to_int(crtime[:19]) + dfxml_filedate = crtime + else: + continue - try: - crtime = obj.crtime - crtime = str(crtime) - except: - pass - - # fallback to created date if last modified doesn't exist - if mtime and (mtime != 'None'): - mtime = time_to_int(mtime[:19]) - dfxml_filedate = mtime - elif crtime and (crtime != 'None'): - crtime = time_to_int(crtime[:19]) - dfxml_filedate = crtime - else: - continue + # rewrite last modified date of corresponding file in objects/files + exported_filepath = os.path.join(files_dir, dfxml_filename) + if os.path.isfile(exported_filepath): + os.utime(exported_filepath, (dfxml_filedate, dfxml_filedate)) + + # run brunnhilde and write to submissionDocumentation + files_abs = os.path.abspath(files_dir) + if args.piiscan == True: # brunnhilde with bulk_extractor + subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + else: # brunnhilde without bulk_extractor + subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - # rewrite last modified date of corresponding file in objects/files - exported_filepath = os.path.join(files_dir, dfxml_filename) - if os.path.isfile(exported_filepath): - os.utime(exported_filepath, (dfxml_filedate, dfxml_filedate)) + # if user selected 'filesonly', remove disk image files and repackage + if args.filesonly == True: + keep_logical_files_only(object_dir) - # run brunnhilde and write to submissionDocumentation - files_abs = os.path.abspath(files_dir) - if args.piiscan == True: # brunnhilde with bulk_extractor - subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - else: # brunnhilde without bulk_extractor - subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + # write checksums + if args.bagfiles == True: # bag entire SIP + subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) + else: # write metadata/checksum.md5 + subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) - # if user selected 'filesonly', remove disk image files and repackage - if args.filesonly == True: - keep_logical_files_only(object_dir) - # write checksums - if args.bagfiles == True: # bag entire SIP - subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) - else: # write metadata/checksum.md5 - subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) + elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): + # mount disk image + subprocess.call("sudo mount -t hfs -o loop,ro,noexec '%s' /mnt/diskid/" % (diskimage), shell=True) + # use walk_to_dfxml.py to make dfxml + dfxml_file = os.path.abspath(os.path.join(subdoc_dir, 'dfxml.xml')) + try: + subprocess.call("cd /mnt/diskid/ && python3 /usr/share/ccatools/diskimageprocessor/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) + except: + logandprint('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) - elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): - # mount disk image - subprocess.call("sudo mount -t hfs -o loop,ro,noexec '%s' /mnt/diskid/" % (diskimage), shell=True) + # unmount disk image + subprocess.call('sudo umount /mnt/diskid', shell=True) - # use walk_to_dfxml.py to make dfxml - dfxml_file = os.path.abspath(os.path.join(subdoc_dir, 'dfxml.xml')) - try: - subprocess.call("cd /mnt/diskid/ && python3 /usr/share/ccatools/diskimageprocessor/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) - except: - logandprint('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) + # carve files using hfsexplorer + if args.resforks == True: + try: + subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-resforks', 'APPLEDOUBLE', '-o', files_dir, diskimage]) + except subprocess.CalledProcessError as e: + logandprint('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + else: + try: + subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-o', files_dir, diskimage]) + except subprocess.CalledProcessError as e: + logandprint('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + + # modify file permissions + subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) + subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) + + # run brunnhilde and write to reports directory + files_abs = os.path.abspath(files_dir) + if args.piiscan == True: # brunnhilde with bulk_extractor + subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + else: # brunnhilde without bulk_extractor + subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + + # if user selected 'filesonly', remove disk image files and repackage + if args.filesonly == True: + keep_logical_files_only(object_dir) + + # write checksums + if args.bagfiles == True: # bag entire SIP + subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) + else: # write metadata/checksum.md5 + subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) + - # unmount disk image - subprocess.call('sudo umount /mnt/diskid', shell=True) + elif 'udf' in disk_fs.lower(): + # mount image + subprocess.call("sudo mount -t udf -o loop '%s' /mnt/diskid/" % (diskimage), shell=True) - # carve files using hfsexplorer - if args.resforks == True: + # use walk_to_dfxml.py to create dfxml + dfxml_file = os.path.abspath(os.path.join(subdoc_dir, 'dfxml.xml')) try: - subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-resforks', 'APPLEDOUBLE', '-o', files_dir, diskimage]) - except subprocess.CalledProcessError as e: - logandprint('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) - else: + subprocess.call("cd /mnt/diskid/ && python3 /usr/share/dfxml/python/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) + except: + logandprint('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) + + # copy files from disk image to files dir + shutil.rmtree(files_dir) # delete to enable use of copytree try: - subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-o', files_dir, diskimage]) - except subprocess.CalledProcessError as e: - logandprint('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) - - # modify file permissions - subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) - subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) - - # run brunnhilde and write to reports directory - files_abs = os.path.abspath(files_dir) - if args.piiscan == True: # brunnhilde with bulk_extractor - subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - else: # brunnhilde without bulk_extractor - subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - - # if user selected 'filesonly', remove disk image files and repackage - if args.filesonly == True: - keep_logical_files_only(object_dir) - - # write checksums - if args.bagfiles == True: # bag entire SIP - subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) - else: # write metadata/checksum.md5 - subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) - - - elif 'udf' in disk_fs.lower(): - # mount image - subprocess.call("sudo mount -t udf -o loop '%s' /mnt/diskid/" % (diskimage), shell=True) - - # use walk_to_dfxml.py to create dfxml - dfxml_file = os.path.abspath(os.path.join(subdoc_dir, 'dfxml.xml')) - try: - subprocess.call("cd /mnt/diskid/ && python3 /usr/share/dfxml/python/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) - except: - logandprint('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) - - # copy files from disk image to files dir - shutil.rmtree(files_dir) # delete to enable use of copytree - try: - shutil.copytree('/mnt/diskid/', files_dir, symlinks=False, ignore=None) - except: - logandprint("ERROR: shutil.copytree unable to copy files from disk %s" % (diskimage)) + shutil.copytree('/mnt/diskid/', files_dir, symlinks=False, ignore=None) + except: + logandprint("ERROR: shutil.copytree unable to copy files from disk %s" % (diskimage)) - # unmount disk image - subprocess.call('sudo umount /mnt/diskid', shell=True) # unmount + # unmount disk image + subprocess.call('sudo umount /mnt/diskid', shell=True) # unmount - # modify file permissions - subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) - subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) + # modify file permissions + subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) + subprocess.call("sudo find '%s' -type f -exec chmod 644 {} \;" % (sip_dir), shell=True) - # run brunnhilde and write to submissionDocumentation - files_abs = os.path.abspath(files_dir) - if args.piiscan == True: # brunnhilde with bulk_extractor - subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - else: # brunnhilde without bulk_extractor - subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) - - # if user selected 'filesonly', remove disk image files and repackage - if args.filesonly == True: - keep_logical_files_only(object_dir) + # run brunnhilde and write to submissionDocumentation + files_abs = os.path.abspath(files_dir) + if args.piiscan == True: # brunnhilde with bulk_extractor + subprocess.call("brunnhilde.py -zb '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + else: # brunnhilde without bulk_extractor + subprocess.call("brunnhilde.py -z '%s' '%s' '%s'" % (files_abs, subdoc_dir, 'brunnhilde'), shell=True) + + # if user selected 'filesonly', remove disk image files and repackage + if args.filesonly == True: + keep_logical_files_only(object_dir) + + # write checksums + if args.bagfiles == True: # bag entire SIP + subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) + else: # write metadata/checksum.md5 + subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) - # write checksums - if args.bagfiles == True: # bag entire SIP - subprocess.call("bagit.py --processes 4 '%s'" % (sip_dir), shell=True) - else: # write metadata/checksum.md5 - subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) + else: + logandprint('NOTICE: Skipping processing of unknown disk type.') + unprocessed.append(file) + # no raw disk image else: - logandprint('NOTICE: Skipping processing of unknown disk type.') + logandprint('NOTICE: No raw disk image. Skipping disk.') unprocessed.append(file) - # no raw disk image else: - logandprint('NOTICE: No raw disk image. Skipping disk.') - unprocessed.append(file) + # write skipped file to log + logandprint('NOTICE: File is not a disk image. Skipping file.') + # print unprocessed list + if unprocessed: + skipped_disks = ', '.join(unprocessed) + logandprint('Processing complete. Skipped disks: %s' % (skipped_disks)) else: - # write skipped file to log - logandprint('NOTICE: File is not a disk image. Skipping file.') - -# print unprocessed list -if unprocessed: - skipped_disks = ', '.join(unprocessed) - logandprint('Processing complete. Skipped disks: %s' % (skipped_disks)) -else: - logandprint('Processing complete. All disk images processed. Results in %s.' % (destination)) - -# write description spreadsheet -create_spreadsheet(args.filesonly, args.exportall) - -# close files -spreadsheet.close() -log.close() + logandprint('Processing complete. All disk images processed. Results in %s.' % (destination)) + + # write description spreadsheet + create_spreadsheet(args) + + # close log + log.close() + +if __name__ == '__main__': + main() \ No newline at end of file From 5ede2b1b21f60cd984f470ae36de7ed20be50c43 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:26:20 -0400 Subject: [PATCH 14/24] Update version --- README.md | 6 +++++- main.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 54fe770..401b317 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Disk Image Processor Analyze disk images and/or create ready-to-ingest SIPs from a directory of disk images and related files. -Version: 0.6.1 (beta) +Version: 0.7.0 (beta) ## Usage @@ -63,6 +63,10 @@ The "metadata/submissionDocumentation" directory in each SIP contains: * Text output from "disktype" * Brunnhilde reports (including logs and reports from clamAV and, optionally, bulk_extractor) +### Process a single disk image, providing options to tsk_recover (CLI only) + +Also included is a Python 3 script `process_with_tsk_options.py`. This script allows the user to create a SIP and corresponding description for a single disk image (and accompanying files) while specifying the file system type, image type, and sector offset as needed for `tsk_recover`. This script may be useful for certain disks for which tsk_recover is unable to extract files using its autodetection methods. + ## Supported file systems * NTFS diff --git a/main.py b/main.py index 0eb874e..0d574ae 100644 --- a/main.py +++ b/main.py @@ -24,7 +24,7 @@ def __init__(self, parent=None): def about_dialog(self): QMessageBox.information(self, "About", - "Disk Image Processor v0.6.1\nTim Walsh, 2017\nMIT License\nhttps://github.com/timothyryanwalsh/cca-diskimageprocessor") + "Disk Image Processor v0.7.0\nTim Walsh, 2017\nMIT License\nhttps://github.com/timothyryanwalsh/cca-diskimageprocessor") @pyqtSlot() def readStdOutput(self): From 86305df41ea3cc15a4ccafb51ca6e6b6d69945d4 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:32:32 -0400 Subject: [PATCH 15/24] Pass log to longandprint with message --- diskimageprocessor.py | 44 +++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/diskimageprocessor.py b/diskimageprocessor.py index 8e0fe94..f51a9c5 100644 --- a/diskimageprocessor.py +++ b/diskimageprocessor.py @@ -39,7 +39,7 @@ #import Objects.py from python dfxml tools import Objects -def logandprint(message): +def logandprint(log, message): """ Print to log and terminal """ log.write('\n' + (time.strftime("%H:%M:%S %b %d, %Y - ", time.localtime())) + message) @@ -284,7 +284,7 @@ def create_spreadsheet(args): writer.writerow(['', item, '', '', date_statement, date_earliest, date_latest, 'File', extent, scopecontent, '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) - logandprint('Described %s successfully.' % (current)) + logandprint(log, 'Described %s successfully.' % (current)) # if error reading DFXML file, report that except: @@ -292,9 +292,9 @@ def create_spreadsheet(args): writer.writerow(['', item, '', '', 'Error', 'Error', 'Error', 'File', 'Error', 'Error reading DFXML file.', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']) - logandprint('ERROR: DFXML file for %s not well-formed.' % (current)) + logandprint(log, 'ERROR: DFXML file for %s not well-formed.' % (current)) - logandprint('Description CSV created.') + logandprint(log, 'Description CSV created.') def keep_logical_files_only(objects_dir): """ Remove disk image from SIP and repackage """ @@ -342,9 +342,9 @@ def main(): log_file = os.path.join(destination, 'diskimageprocessor-log.txt') try: log = open(log_file, 'w') # open the log file - logandprint('Source of disk images: %s' % (args.source)) + logandprint(log, 'Source of disk images: %s' % (args.source)) except: - logandprint('There was an error creating the log file.') + logandprint(log, 'There was an error creating the log file.') # make list for unprocessed disks unprocessed = [] @@ -353,7 +353,7 @@ def main(): for file in sorted(os.listdir(args.source)): # record filename in log - logandprint('>>> NEW FILE: %s' % (file)) + logandprint(log, '>>> NEW FILE: %s' % (file)) # determine if disk image if file.lower().endswith((".e01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): @@ -388,7 +388,7 @@ def main(): os.rename(os.path.join(diskimage_dir, '%s.raw.info' % (image_id)), os.path.join(diskimage_dir, '%s.img.info' % image_id)) # rename sidecar md5 file diskimage = os.path.join(diskimage_dir, '%s.img' % (image_id)) # use raw disk image in objects/diskimage moving forward except subprocess.CalledProcessError: - logandprint('ERROR: Disk image could not be converted to raw image format. Skipping disk.') + logandprint(log, 'ERROR: Disk image could not be converted to raw image format. Skipping disk.') else: raw_image = True @@ -399,7 +399,7 @@ def main(): try: shutil.copyfile(os.path.join(args.source, movefile), os.path.join(diskimage_dir, movefile)) except: - logandprint('ERROR: File %s not successfully copied to %s' % (movefile, diskimage_dir)) + logandprint(log, 'ERROR: File %s not successfully copied to %s' % (movefile, diskimage_dir)) diskimage = os.path.join(diskimage_dir, file) # use disk image in objects/diskimage moving forward # raw disk image @@ -419,7 +419,7 @@ def main(): for line in open(disktype, 'rb'): if "file system" in line.decode('utf-8','ignore'): disk_fs = line.decode('utf-8','ignore').strip() - logandprint('File system: %s' % (disk_fs)) + logandprint(log, 'File system: %s' % (disk_fs)) # handle differently by file system if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): @@ -428,7 +428,7 @@ def main(): try: subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) except subprocess.CalledProcessError as e: - logandprint('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) + logandprint(log, 'ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) # carve images using tsk_recover carve_flag = '-a' # default to exporting allocated files @@ -437,7 +437,7 @@ def main(): try: subprocess.check_output(['tsk_recover', carve_flag, diskimage, files_dir]) except subprocess.CalledProcessError as e: - logandprint('ERROR: tsk_recover could not carve allocated files from disk. STDERR: %s' % (e.output)) + logandprint(log, 'ERROR: tsk_recover could not carve allocated files from disk. STDERR: %s' % (e.output)) # modify file permissions subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) @@ -514,7 +514,7 @@ def main(): try: subprocess.call("cd /mnt/diskid/ && python3 /usr/share/ccatools/diskimageprocessor/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) except: - logandprint('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) + logandprint(log, 'ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) # unmount disk image subprocess.call('sudo umount /mnt/diskid', shell=True) @@ -524,12 +524,12 @@ def main(): try: subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-resforks', 'APPLEDOUBLE', '-o', files_dir, diskimage]) except subprocess.CalledProcessError as e: - logandprint('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + logandprint(log, 'ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) else: try: subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-o', files_dir, diskimage]) except subprocess.CalledProcessError as e: - logandprint('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + logandprint(log, 'ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) # modify file permissions subprocess.call("sudo find '%s' -type d -exec chmod 755 {} \;" % (sip_dir), shell=True) @@ -562,14 +562,14 @@ def main(): try: subprocess.call("cd /mnt/diskid/ && python3 /usr/share/dfxml/python/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) except: - logandprint('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) + logandprint(log, 'ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) # copy files from disk image to files dir shutil.rmtree(files_dir) # delete to enable use of copytree try: shutil.copytree('/mnt/diskid/', files_dir, symlinks=False, ignore=None) except: - logandprint("ERROR: shutil.copytree unable to copy files from disk %s" % (diskimage)) + logandprint(log, "ERROR: shutil.copytree unable to copy files from disk %s" % (diskimage)) # unmount disk image subprocess.call('sudo umount /mnt/diskid', shell=True) # unmount @@ -596,24 +596,24 @@ def main(): subprocess.call("cd '%s' && md5deep -rl ../objects > checksum.md5" % (metadata_dir), shell=True) else: - logandprint('NOTICE: Skipping processing of unknown disk type.') + logandprint(log, 'NOTICE: Skipping processing of unknown disk type.') unprocessed.append(file) # no raw disk image else: - logandprint('NOTICE: No raw disk image. Skipping disk.') + logandprint(log, 'NOTICE: No raw disk image. Skipping disk.') unprocessed.append(file) else: # write skipped file to log - logandprint('NOTICE: File is not a disk image. Skipping file.') + logandprint(log, 'NOTICE: File is not a disk image. Skipping file.') # print unprocessed list if unprocessed: skipped_disks = ', '.join(unprocessed) - logandprint('Processing complete. Skipped disks: %s' % (skipped_disks)) + logandprint(log, 'Processing complete. Skipped disks: %s' % (skipped_disks)) else: - logandprint('Processing complete. All disk images processed. Results in %s.' % (destination)) + logandprint(log, 'Processing complete. All disk images processed. Results in %s.' % (destination)) # write description spreadsheet create_spreadsheet(args) From e13c50d29a34e30417b38c19f0a148de5dbc2c50 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:39:09 -0400 Subject: [PATCH 16/24] Pass sip_dir to spreadsheet function --- diskimageprocessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/diskimageprocessor.py b/diskimageprocessor.py index f51a9c5..8a54092 100644 --- a/diskimageprocessor.py +++ b/diskimageprocessor.py @@ -63,7 +63,7 @@ def time_to_int(str_time): "%Y-%m-%dT%H:%M:%S").timetuple()) return dt -def create_spreadsheet(args): +def create_spreadsheet(args, sips): """ Create csv describing created SIPs """ # open description spreadsheet @@ -616,7 +616,7 @@ def main(): logandprint(log, 'Processing complete. All disk images processed. Results in %s.' % (destination)) # write description spreadsheet - create_spreadsheet(args) + create_spreadsheet(args, sip_dir) # close log log.close() From 1aa5756edb20d8ae533b48d7d074fc1628d34921 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:42:33 -0400 Subject: [PATCH 17/24] Refactor --- diskimageanalyzer.py | 459 ++++++++++++++++++++++--------------------- 1 file changed, 231 insertions(+), 228 deletions(-) diff --git a/diskimageanalyzer.py b/diskimageanalyzer.py index 6ba0d9b..dbd7369 100644 --- a/diskimageanalyzer.py +++ b/diskimageanalyzer.py @@ -26,7 +26,7 @@ import Objects def convert_size(size): - """convert size to human-readable form""" + """ Convert size to human-readable form """ if (size == 0): return '0 bytes' size_name = ("bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") @@ -38,11 +38,12 @@ def convert_size(size): return '%s %s' % (s,size_name[i]) def time_to_int(str_time): + """ Convert datetime to unix integer value """ dt = time.mktime(datetime.datetime.strptime(str_time, "%Y-%m-%dT%H:%M:%S").timetuple()) return dt def write_to_spreadsheet(disk_result, spreadsheet_path, exportall): - """append info for current disk to analysis CSV""" + """ Append info for current disk to analysis CSV """ # open description spreadsheet spreadsheet = open(spreadsheet_path, 'a') @@ -228,258 +229,260 @@ def write_to_spreadsheet(disk_result, spreadsheet_path, exportall): spreadsheet.close() -# MAIN FLOW - -# parse arguments -parser = argparse.ArgumentParser() -parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") -parser.add_argument("-k", "--keepfiles", help="Retain exported logical files from each disk", action="store_true") -parser.add_argument("-r", "--resforks", help="Export AppleDouble resource forks from HFS-formatted disks", action="store_true") -parser.add_argument("source", help="Path to folder containing disk images") -parser.add_argument("destination", help="Output destination") -args = parser.parse_args() - -source = args.source -destination = args.destination - -# make outdir disks -if not os.path.exists(destination): - os.makedirs(destination) -diskimages_dir = os.path.join(destination, 'diskimages') -files_dir = os.path.join(destination, 'files') -results_dir = os.path.join(destination, 'reports') -for new_dir in diskimages_dir, files_dir, results_dir: - os.makedirs(new_dir) - -# make list for unanalyzed disks -unanalyzed = [] - -# process each disk image -for file in sorted(os.listdir(source)): - - # determine if disk image - if file.endswith((".E01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): - - # save info about file - image_path = os.path.join(source, file) - image_id = os.path.splitext(file)[0] - image_ext = os.path.splitext(file)[1] - - # create new folders - disk_dir = os.path.join(results_dir, file) - os.makedirs(disk_dir) - - # disk image status - raw_image = False - - # check if disk image is ewf - if image_ext == ".E01": - # convert disk image to raw and write to diskimages_dir - raw_out = os.path.join(diskimages_dir, image_id) - try: - subprocess.check_output(['ewfexport', '-t', raw_out, '-f', 'raw', '-o', '0', '-S', '0', '-u', image_path]) - raw_image = True - os.rename(os.path.join(diskimages_dir, '%s.raw' % image_id), os.path.join(diskimages_dir, '%s.img' % image_id)) # change file extension from .raw to .img - os.rename(os.path.join(diskimages_dir, '%s.raw.info' % image_id), os.path.join(diskimages_dir, '%s.img.info' % image_id)) # rename sidecar md5 file - diskimage = os.path.join(diskimages_dir, '%s.img' % image_id) # use raw disk image in diskimages_dir moving forward - except subprocess.CalledProcessError: - print('ERROR: Disk image could not be converted to raw image format. Skipping disk.') - - else: - raw_image = True - for movefile in os.listdir(args.source): - # if filename starts with disk image basename (this will also capture info and log files, multi-part disk images, etc.) - if movefile.startswith(image_id): - # copy file to objects/diskimage - shutil.copyfile(os.path.join(args.source, movefile), os.path.join(diskimages_dir, movefile)) - diskimage = os.path.join(diskimages_dir, file) # use disk image in diskimages_dir moving forward - - # raw disk image - if raw_image == True: - - # run disktype on disk image, save output to disk_dir - disktype = os.path.join(disk_dir, 'disktype.txt') - subprocess.call("disktype '%s' > '%s'" % (diskimage, disktype), shell=True) +def _make_parser(): - # pull filesystem info from disktype.txt - disk_fs = '' - try: - for line in open(disktype, 'r'): - if "file system" in line: - disk_fs = line.strip() - except: # handle non-Unicode chars - for line in open(disktype, 'rb'): - if "file system" in line.decode('utf-8','ignore'): - disk_fs = line.decode('utf-8','ignore').strip() - - # handle differently by file system - if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): - # use fiwalk to make dfxml - fiwalk_file = os.path.abspath(os.path.join(disk_dir, 'dfxml.xml')) - try: - subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) - except subprocess.CalledProcessError as e: - print('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) - - # carve files - disk_files_dir = os.path.join(files_dir, file) - if not os.path.exists(disk_files_dir): - os.makedirs(disk_files_dir) - # carve allocated or all files depending on option selected - if args.exportall == True: - try: - subprocess.check_output(['tsk_recover', '-e', diskimage, disk_files_dir]) - except subprocess.CalledProcessError as e: - print('ERROR: tsk_recover could not carve all files from disk. STDERR: %s' % (e.output)) - else: - try: - subprocess.check_output(['tsk_recover', '-a', diskimage, disk_files_dir]) - except subprocess.CalledProcessError as e: - print('ERROR: tsk_recover could not carve allocated files from disk. STDERR: %s' % (e.output)) + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--exportall", help="Export all (not only allocated) with tsk_recover", action="store_true") + parser.add_argument("-k", "--keepfiles", help="Retain exported logical files from each disk", action="store_true") + parser.add_argument("-r", "--resforks", help="Export AppleDouble resource forks from HFS-formatted disks", action="store_true") + parser.add_argument("source", help="Path to folder containing disk images") + parser.add_argument("destination", help="Output destination") - # rewrite last modified dates of carved files based on values in DFXML - for (event, obj) in Objects.iterparse(fiwalk_file): - - # only work on FileObjects - if not isinstance(obj, Objects.FileObject): - continue +def main(): - # skip directories and links - if obj.name_type: - if obj.name_type != "r": - continue + parser = _make_parser() + args = parser.parse_args() - # record filename - dfxml_filename = obj.filename - dfxml_filedate = int(time.time()) # default to current time + source = os.path.abspath(args.source) + destination = os.path.abspath(args.destination) - # record last modified or last created date - try: - mtime = obj.mtime - mtime = str(mtime) - except: - pass - - try: - crtime = obj.crtime - crtime = str(crtime) - except: - pass - - # fallback to created date if last modified doesn't exist - if mtime and (mtime != 'None'): - mtime = time_to_int(mtime[:19]) - dfxml_filedate = mtime - elif crtime and (crtime != 'None'): - crtime = time_to_int(crtime[:19]) - dfxml_filedate = crtime - else: - continue + # make outdir disks + if not os.path.exists(destination): + os.makedirs(destination) + diskimages_dir = os.path.join(destination, 'diskimages') + files_dir = os.path.join(destination, 'files') + results_dir = os.path.join(destination, 'reports') + for new_dir in diskimages_dir, files_dir, results_dir: + os.makedirs(new_dir) + + # make list for unanalyzed disks + unanalyzed = [] - # rewrite last modified date of corresponding file in objects/files - exported_filepath = os.path.join(disk_files_dir, dfxml_filename) - if os.path.isfile(exported_filepath): - os.utime(exported_filepath, (dfxml_filedate, dfxml_filedate)) + # process each disk image + for file in sorted(os.listdir(source)): + + # determine if disk image + if file.lower().endswith((".e01", ".000", ".001", ".raw", ".img", ".dd", ".iso")): - # run brunnhilde - subprocess.call("brunnhilde.py -zwb '%s' '%s' brunnhilde" % (disk_files_dir, disk_dir), shell=True) + # save info about file + image_path = os.path.join(source, file) + image_id = os.path.splitext(file)[0] + image_ext = os.path.splitext(file)[1] - # remove disk_files_dir unless keepfiles option selected - if args.keepfiles == False: - shutil.rmtree(disk_files_dir) + # create new folders + disk_dir = os.path.join(results_dir, file) + os.makedirs(disk_dir) - elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): - # mount disk image - subprocess.call("sudo mount -t hfs -o loop,ro,noexec '%s' /mnt/diskid/" % (diskimage), shell=True) + # disk image status + raw_image = False - # use walk_to_dfxml.py to make dfxml - dfxml_file = os.path.abspath(os.path.join(disk_dir, 'dfxml.xml')) + # check if disk image is ewf + if image_ext == ".E01": + # convert disk image to raw and write to diskimages_dir + raw_out = os.path.join(diskimages_dir, image_id) try: - subprocess.call("cd /mnt/diskid/ && python3 /usr/share/ccatools/diskimageprocessor/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) - except: - print('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) + subprocess.check_output(['ewfexport', '-t', raw_out, '-f', 'raw', '-o', '0', '-S', '0', '-u', image_path]) + raw_image = True + os.rename(os.path.join(diskimages_dir, '%s.raw' % image_id), os.path.join(diskimages_dir, '%s.img' % image_id)) # change file extension from .raw to .img + os.rename(os.path.join(diskimages_dir, '%s.raw.info' % image_id), os.path.join(diskimages_dir, '%s.img.info' % image_id)) # rename sidecar md5 file + diskimage = os.path.join(diskimages_dir, '%s.img' % image_id) # use raw disk image in diskimages_dir moving forward + except subprocess.CalledProcessError: + print('ERROR: Disk image could not be converted to raw image format. Skipping disk.') + + else: + raw_image = True + for movefile in os.listdir(args.source): + # if filename starts with disk image basename (this will also capture info and log files, multi-part disk images, etc.) + if movefile.startswith(image_id): + # copy file to objects/diskimage + shutil.copyfile(os.path.join(args.source, movefile), os.path.join(diskimages_dir, movefile)) + diskimage = os.path.join(diskimages_dir, file) # use disk image in diskimages_dir moving forward + + # raw disk image + if raw_image == True: - # run brunnhilde - subprocess.call("brunnhilde.py -zwb /mnt/diskid/ '%s' brunnhilde" % (disk_dir), shell=True) + # run disktype on disk image, save output to disk_dir + disktype = os.path.join(disk_dir, 'disktype.txt') + subprocess.call("disktype '%s' > '%s'" % (diskimage, disktype), shell=True) - # unmount disk image - subprocess.call('sudo umount /mnt/diskid', shell=True) + # pull filesystem info from disktype.txt + disk_fs = '' + try: + for line in open(disktype, 'r'): + if "file system" in line: + disk_fs = line.strip() + except: # handle non-Unicode chars + for line in open(disktype, 'rb'): + if "file system" in line.decode('utf-8','ignore'): + disk_fs = line.decode('utf-8','ignore').strip() + + # handle differently by file system + if any(x in disk_fs.lower() for x in ('ntfs', 'fat', 'ext', 'iso9660', 'hfs+', 'ufs', 'raw', 'swap', 'yaffs2')): + # use fiwalk to make dfxml + fiwalk_file = os.path.abspath(os.path.join(disk_dir, 'dfxml.xml')) + try: + subprocess.check_output(['fiwalk', '-X', fiwalk_file, diskimage]) + except subprocess.CalledProcessError as e: + print('ERROR: Fiwalk could not create DFXML for disk. STDERR: %s' % (e.output)) - # export files to disk_files_dir if keepfiles selected - if args.keepfiles == True: + # carve files disk_files_dir = os.path.join(files_dir, file) if not os.path.exists(disk_files_dir): os.makedirs(disk_files_dir) - # carve with or without resource forks depending on option selected - if args.resforks == True: + # carve allocated or all files depending on option selected + if args.exportall == True: try: - subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-resforks', 'APPLEDOUBLE', '-o', disk_files_dir, diskimage]) + subprocess.check_output(['tsk_recover', '-e', diskimage, disk_files_dir]) except subprocess.CalledProcessError as e: - print('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + print('ERROR: tsk_recover could not carve all files from disk. STDERR: %s' % (e.output)) else: try: - subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-o', disk_files_dir, diskimage]) + subprocess.check_output(['tsk_recover', '-a', diskimage, disk_files_dir]) except subprocess.CalledProcessError as e: - print('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + print('ERROR: tsk_recover could not carve allocated files from disk. STDERR: %s' % (e.output)) + # rewrite last modified dates of carved files based on values in DFXML + for (event, obj) in Objects.iterparse(fiwalk_file): + + # only work on FileObjects + if not isinstance(obj, Objects.FileObject): + continue - elif 'udf' in disk_fs.lower(): - # mount image - subprocess.call("sudo mount -t udf -o loop '%s' /mnt/diskid/" % (diskimage), shell=True) + # skip directories and links + if obj.name_type: + if obj.name_type != "r": + continue - # use walk_to_dfxml.py to create dfxml - dfxml_file = os.path.abspath(os.path.join(disk_dir, 'dfxml.xml')) - try: - subprocess.call("cd /mnt/diskid/ && python3 /usr/share/ccatools/diskimageprocessor/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) - except: - print('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) - - # write files to tempdir - disk_files_dir = os.path.join(files_dir, file) - shutil.copytree('/mnt/diskid/', disk_files_dir, symlinks=False, ignore=None) + # record filename + dfxml_filename = obj.filename + dfxml_filedate = int(time.time()) # default to current time + + # record last modified or last created date + try: + mtime = obj.mtime + mtime = str(mtime) + except: + pass + + try: + crtime = obj.crtime + crtime = str(crtime) + except: + pass + + # fallback to created date if last modified doesn't exist + if mtime and (mtime != 'None'): + mtime = time_to_int(mtime[:19]) + dfxml_filedate = mtime + elif crtime and (crtime != 'None'): + crtime = time_to_int(crtime[:19]) + dfxml_filedate = crtime + else: + continue + + # rewrite last modified date of corresponding file in objects/files + exported_filepath = os.path.join(disk_files_dir, dfxml_filename) + if os.path.isfile(exported_filepath): + os.utime(exported_filepath, (dfxml_filedate, dfxml_filedate)) + + # run brunnhilde + subprocess.call("brunnhilde.py -zwb '%s' '%s' brunnhilde" % (disk_files_dir, disk_dir), shell=True) + + # remove disk_files_dir unless keepfiles option selected + if args.keepfiles == False: + shutil.rmtree(disk_files_dir) + + elif ('hfs' in disk_fs.lower()) and ('hfs+' not in disk_fs.lower()): + # mount disk image + subprocess.call("sudo mount -t hfs -o loop,ro,noexec '%s' /mnt/diskid/" % (diskimage), shell=True) + + # use walk_to_dfxml.py to make dfxml + dfxml_file = os.path.abspath(os.path.join(disk_dir, 'dfxml.xml')) + try: + subprocess.call("cd /mnt/diskid/ && python3 /usr/share/ccatools/diskimageprocessor/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) + except: + print('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) + + # run brunnhilde + subprocess.call("brunnhilde.py -zwb /mnt/diskid/ '%s' brunnhilde" % (disk_dir), shell=True) + + # unmount disk image + subprocess.call('sudo umount /mnt/diskid', shell=True) + + # export files to disk_files_dir if keepfiles selected + if args.keepfiles == True: + disk_files_dir = os.path.join(files_dir, file) + if not os.path.exists(disk_files_dir): + os.makedirs(disk_files_dir) + # carve with or without resource forks depending on option selected + if args.resforks == True: + try: + subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-resforks', 'APPLEDOUBLE', '-o', disk_files_dir, diskimage]) + except subprocess.CalledProcessError as e: + print('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + else: + try: + subprocess.check_output(['bash', '/usr/share/hfsexplorer/bin/unhfs', '-v', '-o', disk_files_dir, diskimage]) + except subprocess.CalledProcessError as e: + print('ERROR: HFS Explorer could not carve the following files from image: %s' % (e.output)) + + + elif 'udf' in disk_fs.lower(): + # mount image + subprocess.call("sudo mount -t udf -o loop '%s' /mnt/diskid/" % (diskimage), shell=True) + + # use walk_to_dfxml.py to create dfxml + dfxml_file = os.path.abspath(os.path.join(disk_dir, 'dfxml.xml')) + try: + subprocess.call("cd /mnt/diskid/ && python3 /usr/share/ccatools/diskimageprocessor/walk_to_dfxml.py > '%s'" % (dfxml_file), shell=True) + except: + print('ERROR: walk_to_dfxml.py unable to generate DFXML for disk %s' % (diskimage)) + + # write files to tempdir + disk_files_dir = os.path.join(files_dir, file) + shutil.copytree('/mnt/diskid/', disk_files_dir, symlinks=False, ignore=None) - # change file permissions in disk_files_dir - subprocess.call("find '%s' -type d -exec chmod 755 {} \;" % (disk_files_dir), shell=True) - subprocess.call("find '%s' -type f -exec chmod 644 {} \;" % (disk_files_dir), shell=True) + # change file permissions in disk_files_dir + subprocess.call("find '%s' -type d -exec chmod 755 {} \;" % (disk_files_dir), shell=True) + subprocess.call("find '%s' -type f -exec chmod 644 {} \;" % (disk_files_dir), shell=True) - # unmount disk image - subprocess.call('sudo umount /mnt/diskid', shell=True) + # unmount disk image + subprocess.call('sudo umount /mnt/diskid', shell=True) - # run brunnhilde - subprocess.call("brunnhilde.py -zwb '%s' '%s' brunnhilde" % (disk_files_dir, disk_dir), shell=True) + # run brunnhilde + subprocess.call("brunnhilde.py -zwb '%s' '%s' brunnhilde" % (disk_files_dir, disk_dir), shell=True) + + # remove disk_files_dir unless keepfiles option selected + if args.keepfiles == False: + shutil.rmtree(disk_files_dir) - # remove disk_files_dir unless keepfiles option selected - if args.keepfiles == False: - shutil.rmtree(disk_files_dir) - - else: - # add disk to unanalyzed list - unanalyzed.append(diskimage) - -# delete temp directories -shutil.rmtree(diskimages_dir) -if args.keepfiles == False: - shutil.rmtree(files_dir) - -# create analysis spreadsheet -spreadsheet_path = os.path.join(destination, 'analysis.csv') -# open description spreadsheet -spreadsheet = open(spreadsheet_path, 'w') -writer = csv.writer(spreadsheet, quoting=csv.QUOTE_NONNUMERIC) -header_list = ['Disk image', 'File system', 'Date type', 'Date statement', 'Date begin', 'Date end', 'Extent', 'Virus found', 'File formats'] -writer.writerow(header_list) - -# close description spreadsheet -spreadsheet.close() - -# add info to description spreadsheet -for item in sorted(os.listdir(results_dir)): - disk_result = os.path.join(results_dir, item) - write_to_spreadsheet(disk_result, spreadsheet_path, args.exportall) - -# write closing message -if unanalyzed: - skipped_disks = ', '.join(unanalyzed) - print('Analysis complete. Skipped disks: %s.' % (skipped_disks)) -else: - print('Analysis complete. All disk images analyzed. Results in %s.' % (destination)) + else: + # add disk to unanalyzed list + unanalyzed.append(diskimage) + + # delete temp directories + shutil.rmtree(diskimages_dir) + if args.keepfiles == False: + shutil.rmtree(files_dir) + + # create analysis csv, write header, and close file + spreadsheet = open(os.path.join(destination, 'analysis.csv'), 'w') + writer = csv.writer(spreadsheet, quoting=csv.QUOTE_NONNUMERIC) + header_list = ['Disk image', 'File system', 'Date type', 'Date statement', 'Date begin', 'Date end', 'Extent', 'Virus found', 'File formats'] + writer.writerow(header_list) + spreadsheet.close() + + # add info to analysis csv for each SIP + for item in sorted(os.listdir(results_dir)): + disk_result = os.path.join(results_dir, item) + write_to_spreadsheet(disk_result, spreadsheet_path, args.exportall) + + # write closing message + if unanalyzed: + skipped_disks = ', '.join(unanalyzed) + print('Analysis complete. Skipped disks: %s.' % (skipped_disks)) + else: + print('Analysis complete. All disk images analyzed. Results in %s.' % (destination)) + +if __name__ == '__main__': + main() \ No newline at end of file From 8416a637fa3ac35c699abfafd0befcfaa7023a55 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:47:17 -0400 Subject: [PATCH 18/24] Pass log path to csv function --- diskimageprocessor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/diskimageprocessor.py b/diskimageprocessor.py index 8a54092..4a86a4a 100644 --- a/diskimageprocessor.py +++ b/diskimageprocessor.py @@ -63,7 +63,7 @@ def time_to_int(str_time): "%Y-%m-%dT%H:%M:%S").timetuple()) return dt -def create_spreadsheet(args, sips): +def create_spreadsheet(args, sips, log): """ Create csv describing created SIPs """ # open description spreadsheet @@ -616,7 +616,7 @@ def main(): logandprint(log, 'Processing complete. All disk images processed. Results in %s.' % (destination)) # write description spreadsheet - create_spreadsheet(args, sip_dir) + create_spreadsheet(args, sip_dir, log) # close log log.close() From 020d237f4ff7cb86dae44d43ecb80446394fece3 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:52:14 -0400 Subject: [PATCH 19/24] Pass sips path to csv function, not sip_dir --- diskimageprocessor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskimageprocessor.py b/diskimageprocessor.py index 4a86a4a..60d7619 100644 --- a/diskimageprocessor.py +++ b/diskimageprocessor.py @@ -616,7 +616,7 @@ def main(): logandprint(log, 'Processing complete. All disk images processed. Results in %s.' % (destination)) # write description spreadsheet - create_spreadsheet(args, sip_dir, log) + create_spreadsheet(args, sips, log) # close log log.close() From f3457af42c37f542524849693537eb0f58a24c8e Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:53:52 -0400 Subject: [PATCH 20/24] Return parser from _make_parser() --- diskimageanalyzer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/diskimageanalyzer.py b/diskimageanalyzer.py index dbd7369..8e2dac4 100644 --- a/diskimageanalyzer.py +++ b/diskimageanalyzer.py @@ -238,6 +238,8 @@ def _make_parser(): parser.add_argument("source", help="Path to folder containing disk images") parser.add_argument("destination", help="Output destination") + return parser + def main(): parser = _make_parser() From 6b0239e76af120b2c3417f03eafacc247904a704 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 17:59:13 -0400 Subject: [PATCH 21/24] Update README.md --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 401b317..c6ea727 100644 --- a/README.md +++ b/README.md @@ -90,9 +90,9 @@ Also included is a Python 3 script `process_with_tsk_options.py`. This script al ## Disk image extensions recognized -Disk Image Processor recognizes which files are disk images by their file extensions. Currently, it looks for the following extensions: +Disk Image Processor recognizes which files are disk images by their file extensions. Currently, it looks for the following extensions (case-insensitive): -* .E01 +* .e01 * .000 * .001 * .raw @@ -100,8 +100,6 @@ Disk Image Processor recognizes which files are disk images by their file extens * .dd * .iso -*To add extensions to this list, add them as elements in the tuple inside `file.endswith((".E01", ".000", ".001", ".raw", ".img", ".dd", ".iso"))` on line 353 of `diskimageprocessor.py` and/or line 261 of `diskimageanalyzer.py`.* - ## Installation and dependencies This utility is designed for easy use in BitCurator v1.8.0+. It requires Python 2.7 (to run the GUI) and Python 3.4+ (to run the scripts that analyze and process disk images), both of which are already included in BitCurator. From a37ffb1302d30a7be242cca8d601e1bb96638f32 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Tue, 17 Oct 2017 18:03:08 -0400 Subject: [PATCH 22/24] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c6ea727..f618f78 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,8 @@ Also included is a Python 3 script `process_with_tsk_options.py`. This script al * SWAP * YAFFS2 +For disks with exfat file systems you may need to use the `process_with_tsk_options.py` script and explicitly specify the file system type. This is due to disktype's inability to recognize exfat file systems. + ## Supported disk image types * raw (dd, iso, img, etc.) From 06771941d545c51d4156b8c5d1f48ba5bc08d8ac Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Wed, 18 Oct 2017 10:19:47 -0400 Subject: [PATCH 23/24] Modify spreadsheet path to write_to_spreadsheet() --- diskimageanalyzer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskimageanalyzer.py b/diskimageanalyzer.py index 8e2dac4..9b80e25 100644 --- a/diskimageanalyzer.py +++ b/diskimageanalyzer.py @@ -477,7 +477,7 @@ def main(): # add info to analysis csv for each SIP for item in sorted(os.listdir(results_dir)): disk_result = os.path.join(results_dir, item) - write_to_spreadsheet(disk_result, spreadsheet_path, args.exportall) + write_to_spreadsheet(disk_result, os.path.join(destination, 'analysis.csv'), args.exportall) # write closing message if unanalyzed: From e52900c3492e0a96cb9b63e6127e6b38347488b0 Mon Sep 17 00:00:00 2001 From: Tim Walsh Date: Fri, 20 Oct 2017 15:55:47 -0400 Subject: [PATCH 24/24] Update dfxml --- deps/dfxml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deps/dfxml b/deps/dfxml index e75ef19..a95919f 160000 --- a/deps/dfxml +++ b/deps/dfxml @@ -1 +1 @@ -Subproject commit e75ef197d387ca165d6fc6676273b4ce534ba0f6 +Subproject commit a95919fa67f97f92077339898e6b2de31f24b974