diff --git a/scripts/input_data_prep/merge_omni_data.py b/scripts/input_data_prep/merge_omni_data.py index 2090ae1..6221b15 100644 --- a/scripts/input_data_prep/merge_omni_data.py +++ b/scripts/input_data_prep/merge_omni_data.py @@ -8,6 +8,7 @@ from pyfiglet import Figlet from termcolor import colored +from tqdm import tqdm import pandas as pd import os @@ -32,37 +33,43 @@ def merge_omni(): parser = argparse.ArgumentParser(description='Merging Omniweb Data', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--input_dir', type=str, default='../../data/omniweb_data', help='Input directory of Omniweb data') - parser.add_argument('--output_dir', type=str, default='../../data/merged_datasets', help='Output directory of processed data') opt = parser.parse_args() - #create output directory if it does not exist - os.makedirs(opt.output_dir, exist_ok=True) - filenames=os.listdir(opt.input_dir) filenames_magnetic_field=[os.path.join(opt.input_dir,f) for f in filenames if f.endswith('.csv') and f.startswith('magnetic_field')] filenames_solar_wind=[os.path.join(opt.input_dir,f) for f in filenames if f.endswith('.csv') and f.startswith('solar_wind')] filenames_indices=[os.path.join(opt.input_dir,f) for f in filenames if f.endswith('.csv') and f.startswith('indices')] + #first check if it exists: + if os.path.exists(os.path.join(opt.input_dir,'merged_omni_magnetic_field.csv')): + print('merged_merged_omni_magnetic_field.csv already exists, skipping') + else: + final_df_magnetic_field=create_df(filenames_magnetic_field) + final_df_magnetic_field.sort_values('all__dates_datetime__',inplace=True) + file_path=os.path.join(opt.input_dir,'merged_omni_magnetic_field.csv') + final_df_magnetic_field.to_csv(file_path,index=False) + print(f' OMNI magnetic field merged dataframe created at: {file_path}') + del final_df_magnetic_field + #first check if it exists: + if os.path.exists(os.path.join(opt.input_dir,'merged_omni_solar_wind.csv')): + print('merged_omni_solar_wind.csv already exists, skipping') + else: + final_df_solar_wind=create_df(filenames_solar_wind) + final_df_solar_wind.sort_values('all__dates_datetime__',inplace=True) + file_path=os.path.join(opt.input_dir,'merged_omni_solar_wind.csv') + final_df_solar_wind.to_csv(file_path,index=False) + print(f' OMNI Solar Wind merged dataframe created at: {file_path}') + del final_df_solar_wind - final_df_magnetic_field=create_df(filenames_magnetic_field) - final_df_magnetic_field.sort_values('all__dates_datetime__',inplace=True) - file_path=os.path.join(opt.input_dir,'merged_omni_magnetic_field.csv') - final_df_magnetic_field.to_csv(file_path,index=False) - print(f' OMNI magnetic field merged dataframe created at: {file_path}') - del final_df_magnetic_field - - final_df_solar_wind=create_df(filenames_solar_wind) - final_df_solar_wind.sort_values('all__dates_datetime__',inplace=True) - file_path=os.path.join(opt.input_dir,'merged_omni_solar_wind.csv') - final_df_solar_wind.to_csv(file_path,index=False) - print(f' OMNI Solar Wind merged dataframe created at: {file_path}') - del final_df_solar_wind - + #first check if it exists: + if os.path.exists(os.path.join(opt.input_dir,'merged_omni_indices.csv')): + print('merged_omni_solar_wind.csv already exists, skipping') + else: - final_df_indices=create_df(filenames_indices) - final_df_indices.sort_values('all__dates_datetime__',inplace=True) - file_path=opt.path.join(opt.output_dir,'merged_omni_indices.csv') - final_df_indices.to_csv(file_path,index=False) + final_df_indices=create_df(filenames_indices) + final_df_indices.sort_values('all__dates_datetime__',inplace=True) + file_path=os.path.join(opt.input_dir,'merged_omni_indices.csv') + final_df_indices.to_csv(file_path,index=False) print(f' OMNI Indices merged dataframe created at: {file_path}') if __name__ == "__main__":