diff --git a/notebooks/CSO_events.ipynb b/notebooks/CSO_events.ipynb new file mode 100644 index 000000000..c329274ab --- /dev/null +++ b/notebooks/CSO_events.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Subdivision of CSO observed data into events" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import plotly.graph_objects as go" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Import data\n", + "os.chdir('C:\\\\Users\\\\rpal\\\\Source\\\\modelskill\\\\tmp\\\\RPAL\\\\data\\\\obs_and_model_data_Rocco')\n", + "\n", + "CSO = pd.read_csv('CSO.csv', sep=',', header=0, index_col=0, parse_dates=True)\n", + "\n", + "# Remove all the rows where the observed or modelled value is missing\n", + "CSO = CSO[CSO['filtered'].notna() & CSO['model'].notna()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create column for detection of events" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Define variable used to detect events\n", + "CSO['event_signal'] = np.max(CSO[['model', 'filtered']], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Identify events start and end based on defined threshold value" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Set threshold for event detection\n", + "det_thr = 0.001\n", + "\n", + "# Create empty DataFrame for storing events\n", + "events = pd.DataFrame(columns=['start','end'], index=pd.Index([])) \n", + "\n", + "# Find event starts = where obs goes from <= det_thr to > det_thr\n", + "start_idx = CSO['event_signal'].shift(1).le(det_thr) & CSO['event_signal'].gt(det_thr)\n", + "start_event = CSO.index[start_idx]\n", + "events['start'] = start_event\n", + "\n", + "# Find event ends = where obs goes from > det_thr to <= det_thr\n", + "end_idx = CSO['event_signal'].gt(det_thr) & CSO['event_signal'].shift(-1).le(det_thr)\n", + "end_event = CSO.index[end_idx]\n", + "events['end'] = end_event" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Aggregate events that are separated by gaps shorter than given value" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Set min gap between events\n", + "min_gap = '1 hour'\n", + "\n", + "# Calculate gap between events \n", + "events['diff'] = events['start'] - events['end'].shift(1)\n", + "\n", + "# Identify events based on min_gap\n", + "#events['check'] = (events['diff'] > min_gap)\n", + "events['ID'] = (events['diff'] > min_gap).cumsum( ) + 1\n", + "# events['fix'] = events.ID +1\n", + "\n", + "# Aggregate events\n", + "events = events.groupby('ID').agg({'start':'first', 'end':'last'})\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assign event index to original series" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "CSO['event'] = 0\n", + "for e in events.index:\n", + " CSO.loc[events['start'][e]:events['end'][e],'event'] = e\n", + "\n", + "# remove columns event_signal from CSO\n", + "CSO = CSO.drop(columns=['event_signal'])\n", + "\n", + "CSO.to_csv('CSO_events.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute event signatures" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Event duration\n", + "events['duration'] = events['end'] - events['start']\n", + "events.duration\n", + "\n", + "# Peak observed value\n", + "events['obs_peak'] = CSO.groupby('event')['obs'].max()\n", + "\n", + "# Peak modelled value\n", + "events['mod_peak'] = CSO.groupby('event')['model'].max()\n", + "\n", + "# Index of peak observed value\n", + "events['obs_peak_idx'] = CSO.groupby('event')['obs'].idxmax()\n", + "\n", + "# Index of peak modelled value\n", + "events['mod_peak_idx'] = CSO.groupby('event')['model'].idxmax()\n", + "\n", + "# Find duration of observed values for each event\n", + "events['obs_dur'] = CSO.groupby('event')['obs'].apply(\n", + " lambda x: (x[x > 0].index[-1]) - (x[x > 0].index[0]) if len(x[x > 0]) > 0 else 0)\n", + "\n", + "# Find duration of modelled values for each event\n", + "events['mod_dur'] = CSO.groupby('event')['model'].apply(\n", + " lambda x: (x[x > 0].index[-1]) - (x[x > 0].index[0]) if len(x[x > 0]) > 0 else 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert CSO index to regular column and call it timestep\n", + "CSO['timestamp'] = CSO.index\n", + "CSO['timestep'] = (CSO.timestamp - CSO.timestamp.shift(1)).dt.total_seconds()\n", + "\n", + "# Find area under the curve of observed values for each event\n", + "CSO['obs_AUC'] = CSO['filtered'] * CSO['timestep']\n", + "events['obs_AUC'] = CSO.groupby('event')['obs_AUC'].sum()\n", + "\n", + "# Find area under the curve of modelled values for each event\n", + "CSO['mod_AUC'] = CSO['model'] * CSO['timestep']\n", + "events['mod_AUC'] = CSO.groupby('event')['mod_AUC'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
startenddurationobs_peakmod_peakobs_peak_idxmod_peak_idxobs_durmod_durobs_AUCmod_AUC
ID
12022-08-27 09:45:002022-08-27 12:15:000 days 02:30:000.40300.44412022-08-27 11:15:002022-08-27 11:00:000 days 02:15:000 days 02:30:002269.982854.98
22022-09-18 13:45:002022-09-18 15:45:000 days 02:00:000.00000.76002022-09-18 13:45:002022-09-18 14:00:0000 days 02:00:000.004742.19
32022-09-28 16:00:002022-09-28 17:30:000 days 01:30:000.09190.44492022-09-28 17:00:002022-09-28 16:30:000 days 00:30:000 days 01:30:00170.371855.08
42022-09-28 21:00:002022-09-28 21:30:000 days 00:30:000.00000.04172022-09-28 21:00:002022-09-28 21:15:0000 days 00:30:000.0065.43
52022-10-01 15:45:002022-10-01 16:45:000 days 01:00:000.00000.23422022-10-01 15:45:002022-10-01 16:00:0000 days 01:00:000.00706.86
\n", + "
" + ], + "text/plain": [ + " start end duration obs_peak \\\n", + "ID \n", + "1 2022-08-27 09:45:00 2022-08-27 12:15:00 0 days 02:30:00 0.4030 \n", + "2 2022-09-18 13:45:00 2022-09-18 15:45:00 0 days 02:00:00 0.0000 \n", + "3 2022-09-28 16:00:00 2022-09-28 17:30:00 0 days 01:30:00 0.0919 \n", + "4 2022-09-28 21:00:00 2022-09-28 21:30:00 0 days 00:30:00 0.0000 \n", + "5 2022-10-01 15:45:00 2022-10-01 16:45:00 0 days 01:00:00 0.0000 \n", + "\n", + " mod_peak obs_peak_idx mod_peak_idx obs_dur \\\n", + "ID \n", + "1 0.4441 2022-08-27 11:15:00 2022-08-27 11:00:00 0 days 02:15:00 \n", + "2 0.7600 2022-09-18 13:45:00 2022-09-18 14:00:00 0 \n", + "3 0.4449 2022-09-28 17:00:00 2022-09-28 16:30:00 0 days 00:30:00 \n", + "4 0.0417 2022-09-28 21:00:00 2022-09-28 21:15:00 0 \n", + "5 0.2342 2022-10-01 15:45:00 2022-10-01 16:00:00 0 \n", + "\n", + " mod_dur obs_AUC mod_AUC \n", + "ID \n", + "1 0 days 02:30:00 2269.98 2854.98 \n", + "2 0 days 02:00:00 0.00 4742.19 \n", + "3 0 days 01:30:00 170.37 1855.08 \n", + "4 0 days 00:30:00 0.00 65.43 \n", + "5 0 days 01:00:00 0.00 706.86 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "events.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Export events to csv\n", + "events.to_csv('CSO_events_signatures.csv')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "keras", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}