From 14e824813877ff5782311f0ab434ec8c25015e02 Mon Sep 17 00:00:00 2001 From: numatis <141714391+numatis@users.noreply.github.com> Date: Tue, 14 Nov 2023 14:31:06 +0100 Subject: [PATCH] Test --- notebooks/CSO_events.ipynb | 389 +++++++++++++++++++++++++++++++++++++ 1 file changed, 389 insertions(+) create mode 100644 notebooks/CSO_events.ipynb diff --git a/notebooks/CSO_events.ipynb b/notebooks/CSO_events.ipynb new file mode 100644 index 000000000..c329274ab --- /dev/null +++ b/notebooks/CSO_events.ipynb @@ -0,0 +1,389 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Subdivision of CSO observed data into events" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import plotly.graph_objects as go" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Import data\n", + "os.chdir('C:\\\\Users\\\\rpal\\\\Source\\\\modelskill\\\\tmp\\\\RPAL\\\\data\\\\obs_and_model_data_Rocco')\n", + "\n", + "CSO = pd.read_csv('CSO.csv', sep=',', header=0, index_col=0, parse_dates=True)\n", + "\n", + "# Remove all the rows where the observed or modelled value is missing\n", + "CSO = CSO[CSO['filtered'].notna() & CSO['model'].notna()]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create column for detection of events" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Define variable used to detect events\n", + "CSO['event_signal'] = np.max(CSO[['model', 'filtered']], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Identify events start and end based on defined threshold value" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Set threshold for event detection\n", + "det_thr = 0.001\n", + "\n", + "# Create empty DataFrame for storing events\n", + "events = pd.DataFrame(columns=['start','end'], index=pd.Index([])) \n", + "\n", + "# Find event starts = where obs goes from <= det_thr to > det_thr\n", + "start_idx = CSO['event_signal'].shift(1).le(det_thr) & CSO['event_signal'].gt(det_thr)\n", + "start_event = CSO.index[start_idx]\n", + "events['start'] = start_event\n", + "\n", + "# Find event ends = where obs goes from > det_thr to <= det_thr\n", + "end_idx = CSO['event_signal'].gt(det_thr) & CSO['event_signal'].shift(-1).le(det_thr)\n", + "end_event = CSO.index[end_idx]\n", + "events['end'] = end_event" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Aggregate events that are separated by gaps shorter than given value" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Set min gap between events\n", + "min_gap = '1 hour'\n", + "\n", + "# Calculate gap between events \n", + "events['diff'] = events['start'] - events['end'].shift(1)\n", + "\n", + "# Identify events based on min_gap\n", + "#events['check'] = (events['diff'] > min_gap)\n", + "events['ID'] = (events['diff'] > min_gap).cumsum( ) + 1\n", + "# events['fix'] = events.ID +1\n", + "\n", + "# Aggregate events\n", + "events = events.groupby('ID').agg({'start':'first', 'end':'last'})\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Assign event index to original series" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "CSO['event'] = 0\n", + "for e in events.index:\n", + " CSO.loc[events['start'][e]:events['end'][e],'event'] = e\n", + "\n", + "# remove columns event_signal from CSO\n", + "CSO = CSO.drop(columns=['event_signal'])\n", + "\n", + "CSO.to_csv('CSO_events.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Compute event signatures" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Event duration\n", + "events['duration'] = events['end'] - events['start']\n", + "events.duration\n", + "\n", + "# Peak observed value\n", + "events['obs_peak'] = CSO.groupby('event')['obs'].max()\n", + "\n", + "# Peak modelled value\n", + "events['mod_peak'] = CSO.groupby('event')['model'].max()\n", + "\n", + "# Index of peak observed value\n", + "events['obs_peak_idx'] = CSO.groupby('event')['obs'].idxmax()\n", + "\n", + "# Index of peak modelled value\n", + "events['mod_peak_idx'] = CSO.groupby('event')['model'].idxmax()\n", + "\n", + "# Find duration of observed values for each event\n", + "events['obs_dur'] = CSO.groupby('event')['obs'].apply(\n", + " lambda x: (x[x > 0].index[-1]) - (x[x > 0].index[0]) if len(x[x > 0]) > 0 else 0)\n", + "\n", + "# Find duration of modelled values for each event\n", + "events['mod_dur'] = CSO.groupby('event')['model'].apply(\n", + " lambda x: (x[x > 0].index[-1]) - (x[x > 0].index[0]) if len(x[x > 0]) > 0 else 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Convert CSO index to regular column and call it timestep\n", + "CSO['timestamp'] = CSO.index\n", + "CSO['timestep'] = (CSO.timestamp - CSO.timestamp.shift(1)).dt.total_seconds()\n", + "\n", + "# Find area under the curve of observed values for each event\n", + "CSO['obs_AUC'] = CSO['filtered'] * CSO['timestep']\n", + "events['obs_AUC'] = CSO.groupby('event')['obs_AUC'].sum()\n", + "\n", + "# Find area under the curve of modelled values for each event\n", + "CSO['mod_AUC'] = CSO['model'] * CSO['timestep']\n", + "events['mod_AUC'] = CSO.groupby('event')['mod_AUC'].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | start | \n", + "end | \n", + "duration | \n", + "obs_peak | \n", + "mod_peak | \n", + "obs_peak_idx | \n", + "mod_peak_idx | \n", + "obs_dur | \n", + "mod_dur | \n", + "obs_AUC | \n", + "mod_AUC | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|
ID | \n", + "\n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " | \n", + " |
1 | \n", + "2022-08-27 09:45:00 | \n", + "2022-08-27 12:15:00 | \n", + "0 days 02:30:00 | \n", + "0.4030 | \n", + "0.4441 | \n", + "2022-08-27 11:15:00 | \n", + "2022-08-27 11:00:00 | \n", + "0 days 02:15:00 | \n", + "0 days 02:30:00 | \n", + "2269.98 | \n", + "2854.98 | \n", + "
2 | \n", + "2022-09-18 13:45:00 | \n", + "2022-09-18 15:45:00 | \n", + "0 days 02:00:00 | \n", + "0.0000 | \n", + "0.7600 | \n", + "2022-09-18 13:45:00 | \n", + "2022-09-18 14:00:00 | \n", + "0 | \n", + "0 days 02:00:00 | \n", + "0.00 | \n", + "4742.19 | \n", + "
3 | \n", + "2022-09-28 16:00:00 | \n", + "2022-09-28 17:30:00 | \n", + "0 days 01:30:00 | \n", + "0.0919 | \n", + "0.4449 | \n", + "2022-09-28 17:00:00 | \n", + "2022-09-28 16:30:00 | \n", + "0 days 00:30:00 | \n", + "0 days 01:30:00 | \n", + "170.37 | \n", + "1855.08 | \n", + "
4 | \n", + "2022-09-28 21:00:00 | \n", + "2022-09-28 21:30:00 | \n", + "0 days 00:30:00 | \n", + "0.0000 | \n", + "0.0417 | \n", + "2022-09-28 21:00:00 | \n", + "2022-09-28 21:15:00 | \n", + "0 | \n", + "0 days 00:30:00 | \n", + "0.00 | \n", + "65.43 | \n", + "
5 | \n", + "2022-10-01 15:45:00 | \n", + "2022-10-01 16:45:00 | \n", + "0 days 01:00:00 | \n", + "0.0000 | \n", + "0.2342 | \n", + "2022-10-01 15:45:00 | \n", + "2022-10-01 16:00:00 | \n", + "0 | \n", + "0 days 01:00:00 | \n", + "0.00 | \n", + "706.86 | \n", + "