Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Trend, Outlier, Bias, and Nelson Rules Detection in _alarms.py #56

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,5 @@ dmypy.json
# Pyre type checker
.pyre/

# Temporary files
temp.py
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"editor.formatOnSave": false
}
16 changes: 15 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,4 +109,18 @@ To apply the new logic in the library, it will be necessary to implement the use

### Additional Features

Preferably, use the `_bibmon_tools.py` file to implement additional features.
Preferably, use the `_bibmon_tools.py` file to implement additional features.

### Testing New Functionalities

The first step to add new functionalities is to download the testing libraries. To do this, run the following command:

```bash
pip install -r test/requirements.txt
```

After implementing the new functionalities, run the tests to ensure that the new code is working correctly. To do this, run the following command:

```bash
pytest
```
4 changes: 2 additions & 2 deletions bibmon/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
from ._sbm import SBM
from ._sklearn_regressor import sklearnRegressor
from ._preprocess import PreProcess
from ._load_data import load_tennessee_eastman, load_real_data
from ._load_data import load_tennessee_eastman, load_real_data, load_3w
from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows

__all__ = ['Autoencoder','PCA','ESN','SBM',
'sklearnRegressor', 'PreProcess',
'load_tennessee_eastman', 'load_real_data',
'load_tennessee_eastman', 'load_real_data', 'load_3w',
'train_val_test_split', 'complete_analysis', 'comparative_table',
'spearmanr_dendrogram', 'create_df_with_dates',
'create_df_with_noise', 'align_dfs_by_rows']
147 changes: 147 additions & 0 deletions bibmon/_alarms.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,150 @@ def detecOutlier(data, lim, count = False, count_limit = 1):
alarm = +1
return alarm

def detectStdDevOutlier(data, threshold=2):
"""
Detects outliers based on the standard deviation method.

Parameters
----------
data: array-like
The input data to be analyzed.
threshold: float, optional
The number of standard deviations from the mean to consider
as an outlier. Default is 2.

Returns
----------
alarm: ndarray
An array indicating the outliers (0 for values within
threshold * stddev, 1 for outliers).
"""
mean = np.nanmean(data)
std_dev = np.nanstd(data)
upper_limit = mean + threshold * std_dev
lower_limit = mean - threshold * std_dev
return np.where((data > upper_limit) | (data < lower_limit), 1, 0)


def detectTrend(data, window_size=5):
"""
Detects trend in the data using a moving average.

Parameters
----------
data: array-like
The input data to be analyzed.
window_size: int
The size of the moving window to calculate the trend.

Returns
----------
alarm: ndarray
An array indicating the trend (1 for positive trend, -1 for negative trend, 0 for no trend).
"""
if len(data) < window_size:
return np.zeros(len(data))

moving_avg = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
trend = np.diff(moving_avg)
trend_alarm = np.where(trend > 0, 1, np.where(trend < 0, -1, 0))

# Pad the result with zeros for the beginning of the array
trend_alarm = np.pad(trend_alarm, (window_size-1, 0), mode='constant')

return trend_alarm


def detectMovingWindowOutlier(data, window_size=10, count_limit=1):
"""
Detects outliers in a moving window.

Parameters
----------
data: array-like
The input data to be analyzed.
window_size: int
The size of the moving window to analyze.
count_limit: int
The maximum number of outliers allowed within the window.

Returns
----------
alarm: ndarray
An array indicating if the count of outliers exceeds
the count_limit within each window (1 for alarm, 0 for no alarm).
"""
alarm = np.zeros(len(data))
for i in range(len(data) - window_size + 1):
window = data[i:i + window_size]
if np.count_nonzero(window > np.nanmean(window)) > count_limit:
alarm[i + window_size - 1] = 1 # Mark the last element in the window
return alarm

def detectBias(data, expected_value, threshold=0.1):
"""
Detects bias in the data by comparing the mean to an expected value.

Parameters
----------
data: array-like
The input data to be analyzed.
expected_value: float
The expected mean value to compare against.
threshold: float, optional
The threshold for deviation from the expected value. Default is 0.1.

Returns
----------
alarm: ndarray
An array indicating if the bias exceeds the threshold (1 for alarm, 0 for no alarm).
"""
mean_value = np.nanmean(data)
return np.where(np.abs(mean_value - expected_value) > threshold, 1, 0)


def detectNelsonRules(data,threshold=1):
"""
Detects anomalies in the data based on Nelson Rules 1, 2, and 3.

Parameters
----------
data: array-like
The input data to be analyzed.

Returns
----------
alarms: dict
A dictionary with alarms for each rule (1 for alarm, 0 for no alarm).
"""
mean_value = np.nanmean(data)
std_dev = np.nanstd(data)

#rule 1 = 1 point exceeding the threshold
rule_1_alarms = np.where(np.abs(data - mean_value) > threshold * std_dev, 1, 0)

#rule 2 = 2 points in a row exceeding the threshold
rule_2_alarms = np.zeros_like(data)
for i in range(1, len(data)):
if (np.abs(data[i] - mean_value) > threshold * std_dev) and (np.abs(data[i-1] - mean_value) > threshold * std_dev):
rule_2_alarms[i-1:i+1] = 1

# rule 3 = 6 points in a row increasing or decreasing
rule_3_alarms = np.zeros_like(data)
count = 0
for i in range(len(data)):
if data[i] > mean_value + threshold :
count += 1
if count >= 6:
rule_3_alarms[i-5:i+1] = 1
else:
count = 0
alarms = {
'rule_1': rule_1_alarms,
'rule_2': rule_2_alarms,
'rule_3': rule_3_alarms
}

return alarms


85 changes: 84 additions & 1 deletion bibmon/_bibmon_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from typing import Literal

###############################################################################

Expand Down Expand Up @@ -692,4 +693,86 @@ def comparative_table (models, X_train, X_validation, X_test,

return_tables.append(times_df)

return return_tables
return return_tables

##############################################################################

def find_df_transitions(
df: pd.DataFrame,
threshold: float = 1,
data_type: Literal["string", "number"] = "number",
label: str = None,
) -> list[int]:
"""
Finds transitions in a DataFrame. This can be used to find indices of interesting events in the data.

Parameters
----------
df: pandas.DataFrame
Data to be analyzed.
threshold: float, optional
Threshold to be used in the transition detection, this is the minimum difference between two consecutive points. Will be used only if data_type is 'number'.
data_type: str, optional
Type of data to be analyzed. If 'number', the threshold will be used to detect transitions. If 'string', the function will look for changes in the values.
label: str
Label to be used in the transition detection.

Returns
----------
: list of ints
Indices of the transitions.
"""

if label is None:
return []

transitions = []
previous_event = df[label].iloc[0]

for i in range(1, len(df)):
if data_type == "number":
if abs(df[label].iloc[i] - previous_event) > threshold:
transitions.append(i)
previous_event = df[label].iloc[i]
elif data_type == "string":
if df[label].iloc[i] != previous_event:
transitions.append(i)
previous_event = df[label].iloc[i]

return transitions

###############################################################################

def split_df_percentages(df: pd.DataFrame, percentages: list[float]) -> list[pd.DataFrame]:
"""
Splits a DataFrame into multiple DataFrames according to the given percentages, the sum of percentages must equal 1.

For example, if percentage = [0.6, 0.2, 0.2], the function will return a list with three DataFrames, the first one with 60% of the data, the second one with 20% and the third one with 20%.

Warning: This function may cause data loss if the split cannot be done exactly according to the percentages.

Parameters
----------
df: pandas.DataFrame
Data to be split.
percentages: list of floats
List of percentages to be used in the split.

Returns
----------
: list of pandas.DataFrames
List with the split DataFrames.
"""

if sum(percentages) != 1:
raise ValueError("The sum of the percentages must be 1.")

split_dfs = []
start = 0

for i in range(len(percentages)):
end = start + int(percentages[i] * len(df))
split_dfs.append(df.iloc[start:end])
start = end

return split_dfs
Loading