petrobras · Reinaldo-Kn · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,5 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# Temporary files
+temp.py
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "editor.formatOnSave": false
+}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -109,4 +109,18 @@ To apply the new logic in the library, it will be necessary to implement the use
 
 ### Additional Features
 
-Preferably, use the `_bibmon_tools.py` file to implement additional features.
+Preferably, use the `_bibmon_tools.py` file to implement additional features.
+
+### Testing New Functionalities
+
+The first step to add new functionalities is to download the testing libraries. To do this, run the following command:
+
+```bash
+pip install -r test/requirements.txt
+```
+
+After implementing the new functionalities, run the tests to ensure that the new code is working correctly. To do this, run the following command:
+
+```bash
+pytest
+```
diff --git a/bibmon/__init__.py b/bibmon/__init__.py
@@ -4,12 +4,12 @@
 from ._sbm import SBM
 from ._sklearn_regressor import sklearnRegressor
 from ._preprocess import PreProcess
-from ._load_data import load_tennessee_eastman, load_real_data
+from ._load_data import load_tennessee_eastman, load_real_data, load_3w
 from ._bibmon_tools import train_val_test_split, complete_analysis, comparative_table, spearmanr_dendrogram, create_df_with_dates, create_df_with_noise, align_dfs_by_rows
 
 __all__ = ['Autoencoder','PCA','ESN','SBM',
 	   'sklearnRegressor', 'PreProcess',
-           'load_tennessee_eastman', 'load_real_data', 
+           'load_tennessee_eastman', 'load_real_data', 'load_3w', 
            'train_val_test_split', 'complete_analysis', 'comparative_table',
 	       'spearmanr_dendrogram', 'create_df_with_dates',
            'create_df_with_noise', 'align_dfs_by_rows']
diff --git a/bibmon/_alarms.py b/bibmon/_alarms.py
@@ -44,3 +44,150 @@ def detecOutlier(data, lim, count = False, count_limit = 1):
             alarm = +1
         return alarm
 
+def detectStdDevOutlier(data, threshold=2):
+    """
+    Detects outliers based on the standard deviation method.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    threshold: float, optional
+        The number of standard deviations from the mean to consider 
+        as an outlier. Default is 2.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating the outliers (0 for values within 
+        threshold * stddev, 1 for outliers).
+    """
+    mean = np.nanmean(data)
+    std_dev = np.nanstd(data)
+    upper_limit = mean + threshold * std_dev
+    lower_limit = mean - threshold * std_dev
+    return np.where((data > upper_limit) | (data < lower_limit), 1, 0)
+
+
+def detectTrend(data, window_size=5):
+    """
+    Detects trend in the data using a moving average.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    window_size: int
+        The size of the moving window to calculate the trend.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating the trend (1 for positive trend, -1 for negative trend, 0 for no trend).
+    """
+    if len(data) < window_size:
+        return np.zeros(len(data))
+
+    moving_avg = np.convolve(data, np.ones(window_size)/window_size, mode='valid')
+    trend = np.diff(moving_avg)
+    trend_alarm = np.where(trend > 0, 1, np.where(trend < 0, -1, 0))
+
+    # Pad the result with zeros for the beginning of the array
+    trend_alarm = np.pad(trend_alarm, (window_size-1, 0), mode='constant')
+
+    return trend_alarm
+
+
+def detectMovingWindowOutlier(data, window_size=10, count_limit=1):
+    """
+    Detects outliers in a moving window.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    window_size: int
+        The size of the moving window to analyze.
+    count_limit: int
+        The maximum number of outliers allowed within the window.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating if the count of outliers exceeds 
+        the count_limit within each window (1 for alarm, 0 for no alarm).
+    """
+    alarm = np.zeros(len(data))
+    for i in range(len(data) - window_size + 1):
+        window = data[i:i + window_size]
+        if np.count_nonzero(window > np.nanmean(window)) > count_limit:
+            alarm[i + window_size - 1] = 1  # Mark the last element in the window
+    return alarm
+
+def detectBias(data, expected_value, threshold=0.1):
+    """
+    Detects bias in the data by comparing the mean to an expected value.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+    expected_value: float
+        The expected mean value to compare against.
+    threshold: float, optional
+        The threshold for deviation from the expected value. Default is 0.1.
+
+    Returns
+    ----------
+    alarm: ndarray
+        An array indicating if the bias exceeds the threshold (1 for alarm, 0 for no alarm).
+    """
+    mean_value = np.nanmean(data)
+    return np.where(np.abs(mean_value - expected_value) > threshold, 1, 0)
+
+
+def detectNelsonRules(data,threshold=1):
+    """
+    Detects anomalies in the data based on Nelson Rules 1, 2, and 3.
+
+    Parameters
+    ----------
+    data: array-like
+        The input data to be analyzed.
+
+    Returns
+    ----------
+    alarms: dict
+        A dictionary with alarms for each rule (1 for alarm, 0 for no alarm).
+    """
+    mean_value = np.nanmean(data)
+    std_dev = np.nanstd(data)
+
+    #rule 1 = 1 point exceeding the threshold
+    rule_1_alarms = np.where(np.abs(data - mean_value) > threshold * std_dev, 1, 0)
+
+    #rule 2 = 2 points in a row exceeding the threshold
+    rule_2_alarms = np.zeros_like(data)
+    for i in range(1, len(data)):
+        if (np.abs(data[i] - mean_value) > threshold * std_dev) and (np.abs(data[i-1] - mean_value) > threshold * std_dev):
+            rule_2_alarms[i-1:i+1] = 1  
+
+    # rule 3 = 6 points in a row increasing or decreasing
+    rule_3_alarms = np.zeros_like(data)
+    count = 0
+    for i in range(len(data)):
+        if data[i] > mean_value + threshold :
+            count += 1
+            if count >= 6:
+                rule_3_alarms[i-5:i+1] = 1  
+        else:
+            count = 0  
+    alarms = {
+        'rule_1': rule_1_alarms,
+        'rule_2': rule_2_alarms,
+        'rule_3': rule_3_alarms
+    }
+
+    return alarms
+
+
diff --git a/bibmon/_bibmon_tools.py b/bibmon/_bibmon_tools.py
@@ -2,6 +2,7 @@
 import pandas as pd
 from datetime import datetime
 import matplotlib.pyplot as plt
+from typing import Literal
 
 ###############################################################################
 
@@ -692,4 +693,86 @@ def comparative_table (models, X_train, X_validation, X_test,
 
         return_tables.append(times_df)
 
-    return return_tables
+    return return_tables
+
+##############################################################################
+
+def find_df_transitions(
+    df: pd.DataFrame,
+    threshold: float = 1,
+    data_type: Literal["string", "number"] = "number",
+    label: str = None,
+) -> list[int]:
+    """
+    Finds transitions in a DataFrame. This can be used to find indices of interesting events in the data.
+
+    Parameters
+    ----------
+    df: pandas.DataFrame
+        Data to be analyzed.
+    threshold: float, optional
+        Threshold to be used in the transition detection, this is the minimum difference between two consecutive points. Will be used only if data_type is 'number'.
+    data_type: str, optional
+        Type of data to be analyzed. If 'number', the threshold will be used to detect transitions. If 'string', the function will look for changes in the values.
+    label: str
+        Label to be used in the transition detection.
+
+    Returns
+    ----------
+    : list of ints
+        Indices of the transitions.
+    """
+
+    if label is None:
+        return []
+
+    transitions = []
+    previous_event = df[label].iloc[0]
+
+    for i in range(1, len(df)):
+        if data_type == "number":
+            if abs(df[label].iloc[i] - previous_event) > threshold:
+                transitions.append(i)
+                previous_event = df[label].iloc[i]
+        elif data_type == "string":
+            if df[label].iloc[i] != previous_event:
+                transitions.append(i)
+                previous_event = df[label].iloc[i]
+
+    return transitions
+
+###############################################################################
+
+def split_df_percentages(df: pd.DataFrame, percentages: list[float]) -> list[pd.DataFrame]:
+    """
+    Splits a DataFrame into multiple DataFrames according to the given percentages, the sum of percentages must equal 1.
+
+    For example, if percentage = [0.6, 0.2, 0.2], the function will return a list with three DataFrames, the first one with 60% of the data, the second one with 20% and the third one with 20%.
+
+    Warning: This function may cause data loss if the split cannot be done exactly according to the percentages.
+
+    Parameters
+    ----------
+    df: pandas.DataFrame
+        Data to be split.
+    percentages: list of floats
+        List of percentages to be used in the split.
+
+    Returns
+    ----------
+    : list of pandas.DataFrames
+        List with the split DataFrames.
+    """
+
+    if sum(percentages) != 1:
+        raise ValueError("The sum of the percentages must be 1.")
+
+    split_dfs = []
+    start = 0
+
+    for i in range(len(percentages)):
+        end = start + int(percentages[i] * len(df))
+        split_dfs.append(df.iloc[start:end])
+        start = end
+
+    return split_dfs
-Original file line number
+Diff line change
@@ Expand Up / @@ -172,3 +172,5 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # Temporary files
+    temp.py