preprocessing changes

mpkrass7 · Jun 24, 2024 · c774575 · c774575
1 parent 15b2d83
commit c774575
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 4 deletions.
diff --git a/{{ cookiecutter.repo_name }}/conf/base/parameters.yml b/{{ cookiecutter.repo_name }}/conf/base/parameters.yml
@@ -40,6 +40,8 @@ deploy_forecast:
       datetime_partition_column: as_of_datetime
       multiseries_id_columns: [title]
       use_time_series: True
+      feature_derivation_window_start: -21
+      feature_derivation_window_end: 0
       forecast_window_start: 3 # Forecast Window start must be a multiple of the time step
       forecast_window_end: 6
       # TODO: what other configurations should we add? Should we add and leave empty?

diff --git a/...ter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/deploy_forecast/nodes.py b/...ter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/deploy_forecast/nodes.py
@@ -158,10 +158,10 @@ def ensure_deployment_settings(
 
     user_id = deployment.owners["preview"][0]["id"]  # type: ignore
 
-    client.patch(f"deployments/{deployment_id}/settings",
-                 json={
-                     "automaticActuals": True
-                 })
+    # client.patch(f"deployments/{deployment_id}/settings",
+    #              json={
+    #                  "automaticActuals": {"enabled": True}
+    #              })
 
     # set up retraining
     try:

diff --git a/...utter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/preprocessing/nodes.py b/...utter.repo_name }}/src/{{ cookiecutter.python_package }}/pipelines/preprocessing/nodes.py
@@ -50,10 +50,23 @@ def create_or_update_modeling_dataset(modeling_dataset_name: str,
     new_data["likeDiff"] = 0
     new_data["commentDiff"] = 0
 
+    def _round_to_nearest_half_hour(time):
+        # Find the number of minutes past the hour
+        minutes = time.minute
+        if minutes < 15:
+            rounded_time = time.replace(minute=0, second=0, microsecond=0)
+        elif minutes < 45:
+            rounded_time = time.replace(minute=30, second=0, microsecond=0)
+        else:
+            rounded_time = (time + pd.Timedelta(minutes=(60 - minutes))).replace(minute=0, second=0, microsecond=0)
+        return rounded_time
+
     modeling_dataset_id = _check_if_dataset_exists(modeling_dataset_name)
 
     # TODO: Should this be idempotent? (use hash?)
     if modeling_dataset_id is None:
+        new_data["as_of_datetime"] = pd.to_datetime(new_data['as_of_datetime'], errors='coerce')
+        new_data["as_of_datetime"] = new_data["as_of_datetime"].apply(_round_to_nearest_half_hour)
         dataset: Dataset = Dataset.create_from_in_memory_data(
             data_frame=new_data, use_cases=use_cases
         )