From e75917ed882fe1553aa7c6f0d9393bd418c44ada Mon Sep 17 00:00:00 2001
From: Joel Ye <joelye9@gmail.com>
Date: Fri, 5 Apr 2024 15:35:16 -0400
Subject: [PATCH] clean evaluator code

---
 decoder_demos/ndt2_decoder.py           | 19 ++-------
 decoder_demos/ndt2_sample.Dockerfile    |  3 +-
 decoder_demos/ndt2_sample.py            |  8 ++--
 decoder_demos/sklearn_sample.Dockerfile |  4 +-
 falcon_challenge/config.py              |  3 +-
 falcon_challenge/evaluator.py           | 54 +------------------------
 setup.py                                |  2 +-
 7 files changed, 15 insertions(+), 78 deletions(-)

diff --git a/decoder_demos/ndt2_decoder.py b/decoder_demos/ndt2_decoder.py
index 4d550b3..5be99c6 100644
--- a/decoder_demos/ndt2_decoder.py
+++ b/decoder_demos/ndt2_decoder.py
@@ -18,8 +18,7 @@
 from context_general_bci.config import RootConfig, propagate_config, DataKey, MetaKey
 from context_general_bci.dataset import DataAttrs, ContextAttrs
 from context_general_bci.subjects import SubjectName
-from context_general_bci.contexts.context_registry import context_registry
-from context_general_bci.contexts.context_info import FalconContextInfo, ExperimentalTask
+from context_general_bci.contexts.context_info import ExperimentalTask
 from context_general_bci.model import load_from_checkpoint
 from context_general_bci.model_slim import transfer_model
 
@@ -38,6 +37,7 @@ def __init__(
             model_ckpt_path: str,
             model_cfg_stem: str,
             zscore_path: str,
+            dataset_handles: List[str] = []
         ):
         r"""
             Loading NDT2 requires both weights and model config. Weight loading through a checkpoint is standard.
@@ -45,17 +45,6 @@ def __init__(
         """
         self._task_config = task_config
         self.exp_task = getattr(ExperimentalTask, f'falcon_{task_config.task.name}')
-
-        context_registry.register([
-            *FalconContextInfo.build_from_dir(
-                f'./data/{task_config.task.name}/eval',
-                task=self.exp_task,
-                suffix='eval'),
-            *FalconContextInfo.build_from_dir(
-                f'./data/{task_config.task.name}/minival',
-                task=self.exp_task,
-                suffix='minival')])
-
         try:
             initialize_config_module(
                 config_module="context_general_bci.config",
@@ -76,9 +65,7 @@ def __init__(
         context_idx = {
             MetaKey.array.name: [format_array_name(self.subject)],
             MetaKey.subject.name: [self.subject],
-            MetaKey.session.name: sorted([
-                self._task_config.hash_dataset(handle) for handle in task_config.dataset_handles
-            ]),
+            MetaKey.session.name: sorted([self._task_config.hash_dataset(handle) for handle in dataset_handles]),
             MetaKey.task.name: [self.exp_task],
         }
         data_attrs = DataAttrs.from_config(cfg.dataset, context=ContextAttrs(**context_idx))
diff --git a/decoder_demos/ndt2_sample.Dockerfile b/decoder_demos/ndt2_sample.Dockerfile
index 9449409..fc3a705 100644
--- a/decoder_demos/ndt2_sample.Dockerfile
+++ b/decoder_demos/ndt2_sample.Dockerfile
@@ -24,7 +24,7 @@ ENV EVALUATION_LOC remote
 
 # Add ckpt
 # Note that Docker cannot easily import across symlinks, make sure data is not symlinked
-ADD ./local_data/ndt2_h1_sample.pth data/decoder.pth
+ADD ./local_data/ndt2_h1_sample_nokey.pth data/decoder.pth
 ADD ./local_data/ndt2_zscore_h1.pt data/zscore.pt
 
 # Add runfile
@@ -38,6 +38,7 @@ ENV PHASE "test"
 
 # Make sure this matches the mounted data volume path. Generally leave as is.
 ENV EVAL_DATA_PATH "/dataset/evaluation_data"
+ADD ./falcon_challenge falcon_challenge 
 
 # CMD specifies a default command to run when the container is launched.
 # It can be overridden with any cmd e.g. sudo docker run -it my_image /bin/bash
diff --git a/decoder_demos/ndt2_sample.py b/decoder_demos/ndt2_sample.py
index 0246f69..02d06a7 100644
--- a/decoder_demos/ndt2_sample.py
+++ b/decoder_demos/ndt2_sample.py
@@ -41,16 +41,14 @@ def main():
         split=args.split)
 
     task = getattr(FalconTask, args.split)
-    config = FalconConfig(
-        task=task,
-        dataset_handles=[x.stem for x in evaluator.get_eval_files(phase=args.phase)]
-    )
+    config = FalconConfig(task=task)
 
     decoder = NDT2Decoder(
         task_config=config,
         model_ckpt_path=args.model_path,
         model_cfg_stem=args.config_stem,
-        zscore_path=args.zscore_path
+        zscore_path=args.zscore_path,
+        dataset_handles=[x.stem for x in evaluator.get_eval_files(phase=args.phase)]
     )
 
 
diff --git a/decoder_demos/sklearn_sample.Dockerfile b/decoder_demos/sklearn_sample.Dockerfile
index 4bf4826..b8277e8 100644
--- a/decoder_demos/sklearn_sample.Dockerfile
+++ b/decoder_demos/sklearn_sample.Dockerfile
@@ -36,8 +36,8 @@ ADD ./decoder_demos/sklearn_sample.py decode.py
 ADD ./preproc/filtering.py filtering.py
 
 ENV SPLIT "h1"
-ENV PHASE "minival"
-# ENV PHASE "test"
+# ENV PHASE "minival"
+ENV PHASE "test"
 
 # Make sure this matches the mounted data volume path. Generally leave as is.
 ENV EVAL_DATA_PATH "/dataset/evaluation_data"
diff --git a/falcon_challenge/config.py b/falcon_challenge/config.py
index c67f084..a7319b6 100644
--- a/falcon_challenge/config.py
+++ b/falcon_challenge/config.py
@@ -1,4 +1,5 @@
 import enum
+from typing import Union
 from pathlib import Path
 from dataclasses import dataclass, field
 
@@ -52,7 +53,7 @@ def out_dim(self):
             return 2
         raise NotImplementedError(f"Task {self.task} not implemented.")
         
-    def hash_dataset(self, handle: str | Path):
+    def hash_dataset(self, handle: Union[str, Path]):
         r"""
             handle - path.stem of a datafile.
             Convenience function to help identify what "session" a datafile belongs to.. If multiple files per session in real-world time, this may _not_ uniquely identify runfile.
diff --git a/falcon_challenge/evaluator.py b/falcon_challenge/evaluator.py
index b7eebb3..04b71a4 100644
--- a/falcon_challenge/evaluator.py
+++ b/falcon_challenge/evaluator.py
@@ -80,51 +80,6 @@
     'held_out': "Held Out",
 }
 
-# def evaluate(
-#     test_annotation_file: str, # The annotation file for the phase - but our labels are pulled from eval data.
-#     user_submission_file: str, # * JY: This appears to always be /submission/submission.csv on EvalAI. No matter - load it as a pickle.
-#     phase_codename: str, # e.g. minival or test
-#     **kwargs
-# ):
-#     r"""
-#         Evaluate payloads with potentially multiple splits worth of data
-#         - Low pri: can I provide all results or just one split's worth entry? Currently providing 1, examples just provide 1, but in general would be nice to provide all. User shouldn't be able to submit more than 1, though.
-#     """
-#     # ! Want: Locally, test_annotation should be somewhere safe (tmp)
-#     # ! Remotely, it shoudl be /submission/submission.csv exactly.
-#     # Ignore explicit annotations provided and directly search for concatenated answers
-#     logger.info(f"Evaluation: Docker side")
-#     logger.info(f"Loading GT from {test_annotation_file}")
-#     logger.info(f"Loading submission from {user_submission_file}")
-#     logger.info(f"Phase: {phase_codename}")
-
-#     result = []
-#     # Load pickles
-#     with open(test_annotation_file, 'rb') as test_annotation_file, open(user_submission_file, 'rb') as user_submission_file:
-#         test_annotations = pickle.load(test_annotation_file)
-#         user_submission = pickle.load(user_submission_file)
-#     for datasplit in user_submission: # datasplit e.g. h1, m1
-#         if datasplit not in test_annotations:
-#             raise ValueError(f"Missing {datasplit} in GT labels.")
-#         split_annotations = test_annotations[datasplit]
-#         split_result = {}
-#         split_result["Normalized Latency"] = user_submission[datasplit]["normalized_latency"]
-#         for in_or_out in split_annotations.keys():
-#             if f'{in_or_out}_pred' in user_submission[datasplit]:
-#                 pred = user_submission[datasplit][f'{in_or_out}_pred']
-#                 mask = user_submission[datasplit][f'{in_or_out}_eval_mask']
-#                 # User submission should be in an expected format because we force predictions through our codepack interface... right? They could hypothetically spoof. But we see dockerfile.
-#                 eval_fn = FalconEvaluator.compute_metrics_classification if 'h2' in datasplit else FalconEvaluator.compute_metrics_regression
-#                 metrics_held_in = eval_fn(pred, split_annotations[in_or_out], mask)
-#                 for k in metrics_held_in:
-#                     split_result[f'{HELDIN_OR_OUT_MAP[in_or_out]} {k}'] = metrics_held_in[k]
-#         result.append({datasplit: split_result})
-            
-#     print(f"Returning result from phase: {phase_codename}: {result}")
-#     # Out struct according to https://evalai.readthedocs.io/en/latest/evaluation_scripts.html
-#     return {"result": result, 'submission_result': result[0]}
-
-
 def evaluate(
     test_annotation_file: str, # The annotation file for the phase
     user_submission_file: str, # * JY: This appears to always be /submission/submission.csv on EvalAI. No matter - load it as a pickle.
@@ -341,17 +296,12 @@ def evaluate(self, decoder: BCIDecoder, phase: str, held_out_only: bool = False,
             truth_payload = {self.dataset.name: inner_tgt_spoof}
         else:
             pass
-            # TODO restore
-            # metrics_held_in = self.compute_metrics(all_preds_held_in, all_targets_held_in, all_eval_mask_held_in)
-            # metrics_held_out = self.compute_metrics(all_preds_held_out, all_targets_held_out, all_eval_mask_held_out)
-            # for k, v in metrics_held_in.items():
-                # metrics[f'{HELDIN_OR_OUT_MAP["held_in"]} {k}'] = v
-            # for k, v in metrics_held_out.items():
-                # metrics[f'{HELDIN_OR_OUT_MAP["held_out"]} {k}'] = v            
             
         if USE_PKLS:
+            Path(prediction_path).parent.mkdir(parents=True, exist_ok=True)
             with open(prediction_path, 'wb') as f:
                 pickle.dump(pred_payload, f)
+            Path(gt_path).parent.mkdir(parents=True, exist_ok=True)
             with open(gt_path, 'wb') as f:
                 pickle.dump(truth_payload, f)
             import time
diff --git a/setup.py b/setup.py
index 8e6bd79..ac5166e 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='falcon_challenge',
-    version='0.2.6',
+    version='0.2.7',
 
     url='https://github.com/snel-repo/stability-benchmark',
     author='Joel Ye',