diff --git a/.gitignore b/.gitignore index 2b7dad7..48fb536 100644 --- a/.gitignore +++ b/.gitignore @@ -162,5 +162,5 @@ cython_debug/ data/* data/ example_data/* -!example_data/prediction_v1.3.tgz +!example_data/deepseek-67b-chat-hf-prediction.tgz inference/outputs diff --git a/README.md b/README.md index 97c1e10..de26650 100644 --- a/README.md +++ b/README.md @@ -190,6 +190,15 @@ The evaluation results of GPT-4 under `save_dir` is `jsonl`, and each line conta * 10 denotes the best performance * 8 denotes the comparable performance with our human-annotated high-quality critiques, and scores higher than 8 denotes the better performance of evaluated critiques. + +To compute the overall score, please run this command: +```bash +# the results of GPT-4 on the test set +python compute_overall.py --mode obj --feedback_overall 63.54 --correction_overall 69.67 --comp_feedback_overall 57.33 --meta_feedback_overall 62.90 +``` +* replacing the `mode` with `sub` could compute the overall score for subjective evaluation +* ensure that at least one dimension's overall score is provided among the four dimensions + ## Benchmark Results The subjective evaluation results of some representation LLMs are shown: diff --git a/critic_bench/compute_overall.py b/critic_bench/compute_overall.py new file mode 100644 index 0000000..03324c2 --- /dev/null +++ b/critic_bench/compute_overall.py @@ -0,0 +1,37 @@ +import argparse +import numpy as np +import math + + +'''This script computes the overall score for the objective and subjective evaluations''' + +parser = argparse.ArgumentParser() +parser.add_argument("--mode", help="must be obj or sub, representing the objective and subjective evaluation", default='obj') +parser.add_argument("--feedback_overall", help="the overall score of feedback dimension", default=math.inf, type=float) +parser.add_argument("--comp_feedback_overall", help="the overall score of comparison dimension", default=math.inf, type=float) +parser.add_argument("--correction_overall", help="the overall score of correction dimension", default=math.inf, type=float) +parser.add_argument("--meta_feedback_overall", help="the overall score of meta-feedback dimension", default=math.inf, type=float) +args = parser.parse_args() + + +def normalize(score): + # normlize the corrections for objective evaluation + return (score + 100) / 2 + + +if __name__ == "__main__": + scores = [] + for index, score in enumerate([ + args.feedback_overall, + args.comp_feedback_overall, + args.correction_overall, + args.meta_feedback_overall + ]): + if score != math.inf: + if args.mode == 'obj' and index in [0, 3]: + score = normalize(score) + scores.append(score) + print(scores) + + assert len(scores) > 0, 'No valid scores' + print('Overall Scores:', round(np.mean(scores), 4)) diff --git a/example_data/deepseek-67b-chat-hf-prediction.tgz b/example_data/deepseek-67b-chat-hf-prediction.tgz new file mode 100755 index 0000000..1807be8 Binary files /dev/null and b/example_data/deepseek-67b-chat-hf-prediction.tgz differ diff --git a/example_data/prediction_v1.3.tgz b/example_data/prediction_v1.3.tgz deleted file mode 100644 index 3235d50..0000000 Binary files a/example_data/prediction_v1.3.tgz and /dev/null differ