update list

mlfoundations · Jan 23, 2025 · e1148ae · e1148ae
1 parent 590f12a
commit e1148ae
Show file tree

Hide file tree

Showing 7 changed files with 15 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -61,6 +61,9 @@ huggingface-cli login
   - **AIME24**: [Math Reasoning Dataset](https://huggingface.co/datasets/AI-MO/aimo-validation-aime)
   - **AMC23**: [Math Reasoning Dataset](https://huggingface.co/datasets/AI-MO/aimo-validation-amc)
   - **MATH500**: [Math Reasoning Dataset](https://huggingface.co/datasets/HuggingFaceH4/MATH-500) split from [Let's Verify Step by Step](https://github.com/openai/prm800k/tree/main?tab=readme-ov-file#math-splits)
+  - **LiveCodeBench**: [Benchmark of LLMs for code](https://livecodebench.github.io/)
+  - **LiveBench**: [A benchmark for LLMs designed with test set contamination and objective evaluation in mind](https://livebench.ai/#/)
+  - **GPQA Diamond** (Coming soon): [A Graduate-Level Google-Proof Q&A Benchmark](https://huggingface.co/datasets/Idavidrein/gpqa)
   - **Arena-Hard-Auto** (Coming soon): [Automatic evaluation tool for instruction-tuned LLMs](https://github.com/lmarena/arena-hard-auto)
   - **SWE-Bench** (Coming soon): [Evaluating large language models on real-world software issues](https://github.com/princeton-nlp/SWE-bench)
   - **SafetyBench** (Coming soon): [Evaluating the safety of LLMs](https://github.com/thu-coai/SafetyBench)

diff --git a/eval/chat_benchmarks/AIME24/eval_instruct.py b/eval/chat_benchmarks/AIME24/eval_instruct.py
@@ -59,7 +59,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         all_instances = []
         for idx, example in enumerate(examples):
             messages = [
-                {"role": "system", "content": "You are a helpful and harmless assistant. You are DeepSeek R1 developed by DeepSeek. You should think step-by-step."},
+                {"role": "system", "content": "You are a helpful and harmless assistant. You should think step-by-step."},
                 {"role": "user", "content": PROMPT.format(problem=example["problem"])}
                 ]
             templated_messages = model.apply_chat_template(messages)
@@ -68,7 +68,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 "do_sample": False,
                 "max_gen_toks" if isinstance(model, VLLM) else "max_new_tokens": self.max_new_tokens
             }
-            
+
             all_instances.append(
                 Instance(
                     "generate_until",

diff --git a/eval/chat_benchmarks/AMC23/eval_instruct.py b/eval/chat_benchmarks/AMC23/eval_instruct.py
@@ -60,7 +60,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         all_instances = []
         for idx, example in enumerate(examples):
             messages = [
-                {"role": "system", "content": "You are a helpful and harmless assistant. You are DeepSeek R1 developed by DeepSeek. You should think step-by-step."},
+                {"role": "system", "content": "You are a helpful and harmless assistant. You should think step-by-step."},
                 {"role": "user", "content": PROMPT.format(problem=example["question"])}
                 ]
             templated_messages = model.apply_chat_template(messages)
@@ -69,7 +69,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 "do_sample": False,
                 "max_gen_toks" if isinstance(model, VLLM) else "max_new_tokens": self.max_new_tokens
             }
-            
+
             all_instances.append(
                 Instance(
                     "generate_until",

diff --git a/eval/chat_benchmarks/GPQADiamond/eval_instruct.py b/eval/chat_benchmarks/GPQADiamond/eval_instruct.py
@@ -62,7 +62,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
             example["answer"] = correct_answer
 
             messages = [
-                {"role": "system", "content": "You are a helpful and harmless assistant. You are DeepSeek R1 developed by DeepSeek. You should think step-by-step."},
+                {"role": "system", "content": "You are a helpful and harmless assistant. You should think step-by-step."},
                 {"role": "user", "content": PROMPT.format(
                     problem=example["Question"],
                     options=multiple_choice_string
@@ -74,7 +74,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 "do_sample": False,
                 "max_gen_toks" if isinstance(model, VLLM) else "max_new_tokens": self.max_new_tokens
             }
-            
+
             all_instances.append(
                 Instance(
                     "generate_until",

diff --git a/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py b/eval/chat_benchmarks/LiveCodeBench/eval_instruct.py
@@ -75,7 +75,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 prompt_text = "Generate an executable Python function generated from the given prompt. Return the function body without invoking it at the final solution." + example["prompt"]
 
             messages = [
-                {"role": "system", "content": "You are a helpful and harmless assistant. You are DeepSeek R1 developed by DeepSeek. You should think step-by-step."},
+                {"role": "system", "content": "You are a helpful and harmless assistant. You should think step-by-step."},
                 {"role": "user", "content": prompt_text}
                 ]
 
@@ -85,7 +85,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 "do_sample": False,
                 "max_gen_toks" if isinstance(model, VLLM) else "max_new_tokens": self.max_new_tokens
             }
-            
+
             all_instances.append(
                 Instance(
                     "generate_until",

diff --git a/eval/chat_benchmarks/MATH500/eval_instruct.py b/eval/chat_benchmarks/MATH500/eval_instruct.py
@@ -60,7 +60,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
         all_instances = []
         for idx, example in enumerate(examples):
             messages = [
-                {"role": "system", "content": "You are a helpful and harmless assistant. You are DeepSeek R1 developed by DeepSeek. You should think step-by-step."},
+                {"role": "system", "content": "You are a helpful and harmless assistant. You should think step-by-step."},
                 {"role": "user", "content": PROMPT.format(problem=example["problem"])}
                 ]
             templated_messages = model.apply_chat_template(messages)
@@ -69,7 +69,7 @@ def generate_responses(self, model: LM) -> Dict[str, Any]:
                 "do_sample": False,
                 "max_gen_toks" if isinstance(model, VLLM) else "max_new_tokens": self.max_new_tokens
             }
-            
+
             all_instances.append(
                 Instance(
                     "generate_until",

diff --git a/reproduced_benchmarks.md b/reproduced_benchmarks.md
@@ -68,4 +68,5 @@
 |             |         | meta-llama/Meta-Llama-3.1-8B-Instruct   | instruct (pass@1)             | 30.7        | 32.8             |                                     |
 |             |         |                                         | complete (pass@1)             | 41.9        | 40.5             |                                     |
 |             |         | Qwen/Qwen2.5-7B-Instruct                | instruct (pass@1)             | 35.2        | 37.6             |                                     |
-|             |         |                                         | complete (pass@1)             | 46.7        | 46.1             |                                     |
+|             |         |                                         | complete (pass@1)             | 46.7        | 46.1             |                                     |
+|LiveCodeBench| Negin   | deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | (pass@1)             | 37.9        | 37.6            | [DeepSeek-R1 Paper](https://github.com/deepseek-ai/DeepSeek-R1/blob/main/DeepSeek_R1.pdf) Table 5 |