Merge pull request #174 from trishullab/george

George
trishullab · Jul 24, 2024 · 807a2e5 · 807a2e5
2 parents 02ae505 + 4b5b2f5
commit 807a2e5
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ PutnamBench aims to support research in automated mathematical reasoning by prov
 
 PutnamBench includes factored solutions for problems which require exhibiting a numerical answer in addition to its proof of correctness. For these problems, one can attempt two tasks: proving the problem with the numerical answer written into the theorem statement, or additionally producing the answer along with the proof.
 
-We are hosting a [**leaderboard**](https://trishullab.github.io/PutnamBench/leaderboard.html) and will readily receive evaluation results which are accompanied by a preprint or publication. Do **not** include proofs as confirmation in any public setting. Please reach out to us privately for confirmation. 
+We are hosting a [**leaderboard**](https://trishullab.github.io/PutnamBench/leaderboard.html) and will readily receive evaluation results which are accompanied by a preprint or publication. **Do not** include proofs as confirmation in any public setting. Please reach out privately at `[email protected]` with any requests for additions to the leaderboard. 
 
 **We strongly encourage community feedback!** Please let us know if you have any comments for improving PutnamBench. If you notice any mistakes, please raise an issue on the repository and we will address it. We kindly ask that you do not write formal proofs for any of the problems in an effort to reduce contamination.
 
@@ -23,7 +23,7 @@ We are hosting a [**leaderboard**](https://trishullab.github.io/PutnamBench/lead
 | Isabelle      | 640            |
 | Coq           | 417            |
 
-We also report the number of problems in a certain category. Note that some problems fall under multiple categories. While the categories are intended to capture general features of the problems, we indicate that there is a high variance of problems inside an individual category.
+We also report the number of problems in a certain category. Note that some problems fall under multiple categories. While the categories are intended to capture general features of the problems, we also note that there is a high variance of problems inside an individual category.
 
 | Category         | Total Quantity | 
 | ---------------- | -------------- | 

diff --git a/docs/leaderboard.html b/docs/leaderboard.html
@@ -349,7 +349,7 @@ <h3>📝 Notes</h3>
       //   ],
       // };
 
-      const theaders = ["Model", "num-solved"];
+      const theaders = ["Model", "num-solved", "compute"];
 
       // score: 'average', 'humaneval', 'mbpp', 'humaneval+', 'mbpp+'
       const displayTable = (table, score) => {
@@ -425,18 +425,21 @@ <h3>📝 Notes</h3>
             modelCell.appendChild(promptedSymbol);
           }
           dataRow.appendChild(modelCell);
-          var passCell = document.createElement("td");
+          var solvedCell = document.createElement("td");
           // if (table == originTable) {
           //   passCell.classList.add("text-danger");
           // } else if (table == plusedTable) {
           //   passCell.textContent = "⚡";
           //   passCell.classList.add("text-success");
           // }
           if (row["num-solved"][score] != "NONE") {
-            passCell.classList.add("text-success");
-            passCell.textContent = ""+row["num-solved"][score];
+            solvedCell.classList.add("text-success");
+            solvedCell.textContent = ""+row["num-solved"][score];
           }
-          dataRow.appendChild(passCell);
+          var computeCell = document.createElement("td");
+          computeCell.textContent = row["condensed-compute-budget"];
+          dataRow.appendChild(solvedCell);
+          dataRow.appendChild(computeCell);
           tbody.appendChild(dataRow);
         });
         table.appendChild(tbody);

diff --git a/docs/results.json b/docs/results.json
@@ -6,6 +6,7 @@
         "isabelle-wsolution": 4
       },
       "size": 0,
+      "condensed-compute-budget": "pass@10",
       "note": "pass@10 w/ T=0.1"
     },
     "GPT-4o": {
@@ -17,6 +18,7 @@
         "coq-wsolution": 1
       },
       "size": 0,
+      "condensed-compute-budget": "pass@10",
       "note": "pass@10 w/ T=0.7"
     },
     "COPRA (GPT-4o)": {
@@ -27,6 +29,7 @@
         "coq-wsolution": 1
       },
       "size": 0,
+      "condensed-compute-budget": "pass@1",
       "note": "pass@1 w/ T=0.0"
     },
     "CoqHammer": {
@@ -36,6 +39,7 @@
         "coq-wsolution": 0
       },
       "size": 0,
+      "condensed-compute-budget": "pass@1",
       "note": "pass@1 w/ t=120s"
     },
     "Sledgehammer": {
@@ -45,6 +49,7 @@
         "isabelle-wsolution": 3
       },
       "size": 0,
+      "condensed-compute-budget": "pass@1",
       "note": "pass@1 w/ t=120s"
     },
     "ReProver w/ retrieval": {
@@ -54,6 +59,7 @@
         "lean-wsolution": 0
       },
       "size": 0,
+      "condensed-compute-budget": "pass@1",
       "note": "pass@1 w/ t=600s"
     },
     "ReProver w/o retrieval": {
@@ -63,6 +69,7 @@
         "lean-wsolution": 0
       },
       "size": 0,
+      "condensed-compute-budget": "pass@1",
       "note": "pass@1 w/ t=600s"
     },
     "Tactician (LSH)": {
@@ -72,6 +79,17 @@
         "coq-wsolution": 0
       },
       "size": 0,
+      "condensed-compute-budget": "pass@1",
       "note": "pass@1 w/ t=600s"
+    },
+    "InternLM 7B": {
+      "link": "https://github.com/InternLM/InternLM-Math",
+      "open-data": "NONE",
+      "num-solved": {
+        "lean-wsolution": 4
+      },
+      "size": 7,
+      "condensed-compute-budget": "pass@4096",
+      "note": "pass@2048 w/ T = 1.0 + pass@2048 w/ T = 0.7"
     }
 }
diff --git a/lean4/src/putnam_1965_a1.lean b/lean4/src/putnam_1965_a1.lean
@@ -11,7 +11,7 @@ theorem putnam_1965_a1
 (A B C X Y : EuclideanSpace ℝ (Fin 2))
 (hABC : ¬Collinear ℝ {A, B, C})
 (hangles : ∠ C A B < ∠ B C A ∧ ∠ B C A < π/2 ∧ π/2 < ∠ A B C)
-(hX : Collinear ℝ {X, B, C} ∧ ∠ X A B = (π - ∠ C A B)/2 ∧ Euclidean.dist A X = Euclidean.dist A B)
-(hY : Collinear ℝ {Y, C, A} ∧ ∠ Y B C = (π - ∠ A B C)/2 ∧ Euclidean.dist B Y = Euclidean.dist A B)
+(hX : Collinear ℝ {X, B, C} ∧ ∠ X A B = (π - ∠ C A B)/2 ∧ (A 0 - X 0)^2 + (A 1 - X 1)^2 = (A 0 - B 0)^2 + (A 1 - B 1)^2)
+(hY : Collinear ℝ {Y, C, A} ∧ ∠ Y B C = (π - ∠ A B C)/2 ∧ (B 0 - Y 0)^2 + (B 1 - Y 1)^2 = (A 0 - B 0)^2 + (A 1 - B 1)^2)
 : ∠ C A B = putnam_1965_a1_solution :=
 sorry
diff --git a/lean4/src/putnam_2013_a5.lean b/lean4/src/putnam_2013_a5.lean
@@ -1,7 +1,7 @@
 import Mathlib
 open BigOperators
 
-open Function Set
+open Function Set MeasureTheory
 
 -- Note: uses (Fin m → Fin m → Fin m → ℝ) instead of ensuring inputs are strictly increasing
 theorem putnam_2013_a5
@@ -11,8 +11,8 @@ theorem putnam_2013_a5
 (areadef2 : (Fin m → Fin m → Fin m → ℝ) → Prop)
 (areadef3 : (Fin m → Fin m → Fin m → ℝ) → Prop)
 (mge3 : m ≥ 3)
-(harea2 : ∀ a b c : Fin 2 → ℝ, area2 a b c = (MeasureTheory.volume (convexHull ℝ {a, b, c})).toReal)
-(harea3 : ∀ a b c : Fin 3 → ℝ, area3 a b c = (MeasureTheory.volume (convexHull ℝ {a, b, c})).toReal)
+(harea2 : ∀ a b c : Fin 2 → ℝ, area2 a b c = (volume (convexHull ℝ {a, b, c})).toReal)
+(harea3 : ∀ a b c : Fin 3 → ℝ, area3 a b c = (μH[2] (convexHull ℝ {a, b, c})).toReal)
 (hareadef2 : ∀ a : Fin m → Fin m → Fin m → ℝ, areadef2 a = ∀ A : Fin m → (Fin 2 → ℝ), (∑ i : Fin m, ∑ j : Fin m, ∑ k : Fin m, if (i < j ∧ j < k) then (a i j k * area2 (A i) (A j) (A k)) else 0) ≥ 0)
 (hareadef3 : ∀ a : Fin m → Fin m → Fin m → ℝ, areadef3 a = ∀ A : Fin m → (Fin 3 → ℝ), (∑ i : Fin m, ∑ j : Fin m, ∑ k : Fin m, if (i < j ∧ j < k) then (a i j k * area3 (A i) (A j) (A k)) else 0) ≥ 0)
 : ∀ a : Fin m → Fin m → Fin m → ℝ, areadef2 a → areadef3 a :=