diff --git a/README.md b/README.md
index 9613d75e..40405557 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ PutnamBench aims to support research in automated mathematical reasoning by prov
PutnamBench includes factored solutions for problems which require exhibiting a numerical answer in addition to its proof of correctness. For these problems, one can attempt two tasks: proving the problem with the numerical answer written into the theorem statement, or additionally producing the answer along with the proof.
-We are hosting a [**leaderboard**](https://trishullab.github.io/PutnamBench/leaderboard.html) and will readily receive evaluation results which are accompanied by a preprint or publication. Do **not** include proofs as confirmation in any public setting. Please reach out to us privately for confirmation.
+We are hosting a [**leaderboard**](https://trishullab.github.io/PutnamBench/leaderboard.html) and will readily receive evaluation results which are accompanied by a preprint or publication. **Do not** include proofs as confirmation in any public setting. Please reach out privately at `george.tsoukalas@utexas.edu` with any requests for additions to the leaderboard.
**We strongly encourage community feedback!** Please let us know if you have any comments for improving PutnamBench. If you notice any mistakes, please raise an issue on the repository and we will address it. We kindly ask that you do not write formal proofs for any of the problems in an effort to reduce contamination.
@@ -23,7 +23,7 @@ We are hosting a [**leaderboard**](https://trishullab.github.io/PutnamBench/lead
| Isabelle | 640 |
| Coq | 417 |
-We also report the number of problems in a certain category. Note that some problems fall under multiple categories. While the categories are intended to capture general features of the problems, we indicate that there is a high variance of problems inside an individual category.
+We also report the number of problems in a certain category. Note that some problems fall under multiple categories. While the categories are intended to capture general features of the problems, we also note that there is a high variance of problems inside an individual category.
| Category | Total Quantity |
| ---------------- | -------------- |
diff --git a/docs/leaderboard.html b/docs/leaderboard.html
index 496cc2fe..f14c5a48 100644
--- a/docs/leaderboard.html
+++ b/docs/leaderboard.html
@@ -349,7 +349,7 @@
π Notes
// ],
// };
- const theaders = ["Model", "num-solved"];
+ const theaders = ["Model", "num-solved", "compute"];
// score: 'average', 'humaneval', 'mbpp', 'humaneval+', 'mbpp+'
const displayTable = (table, score) => {
@@ -425,7 +425,7 @@ π Notes
modelCell.appendChild(promptedSymbol);
}
dataRow.appendChild(modelCell);
- var passCell = document.createElement("td");
+ var solvedCell = document.createElement("td");
// if (table == originTable) {
// passCell.classList.add("text-danger");
// } else if (table == plusedTable) {
@@ -433,10 +433,13 @@ π Notes
// passCell.classList.add("text-success");
// }
if (row["num-solved"][score] != "NONE") {
- passCell.classList.add("text-success");
- passCell.textContent = ""+row["num-solved"][score];
+ solvedCell.classList.add("text-success");
+ solvedCell.textContent = ""+row["num-solved"][score];
}
- dataRow.appendChild(passCell);
+ var computeCell = document.createElement("td");
+ computeCell.textContent = row["condensed-compute-budget"];
+ dataRow.appendChild(solvedCell);
+ dataRow.appendChild(computeCell);
tbody.appendChild(dataRow);
});
table.appendChild(tbody);
diff --git a/docs/results.json b/docs/results.json
index 8e9adbee..84706eba 100644
--- a/docs/results.json
+++ b/docs/results.json
@@ -6,6 +6,7 @@
"isabelle-wsolution": 4
},
"size": 0,
+ "condensed-compute-budget": "pass@10",
"note": "pass@10 w/ T=0.1"
},
"GPT-4o": {
@@ -17,6 +18,7 @@
"coq-wsolution": 1
},
"size": 0,
+ "condensed-compute-budget": "pass@10",
"note": "pass@10 w/ T=0.7"
},
"COPRA (GPT-4o)": {
@@ -27,6 +29,7 @@
"coq-wsolution": 1
},
"size": 0,
+ "condensed-compute-budget": "pass@1",
"note": "pass@1 w/ T=0.0"
},
"CoqHammer": {
@@ -36,6 +39,7 @@
"coq-wsolution": 0
},
"size": 0,
+ "condensed-compute-budget": "pass@1",
"note": "pass@1 w/ t=120s"
},
"Sledgehammer": {
@@ -45,6 +49,7 @@
"isabelle-wsolution": 3
},
"size": 0,
+ "condensed-compute-budget": "pass@1",
"note": "pass@1 w/ t=120s"
},
"ReProver w/ retrieval": {
@@ -54,6 +59,7 @@
"lean-wsolution": 0
},
"size": 0,
+ "condensed-compute-budget": "pass@1",
"note": "pass@1 w/ t=600s"
},
"ReProver w/o retrieval": {
@@ -63,6 +69,7 @@
"lean-wsolution": 0
},
"size": 0,
+ "condensed-compute-budget": "pass@1",
"note": "pass@1 w/ t=600s"
},
"Tactician (LSH)": {
@@ -72,6 +79,17 @@
"coq-wsolution": 0
},
"size": 0,
+ "condensed-compute-budget": "pass@1",
"note": "pass@1 w/ t=600s"
+ },
+ "InternLM 7B": {
+ "link": "https://github.com/InternLM/InternLM-Math",
+ "open-data": "NONE",
+ "num-solved": {
+ "lean-wsolution": 4
+ },
+ "size": 7,
+ "condensed-compute-budget": "pass@4096",
+ "note": "pass@2048 w/ T = 1.0 + pass@2048 w/ T = 0.7"
}
}
\ No newline at end of file
diff --git a/lean4/src/putnam_1965_a1.lean b/lean4/src/putnam_1965_a1.lean
index 2636b691..6ca5f0bc 100644
--- a/lean4/src/putnam_1965_a1.lean
+++ b/lean4/src/putnam_1965_a1.lean
@@ -11,7 +11,7 @@ theorem putnam_1965_a1
(A B C X Y : EuclideanSpace β (Fin 2))
(hABC : Β¬Collinear β {A, B, C})
(hangles : β C A B < β B C A β§ β B C A < Ο/2 β§ Ο/2 < β A B C)
-(hX : Collinear β {X, B, C} β§ β X A B = (Ο - β C A B)/2 β§ Euclidean.dist A X = Euclidean.dist A B)
-(hY : Collinear β {Y, C, A} β§ β Y B C = (Ο - β A B C)/2 β§ Euclidean.dist B Y = Euclidean.dist A B)
+(hX : Collinear β {X, B, C} β§ β X A B = (Ο - β C A B)/2 β§ (A 0 - X 0)^2 + (A 1 - X 1)^2 = (A 0 - B 0)^2 + (A 1 - B 1)^2)
+(hY : Collinear β {Y, C, A} β§ β Y B C = (Ο - β A B C)/2 β§ (B 0 - Y 0)^2 + (B 1 - Y 1)^2 = (A 0 - B 0)^2 + (A 1 - B 1)^2)
: β C A B = putnam_1965_a1_solution :=
sorry
diff --git a/lean4/src/putnam_2013_a5.lean b/lean4/src/putnam_2013_a5.lean
index 8b481331..e4f83fdb 100644
--- a/lean4/src/putnam_2013_a5.lean
+++ b/lean4/src/putnam_2013_a5.lean
@@ -1,7 +1,7 @@
import Mathlib
open BigOperators
-open Function Set
+open Function Set MeasureTheory
-- Note: uses (Fin m β Fin m β Fin m β β) instead of ensuring inputs are strictly increasing
theorem putnam_2013_a5
@@ -11,8 +11,8 @@ theorem putnam_2013_a5
(areadef2 : (Fin m β Fin m β Fin m β β) β Prop)
(areadef3 : (Fin m β Fin m β Fin m β β) β Prop)
(mge3 : m β₯ 3)
-(harea2 : β a b c : Fin 2 β β, area2 a b c = (MeasureTheory.volume (convexHull β {a, b, c})).toReal)
-(harea3 : β a b c : Fin 3 β β, area3 a b c = (MeasureTheory.volume (convexHull β {a, b, c})).toReal)
+(harea2 : β a b c : Fin 2 β β, area2 a b c = (volume (convexHull β {a, b, c})).toReal)
+(harea3 : β a b c : Fin 3 β β, area3 a b c = (ΞΌH[2] (convexHull β {a, b, c})).toReal)
(hareadef2 : β a : Fin m β Fin m β Fin m β β, areadef2 a = β A : Fin m β (Fin 2 β β), (β i : Fin m, β j : Fin m, β k : Fin m, if (i < j β§ j < k) then (a i j k * area2 (A i) (A j) (A k)) else 0) β₯ 0)
(hareadef3 : β a : Fin m β Fin m β Fin m β β, areadef3 a = β A : Fin m β (Fin 3 β β), (β i : Fin m, β j : Fin m, β k : Fin m, if (i < j β§ j < k) then (a i j k * area3 (A i) (A j) (A k)) else 0) β₯ 0)
: β a : Fin m β Fin m β Fin m β β, areadef2 a β areadef3 a :=