Merge develop into feat/continuations (#250)

* ci: add cargo audit job (#236) * fix: Revert interpreter stack display (#238) * Fix clippy doc_lazy_continuation (#247) * perf: Improve blake2 precompile (#239) * Speed-up some blake2 components * Some more optimizations * Cleanup * Endline * Some more speed-up * Fix comments * Clippy --------- Co-authored-by: Robin Salen <[email protected]>
0xPolygonZero · May 27, 2024 · a76f3b6 · a76f3b6
1 parent 824bb0e
commit a76f3b6
Show file tree

Hide file tree

Showing 12 changed files with 157 additions and 186 deletions.
diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml
@@ -0,0 +1,13 @@
+name: Security audit
+on:
+  push:
+    paths: 
+      - '**/Cargo.toml'
+jobs:
+  security_audit:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: rustsec/[email protected]
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/evm_arithmetization/src/arithmetic/addcy.rs b/evm_arithmetization/src/arithmetic/addcy.rs
@@ -84,8 +84,9 @@ const GOLDILOCKS_INVERSE_65536: u64 = 18446462594437939201;
 ///
 /// If `N_LIMBS = 1`, then this amounts to verifying that either `x_0
 /// + y_0 = z_0` or `x_0 + y_0 == z_0 + cy*2^16` (this is `t` on line
-/// 127ff). Ok. Now assume the constraints are valid for `N_LIMBS =
-/// n-1`. Then by induction,
+/// 127ff). Ok. Now assume the constraints are valid for `N_LIMBS = n-1`.
+///
+/// Then by induction,
 ///
 /// \sum_{i=0}^{n-1} (x_i + y_i) * 2^(16*i) + (x_n + y_n)*2^(16*n) ==
 /// \sum_{i=0}^{n-1} z_i * 2^(16*i) + cy_{n-1}*2^(16*n) + z_n*2^(16*n)

diff --git a/evm_arithmetization/src/cpu/decode.rs b/evm_arithmetization/src/cpu/decode.rs
@@ -8,20 +8,24 @@ use starky::constraint_consumer::{ConstraintConsumer, RecursiveConstraintConsume
 use crate::cpu::columns::{CpuColumnsView, COL_MAP};
 
 /// List of opcode blocks
-///  Each block corresponds to exactly one flag, and each flag corresponds to
+/// Each block corresponds to exactly one flag, and each flag corresponds to
 /// exactly one block.  Each block of opcodes:
+///
 /// - is contiguous,
 /// - has a length that is a power of 2, and
 /// - its start index is a multiple of its length (it is aligned).
-///  These properties permit us to check if an opcode belongs to a block of
+///
+/// These properties permit us to check if an opcode belongs to a block of
 /// length 2^n by checking its top 8-n bits.
-///  Additionally, each block can be made available only to the user, only to
+///
+/// Additionally, each block can be made available only to the user, only to
 /// the kernel, or to both. This is mainly useful for making some instructions
 /// kernel-only, while still decoding to invalid for the user. We do this by
 /// making one kernel-only block and another user-only block. The exception is
 /// the PANIC instruction which is user-only without a corresponding kernel
 /// block. This makes the proof unverifiable when PANIC is executed in kernel
 /// mode, which is the intended behavior.
+///
 /// Note: invalid opcodes are not represented here. _Any_ opcode is permitted to
 /// decode to `is_invalid`. The kernel then verifies that the opcode was
 /// _actually_ invalid.

diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm
@@ -141,5 +141,5 @@ blake2_f_contd:
         // stack: addr_(i+1), h_(i+1)', ..., h_7', kexit_info
     %endrep
 
-    // stack: kexit_info    
+    // stack: kexit_info
     %jump(pop_and_return_success)
diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm
@@ -1,154 +1,137 @@
-%macro blake2_g_function
+%macro blake2_g_function(a, b, c, d)
     // Function to mix two input words, x and y, into the four words indexed by a, b, c, d (which
     // are in the range 0..16) in the internal state.
     // The internal state is stored in memory starting at the address start.
-    // stack: a, b, c, d, x, y, start
-    DUP4
-    DUP4
-    DUP4
-    DUP4
-    // stack: a, b, c, d, a, b, c, d, x, y, start
+    // stack: x, y, start
+
+    // Precompute final addresses
+    PUSH $d DUP4 ADD
+    PUSH $c DUP5 ADD
+    PUSH $b DUP6 ADD
+    PUSH $a DUP7 ADD
+
+    // stack: addr_a, addr_b, addr_c, addr_d, x, y, start
+    PUSH $a
+    PUSH $c
+    PUSH $b
+    PUSH $d
+    // stack: d, b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP11
-    // stack: start, a, b, c, d, a, b, c, d, x, y, start
+    // stack: start, d, b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     MLOAD_GENERAL
-    // stack: v[a], b, c, d, a, b, c, d, x, y, start
+    // stack: v[d], b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
     SWAP1
-    // stack: b, v[a], c, d, a, b, c, d, x, y, start
+    // stack: b, v[d], c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP11
-    // stack: start, b, v[a], c, d, a, b, c, d, x, y, start
+    // stack: start, b, v[d], c, d, addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     MLOAD_GENERAL
-    // stack: v[b], v[a], c, d, a, b, c, d, x, y, start
+    // stack: v[b], v[d], c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
     SWAP2
-    // stack: c, v[a], v[b], d, a, b, c, d, x, y, start
+    // stack: c, v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP11
-    // stack: start, c, v[a], v[b], d, a, b, c, d, x, y, start
+    // stack: start, c, v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     MLOAD_GENERAL
-    // stack: v[c], v[a], v[b], d, a, b, c, d, x, y, start
+    // stack: v[c], v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start
     SWAP3
-    // stack: d, v[a], v[b], v[c], a, b, c, d, x, y, start
+    // stack: a, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP11
-    // stack: start, d, v[a], v[b], v[c], a, b, c, d, x, y, start
+    // stack: start, a, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     MLOAD_GENERAL
-    // stack: v[d], v[a], v[b], v[c], a, b, c, d, x, y, start
-    %stack (vd, vs: 3) -> (vs, vd)
-    // stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start
-    DUP2
-    // stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start
+    // stack: v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
+    DUP3
+    // stack: v[b], v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP10
-    // stack: x, v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start
+    // stack: x, v[b], v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     ADD
     %as_u64
-    // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start
-    %stack (a, b, c, d) -> (a, d, a, b, c, d)
-    // stack: v[a]', v[d], v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start
+    // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
+    %stack (a, d, b, c) -> (a, d, a, b, c)
+    // stack: v[a]', v[d], v[a]', v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
     XOR
     %rotr_64(32)
-    // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start
-    %stack (top: 4, vd) -> (top)
-    // stack: v[d]', v[a]', v[b], v[c], a, b, c, d, x, y, start
+    // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
     %stack (d, a, b, c) -> (c, d, a, b, d)
-    // stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start
+    // stack: v[c], v[d]', v[a]', v[b], v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     %as_u64
-    // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a, b, c, d, x, y, start
+    // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     %stack (c, a, b, d) -> (b, c, a, c, d)
-    // stack: v[b], v[c]', v[a]', v[c]', v[d]', a, b, c, d, x, y, start
+    // stack: v[b], v[c]', v[a]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     XOR
     %rotr_64(24)
-    // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a, b, c, d, x, y, start
+    // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     SWAP1
-    // stack: v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start
+    // stack: v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP2
-    // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start
+    // stack: v[b]', v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP11
-    // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start
+    // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     ADD
     %as_u64
-    // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a, b, c, d, x, y, start
+    // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
     SWAP3
-    // stack: v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start
+    // stack: v[d]', v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP4
-    // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start
+    // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     XOR
     %rotr_64(16)
-    // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a, b, c, d, x, y, start
+    // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 16, v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     SWAP2
-    // stack: v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
+    // stack: v[c]', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP3
-    // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
+    // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     ADD
     %as_u64
-    // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
+    // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     DUP1
-    // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
+    // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     SWAP2
-    // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start
+    // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
     XOR
     %rotr_64(63)
-    // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start
-    %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (start, a, va, start, b, vb, start, c, vc, start, d, vd)
-    // stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]''
-    ADD
-    %swap_mstore
-    ADD
-    %swap_mstore
-    ADD
-    %swap_mstore
-    ADD
-    %swap_mstore
+    // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 63, v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
+
+    // Store resulting values at precomputed addresses
+    %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (va, a, vb, b, vc, c, vd, d)
+    %rep 4
+        MSTORE_GENERAL
+    %endrep
 %endmacro
 
 %macro call_blake2_g_function(a, b, c, d, x_idx, y_idx)
     // stack: round, start
-    PUSH $y_idx
     DUP2
-    // stack: round, y_idx, round, start
-    %blake2_permutation
-    // stack: s[y_idx], round, start
     %blake2_message_addr
+    DUP1
+    // stack: message_addr, message_addr, start, round, start
+    PUSH $y_idx
+    DUP5
+    // stack: round, y_idx, message_addr, message_addr, start, round, start
+    %blake2_permutation
+    // stack: s[y_idx], message_addr, message_addr, start, round, start
     ADD
     MLOAD_GENERAL
-    // stack: m[s[y_idx]], round, start
+    // stack: m[s[y_idx]], message_addr, start, round, start
+    SWAP1
+    // stack: message_addr, m[s[y_idx]], start, round, start
     PUSH $x_idx
-    DUP3
-    // stack: round, 2, m[s[y_idx]], round, start
+    DUP5
+    // stack: round, x_idx, message_addr, m[s[y_idx]], start, round, start
     %blake2_permutation
-    // stack: s[x_idx], m[s[y_idx]], round, start
-    %blake2_message_addr
+    // stack: s[x_idx], message_addr, m[s[y_idx]], start, round, start
     ADD
     MLOAD_GENERAL
-    // stack: m[s[x_idx]], m[s[y_idx]], round, start
-    %stack (ss: 2, r, s) -> (ss, s, r, s)
     // stack: m[s[x_idx]], m[s[y_idx]], start, round, start
-    PUSH $d
-    PUSH $c
-    PUSH $b
-    PUSH $a
-    // stack: a, b, c, d, m[s[x_idx]], m[s[y_idx]], start, round, start
-    %blake2_g_function
+    %blake2_g_function($a, $b, $c, $d)
     // stack: round, start
 %endmacro
 
-run_g_function_round:
-    // stack: round, start, retdest
-    %call_blake2_g_function(0, 4, 8, 12, 0, 1)
-    %call_blake2_g_function(1, 5, 9, 13, 2, 3)
-    %call_blake2_g_function(2, 6, 10, 14, 4, 5)
-    %call_blake2_g_function(3, 7, 11, 15, 6, 7)
-    %call_blake2_g_function(0, 5, 10, 15, 8, 9)
-    %call_blake2_g_function(1, 6, 11, 12, 10, 11)
-    %call_blake2_g_function(2, 7, 8, 13, 12, 13)
-    %call_blake2_g_function(3, 4, 9, 14, 14, 15)
-    %stack (r, s, ret) -> (ret, r, s)
-    // stack: retdest, round, start
-    JUMP
-
 global run_rounds_g_function:
     // stack: current_round, start, rounds, retdest
     DUP3
@@ -158,15 +141,17 @@ global run_rounds_g_function:
     EQ
     %jumpi(run_rounds_g_function_end)
     // stack: current_round, start, rounds, retdest
-    PUSH run_rounds_g_function_return
-    // stack: run_rounds_g_function_return, current_round, start, rounds, retdest
-    %stack (ret, r, s) -> (r, s, ret)
-    // stack: current_round, start, run_rounds_g_function_return, rounds, retdest
-    %jump(run_g_function_round)
-run_rounds_g_function_return:
-    // stack: round, start, rounds, retdest
+    %call_blake2_g_function(0, 4, 8, 12, 0, 1)
+    %call_blake2_g_function(1, 5, 9, 13, 2, 3)
+    %call_blake2_g_function(2, 6, 10, 14, 4, 5)
+    %call_blake2_g_function(3, 7, 11, 15, 6, 7)
+    %call_blake2_g_function(0, 5, 10, 15, 8, 9)
+    %call_blake2_g_function(1, 6, 11, 12, 10, 11)
+    %call_blake2_g_function(2, 7, 8, 13, 12, 13)
+    %call_blake2_g_function(3, 4, 9, 14, 14, 15)
+    // stack: current_round, start, rounds, retdest
     %increment
-    // stack: round + 1, start, rounds, retdest
+    // stack: current_round + 1, start, rounds, retdest
     %jump(run_rounds_g_function)
 run_rounds_g_function_end:
     // stack: current_round, start, rounds, retdest

diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/hash.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/hash.asm
@@ -8,14 +8,11 @@ blake2_generate_new_hash_value:
     MLOAD_GENERAL
     // stack: h_i, i, retdest
     %blake2_internal_state_addr
-    // stack: addr, h_i, i, retdest
-    DUP3
+    DUP1
+    // stack: addr, addr, h_i, i, retdest
+    DUP4
     ADD
     MLOAD_GENERAL
-    // stack: v_i, h_i, i, retdest
-    %blake2_internal_state_addr
-    // stack: addr, v_i, h_i, i, retdest
-    SWAP1
     // stack: v_i, addr, h_i, i, retdest
     SWAP3
     // stack: i, addr, h_i, v_i, retdest

diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/iv.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/iv.asm
@@ -35,17 +35,14 @@ global blake2_iv_const:
 
 global blake2_iv:
     // stack: i, retdest
-    PUSH blake2_iv_const
-    // stack: blake2_iv_const, i, retdest
-    SWAP1
-    // stack: i, blake2_iv_const, retdest
     %mul_const(8)
+    PUSH blake2_iv_const
     ADD
-    // stack: blake2_iv_const + 2 * i, retdest
+    // stack: blake2_iv_const + 8 * i, retdest
     DUP1
-    // stack: blake2_iv_const + 2 * i, blake2_iv_const + 2 * i, retdest
+    // stack: blake2_iv_const + 8 * i, blake2_iv_const + 8 * i, retdest
     %add_const(4)
-    // stack: blake2_iv_const + 2 * i + 1, blake2_iv_const + 2 * i, retdest
+    // stack: blake2_iv_const + 8 * i + 4, blake2_iv_const + 8 * i, retdest
     %mload_kernel_code_u32
     SWAP1
     %mload_kernel_code_u32

diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm
@@ -1,14 +1,14 @@
 // 64-bit right rotation
 %macro rotr_64(rot)
     // stack: value
+    DUP1
+    // stack: value, value
     PUSH $rot
-    // stack: rot, value
-    DUP2
-    DUP2
-    // stack: rot, value, rot, value
+    // stack: rot, value, value
     SHR
-    // stack: value >> rot, rot, value
-    %stack (shifted, rot, value) -> (rot, value, shifted)
+    // stack: value >> rot, value
+    SWAP1
+    PUSH $rot
     // stack: rot, value, value >> rot
     PUSH 64
     SUB

diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/permutations.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/permutations.asm
@@ -60,11 +60,12 @@ global permutation_9_constants:
 
 global blake2_permutation:
     // stack: i, round, retdest
+    PUSH 10 // round_mod
     PUSH permutation_0_constants
-    // stack: permutation_0_constants, i, round, retdest
-    SWAP2
-    // stack: round, i, permutation_0_constants, retdest
-    %mod_const(10)
+    // stack: permutation_0_constants, 10, i, round, retdest
+    SWAP3
+    // stack: round, 10, i, permutation_0_constants, retdest
+    MOD
     // stack: round % 10, i, permutation_0_constants, retdest
     %mul_const(16)
     ADD