From d763cf6d585686fdc26b0c435ff5544e87e9d326 Mon Sep 17 00:00:00 2001 From: Robin Salen Date: Tue, 21 May 2024 18:56:23 -0400 Subject: [PATCH 1/6] Speed-up some blake2 components --- .../kernel/asm/core/precompiles/blake2_f.asm | 2 +- .../kernel/asm/hash/blake2/g_functions.asm | 96 ++++++++----------- .../src/cpu/kernel/asm/hash/blake2/hash.asm | 9 +- .../src/cpu/kernel/asm/hash/blake2/iv.asm | 11 +-- .../src/cpu/kernel/asm/hash/blake2/ops.asm | 14 +-- .../kernel/asm/hash/blake2/permutations.asm | 9 +- 6 files changed, 61 insertions(+), 80 deletions(-) diff --git a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm index 3a7498482..b54cf3b16 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/core/precompiles/blake2_f.asm @@ -141,5 +141,5 @@ blake2_f_contd: // stack: addr_(i+1), h_(i+1)', ..., h_7', kexit_info %endrep - // stack: kexit_info + // stack: kexit_info %jump(pop_and_return_success) diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm index d521da6d8..613f0567f 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm @@ -3,38 +3,36 @@ // are in the range 0..16) in the internal state. // The internal state is stored in memory starting at the address start. // stack: a, b, c, d, x, y, start - DUP4 - DUP4 - DUP4 - DUP4 - // stack: a, b, c, d, a, b, c, d, x, y, start + DUP1 + DUP5 + DUP5 + DUP5 + // stack: b, c, d, a, a, b, c, d, x, y, start DUP11 - // stack: start, a, b, c, d, a, b, c, d, x, y, start + // stack: start, b, c, d, a, a, b, c, d, x, y, start ADD MLOAD_GENERAL - // stack: v[a], b, c, d, a, b, c, d, x, y, start + // stack: v[b], c, d, a, a, b, c, d, x, y, start SWAP1 - // stack: b, v[a], c, d, a, b, c, d, x, y, start + // stack: c, v[b], d, a, a, b, c, d, x, y, start DUP11 - // stack: start, b, v[a], c, d, a, b, c, d, x, y, start + // stack: start, c, v[b], c, d, a, b, c, d, x, y, start ADD MLOAD_GENERAL - // stack: v[b], v[a], c, d, a, b, c, d, x, y, start + // stack: v[c], v[b], d, a, a, b, c, d, x, y, start SWAP2 - // stack: c, v[a], v[b], d, a, b, c, d, x, y, start + // stack: d, v[b], v[c], a, a, b, c, d, x, y, start DUP11 - // stack: start, c, v[a], v[b], d, a, b, c, d, x, y, start + // stack: start, d, v[b], v[c], a, a, b, c, d, x, y, start ADD MLOAD_GENERAL - // stack: v[c], v[a], v[b], d, a, b, c, d, x, y, start + // stack: v[d], v[b], v[c], a, a, b, c, d, x, y, start SWAP3 - // stack: d, v[a], v[b], v[c], a, b, c, d, x, y, start + // stack: a, v[b], v[c], v[d], a, b, c, d, x, y, start DUP11 - // stack: start, d, v[a], v[b], v[c], a, b, c, d, x, y, start + // stack: start, a, v[b], v[c], v[d], a, b, c, d, x, y, start ADD MLOAD_GENERAL - // stack: v[d], v[a], v[b], v[c], a, b, c, d, x, y, start - %stack (vd, vs: 3) -> (vs, vd) // stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start DUP2 // stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start @@ -44,13 +42,11 @@ ADD %as_u64 // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start - %stack (a, b, c, d) -> (a, d, a, b, c, d) - // stack: v[a]', v[d], v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start + %stack (a, b, c, d) -> (a, d, a, b, c) + // stack: v[a]', v[d], v[a]', v[b], v[c], a, b, c, d, x, y, start XOR %rotr_64(32) - // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start - %stack (top: 4, vd) -> (top) - // stack: v[d]', v[a]', v[b], v[c], a, b, c, d, x, y, start + // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], a, b, c, d, x, y, start %stack (d, a, b, c) -> (c, d, a, b, d) // stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start ADD @@ -106,25 +102,27 @@ %macro call_blake2_g_function(a, b, c, d, x_idx, y_idx) // stack: round, start - PUSH $y_idx DUP2 - // stack: round, y_idx, round, start - %blake2_permutation - // stack: s[y_idx], round, start %blake2_message_addr + DUP1 + // stack: message_addr, start, round, start + PUSH $y_idx + DUP5 + // stack: round, y_idx, message_addr, message_addr, start, round, start + %blake2_permutation + // stack: s[y_idx], message_addr, message_addr, start, round, start ADD MLOAD_GENERAL - // stack: m[s[y_idx]], round, start + // stack: m[s[y_idx]], message_addr, start, round, start + SWAP1 + // stack: message_addr, m[s[y_idx]], start, round, start PUSH $x_idx - DUP3 - // stack: round, 2, m[s[y_idx]], round, start + DUP5 + // stack: round, x_idx, message_addr, m[s[y_idx]], start, round, start %blake2_permutation - // stack: s[x_idx], m[s[y_idx]], round, start - %blake2_message_addr + // stack: s[x_idx], message_addr, m[s[y_idx]], start, round, start ADD MLOAD_GENERAL - // stack: m[s[x_idx]], m[s[y_idx]], round, start - %stack (ss: 2, r, s) -> (ss, s, r, s) // stack: m[s[x_idx]], m[s[y_idx]], start, round, start PUSH $d PUSH $c @@ -135,20 +133,6 @@ // stack: round, start %endmacro -run_g_function_round: - // stack: round, start, retdest - %call_blake2_g_function(0, 4, 8, 12, 0, 1) - %call_blake2_g_function(1, 5, 9, 13, 2, 3) - %call_blake2_g_function(2, 6, 10, 14, 4, 5) - %call_blake2_g_function(3, 7, 11, 15, 6, 7) - %call_blake2_g_function(0, 5, 10, 15, 8, 9) - %call_blake2_g_function(1, 6, 11, 12, 10, 11) - %call_blake2_g_function(2, 7, 8, 13, 12, 13) - %call_blake2_g_function(3, 4, 9, 14, 14, 15) - %stack (r, s, ret) -> (ret, r, s) - // stack: retdest, round, start - JUMP - global run_rounds_g_function: // stack: current_round, start, rounds, retdest DUP3 @@ -158,15 +142,17 @@ global run_rounds_g_function: EQ %jumpi(run_rounds_g_function_end) // stack: current_round, start, rounds, retdest - PUSH run_rounds_g_function_return - // stack: run_rounds_g_function_return, current_round, start, rounds, retdest - %stack (ret, r, s) -> (r, s, ret) - // stack: current_round, start, run_rounds_g_function_return, rounds, retdest - %jump(run_g_function_round) -run_rounds_g_function_return: - // stack: round, start, rounds, retdest + %call_blake2_g_function(0, 4, 8, 12, 0, 1) + %call_blake2_g_function(1, 5, 9, 13, 2, 3) + %call_blake2_g_function(2, 6, 10, 14, 4, 5) + %call_blake2_g_function(3, 7, 11, 15, 6, 7) + %call_blake2_g_function(0, 5, 10, 15, 8, 9) + %call_blake2_g_function(1, 6, 11, 12, 10, 11) + %call_blake2_g_function(2, 7, 8, 13, 12, 13) + %call_blake2_g_function(3, 4, 9, 14, 14, 15) + // stack: current_round, start, rounds, retdest %increment - // stack: round + 1, start, rounds, retdest + // stack: current_round + 1, start, rounds, retdest %jump(run_rounds_g_function) run_rounds_g_function_end: // stack: current_round, start, rounds, retdest diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/hash.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/hash.asm index ab0d24763..9cab75b5a 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/hash.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/hash.asm @@ -8,14 +8,11 @@ blake2_generate_new_hash_value: MLOAD_GENERAL // stack: h_i, i, retdest %blake2_internal_state_addr - // stack: addr, h_i, i, retdest - DUP3 + DUP1 + // stack: addr, addr, h_i, i, retdest + DUP4 ADD MLOAD_GENERAL - // stack: v_i, h_i, i, retdest - %blake2_internal_state_addr - // stack: addr, v_i, h_i, i, retdest - SWAP1 // stack: v_i, addr, h_i, i, retdest SWAP3 // stack: i, addr, h_i, v_i, retdest diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/iv.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/iv.asm index 72058ae4a..0394de916 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/iv.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/iv.asm @@ -35,17 +35,14 @@ global blake2_iv_const: global blake2_iv: // stack: i, retdest - PUSH blake2_iv_const - // stack: blake2_iv_const, i, retdest - SWAP1 - // stack: i, blake2_iv_const, retdest %mul_const(8) + PUSH blake2_iv_const ADD - // stack: blake2_iv_const + 2 * i, retdest + // stack: blake2_iv_const + 8 * i, retdest DUP1 - // stack: blake2_iv_const + 2 * i, blake2_iv_const + 2 * i, retdest + // stack: blake2_iv_const + 8 * i, blake2_iv_const + 8 * i, retdest %add_const(4) - // stack: blake2_iv_const + 2 * i + 1, blake2_iv_const + 2 * i, retdest + // stack: blake2_iv_const + 8 * i + 4, blake2_iv_const + 8 * i, retdest %mload_kernel_code_u32 SWAP1 %mload_kernel_code_u32 diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm index 2b40db7f6..65773c77d 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm @@ -1,14 +1,14 @@ // 64-bit right rotation %macro rotr_64(rot) // stack: value + DUP1 + // stack: value, value PUSH $rot - // stack: rot, value - DUP2 - DUP2 - // stack: rot, value, rot, value + // stack: rot, value, value SHR - // stack: value >> rot, rot, value - %stack (shifted, rot, value) -> (rot, value, shifted) + // stack: value >> rot, value + SWAP1 + PUSH $rot // stack: rot, value, value >> rot PUSH 64 SUB @@ -18,4 +18,4 @@ %as_u64 // stack: (value << (64 - rot)) % (1 << 64), value >> rot ADD -%endmacro +%endmacro \ No newline at end of file diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/permutations.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/permutations.asm index 44070b7ae..f7b101ce6 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/permutations.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/permutations.asm @@ -60,11 +60,12 @@ global permutation_9_constants: global blake2_permutation: // stack: i, round, retdest + PUSH 10 // round_mod PUSH permutation_0_constants - // stack: permutation_0_constants, i, round, retdest - SWAP2 - // stack: round, i, permutation_0_constants, retdest - %mod_const(10) + // stack: permutation_0_constants, 10, i, round, retdest + SWAP3 + // stack: round, 10, i, permutation_0_constants, retdest + MOD // stack: round % 10, i, permutation_0_constants, retdest %mul_const(16) ADD From 3c2c049b6fee9b970578a578443a786014f3a5f3 Mon Sep 17 00:00:00 2001 From: Robin Salen Date: Tue, 21 May 2024 19:02:10 -0400 Subject: [PATCH 2/6] Some more optimizations --- .../kernel/asm/hash/blake2/g_functions.asm | 110 +++++++++--------- 1 file changed, 54 insertions(+), 56 deletions(-) diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm index 613f0567f..07a0491d8 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm @@ -1,103 +1,106 @@ -%macro blake2_g_function +%macro blake2_g_function(a, b, c, d) // Function to mix two input words, x and y, into the four words indexed by a, b, c, d (which // are in the range 0..16) in the internal state. // The internal state is stored in memory starting at the address start. - // stack: a, b, c, d, x, y, start - DUP1 - DUP5 - DUP5 - DUP5 - // stack: b, c, d, a, a, b, c, d, x, y, start + // stack: x, y, start + + // Precompute final addresses + PUSH $d DUP4 ADD + PUSH $c DUP5 ADD + PUSH $b DUP6 ADD + PUSH $a DUP7 ADD + + // stack: a_fin, b_fin, c_fin, d_fin, x, y, start + PUSH $a + PUSH $d + PUSH $c + PUSH $b + // stack: b, c, d, a, a_fin, b_fin, c_fin, d_fin, x, y, start DUP11 - // stack: start, b, c, d, a, a, b, c, d, x, y, start + // stack: start, b, c, d, a, a_fin, b_fin, c_fin, d_fin, x, y, start ADD MLOAD_GENERAL - // stack: v[b], c, d, a, a, b, c, d, x, y, start + // stack: v[b], c, d, a, a_fin, b_fin, c_fin, d_fin, x, y, start SWAP1 - // stack: c, v[b], d, a, a, b, c, d, x, y, start + // stack: c, v[b], d, a, a_fin, b_fin, c_fin, d_fin, x, y, start DUP11 - // stack: start, c, v[b], c, d, a, b, c, d, x, y, start + // stack: start, c, v[b], c, d, a_fin, b_fin, c_fin, d_fin, x, y, start ADD MLOAD_GENERAL - // stack: v[c], v[b], d, a, a, b, c, d, x, y, start + // stack: v[c], v[b], d, a, a_fin, b_fin, c_fin, d_fin, x, y, start SWAP2 - // stack: d, v[b], v[c], a, a, b, c, d, x, y, start + // stack: d, v[b], v[c], a, a_fin, b_fin, c_fin, d_fin, x, y, start DUP11 - // stack: start, d, v[b], v[c], a, a, b, c, d, x, y, start + // stack: start, d, v[b], v[c], a, a_fin, b_fin, c_fin, d_fin, x, y, start ADD MLOAD_GENERAL - // stack: v[d], v[b], v[c], a, a, b, c, d, x, y, start + // stack: v[d], v[b], v[c], a, a_fin, b_fin, c_fin, d_fin, x, y, start SWAP3 - // stack: a, v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: a, v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start DUP11 - // stack: start, a, v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: start, a, v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start ADD MLOAD_GENERAL - // stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[a], v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start DUP2 - // stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[b], v[a], v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start DUP10 - // stack: x, v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: x, v[b], v[a], v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start ADD ADD %as_u64 - // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start + // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start %stack (a, b, c, d) -> (a, d, a, b, c) - // stack: v[a]', v[d], v[a]', v[b], v[c], a, b, c, d, x, y, start + // stack: v[a]', v[d], v[a]', v[b], v[c], a_fin, b_fin, c_fin, d_fin, x, y, start XOR %rotr_64(32) - // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], a, b, c, d, x, y, start + // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], a_fin, b_fin, c_fin, d_fin, x, y, start %stack (d, a, b, c) -> (c, d, a, b, d) - // stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start + // stack: v[c], v[d]', v[a]', v[b], v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start ADD %as_u64 - // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a, b, c, d, x, y, start + // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start %stack (c, a, b, d) -> (b, c, a, c, d) - // stack: v[b], v[c]', v[a]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[b], v[c]', v[a]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start XOR %rotr_64(24) - // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start SWAP1 - // stack: v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[a]', v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start DUP2 - // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start DUP11 - // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start ADD ADD %as_u64 - // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a, b, c, d, x, y, start + // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start SWAP3 - // stack: v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start + // stack: v[d]', v[b]', v[c]', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start DUP4 - // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start + // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start XOR %rotr_64(16) - // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a, b, c, d, x, y, start + // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start SWAP2 - // stack: v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[c]', v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start DUP3 - // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start ADD %as_u64 - // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start DUP1 - // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start SWAP2 - // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start + // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start XOR %rotr_64(63) - // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start - %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (start, a, va, start, b, vb, start, c, vc, start, d, vd) - // stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]'' - ADD - %swap_mstore - ADD - %swap_mstore - ADD - %swap_mstore - ADD - %swap_mstore + // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (va, a, vb, b, vc, c, vd, d) + MSTORE_GENERAL + MSTORE_GENERAL + MSTORE_GENERAL + MSTORE_GENERAL %endmacro %macro call_blake2_g_function(a, b, c, d, x_idx, y_idx) @@ -124,12 +127,7 @@ ADD MLOAD_GENERAL // stack: m[s[x_idx]], m[s[y_idx]], start, round, start - PUSH $d - PUSH $c - PUSH $b - PUSH $a - // stack: a, b, c, d, m[s[x_idx]], m[s[y_idx]], start, round, start - %blake2_g_function + %blake2_g_function($a, $b, $c, $d) // stack: round, start %endmacro From 425b394741a57a3da0f8c492a2d936ff09062bbc Mon Sep 17 00:00:00 2001 From: Robin Salen Date: Tue, 21 May 2024 19:17:22 -0400 Subject: [PATCH 3/6] Cleanup --- .../kernel/asm/hash/blake2/g_functions.asm | 79 ++++++++++--------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm index 07a0491d8..4c66a451f 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm @@ -10,97 +10,98 @@ PUSH $b DUP6 ADD PUSH $a DUP7 ADD - // stack: a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: addr_a, addr_b, addr_c, addr_d, x, y, start PUSH $a PUSH $d PUSH $c PUSH $b - // stack: b, c, d, a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: b, c, d, a, addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, b, c, d, a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: start, b, c, d, a, addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[b], c, d, a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[b], c, d, a, addr_a, addr_b, addr_c, addr_d, x, y, start SWAP1 - // stack: c, v[b], d, a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: c, v[b], d, a, addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, c, v[b], c, d, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: start, c, v[b], c, d, addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[c], v[b], d, a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[c], v[b], d, a, addr_a, addr_b, addr_c, addr_d, x, y, start SWAP2 - // stack: d, v[b], v[c], a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: d, v[b], v[c], a, addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, d, v[b], v[c], a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: start, d, v[b], v[c], a, addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[d], v[b], v[c], a, a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[d], v[b], v[c], a, addr_a, addr_b, addr_c, addr_d, x, y, start SWAP3 - // stack: a, v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: a, v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, a, v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: start, a, v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[a], v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[a], v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start DUP2 - // stack: v[b], v[a], v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[b], v[a], v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start DUP10 - // stack: x, v[b], v[a], v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: x, v[b], v[a], v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start ADD ADD %as_u64 - // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start %stack (a, b, c, d) -> (a, d, a, b, c) - // stack: v[a]', v[d], v[a]', v[b], v[c], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[a]', v[d], v[a]', v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start XOR %rotr_64(32) - // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start %stack (d, a, b, c) -> (c, d, a, b, d) - // stack: v[c], v[d]', v[a]', v[b], v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[c], v[d]', v[a]', v[b], v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start ADD %as_u64 - // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start %stack (c, a, b, d) -> (b, c, a, c, d) - // stack: v[b], v[c]', v[a]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[b], v[c]', v[a]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start XOR %rotr_64(24) - // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start SWAP1 - // stack: v[a]', v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start DUP2 - // stack: v[b]', v[a]', v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[b]', v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start ADD ADD %as_u64 - // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start SWAP3 - // stack: v[d]', v[b]', v[c]', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[d]', v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start DUP4 - // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start XOR %rotr_64(16) - // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start SWAP2 - // stack: v[c]', v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[c]', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start DUP3 - // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start ADD %as_u64 - // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start DUP1 - // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start SWAP2 - // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start XOR %rotr_64(63) - // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a_fin, b_fin, c_fin, d_fin, x, y, start + // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start + + // Store resulting values at precomputed addresses %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (va, a, vb, b, vc, c, vd, d) - MSTORE_GENERAL - MSTORE_GENERAL - MSTORE_GENERAL - MSTORE_GENERAL + %rep 4 + MSTORE_GENERAL + %endrep %endmacro %macro call_blake2_g_function(a, b, c, d, x_idx, y_idx) From 2ec17c28cf2cb665ec82f715671d209c58abe502 Mon Sep 17 00:00:00 2001 From: Robin Salen Date: Tue, 21 May 2024 19:23:41 -0400 Subject: [PATCH 4/6] Endline --- evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm index 65773c77d..331863f90 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm @@ -18,4 +18,4 @@ %as_u64 // stack: (value << (64 - rot)) % (1 << 64), value >> rot ADD -%endmacro \ No newline at end of file +%endmacro From 900a7a26c708a7884512f79ffeb2e223b448a509 Mon Sep 17 00:00:00 2001 From: Robin Salen Date: Wed, 22 May 2024 06:53:16 -0400 Subject: [PATCH 5/6] Some more speed-up --- .../kernel/asm/hash/blake2/g_functions.asm | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm index 4c66a451f..cf9f330e3 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm @@ -12,45 +12,45 @@ // stack: addr_a, addr_b, addr_c, addr_d, x, y, start PUSH $a - PUSH $d PUSH $c PUSH $b - // stack: b, c, d, a, addr_a, addr_b, addr_c, addr_d, x, y, start + PUSH $d + // stack: d, b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, b, c, d, a, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: start, d, b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[b], c, d, a, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: v[d], b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start SWAP1 - // stack: c, v[b], d, a, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: b, v[d], c, a, addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, c, v[b], c, d, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: start, b, v[d], c, d, addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[c], v[b], d, a, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: v[b], v[d], c, a, addr_a, addr_b, addr_c, addr_d, x, y, start SWAP2 - // stack: d, v[b], v[c], a, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: c, v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, d, v[b], v[c], a, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: start, c, v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[d], v[b], v[c], a, addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: v[c], v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start SWAP3 - // stack: a, v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: a, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start DUP11 - // stack: start, a, v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: start, a, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start ADD MLOAD_GENERAL - // stack: v[a], v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start - DUP2 - // stack: v[b], v[a], v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start + DUP3 + // stack: v[b], v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start DUP10 - // stack: x, v[b], v[a], v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: x, v[b], v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start ADD ADD %as_u64 - // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], addr_a, addr_b, addr_c, addr_d, x, y, start - %stack (a, b, c, d) -> (a, d, a, b, c) + // stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start + %stack (a, d, b, c) -> (a, d, a, b, c) // stack: v[a]', v[d], v[a]', v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start XOR %rotr_64(32) From 09e9866947dd1ffd4ae09058d11e70970a3b9e63 Mon Sep 17 00:00:00 2001 From: Robin Salen Date: Thu, 23 May 2024 10:54:23 -0400 Subject: [PATCH 6/6] Fix comments --- .../src/cpu/kernel/asm/hash/blake2/g_functions.asm | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm index cf9f330e3..33bf5cb1e 100644 --- a/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm +++ b/evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm @@ -81,7 +81,7 @@ // stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start XOR %rotr_64(16) - // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: v[d]'' = (v[a]'' ^ v[d]') >>> 16, v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start SWAP2 // stack: v[c]', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start DUP3 @@ -95,7 +95,7 @@ // stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start XOR %rotr_64(63) - // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start + // stack: v[b]'' = (v[b]' ^ v[c]'') >>> 63, v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start // Store resulting values at precomputed addresses %stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (va, a, vb, b, vc, c, vd, d) @@ -109,7 +109,7 @@ DUP2 %blake2_message_addr DUP1 - // stack: message_addr, start, round, start + // stack: message_addr, message_addr, start, round, start PUSH $y_idx DUP5 // stack: round, y_idx, message_addr, message_addr, start, round, start