Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: Improve blake2 precompile #239

Merged
merged 7 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -141,5 +141,5 @@ blake2_f_contd:
// stack: addr_(i+1), h_(i+1)', ..., h_7', kexit_info
%endrep

// stack: kexit_info
// stack: kexit_info
%jump(pop_and_return_success)
175 changes: 80 additions & 95 deletions evm_arithmetization/src/cpu/kernel/asm/hash/blake2/g_functions.asm
Original file line number Diff line number Diff line change
@@ -1,154 +1,137 @@
%macro blake2_g_function
%macro blake2_g_function(a, b, c, d)
// Function to mix two input words, x and y, into the four words indexed by a, b, c, d (which
// are in the range 0..16) in the internal state.
// The internal state is stored in memory starting at the address start.
// stack: a, b, c, d, x, y, start
DUP4
DUP4
DUP4
DUP4
// stack: a, b, c, d, a, b, c, d, x, y, start
// stack: x, y, start

// Precompute final addresses
PUSH $d DUP4 ADD
PUSH $c DUP5 ADD
PUSH $b DUP6 ADD
PUSH $a DUP7 ADD

// stack: addr_a, addr_b, addr_c, addr_d, x, y, start
PUSH $a
PUSH $c
PUSH $b
PUSH $d
// stack: d, b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
DUP11
// stack: start, a, b, c, d, a, b, c, d, x, y, start
// stack: start, d, b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
MLOAD_GENERAL
// stack: v[a], b, c, d, a, b, c, d, x, y, start
// stack: v[d], b, c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
SWAP1
// stack: b, v[a], c, d, a, b, c, d, x, y, start
// stack: b, v[d], c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
DUP11
// stack: start, b, v[a], c, d, a, b, c, d, x, y, start
// stack: start, b, v[d], c, d, addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
MLOAD_GENERAL
// stack: v[b], v[a], c, d, a, b, c, d, x, y, start
// stack: v[b], v[d], c, a, addr_a, addr_b, addr_c, addr_d, x, y, start
SWAP2
// stack: c, v[a], v[b], d, a, b, c, d, x, y, start
// stack: c, v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start
DUP11
// stack: start, c, v[a], v[b], d, a, b, c, d, x, y, start
// stack: start, c, v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
MLOAD_GENERAL
// stack: v[c], v[a], v[b], d, a, b, c, d, x, y, start
// stack: v[c], v[d], v[b], a, addr_a, addr_b, addr_c, addr_d, x, y, start
SWAP3
// stack: d, v[a], v[b], v[c], a, b, c, d, x, y, start
// stack: a, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
DUP11
// stack: start, d, v[a], v[b], v[c], a, b, c, d, x, y, start
// stack: start, a, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
MLOAD_GENERAL
// stack: v[d], v[a], v[b], v[c], a, b, c, d, x, y, start
%stack (vd, vs: 3) -> (vs, vd)
// stack: v[a], v[b], v[c], v[d], a, b, c, d, x, y, start
DUP2
// stack: v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start
// stack: v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
DUP3
// stack: v[b], v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
DUP10
// stack: x, v[b], v[a], v[b], v[c], v[d], a, b, c, d, x, y, start
// stack: x, v[b], v[a], v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
ADD
%as_u64
// stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[b], v[c], v[d], a, b, c, d, x, y, start
%stack (a, b, c, d) -> (a, d, a, b, c, d)
// stack: v[a]', v[d], v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start
// stack: v[a]' = (v[a] + v[b] + x) % 2^64, v[d], v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
%stack (a, d, b, c) -> (a, d, a, b, c)
// stack: v[a]', v[d], v[a]', v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
XOR
%rotr_64(32)
// stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], v[d], a, b, c, d, x, y, start
%stack (top: 4, vd) -> (top)
// stack: v[d]', v[a]', v[b], v[c], a, b, c, d, x, y, start
// stack: v[d]' = (v[d] ^ v[a]') >>> 32, v[a]', v[b], v[c], addr_a, addr_b, addr_c, addr_d, x, y, start
%stack (d, a, b, c) -> (c, d, a, b, d)
// stack: v[c], v[d]', v[a]', v[b], v[d]', a, b, c, d, x, y, start
// stack: v[c], v[d]', v[a]', v[b], v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
%as_u64
// stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', a, b, c, d, x, y, start
// stack: v[c]' = (v[c] + v[d]') % 2^64, v[a]', v[b], v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
%stack (c, a, b, d) -> (b, c, a, c, d)
// stack: v[b], v[c]', v[a]', v[c]', v[d]', a, b, c, d, x, y, start
// stack: v[b], v[c]', v[a]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
XOR
%rotr_64(24)
// stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', a, b, c, d, x, y, start
// stack: v[b]' = (v[b] ^ v[c]') >>> 24, v[a]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
SWAP1
// stack: v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start
// stack: v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
DUP2
// stack: v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start
// stack: v[b]', v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
DUP11
// stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', a, b, c, d, x, y, start
// stack: y, v[b]', v[a]', v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
ADD
%as_u64
// stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', a, b, c, d, x, y, start
// stack: v[a]'' = (v[a]' + v[b]' + y) % 2^64, v[b]', v[c]', v[d]', addr_a, addr_b, addr_c, addr_d, x, y, start
SWAP3
// stack: v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start
// stack: v[d]', v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
DUP4
// stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', a, b, c, d, x, y, start
// stack: v[a]'', v[d]', v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
XOR
%rotr_64(16)
// stack: v[d]'' = (v[a]'' ^ v[d]') >>> 8, v[b]', v[c]', v[a]'', a, b, c, d, x, y, start
// stack: v[d]'' = (v[a]'' ^ v[d]') >>> 16, v[b]', v[c]', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
SWAP2
// stack: v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
// stack: v[c]', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
DUP3
// stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
// stack: v[d]'', v[c]', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
ADD
%as_u64
// stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
// stack: v[c]'' = (v[c]' + v[d]'') % 2^64, v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
DUP1
// stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', a, b, c, d, x, y, start
// stack: v[c]'', v[c]'', v[b]', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
SWAP2
// stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start
// stack: v[b]', v[c]'', v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start
XOR
%rotr_64(63)
// stack: v[b]'' = (v[b]' ^ v[c]'') >>> 7, v[c]'', v[d]'', v[a]'', a, b, c, d, x, y, start
%stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (start, a, va, start, b, vb, start, c, vc, start, d, vd)
// stack: start, a, v[a]'', start, b, v[b]'', start, c, v[c]'', start, d, v[d]''
ADD
%swap_mstore
ADD
%swap_mstore
ADD
%swap_mstore
ADD
%swap_mstore
// stack: v[b]'' = (v[b]' ^ v[c]'') >>> 63, v[c]'', v[d]'', v[a]'', addr_a, addr_b, addr_c, addr_d, x, y, start

// Store resulting values at precomputed addresses
%stack (vb, vc, vd, va, a, b, c, d, x, y, start) -> (va, a, vb, b, vc, c, vd, d)
%rep 4
hratoanina marked this conversation as resolved.
Show resolved Hide resolved
MSTORE_GENERAL
%endrep
%endmacro

%macro call_blake2_g_function(a, b, c, d, x_idx, y_idx)
// stack: round, start
PUSH $y_idx
DUP2
// stack: round, y_idx, round, start
%blake2_permutation
// stack: s[y_idx], round, start
%blake2_message_addr
DUP1
// stack: message_addr, message_addr, start, round, start
PUSH $y_idx
DUP5
// stack: round, y_idx, message_addr, message_addr, start, round, start
%blake2_permutation
// stack: s[y_idx], message_addr, message_addr, start, round, start
ADD
MLOAD_GENERAL
// stack: m[s[y_idx]], round, start
// stack: m[s[y_idx]], message_addr, start, round, start
SWAP1
// stack: message_addr, m[s[y_idx]], start, round, start
PUSH $x_idx
DUP3
// stack: round, 2, m[s[y_idx]], round, start
DUP5
// stack: round, x_idx, message_addr, m[s[y_idx]], start, round, start
%blake2_permutation
// stack: s[x_idx], m[s[y_idx]], round, start
%blake2_message_addr
// stack: s[x_idx], message_addr, m[s[y_idx]], start, round, start
ADD
MLOAD_GENERAL
// stack: m[s[x_idx]], m[s[y_idx]], round, start
%stack (ss: 2, r, s) -> (ss, s, r, s)
// stack: m[s[x_idx]], m[s[y_idx]], start, round, start
PUSH $d
PUSH $c
PUSH $b
PUSH $a
// stack: a, b, c, d, m[s[x_idx]], m[s[y_idx]], start, round, start
%blake2_g_function
%blake2_g_function($a, $b, $c, $d)
// stack: round, start
%endmacro

run_g_function_round:
// stack: round, start, retdest
%call_blake2_g_function(0, 4, 8, 12, 0, 1)
%call_blake2_g_function(1, 5, 9, 13, 2, 3)
%call_blake2_g_function(2, 6, 10, 14, 4, 5)
%call_blake2_g_function(3, 7, 11, 15, 6, 7)
%call_blake2_g_function(0, 5, 10, 15, 8, 9)
%call_blake2_g_function(1, 6, 11, 12, 10, 11)
%call_blake2_g_function(2, 7, 8, 13, 12, 13)
%call_blake2_g_function(3, 4, 9, 14, 14, 15)
%stack (r, s, ret) -> (ret, r, s)
// stack: retdest, round, start
JUMP

global run_rounds_g_function:
// stack: current_round, start, rounds, retdest
DUP3
Expand All @@ -158,15 +141,17 @@ global run_rounds_g_function:
EQ
%jumpi(run_rounds_g_function_end)
// stack: current_round, start, rounds, retdest
PUSH run_rounds_g_function_return
// stack: run_rounds_g_function_return, current_round, start, rounds, retdest
%stack (ret, r, s) -> (r, s, ret)
// stack: current_round, start, run_rounds_g_function_return, rounds, retdest
%jump(run_g_function_round)
run_rounds_g_function_return:
// stack: round, start, rounds, retdest
%call_blake2_g_function(0, 4, 8, 12, 0, 1)
%call_blake2_g_function(1, 5, 9, 13, 2, 3)
%call_blake2_g_function(2, 6, 10, 14, 4, 5)
%call_blake2_g_function(3, 7, 11, 15, 6, 7)
%call_blake2_g_function(0, 5, 10, 15, 8, 9)
%call_blake2_g_function(1, 6, 11, 12, 10, 11)
%call_blake2_g_function(2, 7, 8, 13, 12, 13)
%call_blake2_g_function(3, 4, 9, 14, 14, 15)
// stack: current_round, start, rounds, retdest
%increment
// stack: round + 1, start, rounds, retdest
// stack: current_round + 1, start, rounds, retdest
%jump(run_rounds_g_function)
run_rounds_g_function_end:
// stack: current_round, start, rounds, retdest
Expand Down
9 changes: 3 additions & 6 deletions evm_arithmetization/src/cpu/kernel/asm/hash/blake2/hash.asm
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,11 @@ blake2_generate_new_hash_value:
MLOAD_GENERAL
// stack: h_i, i, retdest
%blake2_internal_state_addr
// stack: addr, h_i, i, retdest
DUP3
DUP1
// stack: addr, addr, h_i, i, retdest
DUP4
ADD
MLOAD_GENERAL
// stack: v_i, h_i, i, retdest
%blake2_internal_state_addr
// stack: addr, v_i, h_i, i, retdest
SWAP1
// stack: v_i, addr, h_i, i, retdest
SWAP3
// stack: i, addr, h_i, v_i, retdest
Expand Down
11 changes: 4 additions & 7 deletions evm_arithmetization/src/cpu/kernel/asm/hash/blake2/iv.asm
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,14 @@ global blake2_iv_const:

global blake2_iv:
// stack: i, retdest
PUSH blake2_iv_const
// stack: blake2_iv_const, i, retdest
SWAP1
// stack: i, blake2_iv_const, retdest
%mul_const(8)
PUSH blake2_iv_const
ADD
// stack: blake2_iv_const + 2 * i, retdest
// stack: blake2_iv_const + 8 * i, retdest
DUP1
// stack: blake2_iv_const + 2 * i, blake2_iv_const + 2 * i, retdest
// stack: blake2_iv_const + 8 * i, blake2_iv_const + 8 * i, retdest
%add_const(4)
// stack: blake2_iv_const + 2 * i + 1, blake2_iv_const + 2 * i, retdest
// stack: blake2_iv_const + 8 * i + 4, blake2_iv_const + 8 * i, retdest
%mload_kernel_code_u32
SWAP1
%mload_kernel_code_u32
Expand Down
12 changes: 6 additions & 6 deletions evm_arithmetization/src/cpu/kernel/asm/hash/blake2/ops.asm
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
// 64-bit right rotation
%macro rotr_64(rot)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Identical modification to the one done for SHA2, except this one is a 64-bit rotation instead of 32.

// stack: value
DUP1
// stack: value, value
PUSH $rot
// stack: rot, value
DUP2
DUP2
// stack: rot, value, rot, value
// stack: rot, value, value
SHR
// stack: value >> rot, rot, value
%stack (shifted, rot, value) -> (rot, value, shifted)
// stack: value >> rot, value
SWAP1
PUSH $rot
// stack: rot, value, value >> rot
PUSH 64
SUB
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,12 @@ global permutation_9_constants:

global blake2_permutation:
// stack: i, round, retdest
PUSH 10 // round_mod
PUSH permutation_0_constants
// stack: permutation_0_constants, i, round, retdest
SWAP2
// stack: round, i, permutation_0_constants, retdest
%mod_const(10)
// stack: permutation_0_constants, 10, i, round, retdest
SWAP3
// stack: round, 10, i, permutation_0_constants, retdest
MOD
// stack: round % 10, i, permutation_0_constants, retdest
%mul_const(16)
ADD
Expand Down
Loading