diff --git a/common.vhdl b/common.vhdl index 14a8801d7..cc49e8f23 100644 --- a/common.vhdl +++ b/common.vhdl @@ -86,35 +86,24 @@ package common is -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); - -- Extended GPR index (can hold an SPR or a FPR) - subtype gspr_index_t is std_ulogic_vector(6 downto 0); + -- Extended GPR index (can hold a GPR or a FPR) + subtype gspr_index_t is std_ulogic_vector(5 downto 0); -- FPR indices subtype fpr_index_t is std_ulogic_vector(4 downto 0); - -- Some SPRs are stored in the register file, they use the magic - -- GPR numbers above 31. + -- FPRs are stored in the register file, using GSPR + -- numbers from 32 to 63. -- - -- The function fast_spr_num() returns the corresponding fast - -- pseudo-GPR number for a given SPR number. The result MSB - -- indicates if this is indeed a fast SPR. If clear, then - -- the SPR is not stored in the GPR file. - -- - -- FPRs are also stored in the register file, using GSPR - -- numbers from 64 to 95. - -- - function fast_spr_num(spr: spr_num_t) return gspr_index_t; -- Indices conversion functions function gspr_to_gpr(i: gspr_index_t) return gpr_index_t; function gpr_to_gspr(i: gpr_index_t) return gspr_index_t; - function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t; - function is_fast_spr(s: gspr_index_t) return std_ulogic; function fpr_to_gspr(f: fpr_index_t) return gspr_index_t; -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are -- in the CR file as a kind of CR extension (with a separate write - -- control). The rest is stored as a fast SPR. + -- control). The rest is stored in ctrl_t (effectively in execute1). type xer_common_t is record ca : std_ulogic; ca32 : std_ulogic; @@ -124,6 +113,48 @@ package common is end record; constant xerc_init : xer_common_t := (others => '0'); + -- Some SPRs are stored in a pair of small RAMs in execute1 + -- Even half: + subtype ramspr_index is natural range 0 to 7; + constant RAMSPR_SRR0 : ramspr_index := 0; + constant RAMSPR_HSRR0 : ramspr_index := 1; + constant RAMSPR_SPRG0 : ramspr_index := 2; + constant RAMSPR_SPRG2 : ramspr_index := 3; + constant RAMSPR_HSPRG0 : ramspr_index := 4; + constant RAMSPR_LR : ramspr_index := 5; -- must equal RAMSPR_CTR + constant RAMSPR_TAR : ramspr_index := 6; + -- Odd half: + constant RAMSPR_SRR1 : ramspr_index := 0; + constant RAMSPR_HSRR1 : ramspr_index := 1; + constant RAMSPR_SPRG1 : ramspr_index := 2; + constant RAMSPR_SPRG3 : ramspr_index := 3; + constant RAMSPR_HSPRG1 : ramspr_index := 4; + constant RAMSPR_CTR : ramspr_index := 5; -- must equal RAMSPR_LR + + type ram_spr_info is record + index : ramspr_index; + isodd : std_ulogic; + valid : std_ulogic; + end record; + constant ram_spr_info_init: ram_spr_info := (index => 0, others => '0'); + + subtype spr_selector is std_ulogic_vector(2 downto 0); + type spr_id is record + sel : spr_selector; + valid : std_ulogic; + ispmu : std_ulogic; + end record; + constant spr_id_init : spr_id := (sel => "000", others => '0'); + + constant SPRSEL_TB : spr_selector := 3x"0"; + constant SPRSEL_TBU : spr_selector := 3x"1"; + constant SPRSEL_DEC : spr_selector := 3x"2"; + constant SPRSEL_PVR : spr_selector := 3x"3"; + constant SPRSEL_LOGA : spr_selector := 3x"4"; + constant SPRSEL_LOGD : spr_selector := 3x"5"; + constant SPRSEL_CFAR : spr_selector := 3x"6"; + constant SPRSEL_XER : spr_selector := 3x"7"; + -- FPSCR bit numbers constant FPSCR_FX : integer := 63 - 32; constant FPSCR_FEX : integer := 63 - 33; @@ -192,7 +223,10 @@ package common is dec: std_ulogic_vector(63 downto 0); msr: std_ulogic_vector(63 downto 0); cfar: std_ulogic_vector(63 downto 0); + xer_low: std_ulogic_vector(17 downto 0); end record; + constant ctrl_t_init : ctrl_t := + (xer_low => 18x"0", others => (others => '0')); type Fetch1ToIcacheType is record req: std_ulogic; @@ -226,23 +260,35 @@ package common is stop_mark : std_ulogic; nia: std_ulogic_vector(63 downto 0); insn: std_ulogic_vector(31 downto 0); - ispr1: gspr_index_t; -- (G)SPR used for branch condition (CTR) or mfspr - ispr2: gspr_index_t; -- (G)SPR used for branch target (CTR, LR, TAR) - ispro: gspr_index_t; -- (G)SPR written with LR or CTR decode: decode_rom_t; br_pred: std_ulogic; -- Branch was predicted to be taken big_endian: std_ulogic; + spr_info : spr_id; + ram_spr : ram_spr_info; + reg_a : gspr_index_t; + reg_b : gspr_index_t; + reg_c : gspr_index_t; end record; constant Decode1ToDecode2Init : Decode1ToDecode2Type := (valid => '0', stop_mark => '0', nia => (others => '0'), insn => (others => '0'), - ispr1 => (others => '0'), ispr2 => (others => '0'), ispro => (others => '0'), - decode => decode_rom_init, br_pred => '0', big_endian => '0'); + decode => decode_rom_init, br_pred => '0', big_endian => '0', + spr_info => spr_id_init, ram_spr => ram_spr_info_init, + reg_a => (others => '0'), reg_b => (others => '0'), reg_c => (others => '0')); type Decode1ToFetch1Type is record redirect : std_ulogic; redirect_nia : std_ulogic_vector(63 downto 0); end record; + type Decode1ToRegisterFileType is record + reg_1_addr : gspr_index_t; + reg_2_addr : gspr_index_t; + reg_3_addr : gspr_index_t; + read_1_enable : std_ulogic; + read_2_enable : std_ulogic; + read_3_enable : std_ulogic; + end record; + type bypass_data_t is record tag : instr_tag_t; data : std_ulogic_vector(63 downto 0); @@ -266,6 +312,7 @@ package common is write_reg_enable: std_ulogic; read_reg1: gspr_index_t; read_reg2: gspr_index_t; + read_reg3: gspr_index_t; read_data1: std_ulogic_vector(63 downto 0); read_data2: std_ulogic_vector(63 downto 0); read_data3: std_ulogic_vector(63 downto 0); @@ -276,7 +323,6 @@ package common is rc: std_ulogic; oe: std_ulogic; invert_a: std_ulogic; - addm1 : std_ulogic; invert_out: std_ulogic; input_carry: carry_in_t; output_carry: std_ulogic; @@ -296,11 +342,21 @@ package common is sub_select : std_ulogic_vector(2 downto 0); -- sub-result selection repeat : std_ulogic; -- set if instruction is cracked into two ops second : std_ulogic; -- set if this is the second op + spr_select : spr_id; + spr_is_ram : std_ulogic; + ramspr_even_rdaddr : ramspr_index; + ramspr_odd_rdaddr : ramspr_index; + ramspr_rd_odd : std_ulogic; + ramspr_wraddr : ramspr_index; + ramspr_write_even : std_ulogic; + ramspr_write_odd : std_ulogic; + dbg_spr_access : std_ulogic; + dec_ctr : std_ulogic; end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', unit => NONE, fac => NONE, insn_type => OP_ILLEGAL, instr_tag => instr_tag_init, write_reg_enable => '0', - lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', addm1 => '0', + lr => '0', br_abs => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', output_xer => '0', is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', br_pred => '0', @@ -308,7 +364,13 @@ package common is read_data1 => (others => '0'), read_data2 => (others => '0'), read_data3 => (others => '0'), cr => (others => '0'), insn => (others => '0'), data_len => (others => '0'), result_sel => "000", sub_select => "000", - repeat => '0', second => '0', others => (others => '0')); + repeat => '0', second => '0', spr_select => spr_id_init, + spr_is_ram => '0', + ramspr_even_rdaddr => 0, ramspr_odd_rdaddr => 0, ramspr_rd_odd => '0', + ramspr_wraddr => 0, ramspr_write_even => '0', ramspr_write_odd => '0', + dbg_spr_access => '0', + dec_ctr => '0', + others => (others => '0')); type MultiplyInputType is record valid: std_ulogic; @@ -332,6 +394,7 @@ package common is type Execute1ToDividerType is record valid: std_ulogic; + flush: std_ulogic; dividend: std_ulogic_vector(63 downto 0); divisor: std_ulogic_vector(63 downto 0); is_signed: std_ulogic; @@ -340,9 +403,8 @@ package common is is_modulus: std_ulogic; neg_result: std_ulogic; end record; - constant Execute1ToDividerInit: Execute1ToDividerType := (valid => '0', is_signed => '0', is_32bit => '0', - is_extended => '0', is_modulus => '0', - neg_result => '0', others => (others => '0')); + constant Execute1ToDividerInit: Execute1ToDividerType := ( + dividend => 64x"0", divisor => 64x"0", others => '0'); type PMUEventType is record no_instr_avail : std_ulogic; @@ -391,11 +453,8 @@ package common is type Decode2ToRegisterFileType is record read1_enable : std_ulogic; - read1_reg : gspr_index_t; read2_enable : std_ulogic; - read2_reg : gspr_index_t; read3_enable : std_ulogic; - read3_reg : gspr_index_t; end record; type RegisterFileToDecode2Type is record @@ -437,6 +496,7 @@ package common is is_32bit : std_ulogic; repeat : std_ulogic; second : std_ulogic; + e2stall : std_ulogic; msr : std_ulogic_vector(63 downto 0); end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := @@ -449,13 +509,12 @@ package common is write_reg => (others => '0'), length => (others => '0'), mode_32bit => '0', is_32bit => '0', - repeat => '0', second => '0', + repeat => '0', second => '0', e2stall => '0', msr => (others => '0')); type Loadstore1ToExecute1Type is record busy : std_ulogic; - in_progress : std_ulogic; - interrupt : std_ulogic; + l2stall : std_ulogic; end record; type Loadstore1ToDcacheType is record @@ -498,7 +557,9 @@ package common is iside : std_ulogic; load : std_ulogic; priv : std_ulogic; - sprn : std_ulogic_vector(9 downto 0); + ric : std_ulogic_vector(1 downto 0); + sprnf : std_ulogic; + sprnt : std_ulogic; addr : std_ulogic_vector(63 downto 0); rs : std_ulogic_vector(63 downto 0); end record; @@ -549,7 +610,6 @@ package common is store_done : std_ulogic; interrupt : std_ulogic; intr_vec : intr_vector_t; - srr0: std_ulogic_vector(63 downto 0); srr1: std_ulogic_vector(15 downto 0); end record; constant Loadstore1ToWritebackInit : Loadstore1ToWritebackType := @@ -557,7 +617,7 @@ package common is write_reg => (others => '0'), write_data => (others => '0'), xerc => xerc_init, rc => '0', store_done => '0', interrupt => '0', intr_vec => 0, - srr0 => (others => '0'), srr1 => (others => '0')); + srr1 => (others => '0')); type Loadstore1EventType is record load_complete : std_ulogic; @@ -602,29 +662,37 @@ package common is srr1 => (others => '0'), msr => (others => '0')); type Execute1ToFPUType is record - valid : std_ulogic; - op : insn_type_t; - nia : std_ulogic_vector(63 downto 0); - itag : instr_tag_t; - insn : std_ulogic_vector(31 downto 0); - single : std_ulogic; - fe_mode : std_ulogic_vector(1 downto 0); - fra : std_ulogic_vector(63 downto 0); - frb : std_ulogic_vector(63 downto 0); - frc : std_ulogic_vector(63 downto 0); - frt : gspr_index_t; - rc : std_ulogic; - out_cr : std_ulogic; + valid : std_ulogic; + op : insn_type_t; + nia : std_ulogic_vector(63 downto 0); + itag : instr_tag_t; + insn : std_ulogic_vector(31 downto 0); + single : std_ulogic; + is_signed : std_ulogic; + fe_mode : std_ulogic_vector(1 downto 0); + fra : std_ulogic_vector(63 downto 0); + frb : std_ulogic_vector(63 downto 0); + frc : std_ulogic_vector(63 downto 0); + frt : gspr_index_t; + rc : std_ulogic; + m32b : std_ulogic; + out_cr : std_ulogic; + oe : std_ulogic; + xerc : xer_common_t; + stall : std_ulogic; end record; constant Execute1ToFPUInit : Execute1ToFPUType := (valid => '0', op => OP_ILLEGAL, nia => (others => '0'), itag => instr_tag_init, - insn => (others => '0'), fe_mode => "00", rc => '0', + insn => (others => '0'), fe_mode => "00", rc => '0', fra => (others => '0'), frb => (others => '0'), frc => (others => '0'), frt => (others => '0'), - single => '0', out_cr => '0'); + single => '0', is_signed => '0', out_cr => '0', + m32b => '0', oe => '0', xerc => xerc_init, + stall => '0'); type FPUToExecute1Type is record busy : std_ulogic; + f2stall : std_ulogic; exception : std_ulogic; end record; constant FPUToExecute1Init : FPUToExecute1Type := (others => '0'); @@ -639,8 +707,9 @@ package common is write_cr_enable : std_ulogic; write_cr_mask : std_ulogic_vector(7 downto 0); write_cr_data : std_ulogic_vector(31 downto 0); + write_xerc : std_ulogic; + xerc : xer_common_t; intr_vec : intr_vector_t; - srr0 : std_ulogic_vector(63 downto 0); srr1 : std_ulogic_vector(15 downto 0); end record; constant FPUToWritebackInit : FPUToWritebackType := @@ -648,6 +717,7 @@ package common is write_enable => '0', write_reg => (others => '0'), write_cr_enable => '0', write_cr_mask => (others => '0'), write_cr_data => (others => '0'), + write_xerc => '0', xerc => xerc_init, intr_vec => 0, srr1 => (others => '0'), others => (others => '0')); @@ -695,6 +765,11 @@ package common is write_cr_mask => (others => '0'), write_cr_data => (others => '0')); + type WritebackToExecute1Type is record + intr : std_ulogic; + srr1 : std_ulogic_vector(15 downto 0); + end record; + type WritebackEventType is record instr_complete : std_ulogic; fp_complete : std_ulogic; @@ -707,49 +782,6 @@ package body common is begin return to_integer(unsigned(insn(15 downto 11) & insn(20 downto 16))); end; - function fast_spr_num(spr: spr_num_t) return gspr_index_t is - variable n : integer range 0 to 31; - -- tmp variable introduced as workaround for VCS compilation - -- simulation was failing with subtype constraint mismatch error - -- see GitHub PR #173 - variable tmp : std_ulogic_vector(4 downto 0); - begin - case spr is - when SPR_LR => - n := 0; -- N.B. decode2 relies on this specific value - when SPR_CTR => - n := 1; -- N.B. decode2 relies on this specific value - when SPR_SRR0 => - n := 2; - when SPR_SRR1 => - n := 3; - when SPR_HSRR0 => - n := 4; - when SPR_HSRR1 => - n := 5; - when SPR_SPRG0 => - n := 6; - when SPR_SPRG1 => - n := 7; - when SPR_SPRG2 => - n := 8; - when SPR_SPRG3 | SPR_SPRG3U => - n := 9; - when SPR_HSPRG0 => - n := 10; - when SPR_HSPRG1 => - n := 11; - when SPR_XER => - n := 12; - when SPR_TAR => - n := 13; - when others => - n := 0; - return "0000000"; - end case; - tmp := std_ulogic_vector(to_unsigned(n, 5)); - return "01" & tmp; - end; function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is begin @@ -758,26 +790,12 @@ package body common is function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is begin - return "00" & i; - end; - - function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is - begin - if s(5) = '1' then - return s; - else - return gpr_to_gspr(g); - end if; - end; - - function is_fast_spr(s: gspr_index_t) return std_ulogic is - begin - return s(5); + return "0" & i; end; function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is begin - return "10" & f; + return "1" & f; end; function tag_match(tag1 : instr_tag_t; tag2 : instr_tag_t) return boolean is diff --git a/control.vhdl b/control.vhdl index 1d5551782..e8c8068fb 100644 --- a/control.vhdl +++ b/control.vhdl @@ -15,11 +15,9 @@ entity control is complete_in : in instr_tag_t; valid_in : in std_ulogic; - repeated : in std_ulogic; flush_in : in std_ulogic; - busy_in : in std_ulogic; deferred : in std_ulogic; - sgl_pipe_in : in std_ulogic; + serialize : in std_ulogic; stop_mark_in : in std_ulogic; gpr_write_valid_in : in std_ulogic; @@ -36,42 +34,38 @@ entity control is execute_next_tag : in instr_tag_t; execute_next_cr_tag : in instr_tag_t; + execute2_next_tag : in instr_tag_t; + execute2_next_cr_tag : in instr_tag_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; + ov_read_in : in std_ulogic; + ov_write_in : in std_ulogic; valid_out : out std_ulogic; - stall_out : out std_ulogic; stopped_out : out std_ulogic; - gpr_bypass_a : out std_ulogic; - gpr_bypass_b : out std_ulogic; - gpr_bypass_c : out std_ulogic; - cr_bypass : out std_ulogic; + gpr_bypass_a : out std_ulogic_vector(1 downto 0); + gpr_bypass_b : out std_ulogic_vector(1 downto 0); + gpr_bypass_c : out std_ulogic_vector(1 downto 0); + cr_bypass : out std_ulogic_vector(1 downto 0); instr_tag_out : out instr_tag_t ); end entity control; architecture rtl of control is - type state_type is (IDLE, WAIT_FOR_PREV_TO_COMPLETE, WAIT_FOR_CURR_TO_COMPLETE); - - type reg_internal_type is record - state : state_type; - outstanding : integer range -1 to PIPELINE_DEPTH+2; - end record; - constant reg_internal_init : reg_internal_type := (state => IDLE, outstanding => 0); - - signal r_int, rin_int : reg_internal_type := reg_internal_init; - signal gpr_write_valid : std_ulogic; signal cr_write_valid : std_ulogic; + signal ov_write_valid : std_ulogic; type tag_register is record wr_gpr : std_ulogic; reg : gspr_index_t; recent : std_ulogic; wr_cr : std_ulogic; + wr_ov : std_ulogic; + valid : std_ulogic; end record; type tag_regs_array is array(tag_number_t) of tag_register; @@ -81,30 +75,37 @@ architecture rtl of control is signal gpr_tag_stall : std_ulogic; signal cr_tag_stall : std_ulogic; + signal ov_tag_stall : std_ulogic; + signal serial_stall : std_ulogic; signal curr_tag : tag_number_t; signal next_tag : tag_number_t; signal curr_cr_tag : tag_number_t; + signal curr_ov_tag : tag_number_t; + signal prev_tag : tag_number_t; begin control0: process(clk) begin if rising_edge(clk) then - assert rin_int.outstanding >= 0 and rin_int.outstanding <= (PIPELINE_DEPTH+1) - report "Outstanding bad " & integer'image(rin_int.outstanding) severity failure; - r_int <= rin_int; for i in tag_number_t loop if rst = '1' or flush_in = '1' then tag_regs(i).wr_gpr <= '0'; tag_regs(i).wr_cr <= '0'; + tag_regs(i).wr_ov <= '0'; + tag_regs(i).valid <= '0'; else if complete_in.valid = '1' and i = complete_in.tag then + assert tag_regs(i).valid = '1' report "spurious completion" severity failure; tag_regs(i).wr_gpr <= '0'; tag_regs(i).wr_cr <= '0'; + tag_regs(i).wr_ov <= '0'; + tag_regs(i).valid <= '0'; report "tag " & integer'image(i) & " not valid"; end if; - if gpr_write_valid = '1' and tag_regs(i).reg = gpr_write_in then + if instr_tag.valid = '1' and gpr_write_valid = '1' and + tag_regs(i).reg = gpr_write_in then tag_regs(i).recent <= '0'; if tag_regs(i).recent = '1' and tag_regs(i).wr_gpr = '1' then report "tag " & integer'image(i) & " not recent"; @@ -115,6 +116,8 @@ begin tag_regs(i).reg <= gpr_write_in; tag_regs(i).recent <= gpr_write_valid; tag_regs(i).wr_cr <= cr_write_valid; + tag_regs(i).wr_ov <= ov_write_valid; + tag_regs(i).valid <= '1'; if gpr_write_valid = '1' then report "tag " & integer'image(i) & " valid for gpr " & to_hstring(gpr_write_in); end if; @@ -124,11 +127,19 @@ begin if rst = '1' then curr_tag <= 0; curr_cr_tag <= 0; + curr_ov_tag <= 0; + prev_tag <= 0; else curr_tag <= next_tag; - if cr_write_valid = '1' then + if instr_tag.valid = '1' and cr_write_valid = '1' then curr_cr_tag <= instr_tag.tag; end if; + if instr_tag.valid = '1' and ov_write_valid = '1' then + curr_ov_tag <= instr_tag.tag; + end if; + if valid_out = '1' then + prev_tag <= instr_tag.tag; + end if; end if; end if; end process; @@ -141,11 +152,13 @@ begin variable tag_s : instr_tag_t; variable tag_t : instr_tag_t; variable incr_tag : tag_number_t; - variable byp_a : std_ulogic; - variable byp_b : std_ulogic; - variable byp_c : std_ulogic; + variable byp_a : std_ulogic_vector(1 downto 0); + variable byp_b : std_ulogic_vector(1 downto 0); + variable byp_c : std_ulogic_vector(1 downto 0); variable tag_cr : instr_tag_t; - variable byp_cr : std_ulogic; + variable byp_cr : std_ulogic_vector(1 downto 0); + variable tag_ov : instr_tag_t; + variable tag_prev : instr_tag_t; begin tag_a := instr_tag_init; for i in tag_number_t loop @@ -154,9 +167,6 @@ begin tag_a.tag := i; end if; end loop; - if tag_match(tag_a, complete_in) then - tag_a.valid := '0'; - end if; tag_b := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_b_read_in then @@ -164,9 +174,6 @@ begin tag_b.tag := i; end if; end loop; - if tag_match(tag_b, complete_in) then - tag_b.valid := '0'; - end if; tag_c := instr_tag_init; for i in tag_number_t loop if tag_regs(i).wr_gpr = '1' and tag_regs(i).recent = '1' and tag_regs(i).reg = gpr_c_read_in then @@ -174,30 +181,39 @@ begin tag_c.tag := i; end if; end loop; - if tag_match(tag_c, complete_in) then - tag_c.valid := '0'; - end if; - byp_a := '0'; + byp_a := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_a) then - byp_a := '1'; + byp_a := "01"; + elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_a) then + byp_a := "10"; + elsif tag_match(complete_in, tag_a) then + byp_a := "11"; end if; - byp_b := '0'; + byp_b := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_b) then - byp_b := '1'; + byp_b := "01"; + elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_b) then + byp_b := "10"; + elsif tag_match(complete_in, tag_b) then + byp_b := "11"; end if; - byp_c := '0'; + byp_c := "00"; if EX1_BYPASS and tag_match(execute_next_tag, tag_c) then - byp_c := '1'; + byp_c := "01"; + elsif EX1_BYPASS and tag_match(execute2_next_tag, tag_c) then + byp_c := "10"; + elsif tag_match(complete_in, tag_c) then + byp_c := "11"; end if; gpr_bypass_a <= byp_a; gpr_bypass_b <= byp_b; gpr_bypass_c <= byp_c; - gpr_tag_stall <= (tag_a.valid and not byp_a) or - (tag_b.valid and not byp_b) or - (tag_c.valid and not byp_c); + gpr_tag_stall <= (tag_a.valid and not (or (byp_a))) or + (tag_b.valid and not (or (byp_b))) or + (tag_c.valid and not (or (byp_c))); incr_tag := curr_tag; instr_tag.tag <= curr_tag; @@ -214,115 +230,59 @@ begin if tag_match(tag_cr, complete_in) then tag_cr.valid := '0'; end if; - byp_cr := '0'; + byp_cr := "00"; if EX1_BYPASS and tag_match(execute_next_cr_tag, tag_cr) then - byp_cr := '1'; + byp_cr := "10"; + elsif EX1_BYPASS and tag_match(execute2_next_cr_tag, tag_cr) then + byp_cr := "11"; end if; cr_bypass <= byp_cr; - cr_tag_stall <= tag_cr.valid and not byp_cr; + cr_tag_stall <= tag_cr.valid and not byp_cr(1); + + -- OV hazards + tag_ov.tag := curr_ov_tag; + tag_ov.valid := ov_read_in and tag_regs(curr_ov_tag).wr_ov; + if tag_match(tag_ov, complete_in) then + tag_ov.valid := '0'; + end if; + ov_tag_stall <= tag_ov.valid; + + tag_prev.tag := prev_tag; + tag_prev.valid := tag_regs(prev_tag).valid; + if tag_match(tag_prev, complete_in) then + tag_prev.valid := '0'; + end if; + serial_stall <= tag_prev.valid; end process; control1 : process(all) - variable v_int : reg_internal_type; variable valid_tmp : std_ulogic; - variable stall_tmp : std_ulogic; begin - v_int := r_int; - -- asynchronous valid_tmp := valid_in and not flush_in; - stall_tmp := '0'; - - if flush_in = '1' then - v_int.outstanding := 0; - elsif complete_in.valid = '1' then - v_int.outstanding := r_int.outstanding - 1; - end if; - if r_int.outstanding >= PIPELINE_DEPTH + 1 then - valid_tmp := '0'; - stall_tmp := '1'; - end if; if rst = '1' then gpr_write_valid <= '0'; cr_write_valid <= '0'; - v_int := reg_internal_init; valid_tmp := '0'; end if; -- Handle debugger stop - stopped_out <= '0'; - if stop_mark_in = '1' and v_int.outstanding = 0 then - stopped_out <= '1'; - end if; - - -- state machine to handle instructions that must be single - -- through the pipeline. - case r_int.state is - when IDLE => - if valid_tmp = '1' then - if (sgl_pipe_in = '1') then - if v_int.outstanding /= 0 then - v_int.state := WAIT_FOR_PREV_TO_COMPLETE; - stall_tmp := '1'; - else - -- send insn out and wait on it to complete - v_int.state := WAIT_FOR_CURR_TO_COMPLETE; - end if; - else - -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_tag_stall; - end if; - end if; - - when WAIT_FOR_PREV_TO_COMPLETE => - if v_int.outstanding = 0 then - -- send insn out and wait on it to complete - v_int.state := WAIT_FOR_CURR_TO_COMPLETE; - else - stall_tmp := '1'; - end if; + stopped_out <= stop_mark_in and not serial_stall; - when WAIT_FOR_CURR_TO_COMPLETE => - if v_int.outstanding = 0 then - v_int.state := IDLE; - -- XXX Don't replicate this - if valid_tmp = '1' then - if (sgl_pipe_in = '1') then - if v_int.outstanding /= 0 then - v_int.state := WAIT_FOR_PREV_TO_COMPLETE; - stall_tmp := '1'; - else - -- send insn out and wait on it to complete - v_int.state := WAIT_FOR_CURR_TO_COMPLETE; - end if; - else - -- let it go out if there are no GPR or CR hazards - stall_tmp := gpr_tag_stall or cr_tag_stall; - end if; - end if; - else - stall_tmp := '1'; - end if; - end case; - - if stall_tmp = '1' then + -- Don't let it go out if there are GPR or CR hazards + -- or we are waiting for the previous instruction to complete + if (gpr_tag_stall or cr_tag_stall or ov_tag_stall or + (serialize and serial_stall)) = '1' then valid_tmp := '0'; end if; gpr_write_valid <= gpr_write_valid_in and valid_tmp; cr_write_valid <= cr_write_in and valid_tmp; - - if valid_tmp = '1' and deferred = '0' then - v_int.outstanding := v_int.outstanding + 1; - end if; + ov_write_valid <= ov_write_in and valid_tmp; -- update outputs valid_out <= valid_tmp; - stall_out <= stall_tmp or deferred; - - -- update registers - rin_int <= v_int; end process; end; diff --git a/core.vhdl b/core.vhdl index b18f09afc..764141a86 100644 --- a/core.vhdl +++ b/core.vhdl @@ -63,6 +63,7 @@ architecture behave of core is -- decode signals signal decode1_to_decode2: Decode1ToDecode2Type; signal decode1_to_fetch1: Decode1ToFetch1Type; + signal decode1_to_register_file: Decode1ToRegisterFileType; signal decode2_to_execute1: Decode2ToExecute1Type; -- register file signals @@ -79,6 +80,8 @@ architecture behave of core is signal execute1_to_writeback: Execute1ToWritebackType; signal execute1_bypass: bypass_data_t; signal execute1_cr_bypass: cr_bypass_data_t; + signal execute2_bypass: bypass_data_t; + signal execute2_cr_bypass: cr_bypass_data_t; -- load store signals signal execute1_to_loadstore1: Execute1ToLoadstore1Type; @@ -98,6 +101,10 @@ architecture behave of core is signal fpu_to_execute1: FPUToExecute1Type; signal fpu_to_writeback: FPUToWritebackType; + -- Writeback signals + signal writeback_bypass: bypass_data_t; + signal wb_interrupt: WritebackToExecute1Type; + -- local signals signal fetch1_stall_in : std_ulogic; signal icache_stall_out : std_ulogic; @@ -117,7 +124,6 @@ architecture behave of core is signal complete: instr_tag_t; signal terminate: std_ulogic; signal core_rst: std_ulogic; - signal do_interrupt: std_ulogic; -- Delayed/Latched resets and alt_reset signal rst_fetch1 : std_ulogic; @@ -133,6 +139,7 @@ architecture behave of core is signal rst_dbg : std_ulogic; signal alt_reset_d : std_ulogic; + signal sim_ex_dump: std_ulogic; signal sim_cr_dump: std_ulogic; -- Debug actions @@ -144,8 +151,16 @@ architecture behave of core is signal dbg_gpr_ack : std_ulogic; signal dbg_gpr_addr : gspr_index_t; signal dbg_gpr_data : std_ulogic_vector(63 downto 0); + signal dbg_spr_req : std_ulogic; + signal dbg_spr_ack : std_ulogic; + signal dbg_spr_addr : std_ulogic_vector(7 downto 0); + signal dbg_spr_data : std_ulogic_vector(63 downto 0); + signal dbg_ls_spr_req : std_ulogic; + signal dbg_ls_spr_ack : std_ulogic; + signal dbg_ls_spr_addr : std_ulogic_vector(1 downto 0); + signal dbg_ls_spr_data : std_ulogic_vector(63 downto 0); - signal msr : std_ulogic_vector(63 downto 0); + signal ctrl_debug : ctrl_t; -- PMU event bus signal icache_events : IcacheEventType; @@ -271,6 +286,7 @@ begin f_in => icache_to_decode1, d_out => decode1_to_decode2, f_out => decode1_to_fetch1, + r_out => decode1_to_register_file, log_out => log_data(109 downto 97) ); @@ -298,6 +314,11 @@ begin c_out => decode2_to_cr_file, execute_bypass => execute1_bypass, execute_cr_bypass => execute1_cr_bypass, + execute2_bypass => execute2_bypass, + execute2_cr_bypass => execute2_cr_bypass, + writeback_bypass => writeback_bypass, + dbg_spr_req => dbg_spr_req, + dbg_spr_addr => dbg_spr_addr, log_out => log_data(119 downto 110) ); decode2_busy_in <= ex1_busy_out; @@ -310,6 +331,8 @@ begin ) port map ( clk => clk, + stall => decode2_stall_out, + d1_in => decode1_to_register_file, d_in => decode2_to_register_file, d_out => register_file_to_decode2, w_in => writeback_to_register_file, @@ -318,7 +341,7 @@ begin dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, - sim_dump_done => sim_cr_dump, + sim_dump_done => sim_ex_dump, log_out => log_data(255 downto 184) ); @@ -333,11 +356,13 @@ begin d_out => cr_file_to_decode2, w_in => writeback_to_cr_file, sim_dump => sim_cr_dump, + ctrl => ctrl_debug, log_out => log_data(183 downto 171) ); execute1_0: entity work.execute1 generic map ( + SIM => SIM, EX1_BYPASS => EX1_BYPASS, HAS_FPU => HAS_FPU, HAS_SHORT_MULT => HAS_SHORT_MULT, @@ -352,19 +377,27 @@ begin l_in => loadstore1_to_execute1, fp_in => fpu_to_execute1, ext_irq_in => ext_irq, - interrupt_in => do_interrupt, + interrupt_in => wb_interrupt, l_out => execute1_to_loadstore1, fp_out => execute1_to_fpu, e_out => execute1_to_writeback, bypass_data => execute1_bypass, bypass_cr_data => execute1_cr_bypass, + bypass2_data => execute2_bypass, + bypass2_cr_data => execute2_cr_bypass, icache_inval => ex1_icache_inval, - dbg_msr_out => msr, + dbg_ctrl_out => ctrl_debug, wb_events => writeback_events, ls_events => loadstore_events, dc_events => dcache_events, ic_events => icache_events, terminate_out => terminate, + dbg_spr_req => dbg_spr_req, + dbg_spr_ack => dbg_spr_ack, + dbg_spr_addr => dbg_spr_addr, + dbg_spr_data => dbg_spr_data, + sim_dump => sim_ex_dump, + sim_dump_done => sim_cr_dump, log_out => log_data(134 downto 120), log_rd_addr => log_rd_addr, log_rd_data => log_rd_data, @@ -377,6 +410,7 @@ begin port map ( clk => clk, rst => rst_fpu, + flush_in => flush, e_in => execute1_to_fpu, e_out => fpu_to_execute1, w_out => fpu_to_writeback @@ -406,6 +440,10 @@ begin m_in => mmu_to_loadstore1, dc_stall => dcache_stall_out, events => loadstore_events, + dbg_spr_req => dbg_ls_spr_req, + dbg_spr_ack => dbg_ls_spr_ack, + dbg_spr_addr => dbg_ls_spr_addr, + dbg_spr_data => dbg_ls_spr_data, log_out => log_data(149 downto 140) ); @@ -455,8 +493,9 @@ begin w_out => writeback_to_register_file, c_out => writeback_to_cr_file, f_out => writeback_to_fetch1, + wb_bypass => writeback_bypass, events => writeback_events, - interrupt_out => do_interrupt, + interrupt_out => wb_interrupt, complete_out => complete ); @@ -482,11 +521,19 @@ begin terminate => terminate, core_stopped => dbg_core_is_stopped, nia => fetch1_to_icache.nia, - msr => msr, + msr => ctrl_debug.msr, dbg_gpr_req => dbg_gpr_req, dbg_gpr_ack => dbg_gpr_ack, dbg_gpr_addr => dbg_gpr_addr, dbg_gpr_data => dbg_gpr_data, + dbg_spr_req => dbg_spr_req, + dbg_spr_ack => dbg_spr_ack, + dbg_spr_addr => dbg_spr_addr, + dbg_spr_data => dbg_spr_data, + dbg_ls_spr_req => dbg_ls_spr_req, + dbg_ls_spr_ack => dbg_ls_spr_ack, + dbg_ls_spr_addr => dbg_ls_spr_addr, + dbg_ls_spr_data => dbg_ls_spr_data, log_data => log_data, log_read_addr => log_rd_addr, log_read_data => log_rd_data, diff --git a/core_debug.vhdl b/core_debug.vhdl index ff99df4cb..c060f745f 100644 --- a/core_debug.vhdl +++ b/core_debug.vhdl @@ -33,12 +33,24 @@ entity core_debug is nia : in std_ulogic_vector(63 downto 0); msr : in std_ulogic_vector(63 downto 0); - -- GSPR register read port + -- GPR/FPR register read port dbg_gpr_req : out std_ulogic; dbg_gpr_ack : in std_ulogic; dbg_gpr_addr : out gspr_index_t; dbg_gpr_data : in std_ulogic_vector(63 downto 0); + -- SPR register read port for SPRs in execute1 + dbg_spr_req : out std_ulogic; + dbg_spr_ack : in std_ulogic; + dbg_spr_addr : out std_ulogic_vector(7 downto 0); + dbg_spr_data : in std_ulogic_vector(63 downto 0); + + -- SPR register read port for SPRs in loadstore1 and mmu + dbg_ls_spr_req : out std_ulogic; + dbg_ls_spr_ack : in std_ulogic; + dbg_ls_spr_addr : out std_ulogic_vector(1 downto 0); + dbg_ls_spr_data : in std_ulogic_vector(63 downto 0); + -- Core logging data log_data : in std_ulogic_vector(255 downto 0); log_read_addr : in std_ulogic_vector(31 downto 0); @@ -105,7 +117,10 @@ architecture behave of core_debug is signal do_icreset : std_ulogic; signal terminated : std_ulogic; signal do_gspr_rd : std_ulogic; - signal gspr_index : gspr_index_t; + signal gspr_index : std_ulogic_vector(7 downto 0); + signal gspr_data : std_ulogic_vector(63 downto 0); + + signal spr_index_valid : std_ulogic; signal log_dmi_addr : std_ulogic_vector(31 downto 0) := (others => '0'); signal log_dmi_data : std_ulogic_vector(63 downto 0) := (others => '0'); @@ -119,9 +134,7 @@ architecture behave of core_debug is begin -- Single cycle register accesses on DMI except for GSPR data dmi_ack <= dmi_req when dmi_addr /= DBG_CORE_GSPR_DATA - else dbg_gpr_ack; - dbg_gpr_req <= dmi_req when dmi_addr = DBG_CORE_GSPR_DATA - else '0'; + else dbg_gpr_ack or dbg_spr_ack or dbg_ls_spr_ack; -- Status register read composition stat_reg <= (2 => terminated, @@ -129,12 +142,17 @@ begin 0 => stopping, others => '0'); + gspr_data <= dbg_gpr_data when gspr_index(5) = '0' else + dbg_ls_spr_data when dbg_ls_spr_req = '1' else + dbg_spr_data when spr_index_valid = '1' else + (others => '0'); + -- DMI read data mux with dmi_addr select dmi_dout <= stat_reg when DBG_CORE_STAT, nia when DBG_CORE_NIA, msr when DBG_CORE_MSR, - dbg_gpr_data when DBG_CORE_GSPR_DATA, + gspr_data when DBG_CORE_GSPR_DATA, log_write_addr & log_dmi_addr when DBG_CORE_LOG_ADDR, log_dmi_data when DBG_CORE_LOG_DATA, log_dmi_trigger when DBG_CORE_LOG_TRIGGER, @@ -191,7 +209,7 @@ begin terminated <= '0'; end if; elsif dmi_addr = DBG_CORE_GSPR_INDEX then - gspr_index <= dmi_din(gspr_index_t'left downto 0); + gspr_index <= dmi_din(7 downto 0); elsif dmi_addr = DBG_CORE_LOG_ADDR then log_dmi_addr <= dmi_din(31 downto 0); do_dmi_log_rd <= '1'; @@ -226,7 +244,70 @@ begin end if; end process; - dbg_gpr_addr <= gspr_index; + gspr_access: process(clk) + variable valid : std_ulogic; + variable sel : spr_selector; + variable isram : std_ulogic; + variable raddr : ramspr_index; + variable odd : std_ulogic; + begin + if rising_edge(clk) then + dbg_gpr_req <= '0'; + dbg_spr_req <= '0'; + dbg_ls_spr_req <= '0'; + if rst = '0' and dmi_req = '1' and dmi_addr = DBG_CORE_GSPR_DATA then + if gspr_index(5) = '0' then + dbg_gpr_req <= '1'; + elsif gspr_index(4 downto 2) = "111" then + dbg_ls_spr_req <= '1'; + else + dbg_spr_req <= '1'; + end if; + end if; + + -- Map 0 - 0x1f to GPRs, 0x20 - 0x3f to SPRs, and 0x40 - 0x5f to FPRs + dbg_gpr_addr <= gspr_index(6) & gspr_index(4 downto 0); + dbg_ls_spr_addr <= gspr_index(1 downto 0); + + -- For SPRs, use the same mapping as when the fast SPRs were in the GPR file + valid := '1'; + sel := "000"; + isram := '1'; + raddr := 0; + odd := '0'; + case gspr_index(4 downto 0) is + when 5x"00" => + raddr := RAMSPR_LR; + when 5x"01" => + odd := '1'; + raddr := RAMSPR_CTR; + when 5x"02" | 5x"03" => + odd := gspr_index(0); + raddr := RAMSPR_SRR0; + when 5x"04" | 5x"05" => + odd := gspr_index(0); + raddr := RAMSPR_HSRR0; + when 5x"06" | 5x"07" => + odd := gspr_index(0); + raddr := RAMSPR_SPRG0; + when 5x"08" | 5x"09" => + odd := gspr_index(0); + raddr := RAMSPR_SPRG2; + when 5x"0a" | 5x"0b" => + odd := gspr_index(0); + raddr := RAMSPR_HSPRG0; + when 5x"0c" => + isram := '0'; + sel := SPRSEL_XER; + when 5x"0d" => + raddr := RAMSPR_TAR; + when others => + valid := '0'; + end case; + dbg_spr_addr <= isram & sel & std_ulogic_vector(to_unsigned(raddr, 3)) & odd; + spr_index_valid <= valid; + end if; + end process; -- Core control signals generated by the debug module core_stop <= stopping and not do_step; diff --git a/countbits.vhdl b/countbits.vhdl index b16baa0dd..87417a966 100644 --- a/countbits.vhdl +++ b/countbits.vhdl @@ -9,6 +9,7 @@ entity bit_counter is port ( clk : in std_logic; rs : in std_ulogic_vector(63 downto 0); + stall : in std_ulogic; count_right : in std_ulogic; do_popcnt : in std_ulogic; is_32bit : in std_ulogic; @@ -49,7 +50,7 @@ architecture behaviour of bit_counter is begin countzero_r: process(clk) begin - if rising_edge(clk) then + if rising_edge(clk) and stall = '0' then inp_r <= inp; sum_r <= sum; end if; @@ -88,7 +89,7 @@ begin popcnt_r: process(clk) begin - if rising_edge(clk) then + if rising_edge(clk) and stall = '0' then for i in 0 to 7 loop pc8_r(i) <= pc8(i); end loop; diff --git a/countbits_tb.vhdl b/countbits_tb.vhdl index c00a6b611..c945c573b 100644 --- a/countbits_tb.vhdl +++ b/countbits_tb.vhdl @@ -26,6 +26,7 @@ begin bitcounter_0: entity work.bit_counter port map ( clk => clk, + stall => '0', rs => rs, result => res, count_right => count_right, diff --git a/cr_file.vhdl b/cr_file.vhdl index e9788cb29..940b95bdd 100644 --- a/cr_file.vhdl +++ b/cr_file.vhdl @@ -18,6 +18,7 @@ entity cr_file is d_out : out CrFileToDecode2Type; w_in : in WritebackToCrFileType; + ctrl : in ctrl_t; -- debug sim_dump : in std_ulogic; @@ -65,7 +66,11 @@ begin crs <= crs_updated; end if; if w_in.write_xerc_enable = '1' then - report "Writing XERC"; + report "Writing XERC SO=" & std_ulogic'image(xerc_updated.so) & + " OV=" & std_ulogic'image(xerc_updated.ov) & + " CA=" & std_ulogic'image(xerc_updated.ca) & + " OV32=" & std_ulogic'image(xerc_updated.ov32) & + " CA32=" & std_ulogic'image(xerc_updated.ca32); xerc <= xerc_updated; end if; end if; @@ -84,9 +89,18 @@ begin sim_dump_test: if SIM generate dump_cr: process(all) + variable xer : std_ulogic_vector(31 downto 0); begin if sim_dump = '1' then report "CR 00000000" & to_hstring(crs); + xer := (others => '0'); + xer(31) := xerc.so; + xer(30) := xerc.ov; + xer(29) := xerc.ca; + xer(19) := xerc.ov32; + xer(18) := xerc.ca32; + xer(17 downto 0) := ctrl.xer_low; + report "XER 00000000" & to_hstring(xer); assert false report "end of test" severity failure; end if; end process; diff --git a/dcache.vhdl b/dcache.vhdl index 8f7af524a..2d5ebe39d 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -588,7 +588,7 @@ begin end if; if rst = '1' then r0_full <= '0'; - elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then + elsif r1.full = '0' and d_in.hold = '0' then r0 <= r; r0_full <= r.req.valid; elsif r0.d_valid = '0' then @@ -605,9 +605,9 @@ begin m_out.stall <= '0'; -- Hold off the request in r0 when r1 has an uncompleted request - r0_stall <= r0_full and (r1.full or d_in.hold); + r0_stall <= r1.full or d_in.hold; r0_valid <= r0_full and not r1.full and not d_in.hold; - stall_out <= r0_stall; + stall_out <= r1.full; events <= ev; diff --git a/decode1.vhdl b/decode1.vhdl index a38aee382..c4b77077a 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -5,6 +5,7 @@ use ieee.numeric_std.all; library work; use work.common.all; use work.decode_types.all; +use work.insn_helpers.all; entity decode1 is generic ( @@ -24,13 +25,13 @@ entity decode1 is f_in : in IcacheToDecode1Type; f_out : out Decode1ToFetch1Type; d_out : out Decode1ToDecode2Type; + r_out : out Decode1ToRegisterFileType; log_out : out std_ulogic_vector(12 downto 0) ); end entity decode1; architecture behaviour of decode1 is signal r, rin : Decode1ToDecode2Type; - signal s : Decode1ToDecode2Type; signal f, fin : Decode1ToFetch1Type; constant illegal_inst : decode_rom_t := @@ -38,6 +39,18 @@ architecture behaviour of decode1 is constant x_inst : decode_rom_t := (NONE, NONE, OP_ILLEGAL, NONE, NONE, NONE, NONE, 'X', 'X', 'X', 'X', ZERO, 'X', NONE, 'X', 'X', 'X', 'X', 'X', 'X', NONE, 'X', 'X', NONE); + -- If we have an FPU, then it is used for integer divisions, + -- otherwise a dedicated divider in the ALU is used. + function divider_unit(hf : boolean) return unit_t is + begin + if hf then + return FPU; + else + return ALU; + end if; + end; + constant DVU : unit_t := divider_unit(HAS_FPU); + type reg_internal_t is record override : std_ulogic; override_decode: decode_rom_t; @@ -48,7 +61,6 @@ architecture behaviour of decode1 is (override => '0', override_decode => illegal_inst, override_unit => '0', force_single => '0'); signal ri, ri_in : reg_internal_t; - signal si : reg_internal_t; type br_predictor_t is record br_nia : std_ulogic_vector(61 downto 0); @@ -81,8 +93,8 @@ architecture behaviour of decode1 is 28 => (ALU, NONE, OP_AND, NONE, CONST_UI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andi. 29 => (ALU, NONE, OP_AND, NONE, CONST_UI_HI, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE, '0', '0', NONE), -- andis. 0 => (ALU, NONE, OP_ATTN, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- attn - 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b - 16 => (ALU, NONE, OP_BC, SPR, CONST_BD, NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc + 18 => (ALU, NONE, OP_B, NONE, CONST_LI, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- b + 16 => (ALU, NONE, OP_BC, NONE, CONST_BD, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- bc 11 => (ALU, NONE, OP_CMP, RA, CONST_SI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- cmpi 10 => (ALU, NONE, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- cmpli 34 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lbz @@ -95,7 +107,6 @@ architecture behaviour of decode1 is 43 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhau 40 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhz 41 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzu - 56 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_DQ, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRTE), -- lq 32 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwz 33 => (LDST, NONE, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwzu 7 => (ALU, NONE, OP_MUL_L64, RA, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- mulli @@ -170,11 +181,11 @@ architecture behaviour of decode1 is -- addpcis 2#001# => (ALU, NONE, OP_ADD, CIA, CONST_DXHI4, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- bclr, bcctr, bctar - 2#100# => (ALU, NONE, OP_BCREG, SPR, SPR, NONE, SPR, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), + 2#100# => (ALU, NONE, OP_BCREG, NONE, NONE, NONE, NONE, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0', NONE), -- isync - 2#111# => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), + 2#111# => (ALU, NONE, OP_ISYNC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- rfid - 2#101# => (ALU, NONE, OP_RFID, SPR, SPR, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), + 2#101# => (ALU, NONE, OP_RFID, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), others => illegal_inst ); @@ -225,31 +236,31 @@ architecture behaviour of decode1 is 2#1000111010# => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- cnttzd 2#1000011010# => (ALU, NONE, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- cnttzw 2#1011110011# => (ALU, NONE, OP_DARN, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- darn - 2#0001010110# => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbf - 2#0000110110# => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbst - 2#0100010110# => (ALU, NONE, OP_DCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbt - 2#0011110110# => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- dcbtst + 2#0001010110# => (ALU, NONE, OP_DCBF, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbf + 2#0000110110# => (ALU, NONE, OP_DCBST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbst + 2#0100010110# => (ALU, NONE, OP_DCBT, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbt + 2#0011110110# => (ALU, NONE, OP_DCBTST, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbtst 2#1111110110# => (LDST, NONE, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dcbz - 2#0110001001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeu - 2#1110001001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeuo - 2#0110001011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweu - 2#1110001011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweuo - 2#0110101001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divde - 2#1110101001# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdeo - 2#0110101011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwe - 2#1110101011# => (ALU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divweo - 2#0111001001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdu - 2#1111001001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divduo - 2#0111001011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwu - 2#1111001011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwuo - 2#0111101001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divd - 2#1111101001# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdo - 2#0111101011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divw - 2#1111101011# => (ALU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwo + 2#0110001001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeu + 2#1110001001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdeuo + 2#0110001011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweu + 2#1110001011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divweuo + 2#0110101001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divde + 2#1110101001# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdeo + 2#0110101011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwe + 2#1110101011# => (DVU, NONE, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divweo + 2#0111001001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divdu + 2#1111001001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- divduo + 2#0111001011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwu + 2#1111001011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0', NONE), -- divwuo + 2#0111101001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divd + 2#1111101001# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- divdo + 2#0111101011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divw + 2#1111101011# => (DVU, NONE, OP_DIV, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- divwo 2#1100110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dss 2#0101010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dst 2#0101110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- dstst - 2#1101010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- eieio + 2#1101010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- eieio 2#0100011100# => (ALU, NONE, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- eqv 2#1110111010# => (ALU, NONE, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- extsb 2#1110011010# => (ALU, NONE, OP_EXTS, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- extsh @@ -312,7 +323,6 @@ architecture behaviour of decode1 is 2#1100110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzcix 2#0100110111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lhzux 2#0100010111# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lhzx - 2#0100010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', DRTE), -- lqarx 2#0000010100# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0', NONE), -- lwarx 2#0101110101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '1', '0', '0', '0', NONE, '0', '0', DUPD), -- lwaux 2#0101010101# => (LDST, NONE, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0', NONE), -- lwax @@ -323,15 +333,15 @@ architecture behaviour of decode1 is 2#1001000000# => (ALU, NONE, OP_MCRXRX, NONE, NONE, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mcrxrx 2#0000010011# => (ALU, NONE, OP_MFCR, NONE, NONE, NONE, RT, '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfcr/mfocrf 2#0001010011# => (ALU, NONE, OP_MFMSR, NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mfmsr - 2#0101010011# => (ALU, NONE, OP_MFSPR, SPR, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr - 2#0100001001# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud - 2#0100001011# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw - 2#1100001001# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd - 2#1100001011# => (ALU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw + 2#0101010011# => (ALU, NONE, OP_MFSPR, NONE, NONE, RS, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mfspr + 2#0100001001# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- modud + 2#0100001011# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- moduw + 2#1100001001# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0', NONE), -- modsd + 2#1100001011# => (DVU, NONE, OP_MOD, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', NONE, '0', '0', NONE), -- modsw 2#0010010000# => (ALU, NONE, OP_MTCRF, NONE, NONE, RS, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtcrf/mtocrf - 2#0010010010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '1', NONE), -- mtmsr - 2#0010110010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- mtmsrd # ignore top bits and d - 2#0111010011# => (ALU, NONE, OP_MTSPR, NONE, NONE, RS, SPR, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr + 2#0010010010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- mtmsr + 2#0010110010# => (ALU, NONE, OP_MTMSRD, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtmsrd # ignore top bits and d + 2#0111010011# => (ALU, NONE, OP_MTSPR, NONE, NONE, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- mtspr 2#0001001001# => (ALU, NONE, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '1', RC, '0', '0', NONE), -- mulhd 2#0000001001# => (ALU, NONE, OP_MUL_H64, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- mulhdu 2#0001001011# => (ALU, NONE, OP_MUL_H32, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0', NONE), -- mulhw @@ -395,7 +405,6 @@ architecture behaviour of decode1 is 2#1011010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- sthcx 2#0110110111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- sthux 2#0110010111# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sthx - 2#0010110110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', DRSE), -- stqcx 2#1010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwbrx 2#1110010101# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- stwcix 2#0010010110# => (LDST, NONE, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0', NONE), -- stwcx @@ -411,13 +420,13 @@ architecture behaviour of decode1 is 2#1011101000# => (ALU, NONE, OP_ADD, RA, CONST_M1, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfmeo 2#0011001000# => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfze 2#1011001000# => (ALU, NONE, OP_ADD, RA, NONE, NONE, RT, '0', '0', '1', '0', CA, '1', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- subfzeo - 2#1001010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- sync + 2#1001010110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- sync 2#0001000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- td 2#0000000100# => (ALU, NONE, OP_TRAP, RA, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', NONE, '0', '0', NONE), -- tw 2#0100110010# => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbie 2#0100010010# => (LDST, NONE, OP_TLBIE, NONE, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbiel - 2#1000110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- tlbsync - 2#0000011110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1', NONE), -- wait + 2#1000110110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- tlbsync + 2#0000011110# => (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- wait 2#0100111100# => (ALU, NONE, OP_XOR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0', NONE), -- xor others => illegal_inst ); @@ -454,7 +463,6 @@ architecture behaviour of decode1 is -- op in out A out in out len ext pipe 0 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE), -- std 1 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0', NONE), -- stdu - 2 => (LDST, NONE, OP_STORE, RA_OR_ZERO, CONST_DS, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0', DRSE), -- stq others => decode_rom_init ); @@ -521,32 +529,95 @@ architecture behaviour of decode1 is constant nop_instr : decode_rom_t := (ALU, NONE, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); constant fetch_fail_inst: decode_rom_t := (LDST, NONE, OP_FETCH_FAILED, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0', NONE); + function decode_ram_spr(sprn : spr_num_t) return ram_spr_info is + variable ret : ram_spr_info; + begin + ret := (index => 0, isodd => '0', valid => '1'); + case sprn is + when SPR_LR => + ret.index := RAMSPR_LR; + when SPR_CTR => + ret.index := RAMSPR_CTR; + ret.isodd := '1'; + when SPR_TAR => + ret.index := RAMSPR_TAR; + when SPR_SRR0 => + ret.index := RAMSPR_SRR0; + when SPR_SRR1 => + ret.index := RAMSPR_SRR1; + ret.isodd := '1'; + when SPR_HSRR0 => + ret.index := RAMSPR_HSRR0; + when SPR_HSRR1 => + ret.index := RAMSPR_HSRR1; + ret.isodd := '1'; + when SPR_SPRG0 => + ret.index := RAMSPR_SPRG0; + when SPR_SPRG1 => + ret.index := RAMSPR_SPRG1; + ret.isodd := '1'; + when SPR_SPRG2 => + ret.index := RAMSPR_SPRG2; + when SPR_SPRG3 | SPR_SPRG3U => + ret.index := RAMSPR_SPRG3; + ret.isodd := '1'; + when SPR_HSPRG0 => + ret.index := RAMSPR_HSPRG0; + when SPR_HSPRG1 => + ret.index := RAMSPR_HSPRG1; + ret.isodd := '1'; + when others => + ret.valid := '0'; + end case; + return ret; + end; + + function map_spr(sprn : spr_num_t) return spr_id is + variable i : spr_id; + begin + i.sel := "000"; + i.valid := '1'; + i.ispmu := '0'; + case sprn is + when SPR_TB => + i.sel := SPRSEL_TB; + when SPR_TBU => + i.sel := SPRSEL_TBU; + when SPR_DEC => + i.sel := SPRSEL_DEC; + when SPR_PVR => + i.sel := SPRSEL_PVR; + when 724 => -- LOG_ADDR SPR + i.sel := SPRSEL_LOGA; + when 725 => -- LOG_DATA SPR + i.sel := SPRSEL_LOGD; + when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 | + SPR_UMMCR0 | SPR_UMMCR1 | SPR_UMMCR2 | SPR_UMMCRA | SPR_USIER | SPR_USIAR | SPR_USDAR | + SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 | + SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR => + i.ispmu := '1'; + when SPR_CFAR => + i.sel := SPRSEL_CFAR; + when SPR_XER => + i.sel := SPRSEL_XER; + when others => + i.valid := '0'; + end case; + return i; + end; + begin decode1_0: process(clk) begin if rising_edge(clk) then if rst = '1' then r <= Decode1ToDecode2Init; - s <= Decode1ToDecode2Init; ri <= reg_internal_t_init; - si <= reg_internal_t_init; elsif flush_in = '1' then r.valid <= '0'; - s.valid <= '0'; - elsif s.valid = '1' then - if stall_in = '0' then - r <= s; - ri <= si; - s.valid <= '0'; - end if; - else - s <= rin; - si <= ri_in; - s.valid <= rin.valid and r.valid and stall_in; - if r.valid = '0' or stall_in = '0' then - r <= rin; - ri <= ri_in; - end if; + elsif stall_in = '0' then + r <= rin; + ri <= ri_in; end if; if rst = '1' then br.br_nia <= (others => '0'); @@ -557,10 +628,11 @@ begin end if; end if; end process; - busy_out <= s.valid; + busy_out <= stall_in; decode1_1: process(all) variable v : Decode1ToDecode2Type; + variable vr : Decode1ToRegisterFileType; variable vi : reg_internal_t; variable majorop : major_opcode_t; variable minor4op : std_ulogic_vector(10 downto 0); @@ -569,6 +641,9 @@ begin variable br_target : std_ulogic_vector(61 downto 0); variable br_offset : signed(23 downto 0); variable bv : br_predictor_t; + variable fprs, fprabc : std_ulogic; + variable in3rc : std_ulogic; + variable may_read_rb : std_ulogic; begin v := Decode1ToDecode2Init; vi := reg_internal_t_init; @@ -579,6 +654,11 @@ begin v.stop_mark := f_in.stop_mark; v.big_endian := f_in.big_endian; + fprs := '0'; + fprabc := '0'; + in3rc := '0'; + may_read_rb := '0'; + if f_in.valid = '1' then report "Decode insn " & to_hstring(f_in.insn) & " at " & to_hstring(f_in.nia); end if; @@ -586,76 +666,62 @@ begin br_offset := (others => '0'); majorop := unsigned(f_in.insn(31 downto 26)); - if is_X(majorop) then - v.decode := x_inst; - else - v.decode := major_decode_rom_array(to_integer(majorop)); - end if; + if is_X(majorop) then + v.decode := x_inst; + else + v.decode := major_decode_rom_array(to_integer(majorop)); + end if; + + sprn := decode_spr_num(f_in.insn); + v.spr_info := map_spr(sprn); + v.ram_spr := decode_ram_spr(sprn); case unsigned(majorop) is when "000100" => -- 4 -- major opcode 4, mostly VMX/VSX stuff but also some integer ops (madd*) minor4op := f_in.insn(5 downto 0) & f_in.insn(10 downto 6); - if is_X(minor4op) then - vi.override := 'X'; - else - vi.override := not decode_op_4_valid(to_integer(unsigned(minor4op))); - end if; - - if is_X(f_in.insn) then - v.decode := x_inst; - else - v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0)))); - end if; + vi.override := not decode_op_4_valid(to_integer(unsigned(minor4op))); + v.decode := decode_op_4_array(to_integer(unsigned(f_in.insn(5 downto 0)))); + in3rc := '1'; + may_read_rb := '1'; + + when "010111" => -- 23 + -- rlwnm[.] + may_read_rb := '1'; when "011111" => -- 31 -- major opcode 31, lots of things - if is_X(f_in.insn) then - v.decode := x_inst; - else - v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); - end if; - - -- Work out ispr1/ispro independent of v.decode since they seem to be critical path - if is_X(f_in.insn) then - v.ispr1 := (others => 'X'); - v.ispro := (others => 'X'); - else - sprn := decode_spr_num(f_in.insn); - v.ispr1 := fast_spr_num(sprn); - v.ispro := fast_spr_num(sprn); - end if; + if is_X(f_in.insn) then + v.decode := x_inst; + else + v.decode := decode_op_31_array(to_integer(unsigned(f_in.insn(10 downto 1)))); + end if; + may_read_rb := '1'; if std_match(f_in.insn(10 downto 1), "01-1010011") then -- mfspr or mtspr - -- Make slow SPRs single issue - if is_fast_spr(v.ispr1) = '0' then - vi.force_single := '1'; - -- send MMU-related SPRs to loadstore1 - case sprn is - when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => - vi.override_decode.unit := LDST; - vi.override_unit := '1'; - when others => - end case; + -- Make mtspr to slow SPRs single issue + if v.spr_info.valid = '1' then + vi.force_single := f_in.insn(8); end if; + -- send MMU-related SPRs to loadstore1 + case sprn is + when SPR_DAR | SPR_DSISR | SPR_PID | SPR_PTCR => + vi.override_decode.unit := LDST; + vi.override_unit := '1'; + -- make mtspr to loadstore SPRs single-issue + if f_in.insn(8) = '1' then + vi.force_single := '1'; + end if; + when others => + end case; end if; - if std_match(f_in.insn(10 downto 1), "0100010100") then - -- lqarx, illegal if RA = RT or RB = RT - if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) or - f_in.insn(25 downto 21) = f_in.insn(15 downto 11) then - vi.override := '1'; - end if; + if HAS_FPU and std_match(f_in.insn(10 downto 1), "1----10111") then + -- lower half of column 23 has FP loads and stores + fprs := '1'; end if; when "010000" => -- 16 - -- CTR may be needed as input to bc - if f_in.insn(23) = '0' then - v.ispr1 := fast_spr_num(SPR_CTR); - v.ispro := fast_spr_num(SPR_CTR); - elsif f_in.insn(0) = '1' then - v.ispro := fast_spr_num(SPR_LR); - end if; -- Predict backward branches as taken, forward as untaken v.br_pred := f_in.insn(15); br_offset := resize(signed(f_in.insn(15 downto 2)), 24); @@ -664,47 +730,18 @@ begin -- Unconditional branches are always taken v.br_pred := '1'; br_offset := signed(f_in.insn(25 downto 2)); - if f_in.insn(0) = '1' then - v.ispro := fast_spr_num(SPR_LR); - end if; when "010011" => -- 19 - if is_X(f_in.insn) then - vi.override := 'X'; - else - vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6)))); - end if; + if is_X(f_in.insn) then + vi.override := 'X'; + else + vi.override := not decode_op_19_valid(to_integer(unsigned(f_in.insn(5 downto 1) & f_in.insn(10 downto 6)))); + end if; op_19_bits := f_in.insn(5) & f_in.insn(3) & f_in.insn(2); - if is_X(op_19_bits) then - v.decode := x_inst; - else - v.decode := decode_op_19_array(to_integer(unsigned(op_19_bits))); - end if; - - -- Work out ispr1/ispr2 independent of v.decode since they seem to be critical path - if f_in.insn(2) = '0' then - -- Could be OP_BCREG: bclr, bcctr, bctar - -- Branch uses CTR as condition when BO(2) is 0. This is - -- also used to indicate that CTR is modified (they go - -- together). - -- bcctr doesn't update CTR or use it in the branch condition - if f_in.insn(23) = '0' and (f_in.insn(10) = '0' or f_in.insn(6) = '1') then - v.ispr1 := fast_spr_num(SPR_CTR); - v.ispro := fast_spr_num(SPR_CTR); - elsif f_in.insn(0) = '1' then - v.ispro := fast_spr_num(SPR_LR); - end if; - if f_in.insn(10) = '0' then - v.ispr2 := fast_spr_num(SPR_LR); - elsif f_in.insn(6) = '0' then - v.ispr2 := fast_spr_num(SPR_CTR); - else - v.ispr2 := fast_spr_num(SPR_TAR); - end if; + if is_X(op_19_bits) then + v.decode := x_inst; else - -- Could be OP_RFID - v.ispr1 := fast_spr_num(SPR_SRR1); - v.ispr2 := fast_spr_num(SPR_SRR0); + v.decode := decode_op_19_array(to_integer(unsigned(op_19_bits))); end if; when "011000" => -- 24 @@ -716,60 +753,85 @@ begin end if; when "011110" => -- 30 - if is_X(f_in.insn) then - v.decode := x_inst; - else - v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); - end if; - - when "111000" => -- 56 - -- lq, illegal if RA = RT - if f_in.insn(25 downto 21) = f_in.insn(20 downto 16) then - vi.override := '1'; + if is_X(f_in.insn) then + v.decode := x_inst; + else + v.decode := decode_op_30_array(to_integer(unsigned(f_in.insn(4 downto 1)))); + end if; + may_read_rb := f_in.insn(4); + + when "110100" | "110101" | "110110" | "110111" => -- 52, 53, 54, 55 + -- stfd[u] and stfs[u] + if HAS_FPU then + fprs := '1'; end if; when "111010" => -- 58 - if is_X(f_in.insn) then - v.decode := x_inst; - else - v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0)))); - end if; + if is_X(f_in.insn) then + v.decode := x_inst; + else + v.decode := decode_op_58_array(to_integer(unsigned(f_in.insn(1 downto 0)))); + end if; when "111011" => -- 59 if HAS_FPU then -- floating point operations, mostly single-precision - if is_X(f_in.insn) then - v.decode := x_inst; - else - v.decode := decode_op_59_array(to_integer(unsigned(f_in.insn(5 downto 1)))); - end if; + if is_X(f_in.insn) then + v.decode := x_inst; + else + v.decode := decode_op_59_array(to_integer(unsigned(f_in.insn(5 downto 1)))); + end if; if f_in.insn(5) = '0' and not std_match(f_in.insn(10 downto 1), "11-1001110") then vi.override := '1'; end if; + in3rc := '1'; + fprabc := '1'; + fprs := '1'; + may_read_rb := '1'; end if; when "111110" => -- 62 - if is_X(f_in.insn) then - v.decode := x_inst; - else - v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0)))); - end if; + if is_X(f_in.insn) then + v.decode := x_inst; + else + v.decode := decode_op_62_array(to_integer(unsigned(f_in.insn(1 downto 0)))); + end if; when "111111" => -- 63 if HAS_FPU then -- floating point operations, general and double-precision - if is_X(f_in.insn) then - v.decode := x_inst; - elsif f_in.insn(5) = '0' then + if is_X(f_in.insn) then + v.decode := x_inst; + elsif f_in.insn(5) = '0' then v.decode := decode_op_63l_array(to_integer(unsigned(f_in.insn(4 downto 1) & f_in.insn(10 downto 6)))); else v.decode := decode_op_63h_array(to_integer(unsigned(f_in.insn(4 downto 1)))); end if; + in3rc := '1'; + fprabc := '1'; + fprs := '1'; + may_read_rb := '1'; end if; when others => end case; + -- Work out GPR/FPR read addresses + vr.reg_1_addr := fprabc & insn_ra(f_in.insn); + vr.reg_2_addr := fprabc & insn_rb(f_in.insn); + if in3rc = '1' then + vr.reg_3_addr := fprabc & insn_rcreg(f_in.insn); + else + vr.reg_3_addr := fprs & insn_rs(f_in.insn); + end if; + vr.read_1_enable := f_in.valid and not f_in.fetch_failed; + vr.read_2_enable := f_in.valid and not f_in.fetch_failed and may_read_rb; + vr.read_3_enable := f_in.valid and not f_in.fetch_failed; + + v.reg_a := vr.reg_1_addr; + v.reg_b := vr.reg_2_addr; + v.reg_c := vr.reg_3_addr; + if f_in.fetch_failed = '1' then v.valid := '1'; vi.override := '1'; @@ -815,6 +877,8 @@ begin f_out.redirect <= br.predict; f_out.redirect_nia <= br_target & "00"; flush_out <= bv.predict or br.predict; + + r_out <= vr; end process; d1_log: if LOG_LENGTH > 0 generate diff --git a/decode2.vhdl b/decode2.vhdl index c24d8f5b2..e24ebb502 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -39,6 +39,13 @@ entity decode2 is execute_bypass : in bypass_data_t; execute_cr_bypass : in cr_bypass_data_t; + execute2_bypass : in bypass_data_t; + execute2_cr_bypass : in cr_bypass_data_t; + writeback_bypass : in bypass_data_t; + + -- Access to SPRs from core_debug module + dbg_spr_req : in std_ulogic; + dbg_spr_addr : in std_ulogic_vector(7 downto 0); log_out : out std_ulogic_vector(9 downto 0) ); @@ -47,10 +54,22 @@ end entity decode2; architecture behaviour of decode2 is type reg_type is record e : Decode2ToExecute1Type; - repeat : std_ulogic; + repeat : repeat_t; + busy : std_ulogic; + sgl_pipe : std_ulogic; + prev_sgl : std_ulogic; + reg_a_valid : std_ulogic; + reg_b_valid : std_ulogic; + reg_c_valid : std_ulogic; + reg_o_valid : std_ulogic; + input_ov : std_ulogic; + output_ov : std_ulogic; + read_rspr : std_ulogic; end record; + constant reg_type_init : reg_type := + (e => Decode2ToExecute1Init, repeat => NONE, others => '0'); - signal r, rin : reg_type; + signal dc2, dc2in : reg_type; signal deferred : std_ulogic; @@ -59,49 +78,39 @@ architecture behaviour of decode2 is reg : gspr_index_t; data : std_ulogic_vector(63 downto 0); end record; + constant decode_input_reg_init : decode_input_reg_t := ('0', (others => '0'), (others => '0')); type decode_output_reg_t is record reg_valid : std_ulogic; reg : gspr_index_t; end record; + constant decode_output_reg_init : decode_output_reg_t := ('0', (others => '0')); function decode_input_reg_a (t : input_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0); - ispr : gspr_index_t; instr_addr : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is begin if t = RA or (t = RA_OR_ZERO and insn_ra(insn_in) /= "00000") then - return ('1', gpr_to_gspr(insn_ra(insn_in)), reg_data); - elsif t = SPR then - -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. - -- If it's all 0, we don't treat it as a dependency as slow SPRs - -- operations are single issue. - -- - assert is_fast_spr(ispr) = '1' or ispr = "0000000" - report "Decode A says SPR but ISPR is invalid:" & - to_hstring(ispr) severity failure; - return (is_fast_spr(ispr), ispr, reg_data); + return ('1', gpr_to_gspr(insn_ra(insn_in)), (others => '0')); elsif t = CIA then return ('0', (others => '0'), instr_addr); elsif HAS_FPU and t = FRA then - return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_fra(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; end; - function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0); - ispr : gspr_index_t) return decode_input_reg_t is + function decode_input_reg_b (t : input_reg_b_t; insn_in : std_ulogic_vector(31 downto 0)) + return decode_input_reg_t is variable ret : decode_input_reg_t; begin case t is when RB => - ret := ('1', gpr_to_gspr(insn_rb(insn_in)), reg_data); + ret := ('1', gpr_to_gspr(insn_rb(insn_in)), (others => '0')); when FRB => if HAS_FPU then - ret := ('1', fpr_to_gspr(insn_frb(insn_in)), reg_data); + ret := ('1', fpr_to_gspr(insn_frb(insn_in)), (others => '0')); else ret := ('0', (others => '0'), (others => '0')); end if; @@ -129,14 +138,6 @@ architecture behaviour of decode2 is ret := ('0', (others => '0'), x"00000000000000" & "00" & insn_in(1) & insn_in(15 downto 11)); when CONST_SH32 => ret := ('0', (others => '0'), x"00000000000000" & "000" & insn_in(15 downto 11)); - when SPR => - -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. - -- If it's all 0, we don't treat it as a dependency as slow SPRs - -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "0000000" - report "Decode B says SPR but ISPR is invalid:" & - to_hstring(ispr) severity failure; - ret := (is_fast_spr(ispr), ispr, reg_data); when NONE => ret := ('0', (others => '0'), (others => '0')); end case; @@ -144,23 +145,23 @@ architecture behaviour of decode2 is return ret; end; - function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0); - reg_data : std_ulogic_vector(63 downto 0)) return decode_input_reg_t is + function decode_input_reg_c (t : input_reg_c_t; insn_in : std_ulogic_vector(31 downto 0)) + return decode_input_reg_t is begin case t is when RS => - return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_rs(insn_in)), (others => '0')); when RCR => - return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data); + return ('1', gpr_to_gspr(insn_rcreg(insn_in)), (others => '0')); when FRS => if HAS_FPU then - return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_frt(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; when FRC => if HAS_FPU then - return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data); + return ('1', fpr_to_gspr(insn_frc(insn_in)), (others => '0')); else return ('0', (others => '0'), (others => '0')); end if; @@ -169,8 +170,8 @@ architecture behaviour of decode2 is end case; end; - function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0); - ispr : gspr_index_t) return decode_output_reg_t is + function decode_output_reg (t : output_reg_a_t; insn_in : std_ulogic_vector(31 downto 0)) + return decode_output_reg_t is begin case t is when RT => @@ -181,18 +182,10 @@ architecture behaviour of decode2 is if HAS_FPU then return ('1', fpr_to_gspr(insn_frt(insn_in))); else - return ('0', "0000000"); + return ('0', "000000"); end if; - when SPR => - -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. - -- If it's all 0, we don't treat it as a dependency as slow SPRs - -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "0000000" - report "Decode B says SPR but ISPR is invalid:" & - to_hstring(ispr) severity failure; - return (is_fast_spr(ispr), ispr); when NONE => - return ('0', "0000000"); + return ('0', "000000"); end case; end; @@ -228,14 +221,6 @@ architecture behaviour of decode2 is OP_SHR => "010", OP_EXTSWSLI => "010", OP_MUL_L64 => "011", -- muldiv_result - OP_MUL_H64 => "011", - OP_MUL_H32 => "011", - OP_DIV => "011", - OP_DIVE => "011", - OP_MOD => "011", - OP_CNTZ => "100", -- countbits_result - OP_POPCNT => "100", - OP_MFSPR => "101", -- spr_result OP_B => "110", -- next_nia OP_BC => "110", OP_BCREG => "110", @@ -270,30 +255,37 @@ architecture behaviour of decode2 is others => "000" ); + signal decoded_reg_a : decode_input_reg_t; + signal decoded_reg_b : decode_input_reg_t; + signal decoded_reg_c : decode_input_reg_t; + signal decoded_reg_o : decode_output_reg_t; + -- issue control signals signal control_valid_in : std_ulogic; signal control_valid_out : std_ulogic; - signal control_stall_out : std_ulogic; - signal control_sgl_pipe : std_logic; + signal control_serialize : std_logic; signal gpr_write_valid : std_ulogic; signal gpr_write : gspr_index_t; signal gpr_a_read_valid : std_ulogic; signal gpr_a_read : gspr_index_t; - signal gpr_a_bypass : std_ulogic; + signal gpr_a_bypass : std_ulogic_vector(1 downto 0); signal gpr_b_read_valid : std_ulogic; signal gpr_b_read : gspr_index_t; - signal gpr_b_bypass : std_ulogic; + signal gpr_b_bypass : std_ulogic_vector(1 downto 0); signal gpr_c_read_valid : std_ulogic; signal gpr_c_read : gspr_index_t; - signal gpr_c_bypass : std_ulogic; + signal gpr_c_bypass : std_ulogic_vector(1 downto 0); signal cr_read_valid : std_ulogic; signal cr_write_valid : std_ulogic; - signal cr_bypass : std_ulogic; + signal cr_bypass : std_ulogic_vector(1 downto 0); + + signal ov_read_valid : std_ulogic; + signal ov_write_valid : std_ulogic; signal instr_tag : instr_tag_t; @@ -308,11 +300,9 @@ begin complete_in => complete_in, valid_in => control_valid_in, - repeated => r.repeat, - busy_in => busy_in, deferred => deferred, flush_in => flush_in, - sgl_pipe_in => control_sgl_pipe, + serialize => control_serialize, stop_mark_in => d_in.stop_mark, gpr_write_valid_in => gpr_write_valid, @@ -329,13 +319,17 @@ begin execute_next_tag => execute_bypass.tag, execute_next_cr_tag => execute_cr_bypass.tag, + execute2_next_tag => execute2_bypass.tag, + execute2_next_cr_tag => execute2_cr_bypass.tag, cr_read_in => cr_read_valid, cr_write_in => cr_write_valid, cr_bypass => cr_bypass, + ov_read_in => ov_read_valid, + ov_write_in => ov_write_valid, + valid_out => control_valid_out, - stall_out => control_stall_out, stopped_out => stopped_out, gpr_bypass_a => gpr_a_bypass, @@ -345,223 +339,348 @@ begin instr_tag_out => instr_tag ); - deferred <= r.e.valid and busy_in; + deferred <= dc2.e.valid and busy_in; decode2_0: process(clk) begin if rising_edge(clk) then - if rst = '1' or flush_in = '1' or deferred = '0' then - if rin.e.valid = '1' then - report "execute " & to_hstring(rin.e.nia); + if rst = '1' or flush_in = '1' then + dc2 <= reg_type_init; + elsif deferred = '0' then + if dc2in.e.valid = '1' then + report "execute " & to_hstring(dc2in.e.nia) & + " tag=" & integer'image(dc2in.e.instr_tag.tag) & std_ulogic'image(dc2in.e.instr_tag.valid); end if; - r <= rin; + dc2 <= dc2in; + elsif dc2.read_rspr = '0' then + -- Update debug SPR access signals even when stalled + -- if the instruction in dc2.e doesn't read any SPRs. + dc2.e.dbg_spr_access <= dc2in.e.dbg_spr_access; + dc2.e.ramspr_even_rdaddr <= dc2in.e.ramspr_even_rdaddr; + dc2.e.ramspr_odd_rdaddr <= dc2in.e.ramspr_odd_rdaddr; + dc2.e.ramspr_rd_odd <= dc2in.e.ramspr_rd_odd; + end if; + if d_in.valid = '1' then + assert decoded_reg_a.reg_valid = '0' or decoded_reg_a.reg = d_in.reg_a severity failure; + assert decoded_reg_b.reg_valid = '0' or decoded_reg_b.reg = d_in.reg_b severity failure; + assert decoded_reg_c.reg_valid = '0' or decoded_reg_c.reg = d_in.reg_c severity failure; end if; end if; end process; c_out.read <= d_in.decode.input_cr; + decode2_addrs: process(all) + begin + decoded_reg_a <= decode_input_reg_init; + decoded_reg_b <= decode_input_reg_init; + decoded_reg_c <= decode_input_reg_init; + decoded_reg_o <= decode_output_reg_init; + if d_in.valid = '1' then + decoded_reg_a <= decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, d_in.nia); + decoded_reg_b <= decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn); + decoded_reg_c <= decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn); + decoded_reg_o <= decode_output_reg (d_in.decode.output_reg_a, d_in.insn); + end if; + + r_out.read1_enable <= decoded_reg_a.reg_valid; + r_out.read2_enable <= decoded_reg_b.reg_valid; + r_out.read3_enable <= decoded_reg_c.reg_valid; + + end process; + decode2_1: process(all) variable v : reg_type; - variable mul_a : std_ulogic_vector(63 downto 0); - variable mul_b : std_ulogic_vector(63 downto 0); - variable decoded_reg_a : decode_input_reg_t; - variable decoded_reg_b : decode_input_reg_t; - variable decoded_reg_c : decode_input_reg_t; - variable decoded_reg_o : decode_output_reg_t; variable length : std_ulogic_vector(3 downto 0); variable op : insn_type_t; + variable valid_in : std_ulogic; + variable decctr : std_ulogic; + variable sprs_busy : std_ulogic; begin - v := r; - - v.e := Decode2ToExecute1Init; + v := dc2; - mul_a := (others => '0'); - mul_b := (others => '0'); + valid_in := d_in.valid or dc2.busy; - --v.e.input_cr := d_in.decode.input_cr; - v.e.output_cr := d_in.decode.output_cr; + if dc2.busy = '0' then + v.e := Decode2ToExecute1Init; - -- Work out whether XER common bits are set - v.e.output_xer := d_in.decode.output_carry; - case d_in.decode.insn_type is - when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => - -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only - if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then - v.e.oe := '1'; - v.e.output_xer := '1'; - end if; - when OP_MTSPR => - if is_X(d_in.insn) then - v.e.output_xer := 'X'; - elsif decode_spr_num(d_in.insn) = SPR_XER then - v.e.output_xer := '1'; - end if; - when others => - end case; + sprs_busy := '0'; - decoded_reg_a := decode_input_reg_a (d_in.decode.input_reg_a, d_in.insn, r_in.read1_data, d_in.ispr1, - d_in.nia); - decoded_reg_b := decode_input_reg_b (d_in.decode.input_reg_b, d_in.insn, r_in.read2_data, d_in.ispr2); - decoded_reg_c := decode_input_reg_c (d_in.decode.input_reg_c, d_in.insn, r_in.read3_data); - decoded_reg_o := decode_output_reg (d_in.decode.output_reg_a, d_in.insn, d_in.ispro); + if d_in.valid = '1' then + v.prev_sgl := dc2.sgl_pipe; + v.sgl_pipe := d_in.decode.sgl_pipe; + end if; - if d_in.decode.lr = '1' then - v.e.lr := insn_lk(d_in.insn); - -- b and bc have even major opcodes; bcreg is considered absolute - v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); - end if; - op := d_in.decode.insn_type; - - if d_in.decode.repeat /= NONE then - v.e.repeat := '1'; - v.e.second := r.repeat; - case d_in.decode.repeat is - when DRSE => - -- do RS|1,RS for LE; RS,RS|1 for BE - if r.repeat = d_in.big_endian then - decoded_reg_c.reg(0) := '1'; + v.e.input_cr := d_in.decode.input_cr; + v.e.output_cr := d_in.decode.output_cr; + + -- Work out whether XER SO/OV/OV32 bits are set + -- or used by this instruction + v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); + v.e.output_xer := d_in.decode.output_carry; + v.input_ov := d_in.decode.output_carry; + v.output_ov := '0'; + if d_in.decode.input_carry = OV then + v.input_ov := '1'; + v.output_ov := '1'; + end if; + if v.e.rc = '1' and d_in.decode.facility /= FPU then + v.input_ov := '1'; + end if; + case d_in.decode.insn_type is + when OP_ADD | OP_MUL_L64 | OP_DIV | OP_DIVE => + -- OE field is valid in OP_ADD/OP_MUL_L64 with major opcode 31 only + if d_in.insn(31 downto 26) = "011111" and insn_oe(d_in.insn) = '1' then + v.e.oe := '1'; + v.e.output_xer := '1'; + v.output_ov := '1'; + v.input_ov := '1'; -- need SO state if setting OV to 0 end if; - when DRTE => - -- do RT|1,RT for LE; RT,RT|1 for BE - if r.repeat = d_in.big_endian then - decoded_reg_o.reg(0) := '1'; + when OP_MFSPR => + if decode_spr_num(d_in.insn) = SPR_XER then + v.input_ov := '1'; end if; - when DUPD => - -- update-form loads, 2nd instruction writes RA - if r.repeat = '1' then - decoded_reg_o.reg := decoded_reg_a.reg; + when OP_MTSPR => + if decode_spr_num(d_in.insn) = SPR_XER then + v.e.output_xer := '1'; + v.output_ov := '1'; end if; + when OP_CMP | OP_MCRXRX => + v.input_ov := '1'; when others => end case; - elsif v.e.lr = '1' and decoded_reg_a.reg_valid = '1' then - -- bcl/bclrl/bctarl that needs to write both CTR and LR has to be doubled - v.e.repeat := '1'; - v.e.second := r.repeat; - -- first one does CTR, second does LR - decoded_reg_o.reg(0) := not r.repeat; - end if; - r_out.read1_enable <= decoded_reg_a.reg_valid and d_in.valid; - r_out.read1_reg <= decoded_reg_a.reg; - r_out.read2_enable <= decoded_reg_b.reg_valid and d_in.valid; - r_out.read2_reg <= decoded_reg_b.reg; - r_out.read3_enable <= decoded_reg_c.reg_valid and d_in.valid; - r_out.read3_reg <= decoded_reg_c.reg; - - case d_in.decode.length is - when is1B => - length := "0001"; - when is2B => - length := "0010"; - when is4B => - length := "0100"; - when is8B => - length := "1000"; - when NONE => - length := "0000"; - end case; + v.reg_a_valid := decoded_reg_a.reg_valid; + v.reg_b_valid := decoded_reg_b.reg_valid; + v.reg_c_valid := decoded_reg_c.reg_valid; + v.reg_o_valid := decoded_reg_o.reg_valid; - -- execute unit - v.e.nia := d_in.nia; - v.e.unit := d_in.decode.unit; - v.e.fac := d_in.decode.facility; - v.e.instr_tag := instr_tag; - v.e.read_reg1 := decoded_reg_a.reg; - v.e.read_reg2 := decoded_reg_b.reg; - v.e.write_reg := decoded_reg_o.reg; - v.e.write_reg_enable := decoded_reg_o.reg_valid; - v.e.rc := decode_rc(d_in.decode.rc, d_in.insn); - v.e.xerc := c_in.read_xerc_data; - v.e.invert_a := d_in.decode.invert_a; - v.e.addm1 := '0'; - v.e.insn_type := op; - v.e.invert_out := d_in.decode.invert_out; - v.e.input_carry := d_in.decode.input_carry; - v.e.output_carry := d_in.decode.output_carry; - v.e.is_32bit := d_in.decode.is_32bit; - v.e.is_signed := d_in.decode.is_signed; - v.e.insn := d_in.insn; - v.e.data_len := length; - v.e.byte_reverse := d_in.decode.byte_reverse; - v.e.sign_extend := d_in.decode.sign_extend; - v.e.update := d_in.decode.update; - v.e.reserve := d_in.decode.reserve; - v.e.br_pred := d_in.br_pred; - v.e.result_sel := result_select(op); - v.e.sub_select := subresult_select(op); - if op = OP_BC or op = OP_BCREG then - if d_in.insn(23) = '0' and r.repeat = '0' and - not (d_in.decode.insn_type = OP_BCREG and d_in.insn(10) = '0') then - -- decrement CTR if BO(2) = 0 and not bcctr - v.e.addm1 := '1'; - v.e.result_sel := "000"; -- select adder output + if d_in.decode.lr = '1' then + v.e.lr := insn_lk(d_in.insn); + -- b and bc have even major opcodes; bcreg is considered absolute + v.e.br_abs := insn_aa(d_in.insn) or d_in.insn(26); end if; - end if; + op := d_in.decode.insn_type; + + -- Does this instruction decrement CTR? + -- bc, bclr, bctar with BO(2) = 0 do, but not bcctr. + decctr := '0'; + if d_in.insn(23) = '0' and + (op = OP_BC or + (op = OP_BCREG and not (d_in.insn(10) = '1' and d_in.insn(6) = '0'))) then + decctr := '1'; + end if; + v.e.dec_ctr := decctr; - -- See if any of the operands can get their value via the bypass path. - case gpr_a_bypass is - when '1' => - v.e.read_data1 := execute_bypass.data; - when others => - v.e.read_data1 := decoded_reg_a.data; - end case; - case gpr_b_bypass is - when '1' => - v.e.read_data2 := execute_bypass.data; - when others => - v.e.read_data2 := decoded_reg_b.data; - end case; - case gpr_c_bypass is - when '1' => - v.e.read_data3 := execute_bypass.data; - when others => - v.e.read_data3 := decoded_reg_c.data; - end case; + v.repeat := d_in.decode.repeat; + if d_in.decode.repeat /= NONE then + v.e.repeat := '1'; + end if; + + v.e.spr_select := d_in.spr_info; + + if decctr = '1' then + -- read and write CTR + v.e.ramspr_odd_rdaddr := RAMSPR_CTR; + v.e.ramspr_wraddr := RAMSPR_CTR; + v.e.ramspr_write_odd := '1'; + sprs_busy := '1'; + end if; + if v.e.lr = '1' then + -- write LR + v.e.ramspr_wraddr := RAMSPR_LR; + v.e.ramspr_write_even := '1'; + end if; + + case op is + when OP_BCREG => + if d_in.insn(10) = '0' then + v.e.ramspr_even_rdaddr := RAMSPR_LR; + elsif d_in.insn(6) = '0' then + v.e.ramspr_odd_rdaddr := RAMSPR_CTR; + v.e.ramspr_rd_odd := '1'; + else + v.e.ramspr_even_rdaddr := RAMSPR_TAR; + end if; + sprs_busy := '1'; + when OP_MFSPR => + v.e.ramspr_even_rdaddr := d_in.ram_spr.index; + v.e.ramspr_odd_rdaddr := d_in.ram_spr.index; + v.e.ramspr_rd_odd := d_in.ram_spr.isodd; + v.e.spr_is_ram := d_in.ram_spr.valid; + sprs_busy := d_in.ram_spr.valid; + when OP_MTSPR => + v.e.ramspr_wraddr := d_in.ram_spr.index; + v.e.ramspr_write_even := d_in.ram_spr.valid and not d_in.ram_spr.isodd; + v.e.ramspr_write_odd := d_in.ram_spr.valid and d_in.ram_spr.isodd; + v.e.spr_is_ram := d_in.ram_spr.valid; + when OP_RFID => + v.e.ramspr_even_rdaddr := RAMSPR_SRR0; + v.e.ramspr_odd_rdaddr := RAMSPR_SRR1; + sprs_busy := '1'; + when others => + end case; + v.read_rspr := sprs_busy and d_in.valid; + + case d_in.decode.length is + when is1B => + length := "0001"; + when is2B => + length := "0010"; + when is4B => + length := "0100"; + when is8B => + length := "1000"; + when NONE => + length := "0000"; + end case; + + -- execute unit + v.e.nia := d_in.nia; + v.e.unit := d_in.decode.unit; + v.e.fac := d_in.decode.facility; + v.e.read_reg1 := d_in.reg_a; + v.e.read_reg2 := d_in.reg_b; + v.e.read_reg3 := d_in.reg_c; + v.e.write_reg := decoded_reg_o.reg; + v.e.write_reg_enable := decoded_reg_o.reg_valid; + v.e.invert_a := d_in.decode.invert_a; + v.e.insn_type := op; + v.e.invert_out := d_in.decode.invert_out; + v.e.input_carry := d_in.decode.input_carry; + v.e.output_carry := d_in.decode.output_carry; + v.e.is_32bit := d_in.decode.is_32bit; + v.e.is_signed := d_in.decode.is_signed; + v.e.insn := d_in.insn; + v.e.data_len := length; + v.e.byte_reverse := d_in.decode.byte_reverse; + v.e.sign_extend := d_in.decode.sign_extend; + v.e.update := d_in.decode.update; + v.e.reserve := d_in.decode.reserve; + v.e.br_pred := d_in.br_pred; + v.e.result_sel := result_select(op); + v.e.sub_select := subresult_select(op); + if op = OP_MFSPR then + if d_in.ram_spr.valid = '1' then + v.e.result_sel := "101"; -- ramspr_result + elsif d_in.spr_info.valid = '0' then + -- Privileged mfspr to invalid/unimplemented SPR numbers + -- writes the contents of RT back to RT (i.e. it's a no-op) + v.e.result_sel := "001"; -- logical_result + end if; + end if; - v.e.cr := c_in.read_cr_data; - if cr_bypass = '1' then - v.e.cr := execute_cr_bypass.data; + elsif dc2.e.valid = '1' then + -- dc2.busy = 1 and dc2.e.valid = 1, thus this must be a repeated instruction. + -- Set up for the second iteration (if deferred = 1 this will all be ignored) + v.e.second := '1'; + -- DUPD is the only possibility here: + -- update-form loads, 2nd instruction writes RA + v.e.write_reg := dc2.e.read_reg1; end if; -- issue control - control_valid_in <= d_in.valid; - control_sgl_pipe <= d_in.decode.sgl_pipe; + control_valid_in <= valid_in; + control_serialize <= v.sgl_pipe or v.prev_sgl; - gpr_write_valid <= v.e.write_reg_enable; - gpr_write <= decoded_reg_o.reg; + gpr_write_valid <= v.reg_o_valid; + gpr_write <= v.e.write_reg; - gpr_a_read_valid <= decoded_reg_a.reg_valid; - gpr_a_read <= decoded_reg_a.reg; + gpr_a_read_valid <= v.reg_a_valid; + gpr_a_read <= v.e.read_reg1; - gpr_b_read_valid <= decoded_reg_b.reg_valid; - gpr_b_read <= decoded_reg_b.reg; + gpr_b_read_valid <= v.reg_b_valid; + gpr_b_read <= v.e.read_reg2; - gpr_c_read_valid <= decoded_reg_c.reg_valid; - gpr_c_read <= decoded_reg_c.reg; + gpr_c_read_valid <= v.reg_c_valid; + gpr_c_read <= v.e.read_reg3; - cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); + cr_write_valid <= v.e.output_cr or v.e.rc; -- Since ops that write CR only write some of the fields, -- any op that writes CR effectively also reads it. - cr_read_valid <= cr_write_valid or d_in.decode.input_cr; + cr_read_valid <= cr_write_valid or v.e.input_cr; - v.e.valid := control_valid_out; - if control_valid_out = '1' then - v.repeat := v.e.repeat and not r.repeat; + ov_read_valid <= v.input_ov; + ov_write_valid <= v.output_ov; + + -- See if any of the operands can get their value via the bypass path. + if dc2.busy = '0' or gpr_a_bypass /= "00" then + case gpr_a_bypass is + when "01" => + v.e.read_data1 := execute_bypass.data; + when "10" => + v.e.read_data1 := execute2_bypass.data; + when "11" => + v.e.read_data1 := writeback_bypass.data; + when others => + if decoded_reg_a.reg_valid = '1' then + v.e.read_data1 := r_in.read1_data; + else + v.e.read_data1 := decoded_reg_a.data; + end if; + end case; + end if; + if dc2.busy = '0' or gpr_b_bypass /= "00" then + case gpr_b_bypass is + when "01" => + v.e.read_data2 := execute_bypass.data; + when "10" => + v.e.read_data2 := execute2_bypass.data; + when "11" => + v.e.read_data2 := writeback_bypass.data; + when others => + if decoded_reg_b.reg_valid = '1' then + v.e.read_data2 := r_in.read2_data; + else + v.e.read_data2 := decoded_reg_b.data; + end if; + end case; + end if; + if dc2.busy = '0' or gpr_c_bypass /= "00" then + case gpr_c_bypass is + when "01" => + v.e.read_data3 := execute_bypass.data; + when "10" => + v.e.read_data3 := execute2_bypass.data; + when "11" => + v.e.read_data3 := writeback_bypass.data; + when others => + if decoded_reg_c.reg_valid = '1' then + v.e.read_data3 := r_in.read3_data; + else + v.e.read_data3 := decoded_reg_c.data; + end if; + end case; end if; - stall_out <= control_stall_out or v.repeat; + case cr_bypass is + when "10" => + v.e.cr := execute_cr_bypass.data; + when "11" => + v.e.cr := execute2_cr_bypass.data; + when others => + v.e.cr := c_in.read_cr_data; + end case; + v.e.xerc := c_in.read_xerc_data; - if rst = '1' or flush_in = '1' then - v.e := Decode2ToExecute1Init; - v.repeat := '0'; + v.e.valid := control_valid_out; + v.e.instr_tag := instr_tag; + v.busy := valid_in and (not control_valid_out or (v.e.repeat and not v.e.second)); + + stall_out <= dc2.busy or deferred; + + v.e.dbg_spr_access := dbg_spr_req and not v.read_rspr; + if v.e.dbg_spr_access = '1' then + v.e.ramspr_even_rdaddr := to_integer(unsigned(dbg_spr_addr(3 downto 1))); + v.e.ramspr_odd_rdaddr := to_integer(unsigned(dbg_spr_addr(3 downto 1))); + v.e.ramspr_rd_odd := dbg_spr_addr(0); end if; -- Update registers - rin <= v; + dc2in <= v; -- Update outputs - e_out <= r.e; + e_out <= dc2.e; end process; d2_log: if LOG_LENGTH > 0 generate @@ -570,13 +689,13 @@ begin dec2_log : process(clk) begin if rising_edge(clk) then - log_data <= r.e.nia(5 downto 2) & - r.e.valid & + log_data <= dc2.e.nia(5 downto 2) & + dc2.e.valid & stopped_out & stall_out & - gpr_a_bypass & - gpr_b_bypass & - gpr_c_bypass; + (gpr_a_bypass(1) xor gpr_a_bypass(0)) & + (gpr_b_bypass(1) xor gpr_b_bypass(0)) & + (gpr_c_bypass(1) xor gpr_c_bypass(0)); end if; end process; log_out <= log_data; diff --git a/decode_types.vhdl b/decode_types.vhdl index 885cc91d7..9ee329d75 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -22,11 +22,11 @@ package decode_types is OP_BCD, OP_ADDG6S, OP_FETCH_FAILED ); - type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA); + type input_reg_a_t is (NONE, RA, RA_OR_ZERO, CIA, FRA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, - CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB); + CONST_DXHI4, CONST_DS, CONST_DQ, CONST_M1, CONST_SH, CONST_SH32, FRB); type input_reg_c_t is (NONE, RS, RCR, FRC, FRS); - type output_reg_a_t is (NONE, RT, RA, SPR, FRT); + type output_reg_a_t is (NONE, RT, RA, FRT); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, OV, ONE); @@ -53,8 +53,6 @@ package decode_types is type length_t is (NONE, is1B, is2B, is4B, is8B); type repeat_t is (NONE, -- instruction is not repeated - DRSE, -- double RS, endian twist - DRTE, -- double RT, endian twist DUPD); -- update-form load type decode_rom_t is record diff --git a/divider.vhdl b/divider.vhdl index 3f9b31280..55e3c5d2c 100644 --- a/divider.vhdl +++ b/divider.vhdl @@ -36,7 +36,7 @@ begin divider_0: process(clk) begin if rising_edge(clk) then - if rst = '1' then + if rst = '1' or d_in.flush = '1' then dend <= (others => '0'); div <= (others => '0'); quot <= (others => '0'); diff --git a/execute1.vhdl b/execute1.vhdl index fc8ce2985..16a97a372 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -12,6 +12,7 @@ use work.ppc_fx_insns.all; entity execute1 is generic ( + SIM : boolean := false; EX1_BYPASS : boolean := true; HAS_FPU : boolean := true; HAS_SHORT_MULT : boolean := false; @@ -31,7 +32,7 @@ entity execute1 is fp_in : in FPUToExecute1Type; ext_irq_in : std_ulogic; - interrupt_in : std_ulogic; + interrupt_in : WritebackToExecute1Type; -- asynchronous l_out : out Execute1ToLoadstore1Type; @@ -40,8 +41,10 @@ entity execute1 is e_out : out Execute1ToWritebackType; bypass_data : out bypass_data_t; bypass_cr_data : out cr_bypass_data_t; + bypass2_data : out bypass_data_t; + bypass2_cr_data : out cr_bypass_data_t; - dbg_msr_out : out std_ulogic_vector(63 downto 0); + dbg_ctrl_out : out ctrl_t; icache_inval : out std_ulogic; terminate_out : out std_ulogic; @@ -52,6 +55,16 @@ entity execute1 is dc_events : in DcacheEventType; ic_events : in IcacheEventType; + -- Access to SPRs from core_debug module + dbg_spr_req : in std_ulogic; + dbg_spr_ack : out std_ulogic; + dbg_spr_addr : in std_ulogic_vector(7 downto 0); + dbg_spr_data : out std_ulogic_vector(63 downto 0); + + -- debug + sim_dump : in std_ulogic; + sim_dump_done : out std_ulogic; + log_out : out std_ulogic_vector(14 downto 0); log_rd_addr : out std_ulogic_vector(31 downto 0); log_rd_data : in std_ulogic_vector(63 downto 0); @@ -60,38 +73,97 @@ entity execute1 is end entity execute1; architecture behaviour of execute1 is - type reg_type is record + type side_effect_type is record + terminate : std_ulogic; + icache_inval : std_ulogic; + write_msr : std_ulogic; + write_xerlow : std_ulogic; + write_dec : std_ulogic; + write_cfar : std_ulogic; + write_loga : std_ulogic; + inc_loga : std_ulogic; + write_pmuspr : std_ulogic; + ramspr_write_even : std_ulogic; + ramspr_write_odd : std_ulogic; + end record; + constant side_effect_init : side_effect_type := (others => '0'); + + type actions_type is record + e : Execute1ToWritebackType; + se : side_effect_type; + complete : std_ulogic; + exception : std_ulogic; + trap : std_ulogic; + new_msr : std_ulogic_vector(63 downto 0); + take_branch : std_ulogic; + direct_branch : std_ulogic; + start_mul : std_ulogic; + start_div : std_ulogic; + do_trace : std_ulogic; + fp_intr : std_ulogic; + res2_sel : std_ulogic_vector(1 downto 0); + bypass_valid : std_ulogic; + ramspr_odd_data : std_ulogic_vector(63 downto 0); + end record; + constant actions_type_init : actions_type := + (e => Execute1ToWritebackInit, se => side_effect_init, + new_msr => (others => '0'), res2_sel => "00", + ramspr_odd_data => 64x"0", others => '0'); + + type reg_stage1_type is record e : Execute1ToWritebackType; - cur_instr : Decode2ToExecute1Type; + se : side_effect_type; busy: std_ulogic; - terminate: std_ulogic; - intr_pending : std_ulogic; fp_exception_next : std_ulogic; trace_next : std_ulogic; prev_op : insn_type_t; - br_taken : std_ulogic; + oe : std_ulogic; + mul_select : std_ulogic_vector(1 downto 0); + res2_sel : std_ulogic_vector(1 downto 0); + spr_select : spr_id; + pmu_spr_num : std_ulogic_vector(4 downto 0); mul_in_progress : std_ulogic; mul_finish : std_ulogic; div_in_progress : std_ulogic; - cntz_in_progress : std_ulogic; no_instr_avail : std_ulogic; instr_dispatch : std_ulogic; ext_interrupt : std_ulogic; taken_branch_event : std_ulogic; br_mispredict : std_ulogic; - log_addr_spr : std_ulogic_vector(31 downto 0); + msr : std_ulogic_vector(63 downto 0); + xerc : xer_common_t; + xerc_valid : std_ulogic; + ramspr_wraddr : ramspr_index; + ramspr_odd_data : std_ulogic_vector(63 downto 0); end record; - constant reg_type_init : reg_type := - (e => Execute1ToWritebackInit, - cur_instr => Decode2ToExecute1Init, - busy => '0', terminate => '0', intr_pending => '0', - fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0', - mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0', + constant reg_stage1_type_init : reg_stage1_type := + (e => Execute1ToWritebackInit, se => side_effect_init, + busy => '0', + fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, + oe => '0', mul_select => "00", res2_sel => "00", + spr_select => spr_id_init, pmu_spr_num => 5x"0", + mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', no_instr_avail => '0', instr_dispatch => '0', ext_interrupt => '0', taken_branch_event => '0', br_mispredict => '0', - others => (others => '0')); + msr => 64x"0", + xerc => xerc_init, xerc_valid => '0', + ramspr_wraddr => 0, ramspr_odd_data => 64x"0"); + + type reg_stage2_type is record + e : Execute1ToWritebackType; + se : side_effect_type; + ext_interrupt : std_ulogic; + taken_branch_event : std_ulogic; + br_mispredict : std_ulogic; + log_addr_spr : std_ulogic_vector(31 downto 0); + end record; + constant reg_stage2_type_init : reg_stage2_type := + (e => Execute1ToWritebackInit, se => side_effect_init, + log_addr_spr => 32x"0", others => '0'); - signal r, rin : reg_type; + signal ex1, ex1in : reg_stage1_type; + signal ex2, ex2in : reg_stage2_type; + signal actions : actions_type; signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0); signal cr_in : std_ulogic_vector(31 downto 0); @@ -99,8 +171,8 @@ architecture behaviour of execute1 is signal mshort_p : std_ulogic_vector(31 downto 0) := (others => '0'); signal valid_in : std_ulogic; - signal ctrl: ctrl_t; - signal ctrl_tmp: ctrl_t; + signal ctrl: ctrl_t := ctrl_t_init; + signal ctrl_tmp: ctrl_t := ctrl_t_init; signal right_shift, rot_clear_left, rot_clear_right: std_ulogic; signal rot_sign_ext: std_ulogic; signal rotator_result: std_ulogic_vector(63 downto 0); @@ -112,9 +184,10 @@ architecture behaviour of execute1 is signal adder_result: std_ulogic_vector(63 downto 0); signal misc_result: std_ulogic_vector(63 downto 0); signal muldiv_result: std_ulogic_vector(63 downto 0); + signal shortmul_result: std_ulogic_vector(63 downto 0); signal spr_result: std_ulogic_vector(63 downto 0); signal next_nia : std_ulogic_vector(63 downto 0); - signal current: Decode2ToExecute1Type; + signal s1_sel : std_ulogic_vector(2 downto 0); signal carry_32 : std_ulogic; signal carry_64 : std_ulogic; @@ -132,7 +205,7 @@ architecture behaviour of execute1 is -- divider signals signal x_to_divider: Execute1ToDividerType; - signal divider_to_x: DividerToExecute1Type; + signal divider_to_x: DividerToExecute1Type := DividerToExecute1Init; -- random number generator signals signal random_raw : std_ulogic_vector(63 downto 0); @@ -147,6 +220,22 @@ architecture behaviour of execute1 is signal exception_log : std_ulogic; signal irq_valid_log : std_ulogic; + -- SPR-related signals + type ramspr_half_t is array(ramspr_index) of std_ulogic_vector(63 downto 0); + signal even_sprs : ramspr_half_t := (others => (others => '0')); + signal odd_sprs : ramspr_half_t := (others => (others => '0')); + signal ramspr_even : std_ulogic_vector(63 downto 0); + signal ramspr_odd : std_ulogic_vector(63 downto 0); + signal ramspr_result : std_ulogic_vector(63 downto 0); + signal ramspr_rd_odd : std_ulogic; + signal ramspr_wr_addr : ramspr_index; + signal ramspr_even_wr_data : std_ulogic_vector(63 downto 0); + signal ramspr_even_wr_enab : std_ulogic; + signal ramspr_odd_wr_data : std_ulogic_vector(63 downto 0); + signal ramspr_odd_wr_enab : std_ulogic; + + signal stage2_stall : std_ulogic; + type privilege_level is (USER, SUPER); type op_privilege_array is array(insn_type_t) of privilege_level; constant op_privilege: op_privilege_array := ( @@ -231,6 +320,18 @@ architecture behaviour of execute1 is return msr_out; end; + function intr_srr1(msr: std_ulogic_vector; flags: std_ulogic_vector) + return std_ulogic_vector is + variable srr1: std_ulogic_vector(63 downto 0); + begin + srr1(63 downto 31) := msr(63 downto 31); + srr1(30 downto 27) := flags(14 downto 11); + srr1(26 downto 22) := msr(26 downto 22); + srr1(21 downto 16) := flags(5 downto 0); + srr1(15 downto 0) := msr(15 downto 0); + return srr1; + end; + -- Work out whether a signed value fits into n bits, -- that is, see if it is in the range -2^(n-1) .. 2^(n-1) - 1 function fits_in_n_bits(val: std_ulogic_vector; n: integer) return boolean is @@ -249,6 +350,13 @@ architecture behaviour of execute1 is return x(n - 1) = '1'; end; + function assemble_xer(xerc: xer_common_t; xer_low: std_ulogic_vector) + return std_ulogic_vector is + begin + return 32x"0" & xerc.so & xerc.ov & xerc.ca & "000000000" & + xerc.ov32 & xerc.ca32 & xer_low(17 downto 0); + end; + -- Tell vivado to keep the hierarchy for the random module so that the -- net names in the xdc file match. attribute keep_hierarchy : string; @@ -287,6 +395,7 @@ begin port map ( clk => clk, rs => c_in, + stall => stage2_stall, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, do_popcnt => do_popcnt, @@ -301,13 +410,15 @@ begin m_out => multiply_to_x ); - divider_0: entity work.divider - port map ( - clk => clk, - rst => rst, - d_in => x_to_divider, - d_out => divider_to_x - ); + divider_0: if not HAS_FPU generate + div_0: entity work.divider + port map ( + clk => clk, + rst => rst, + d_in => x_to_divider, + d_out => divider_to_x + ); + end generate; random_0: entity work.random port map ( @@ -336,8 +447,8 @@ begin ); end generate; - dbg_msr_out <= ctrl.msr; - log_rd_addr <= r.log_addr_spr; + dbg_ctrl_out <= ctrl; + log_rd_addr <= ex2.log_addr_spr; a_in <= e_in.read_data1; b_in <= e_in.read_data2; @@ -356,44 +467,119 @@ begin dtlb_miss_resolved => dc_events.dtlb_miss_resolved, icache_miss => ic_events.icache_miss, itlb_miss_resolved => ic_events.itlb_miss_resolved, - no_instr_avail => r.no_instr_avail, - dispatch => r.instr_dispatch, - ext_interrupt => r.ext_interrupt, - br_taken_complete => r.taken_branch_event, - br_mispredict => r.br_mispredict, + no_instr_avail => ex1.no_instr_avail, + dispatch => ex1.instr_dispatch, + ext_interrupt => ex2.ext_interrupt, + br_taken_complete => ex2.taken_branch_event, + br_mispredict => ex2.br_mispredict, others => '0'); - x_to_pmu.nia <= current.nia; + x_to_pmu.nia <= e_in.nia; x_to_pmu.addr <= (others => '0'); x_to_pmu.addr_v <= '0'; - x_to_pmu.spr_num <= e_in.insn(20 downto 16); - x_to_pmu.spr_val <= c_in; + x_to_pmu.spr_num <= ex1.pmu_spr_num; + x_to_pmu.spr_val <= ex1.e.write_data; x_to_pmu.run <= '1'; - -- XER forwarding. To avoid having to track XER hazards, we use - -- the previously latched value. Since the XER common bits - -- (SO, OV[32] and CA[32]) are only modified by instructions that are - -- handled here, we can just forward the result being sent to - -- writeback. - xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc; - - with e_in.unit select busy_out <= - l_in.busy or r.busy or fp_in.busy when LDST, - l_in.busy or l_in.in_progress or r.busy or fp_in.busy when others; - - valid_in <= e_in.valid and not busy_out and not flush_in; - - terminate_out <= r.terminate; + -- XER forwarding. The CA and CA32 bits are only modified by instructions + -- that are handled here, so for them we can just use the result most + -- recently sent to writeback, unless a pipeline flush has happened in the + -- meantime. + -- Hazards for SO/OV/OV32 are handled by control.vhdl as there may be other + -- units writing to them. No forwarding is done because performance of + -- instructions that alter them is not considered significant. + xerc_in.so <= e_in.xerc.so; + xerc_in.ov <= e_in.xerc.ov; + xerc_in.ov32 <= e_in.xerc.ov32; + xerc_in.ca <= ex1.xerc.ca when ex1.xerc_valid = '1' else e_in.xerc.ca; + xerc_in.ca32 <= ex1.xerc.ca32 when ex1.xerc_valid = '1' else e_in.xerc.ca32; + + -- N.B. the busy signal from each source includes the + -- stage2 stall from that source in it. + busy_out <= l_in.busy or ex1.busy or fp_in.busy; + + valid_in <= e_in.valid and not (busy_out or flush_in or ex1.e.redirect or ex1.e.interrupt); + + -- SPRs stored in two small RAM arrays (two so that we can read and write + -- two SPRs in each cycle). + + ramspr_read: process(all) + variable even_rd_data, odd_rd_data : std_ulogic_vector(63 downto 0); + variable wr_addr : ramspr_index; + variable even_wr_enab, odd_wr_enab : std_ulogic; + variable even_wr_data, odd_wr_data : std_ulogic_vector(63 downto 0); + variable doit : std_ulogic; + begin + -- Read address mux and async RAM reading + even_rd_data := even_sprs(e_in.ramspr_even_rdaddr); + odd_rd_data := odd_sprs(e_in.ramspr_odd_rdaddr); + + -- Write address and data muxes + doit := ex1.e.valid and not stage2_stall and not flush_in; + even_wr_enab := (ex1.se.ramspr_write_even and doit) or interrupt_in.intr; + odd_wr_enab := (ex1.se.ramspr_write_odd and doit) or interrupt_in.intr; + if interrupt_in.intr = '1' then + wr_addr := RAMSPR_SRR0; + else + wr_addr := ex1.ramspr_wraddr; + end if; + if interrupt_in.intr = '1' then + even_wr_data := ex2.e.last_nia; + odd_wr_data := intr_srr1(ctrl.msr, interrupt_in.srr1); + else + even_wr_data := ex1.e.write_data; + odd_wr_data := ex1.ramspr_odd_data; + end if; + ramspr_wr_addr <= wr_addr; + ramspr_even_wr_data <= even_wr_data; + ramspr_even_wr_enab <= even_wr_enab; + ramspr_odd_wr_data <= odd_wr_data; + ramspr_odd_wr_enab <= odd_wr_enab; + + -- SPR RAM read with write data bypass + -- We assume no instruction executes in the cycle immediately following + -- an interrupt, so we don't need to bypass interrupt data + if ex1.se.ramspr_write_even = '1' and e_in.ramspr_even_rdaddr = ex1.ramspr_wraddr then + ramspr_even <= ex1.e.write_data; + else + ramspr_even <= even_rd_data; + end if; + if ex1.se.ramspr_write_odd = '1' and e_in.ramspr_odd_rdaddr = ex1.ramspr_wraddr then + ramspr_odd <= ex1.ramspr_odd_data; + else + ramspr_odd <= odd_rd_data; + end if; + if e_in.ramspr_rd_odd = '0' then + ramspr_result <= ramspr_even; + else + ramspr_result <= ramspr_odd; + end if; + end process; - current <= e_in when r.busy = '0' else r.cur_instr; + ramspr_write: process(clk) + begin + if rising_edge(clk) then + if ramspr_even_wr_enab = '1' then + even_sprs(ramspr_wr_addr) <= ramspr_even_wr_data; + report "writing even spr " & integer'image(ramspr_wr_addr) & " data=" & + to_hstring(ramspr_even_wr_data); + end if; + if ramspr_odd_wr_enab = '1' then + odd_sprs(ramspr_wr_addr) <= ramspr_odd_wr_data; + report "writing odd spr " & integer'image(ramspr_wr_addr) & " data=" & + to_hstring(ramspr_odd_wr_data); + end if; + end if; + end process; - -- Result mux - with current.result_sel select alu_result <= + -- First stage result mux + s1_sel <= e_in.result_sel when ex1.busy = '0' else "100"; + with s1_sel select alu_result <= adder_result when "000", logical_result when "001", rotator_result when "010", - muldiv_result when "011", - countbits_result when "100", - spr_result when "101", + shortmul_result when "011", + muldiv_result when "100", + ramspr_result when "101", next_nia when "110", misc_result when others; @@ -401,27 +587,50 @@ begin begin if rising_edge(clk) then if rst = '1' then - r <= reg_type_init; - ctrl.tb <= (others => '0'); - ctrl.dec <= (others => '0'); - ctrl.cfar <= (others => '0'); + ex1 <= reg_stage1_type_init; + ex2 <= reg_stage2_type_init; + ctrl <= ctrl_t_init; ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); + ex1.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0'); else - r <= rin; + ex1 <= ex1in; + ex2 <= ex2in; ctrl <= ctrl_tmp; if valid_in = '1' then report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) & - " wr=" & to_hstring(rin.e.write_reg) & " we=" & std_ulogic'image(rin.e.write_enable) & - " tag=" & integer'image(rin.e.instr_tag.tag) & std_ulogic'image(rin.e.instr_tag.valid); + " wr=" & to_hstring(ex1in.e.write_reg) & " we=" & std_ulogic'image(ex1in.e.write_enable) & + " tag=" & integer'image(ex1in.e.instr_tag.tag) & std_ulogic'image(ex1in.e.instr_tag.valid); + end if; + -- We mustn't get stalled on a cycle where execute2 is + -- completing an instruction or generating an interrupt + if ex2.e.valid = '1' or ex2.e.interrupt = '1' then + assert stage2_stall = '0' severity failure; end if; end if; end if; end process; - -- Data path for integer instructions + ex_dbg_spr: process(clk) + begin + if rising_edge(clk) then + if rst = '0' and dbg_spr_req = '1' then + if e_in.dbg_spr_access = '1' and dbg_spr_ack = '0' then + if dbg_spr_addr(7) = '1' then + dbg_spr_data <= ramspr_result; + else + dbg_spr_data <= assemble_xer(xerc_in, ctrl.xer_low); + end if; + dbg_spr_ack <= '1'; + end if; + else + dbg_spr_ack <= '0'; + end if; + end if; + end process; + + -- Data path for integer instructions (first execute stage) execute1_dp: process(all) variable a_inv : std_ulogic_vector(63 downto 0); - variable b_or_m1 : std_ulogic_vector(63 downto 0); variable sum_with_carry : std_ulogic_vector(64 downto 0); variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); @@ -456,12 +665,7 @@ begin else a_inv := not a_in; end if; - if e_in.addm1 = '0' then - b_or_m1 := b_in; - else - b_or_m1 := (others => '1'); - end if; - sum_with_carry := ppc_adde(a_inv, b_or_m1, + sum_with_carry := ppc_adde(a_inv, b_in, decode_input_carry(e_in.input_carry, xerc_in)); adder_result <= sum_with_carry(63 downto 0); carry_32 <= sum_with_carry(32) xor a_inv(32) xor b_in(32); @@ -501,6 +705,7 @@ begin if e_in.insn_type = OP_MOD then x_to_divider.is_modulus <= '1'; end if; + x_to_divider.flush <= flush_in; addend := (others => '0'); if e_in.insn(26) = '0' then @@ -540,13 +745,10 @@ begin x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0)); end if; - case current.sub_select(1 downto 0) is + shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64)); + case ex1.mul_select is when "00" => - if HAS_SHORT_MULT and r.mul_in_progress = '0' then - muldiv_result <= std_ulogic_vector(resize(signed(mshort_p), 64)); - else - muldiv_result <= multiply_to_x.result(63 downto 0); - end if; + muldiv_result <= multiply_to_x.result(63 downto 0); when "01" => muldiv_result <= multiply_to_x.result(127 downto 64); when "10" => @@ -557,7 +759,7 @@ begin end case; -- Compute misc_result - case current.sub_select is + case e_in.sub_select is when "000" => misc_result <= (others => '0'); when "001" => @@ -599,7 +801,7 @@ begin misc_result <= darn; when "100" => -- mfmsr - misc_result <= ctrl.msr; + misc_result <= ex1.msr; when "101" => if e_in.insn(20) = '0' then -- mfcr @@ -679,7 +881,7 @@ begin bf := insn_bf(e_in.insn); crnum := to_integer(unsigned(bf)); newcrf := (others => '0'); - case current.sub_select is + case e_in.sub_select is when "000" => -- CMP and CMPL instructions if e_in.is_signed = '1' then @@ -692,7 +894,7 @@ begin when "010" => newcrf := ppc_cmpeqb(a_in, b_in); when "011" => - if current.insn(1) = '1' then + if e_in.insn(1) = '1' then -- CR logical instructions j := (7 - crnum) * 4; newcrf := cr_in(j + 3 downto j); @@ -723,7 +925,7 @@ begin newcrf := xerc_in.ov & xerc_in.ov32 & xerc_in.ca & xerc_in.ca32; when others => end case; - if current.insn_type = OP_MTCRF then + if e_in.insn_type = OP_MTCRF then if e_in.insn(20) = '0' then -- mtcrf write_cr_mask <= insn_fxm(e_in.insn); @@ -732,201 +934,97 @@ begin crnum := fxm_to_num(insn_fxm(e_in.insn)); write_cr_mask <= num_to_fxm(crnum); end if; - write_cr_data <= c_in(31 downto 0); - else + elsif e_in.output_cr = '1' then write_cr_mask <= num_to_fxm(crnum); - write_cr_data <= newcrf & newcrf & newcrf & newcrf & - newcrf & newcrf & newcrf & newcrf; + else + write_cr_mask <= (others => '0'); end if; + for i in 0 to 7 loop + if write_cr_mask(i) = '0' then + write_cr_data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4); + elsif e_in.insn_type = OP_MTCRF then + write_cr_data(i*4 + 3 downto i*4) <= c_in(i*4 + 3 downto i*4); + else + write_cr_data(i*4 + 3 downto i*4) <= newcrf; + end if; + end loop; end process; - execute1_1: process(all) - variable v : reg_type; + execute1_actions: process(all) + variable v: actions_type; variable bo, bi : std_ulogic_vector(4 downto 0); - variable overflow : std_ulogic; - variable lv : Execute1ToLoadstore1Type; - variable irq_valid : std_ulogic; - variable exception : std_ulogic; variable illegal : std_ulogic; - variable is_branch : std_ulogic; - variable is_direct_branch : std_ulogic; - variable taken_branch : std_ulogic; - variable abs_branch : std_ulogic; - variable spr_val : std_ulogic_vector(63 downto 0); - variable do_trace : std_ulogic; - variable hold_wr_data : std_ulogic; - variable fv : Execute1ToFPUType; + variable privileged : std_ulogic; + variable slow_op : std_ulogic; + variable owait : std_ulogic; + variable srr1 : std_ulogic_vector(63 downto 0); begin - is_branch := '0'; - is_direct_branch := '0'; - taken_branch := '0'; - abs_branch := '0'; - hold_wr_data := '0'; - - v := r; - v.e := Execute1ToWritebackInit; - v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) & - not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF); + v := actions_type_init; + v.e.write_data := alu_result; + v.e.write_reg := e_in.write_reg; + v.e.write_enable := e_in.write_reg_enable; + v.e.rc := e_in.rc; + v.e.write_cr_data := write_cr_data; + v.e.write_cr_mask := write_cr_mask; + v.e.write_cr_enable := e_in.output_cr; + v.e.write_xerc_enable := e_in.output_xer; v.e.xerc := xerc_in; + v.new_msr := ex1.msr; + v.e.redir_mode := ex1.msr(MSR_IR) & not ex1.msr(MSR_PR) & + not ex1.msr(MSR_LE) & not ex1.msr(MSR_SF); + v.e.intr_vec := 16#700#; + v.e.mode_32bit := not ex1.msr(MSR_SF); + v.e.instr_tag := e_in.instr_tag; + v.e.last_nia := e_in.nia; + v.e.br_offset := 64x"4"; + + v.se.ramspr_write_even := e_in.ramspr_write_even; + v.se.ramspr_write_odd := e_in.ramspr_write_odd; + v.ramspr_odd_data := c_in; + if e_in.dec_ctr = '1' then + v.ramspr_odd_data := std_ulogic_vector(unsigned(ramspr_odd) - 1); + end if; - lv := Execute1ToLoadstore1Init; - fv := Execute1ToFPUInit; - - x_to_multiply.valid <= '0'; - x_to_divider.valid <= '0'; - v.mul_in_progress := '0'; - v.div_in_progress := '0'; - v.cntz_in_progress := '0'; - v.mul_finish := '0'; - v.ext_interrupt := '0'; - v.taken_branch_event := '0'; - v.br_mispredict := '0'; - - x_to_pmu.mfspr <= '0'; - x_to_pmu.mtspr <= '0'; - x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); - x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); - x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); - x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); - x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); - x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); - - spr_result <= (others => '0'); - spr_val := (others => '0'); - - ctrl_tmp <= ctrl; - -- FIXME: run at 512MHz not core freq - ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); - ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); - - irq_valid := ctrl.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); - - v.terminate := '0'; - icache_inval <= '0'; - v.busy := '0'; - - -- Next insn adder used in a couple of places - next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); - - -- rotator control signals - right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; - rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; - rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; - rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; - - do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + -- Note the difference between v.exception and v.trap: + -- v.exception signals a condition that prevents execution of the + -- instruction, and hence shouldn't depend on operand data, so as to + -- avoid timing chains through both data and control paths. + -- v.trap also means we want to generate an interrupt, but doesn't + -- cancel instruction execution (hence we need to avoid setting any + -- side-effect flags or write enables when generating a trap). + -- With v.trap = 1 we will assert both ex1.e.valid and ex1.e.interrupt + -- to writeback, and it will complete the instruction and take + -- and interrupt. It is OK for v.trap to depend on operand data. illegal := '0'; - if r.intr_pending = '1' then - v.e.srr1 := r.e.srr1; - v.e.intr_vec := r.e.intr_vec; - end if; - if valid_in = '1' then - v.e.last_nia := e_in.nia; - else - v.e.last_nia := r.e.last_nia; - end if; + privileged := '0'; + slow_op := '0'; + owait := '0'; - v.e.mode_32bit := not ctrl.msr(MSR_SF); - v.e.instr_tag := current.instr_tag; - - do_trace := valid_in and ctrl.msr(MSR_SE); - if valid_in = '1' then - v.cur_instr := e_in; - v.prev_op := e_in.insn_type; + if ex1.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then + privileged := '1'; end if; - -- Determine if there is any interrupt to be taken - -- before/instead of executing this instruction - exception := r.intr_pending; - if valid_in = '1' and e_in.second = '0' and r.intr_pending = '0' then - if HAS_FPU and r.fp_exception_next = '1' then - -- This is used for FP-type program interrupts that - -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. - exception := '1'; - v.e.intr_vec := 16#700#; - v.e.srr1(47 - 43) := '1'; - v.e.srr1(47 - 47) := '1'; - elsif r.trace_next = '1' then - -- Generate a trace interrupt rather than executing the next instruction - -- or taking any asynchronous interrupt - exception := '1'; - v.e.intr_vec := 16#d00#; - v.e.srr1(47 - 33) := '1'; - if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or - r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then - v.e.srr1(47 - 35) := '1'; - elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then - v.e.srr1(47 - 36) := '1'; - end if; - - elsif irq_valid = '1' then - -- Don't deliver the interrupt until we have a valid instruction - -- coming in, so we have a valid NIA to put in SRR0. - if pmu_to_x.intr = '1' then - v.e.intr_vec := 16#f00#; - report "IRQ valid: PMU"; - elsif ctrl.dec(63) = '1' then - v.e.intr_vec := 16#900#; - report "IRQ valid: DEC"; - elsif ext_irq_in = '1' then - v.e.intr_vec := 16#500#; - report "IRQ valid: External"; - v.ext_interrupt := '1'; - end if; - exception := '1'; - - elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then - -- generate a program interrupt - exception := '1'; - v.e.intr_vec := 16#700#; - -- set bit 45 to indicate privileged instruction type interrupt - v.e.srr1(47 - 45) := '1'; - report "privileged instruction"; - - elsif not HAS_FPU and e_in.fac = FPU then - -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations - illegal := '1'; - - elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then - -- generate a floating-point unavailable interrupt - exception := '1'; - v.e.intr_vec := 16#800#; - report "FP unavailable interrupt"; - end if; - end if; - if exception = '1' and l_in.in_progress = '1' then - -- We can't send this interrupt to writeback yet because there are - -- still instructions in loadstore1 that haven't completed. - v.intr_pending := '1'; - v.busy := '1'; + if (not HAS_FPU and e_in.fac = FPU) or e_in.unit = NONE then + -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations + illegal := '1'; end if; - if l_in.interrupt = '1' then - v.intr_pending := '0'; - end if; - - v.no_instr_avail := not (e_in.valid or l_in.busy or l_in.in_progress or r.busy or fp_in.busy); - v.instr_dispatch := valid_in and not exception and not illegal; - - if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then - v.e.valid := '1'; - - case_0: case e_in.insn_type is + v.do_trace := ex1.msr(MSR_SE); + case_0: case e_in.insn_type is when OP_ILLEGAL => - -- we need two cycles to write srr0 and 1 - -- will need more when we have to write HEIR illegal := '1'; when OP_SC => -- check bit 1 of the instruction is 1 so we know this is sc; -- 0 would mean scv, so generate an illegal instruction interrupt - -- we need two cycles to write srr0 and 1 if e_in.insn(1) = '1' then - exception := '1'; + v.trap := '1'; v.e.intr_vec := 16#C00#; v.e.last_nia := next_nia; - report "sc"; + if e_in.valid = '1' then + report "sc"; + end if; else illegal := '1'; end if; @@ -934,13 +1032,15 @@ begin -- check bits 1-10 of the instruction to make sure it's attn -- if not then it is illegal if e_in.insn(10 downto 1) = "0100000000" then - v.terminate := '1'; - report "ATTN"; + v.se.terminate := '1'; + if e_in.valid = '1' then + report "ATTN"; + end if; else illegal := '1'; end if; when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT => - -- Do nothing + -- Do nothing when OP_ADD => if e_in.output_carry = '1' then if e_in.input_carry /= OV then @@ -961,194 +1061,191 @@ begin v.e.srr1(47 - 46) := '1'; if or (trapval and insn_to(e_in.insn)) = '1' then -- generate trap-type program interrupt - exception := '1'; - report "trap"; + v.trap := '1'; + if e_in.valid = '1' then + report "trap"; + end if; end if; when OP_ADDG6S => when OP_CMPRB => when OP_CMPEQB => when OP_AND | OP_OR | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS | - OP_BPERM | OP_BCD => + OP_BPERM | OP_BCD => when OP_B => - is_branch := '1'; - taken_branch := '1'; - is_direct_branch := '1'; - abs_branch := e_in.br_abs; - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; + v.take_branch := '1'; + v.direct_branch := '1'; + v.e.br_last := '1'; + v.e.br_taken := '1'; + v.e.br_offset := b_in; + v.e.abs_br := insn_aa(e_in.insn); + if e_in.br_pred = '0' then + -- should never happen + v.e.redirect := '1'; end if; - v.taken_branch_event := '1'; - when OP_BC | OP_BCREG => - -- read_data1 is CTR - -- for OP_BCREG, read_data2 is target register (CTR, LR or TAR) - -- If this instruction updates both CTR and LR, then it is - -- doubled; the first instruction decrements CTR and determines - -- whether the branch is taken, and the second does the - -- redirect and the LR update. + if ex1.msr(MSR_BE) = '1' then + v.do_trace := '1'; + end if; + v.se.write_cfar := '1'; + when OP_BC => + -- If CTR is being decremented, it is in ramspr_odd. bo := insn_bo(e_in.insn); bi := insn_bi(e_in.insn); - if e_in.second = '0' then - taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in); - else - taken_branch := r.br_taken; + v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); + if v.take_branch = '1' then + v.e.br_offset := b_in; + v.e.abs_br := insn_aa(e_in.insn); end if; - v.br_taken := taken_branch; - v.taken_branch_event := taken_branch; - abs_branch := e_in.br_abs; - if e_in.repeat = '0' or e_in.second = '1' then - is_branch := '1'; - if e_in.insn_type = OP_BC then - is_direct_branch := '1'; - end if; - if ctrl.msr(MSR_BE) = '1' then - do_trace := '1'; - end if; + -- Mispredicted branches cause a redirect + if v.take_branch /= e_in.br_pred then + v.e.redirect := '1'; + end if; + v.direct_branch := '1'; + v.e.br_last := '1'; + v.e.br_taken := v.take_branch; + if ex1.msr(MSR_BE) = '1' then + v.do_trace := '1'; + end if; + v.se.write_cfar := v.take_branch; + when OP_BCREG => + -- If CTR is being decremented, it is in ramspr_odd. + -- The target address is in ramspr_result (LR, CTR or TAR). + bo := insn_bo(e_in.insn); + bi := insn_bi(e_in.insn); + v.take_branch := ppc_bc_taken(bo, bi, cr_in, ramspr_odd); + if v.take_branch = '1' then + v.e.br_offset := ramspr_result; + v.e.abs_br := '1'; + end if; + -- Indirect branches are never predicted taken + v.e.redirect := v.take_branch; + v.e.br_taken := v.take_branch; + if ex1.msr(MSR_BE) = '1' then + v.do_trace := '1'; end if; + v.se.write_cfar := v.take_branch; when OP_RFID => - v.e.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) & - not a_in(MSR_LE) & not a_in(MSR_SF); + srr1 := ramspr_odd; + v.e.redir_mode := (srr1(MSR_IR) or srr1(MSR_PR)) & not srr1(MSR_PR) & + not srr1(MSR_LE) & not srr1(MSR_SF); -- Can't use msr_copy here because the partial function MSR -- bits should be left unchanged, not zeroed. - ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31); - ctrl_tmp.msr(26 downto 22) <= a_in(26 downto 22); - ctrl_tmp.msr(15 downto 0) <= a_in(15 downto 0); - if a_in(MSR_PR) = '1' then - ctrl_tmp.msr(MSR_EE) <= '1'; - ctrl_tmp.msr(MSR_IR) <= '1'; - ctrl_tmp.msr(MSR_DR) <= '1'; + v.new_msr(63 downto 31) := srr1(63 downto 31); + v.new_msr(26 downto 22) := srr1(26 downto 22); + v.new_msr(15 downto 0) := srr1(15 downto 0); + if srr1(MSR_PR) = '1' then + v.new_msr(MSR_EE) := '1'; + v.new_msr(MSR_IR) := '1'; + v.new_msr(MSR_DR) := '1'; end if; - -- mark this as a branch so CFAR gets updated - is_branch := '1'; - taken_branch := '1'; - abs_branch := '1'; + v.se.write_msr := '1'; + v.e.br_offset := ramspr_result; + v.e.abs_br := '1'; + v.e.redirect := '1'; + v.se.write_cfar := '1'; if HAS_FPU then - v.fp_exception_next := fp_in.exception and - (a_in(MSR_FE0) or a_in(MSR_FE1)); + v.fp_intr := fp_in.exception and + (srr1(MSR_FE0) or srr1(MSR_FE1)); end if; - do_trace := '0'; + v.do_trace := '0'; when OP_CNTZ | OP_POPCNT => - v.e.valid := '0'; - v.cntz_in_progress := '1'; - v.busy := '1'; + v.res2_sel := "01"; + slow_op := '1'; when OP_ISEL => when OP_CROP => when OP_MCRXRX => when OP_DARN => when OP_MFMSR => when OP_MFSPR => - if not is_X(e_in.insn) then - report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(a_in); - end if; - if is_fast_spr(e_in.read_reg1) = '1' then - spr_val := a_in; - if is_X(e_in.insn) then - spr_val(63 downto 32) := (others => 'X'); - elsif decode_spr_num(e_in.insn) = SPR_XER then - -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer - spr_val(63 downto 32) := (others => '0'); - spr_val(63-32) := xerc_in.so; - spr_val(63-33) := xerc_in.ov; - spr_val(63-34) := xerc_in.ca; - spr_val(63-35 downto 63-43) := "000000000"; - spr_val(63-44) := xerc_in.ov32; - spr_val(63-45) := xerc_in.ca32; + if e_in.spr_is_ram = '1' then + if e_in.valid = '1' and not is_X(e_in.insn) then + report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + "=" & to_hstring(alu_result); + end if; + elsif e_in.spr_select.valid = '1' then + if e_in.valid = '1' and not is_X(e_in.insn) then + report "MFSPR to slow SPR " & integer'image(decode_spr_num(e_in.insn)); + end if; + slow_op := '1'; + if e_in.spr_select.ispmu = '0' then + case e_in.spr_select.sel is + when SPRSEL_LOGD => + v.se.inc_loga := '1'; + when others => + end case; + v.res2_sel := "10"; + else + v.res2_sel := "11"; + end if; + else + -- mfspr from unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + if e_in.valid = '1' and not is_X(e_in.insn) then + report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + " invalid"; + end if; + if ex1.msr(MSR_PR) = '1' then + illegal := '1'; end if; - else - spr_val := c_in; - case decode_spr_num(e_in.insn) is - when SPR_TB => - spr_val := ctrl.tb; - when SPR_TBU => - spr_val(63 downto 32) := (others => '0'); - spr_val(31 downto 0) := ctrl.tb(63 downto 32); - when SPR_DEC => - spr_val := ctrl.dec; - when SPR_CFAR => - spr_val := ctrl.cfar; - when SPR_PVR => - spr_val(63 downto 32) := (others => '0'); - spr_val(31 downto 0) := PVR_MICROWATT; - when 724 => -- LOG_ADDR SPR - spr_val := log_wr_addr & r.log_addr_spr; - when 725 => -- LOG_DATA SPR - spr_val := log_rd_data; - v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); - when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 | - SPR_UMMCR0 | SPR_UMMCR1 | SPR_UMMCR2 | SPR_UMMCRA | SPR_USIER | SPR_USIAR | SPR_USDAR | - SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 | - SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR => - x_to_pmu.mfspr <= '1'; - spr_val := pmu_to_x.spr_val; - when others => - -- mfspr from unimplemented SPRs should be a nop in - -- supervisor mode and a program interrupt for user mode - if is_fast_spr(e_in.read_reg1) = '0' and ctrl.msr(MSR_PR) = '1' then - illegal := '1'; - end if; - end case; end if; - spr_result <= spr_val; when OP_MFCR => when OP_MTCRF => when OP_MTMSRD => + v.se.write_msr := '1'; if e_in.insn(16) = '1' then -- just update EE and RI - ctrl_tmp.msr(MSR_EE) <= c_in(MSR_EE); - ctrl_tmp.msr(MSR_RI) <= c_in(MSR_RI); + v.new_msr(MSR_EE) := c_in(MSR_EE); + v.new_msr(MSR_RI) := c_in(MSR_RI); else -- Architecture says to leave out bits 3 (HV), 51 (ME) -- and 63 (LE) (IBM bit numbering) if e_in.is_32bit = '0' then - ctrl_tmp.msr(63 downto 61) <= c_in(63 downto 61); - ctrl_tmp.msr(59 downto 32) <= c_in(59 downto 32); + v.new_msr(63 downto 61) := c_in(63 downto 61); + v.new_msr(59 downto 32) := c_in(59 downto 32); end if; - ctrl_tmp.msr(31 downto 13) <= c_in(31 downto 13); - ctrl_tmp.msr(11 downto 1) <= c_in(11 downto 1); + v.new_msr(31 downto 13) := c_in(31 downto 13); + v.new_msr(11 downto 1) := c_in(11 downto 1); if c_in(MSR_PR) = '1' then - ctrl_tmp.msr(MSR_EE) <= '1'; - ctrl_tmp.msr(MSR_IR) <= '1'; - ctrl_tmp.msr(MSR_DR) <= '1'; + v.new_msr(MSR_EE) := '1'; + v.new_msr(MSR_IR) := '1'; + v.new_msr(MSR_DR) := '1'; end if; if HAS_FPU then - v.fp_exception_next := fp_in.exception and - (c_in(MSR_FE0) or c_in(MSR_FE1)); + v.fp_intr := fp_in.exception and + (c_in(MSR_FE0) or c_in(MSR_FE1)); end if; end if; when OP_MTSPR => - report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & - "=" & to_hstring(c_in); - if is_fast_spr(e_in.write_reg) then - if decode_spr_num(e_in.insn) = SPR_XER then - v.e.xerc.so := c_in(63-32); - v.e.xerc.ov := c_in(63-33); - v.e.xerc.ca := c_in(63-34); - v.e.xerc.ov32 := c_in(63-44); - v.e.xerc.ca32 := c_in(63-45); - end if; - else - -- slow spr - case decode_spr_num(e_in.insn) is - when SPR_DEC => - ctrl_tmp.dec <= c_in; - when 724 => -- LOG_ADDR SPR - v.log_addr_spr := c_in(31 downto 0); - when SPR_UPMC1 | SPR_UPMC2 | SPR_UPMC3 | SPR_UPMC4 | SPR_UPMC5 | SPR_UPMC6 | - SPR_UMMCR0 | SPR_UMMCR2 | SPR_UMMCRA | - SPR_PMC1 | SPR_PMC2 | SPR_PMC3 | SPR_PMC4 | SPR_PMC5 | SPR_PMC6 | - SPR_MMCR0 | SPR_MMCR1 | SPR_MMCR2 | SPR_MMCRA | SPR_SIER | SPR_SIAR | SPR_SDAR => - x_to_pmu.mtspr <= '1'; - when others => - -- mtspr to unimplemented SPRs should be a nop in - -- supervisor mode and a program interrupt for user mode - if ctrl.msr(MSR_PR) = '1' then - illegal := '1'; - end if; - end case; + if e_in.valid = '1' and not is_X(e_in.insn) then + report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) & + "=" & to_hstring(c_in); + end if; + v.se.write_pmuspr := e_in.spr_select.ispmu; + if e_in.spr_select.valid = '1' and e_in.spr_select.ispmu = '0' then + case e_in.spr_select.sel is + when SPRSEL_XER => + v.e.xerc.so := c_in(63-32); + v.e.xerc.ov := c_in(63-33); + v.e.xerc.ca := c_in(63-34); + v.e.xerc.ov32 := c_in(63-44); + v.e.xerc.ca32 := c_in(63-45); + v.se.write_xerlow := '1'; + when SPRSEL_DEC => + v.se.write_dec := '1'; + when SPRSEL_LOGA => + v.se.write_loga := '1'; + when others => + end case; + end if; + if e_in.spr_select.valid = '0' and e_in.spr_is_ram = '0' then + -- mtspr to unimplemented SPRs should be a nop in + -- supervisor mode and a program interrupt for user mode + if ex1.msr(MSR_PR) = '1' then + illegal := '1'; + end if; end if; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => if e_in.output_carry = '1' then @@ -1158,13 +1255,12 @@ begin when OP_ISYNC => v.e.redirect := '1'; - v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); when OP_ICBI => - icache_inval <= '1'; + v.se.icache_inval := '1'; - when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 => - if HAS_SHORT_MULT and e_in.insn_type = OP_MUL_L64 and e_in.insn(26) = '1' and + when OP_MUL_L64 => + if HAS_SHORT_MULT and e_in.insn(26) = '1' and fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then -- Operands fit into 16 bits, so use short multiplier if e_in.oe = '1' then @@ -1173,165 +1269,274 @@ begin end if; else -- Use standard multiplier - v.e.valid := '0'; - v.mul_in_progress := '1'; - v.busy := '1'; - x_to_multiply.valid <= '1'; + v.start_mul := '1'; + slow_op := '1'; + owait := '1'; end if; + when OP_MUL_H64 | OP_MUL_H32 => + v.start_mul := '1'; + slow_op := '1'; + owait := '1'; + when OP_DIV | OP_DIVE | OP_MOD => - v.e.valid := '0'; - v.div_in_progress := '1'; - v.busy := '1'; - x_to_divider.valid <= '1'; + if not HAS_FPU then + v.start_div := '1'; + slow_op := '1'; + owait := '1'; + end if; + + when OP_FETCH_FAILED => + -- Handling an ITLB miss doesn't count as having executed an instruction + v.do_trace := '0'; when others => - v.terminate := '1'; - report "illegal"; - end case; - - -- Mispredicted branches cause a redirect - if is_branch = '1' then - if taken_branch = '1' then - ctrl_tmp.cfar <= e_in.nia; + if e_in.valid = '1' and e_in.unit = ALU then + report "unhandled insn_type " & insn_type_t'image(e_in.insn_type); end if; - if taken_branch = '1' then - v.e.br_offset := b_in; - v.e.abs_br := abs_branch; - else - v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64)); + end case; + + if privileged = '1' then + -- generate a program interrupt + v.exception := '1'; + -- set bit 45 to indicate privileged instruction type interrupt + v.e.srr1(47 - 45) := '1'; + if e_in.valid = '1' then + report "privileged instruction"; + end if; + + elsif illegal = '1' then + v.exception := '1'; + -- Since we aren't doing Hypervisor emulation assist (0xe40) we + -- set bit 44 to indicate we have an illegal + v.e.srr1(47 - 44) := '1'; + if e_in.valid = '1' then + report "illegal instruction"; + end if; + + elsif HAS_FPU and ex1.msr(MSR_FP) = '0' and e_in.fac = FPU then + -- generate a floating-point unavailable interrupt + v.exception := '1'; + v.e.intr_vec := 16#800#; + if e_in.valid = '1' then + report "FP unavailable interrupt"; + end if; + end if; + + if e_in.unit = ALU then + v.complete := e_in.valid and not v.exception and not owait; + v.bypass_valid := e_in.valid and not v.exception and not slow_op; + end if; + + actions <= v; + end process; + + -- First execute stage + execute1_1: process(all) + variable v : reg_stage1_type; + variable overflow : std_ulogic; + variable lv : Execute1ToLoadstore1Type; + variable irq_valid : std_ulogic; + variable exception : std_ulogic; + variable fv : Execute1ToFPUType; + variable go : std_ulogic; + variable bypass_valid : std_ulogic; + begin + v := ex1; + if (ex1.busy or l_in.busy or fp_in.busy) = '0' then + v.e := actions.e; + v.e.valid := '0'; + v.oe := e_in.oe; + v.spr_select := e_in.spr_select; + v.pmu_spr_num := e_in.insn(20 downto 16); + v.mul_select := e_in.sub_select(1 downto 0); + v.se := side_effect_init; + v.ramspr_wraddr := e_in.ramspr_wraddr; + v.ramspr_odd_data := actions.ramspr_odd_data; + end if; + + lv := Execute1ToLoadstore1Init; + fv := Execute1ToFPUInit; + + x_to_multiply.valid <= '0'; + x_to_divider.valid <= '0'; + v.ext_interrupt := '0'; + v.taken_branch_event := '0'; + v.br_mispredict := '0'; + v.busy := '0'; + bypass_valid := '0'; + + irq_valid := ex1.msr(MSR_EE) and (pmu_to_x.intr or ctrl.dec(63) or ext_irq_in); + + -- Next insn adder used in a couple of places + next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4); + + -- rotator control signals + right_shift <= '1' when e_in.insn_type = OP_SHR else '0'; + rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0'; + rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; + rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; + + do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + + if valid_in = '1' then + v.prev_op := e_in.insn_type; + end if; + + -- Determine if there is any interrupt to be taken + -- before/instead of executing this instruction + exception := valid_in and actions.exception; + if valid_in = '1' and e_in.second = '0' then + if HAS_FPU and ex1.fp_exception_next = '1' then + -- This is used for FP-type program interrupts that + -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero. + exception := '1'; + v.e.intr_vec := 16#700#; + v.e.srr1 := (others => '0'); + v.e.srr1(47 - 43) := '1'; + v.e.srr1(47 - 47) := '1'; + elsif ex1.trace_next = '1' then + -- Generate a trace interrupt rather than executing the next instruction + -- or taking any asynchronous interrupt + exception := '1'; + v.e.intr_vec := 16#d00#; + v.e.srr1 := (others => '0'); + v.e.srr1(47 - 33) := '1'; + if ex1.prev_op = OP_LOAD or ex1.prev_op = OP_ICBI or ex1.prev_op = OP_ICBT or + ex1.prev_op = OP_DCBT or ex1.prev_op = OP_DCBST or ex1.prev_op = OP_DCBF then + v.e.srr1(47 - 35) := '1'; + elsif ex1.prev_op = OP_STORE or ex1.prev_op = OP_DCBZ or + ex1.prev_op = OP_DCBTST then + v.e.srr1(47 - 36) := '1'; end if; - if taken_branch /= e_in.br_pred then - v.e.redirect := '1'; - v.br_mispredict := is_direct_branch; + + elsif irq_valid = '1' then + -- Don't deliver the interrupt until we have a valid instruction + -- coming in, so we have a valid NIA to put in SRR0. + if pmu_to_x.intr = '1' then + v.e.intr_vec := 16#f00#; + report "IRQ valid: PMU"; + elsif ctrl.dec(63) = '1' then + v.e.intr_vec := 16#900#; + report "IRQ valid: DEC"; + elsif ext_irq_in = '1' then + v.e.intr_vec := 16#500#; + report "IRQ valid: External"; + v.ext_interrupt := '1'; end if; - v.e.br_last := is_direct_branch; - v.e.br_taken := taken_branch; + v.e.srr1 := (others => '0'); + exception := '1'; + end if; + end if; + + v.no_instr_avail := not (e_in.valid or l_in.busy or ex1.busy or fp_in.busy); + + go := valid_in and not exception; + v.instr_dispatch := go; + + if go = '1' then + v.se := actions.se; + v.e.valid := actions.complete; + bypass_valid := actions.bypass_valid; + v.taken_branch_event := actions.take_branch; + v.trace_next := actions.do_trace; + v.fp_exception_next := actions.fp_intr; + v.res2_sel := actions.res2_sel; + v.msr := actions.new_msr; + x_to_multiply.valid <= actions.start_mul; + v.mul_in_progress := actions.start_mul; + x_to_divider.valid <= actions.start_div; + v.div_in_progress := actions.start_div; + v.br_mispredict := v.e.redirect and actions.direct_branch; + exception := actions.trap; + + -- Go busy while division is happening because the + -- divider is not pipelined. Also go busy while a + -- multiply is happening in order to stop following + -- instructions from using the wrong XER value + -- (and for simplicity in the OE=0 case). + v.busy := actions.start_div or actions.start_mul; - elsif valid_in = '1' and exception = '0' and illegal = '0' then -- instruction for other units, i.e. LDST if e_in.unit = LDST then lv.valid := '1'; - elsif e_in.unit = NONE then - illegal := '1'; - elsif HAS_FPU and e_in.unit = FPU then - fv.valid := '1'; end if; - -- Handling an ITLB miss doesn't count as having executed an instruction - if e_in.insn_type = OP_FETCH_FAILED then - do_trace := '0'; + if HAS_FPU and e_in.unit = FPU then + fv.valid := '1'; end if; end if; - -- The following cases all occur when r.busy = 1 and therefore - -- valid_in = 0. Hence they don't happen in the same cycle as any of - -- the cases above which depend on valid_in = 1. - if r.cntz_in_progress = '1' then - -- cnt[lt]z and popcnt* always take two cycles - v.e.valid := '1'; - elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then - if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or - (r.div_in_progress = '1' and divider_to_x.valid = '1') then - if r.mul_in_progress = '1' then - overflow := '0'; - else - overflow := divider_to_x.overflow; - end if; - if r.mul_in_progress = '1' and current.oe = '1' then - -- have to wait until next cycle for overflow indication - v.mul_finish := '1'; - v.busy := '1'; - else - -- We must test oe because the RC update code in writeback - -- will use the xerc value to set CR0:SO so we must not clobber - -- xerc if OE wasn't set. - if current.oe = '1' then - v.e.xerc.ov := overflow; - v.e.xerc.ov32 := overflow; - if overflow = '1' then - v.e.xerc.so := '1'; - end if; - end if; - v.e.valid := '1'; + if ex1.div_in_progress = '1' then + v.div_in_progress := not divider_to_x.valid; + v.busy := not divider_to_x.valid; + if divider_to_x.valid = '1' and ex1.oe = '1' then + v.e.xerc.ov := divider_to_x.overflow; + v.e.xerc.ov32 := divider_to_x.overflow; + if divider_to_x.overflow = '1' then + v.e.xerc.so := '1'; end if; - else - v.busy := '1'; - v.mul_in_progress := r.mul_in_progress; - v.div_in_progress := r.div_in_progress; - end if; - elsif r.mul_finish = '1' then - hold_wr_data := '1'; + end if; + v.e.valid := divider_to_x.valid; + v.e.write_data := alu_result; + bypass_valid := v.e.valid; + end if; + if ex1.mul_in_progress = '1' then + v.mul_in_progress := not multiply_to_x.valid; + v.mul_finish := multiply_to_x.valid and ex1.oe; + v.e.valid := multiply_to_x.valid and not ex1.oe; + v.busy := not v.e.valid; + v.e.write_data := alu_result; + bypass_valid := v.e.valid; + end if; + if ex1.mul_finish = '1' then + v.mul_finish := '0'; v.e.xerc.ov := multiply_to_x.overflow; v.e.xerc.ov32 := multiply_to_x.overflow; if multiply_to_x.overflow = '1' then v.e.xerc.so := '1'; end if; v.e.valid := '1'; - end if; - - if illegal = '1' then - exception := '1'; - v.e.intr_vec := 16#700#; - -- Since we aren't doing Hypervisor emulation assist (0xe40) we - -- set bit 44 to indicate we have an illegal - v.e.srr1(47 - 44) := '1'; - report "illegal"; end if; - v.e.interrupt := exception and not (l_in.in_progress or l_in.interrupt); - if v.e.interrupt = '1' then - v.intr_pending := '0'; + if v.e.write_xerc_enable = '1' and v.e.valid = '1' then + v.xerc := v.e.xerc; + v.xerc_valid := '1'; end if; - if do_trace = '1' then - v.trace_next := '1'; + if (ex1.busy or l_in.busy or fp_in.busy) = '0' then + v.e.interrupt := exception; end if; - - if interrupt_in = '1' then - ctrl_tmp.msr(MSR_SF) <= '1'; - ctrl_tmp.msr(MSR_EE) <= '0'; - ctrl_tmp.msr(MSR_PR) <= '0'; - ctrl_tmp.msr(MSR_SE) <= '0'; - ctrl_tmp.msr(MSR_BE) <= '0'; - ctrl_tmp.msr(MSR_FP) <= '0'; - ctrl_tmp.msr(MSR_FE0) <= '0'; - ctrl_tmp.msr(MSR_FE1) <= '0'; - ctrl_tmp.msr(MSR_IR) <= '0'; - ctrl_tmp.msr(MSR_DR) <= '0'; - ctrl_tmp.msr(MSR_RI) <= '0'; - ctrl_tmp.msr(MSR_LE) <= '1'; + if v.e.valid = '0' then + v.e.redirect := '0'; + v.e.br_last := '0'; + end if; + if flush_in = '1' then + v.e.valid := '0'; + v.e.interrupt := '0'; + v.e.redirect := '0'; + v.e.br_last := '0'; + v.busy := '0'; + v.div_in_progress := '0'; + v.mul_in_progress := '0'; + v.mul_finish := '0'; + v.xerc_valid := '0'; + end if; + if flush_in = '1' or interrupt_in.intr = '1' then + v.msr := ctrl_tmp.msr; + end if; + if interrupt_in.intr = '1' then v.trace_next := '0'; v.fp_exception_next := '0'; - v.intr_pending := '0'; end if; - if hold_wr_data = '0' then - v.e.write_data := alu_result; - else - v.e.write_data := r.e.write_data; - end if; - v.e.write_reg := current.write_reg; - v.e.write_enable := current.write_reg_enable and v.e.valid and not exception; - v.e.rc := current.rc and v.e.valid and not exception; - v.e.write_cr_data := write_cr_data; - v.e.write_cr_mask := write_cr_mask; - v.e.write_cr_enable := current.output_cr and v.e.valid and not exception; - v.e.write_xerc_enable := current.output_xer and v.e.valid and not exception; + bypass_data.tag.valid <= v.e.write_enable and bypass_valid; + bypass_data.tag.tag <= v.e.instr_tag.tag; + bypass_data.data <= alu_result; - bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid; - bypass_data.tag.tag <= current.instr_tag.tag; - bypass_data.data <= v.e.write_data; - - bypass_cr_data.tag.valid <= current.instr_tag.valid and current.output_cr and v.e.valid; - bypass_cr_data.tag.tag <= current.instr_tag.tag; - for i in 0 to 7 loop - if v.e.write_cr_mask(i) = '1' then - bypass_cr_data.data(i*4 + 3 downto i*4) <= v.e.write_cr_data(i*4 + 3 downto i*4); - else - bypass_cr_data.data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4); - end if; - end loop; + bypass_cr_data.tag.valid <= v.e.write_cr_enable and bypass_valid; + bypass_cr_data.tag.tag <= v.e.instr_tag.tag; + bypass_cr_data.data <= v.e.write_cr_data; -- Outputs to loadstore1 (async) lv.op := e_in.insn_type; @@ -1342,7 +1547,7 @@ begin lv.data := c_in; lv.write_reg := e_in.write_reg; lv.length := e_in.data_len; - lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE); + lv.byte_reverse := e_in.byte_reverse xnor ex1.msr(MSR_LE); lv.sign_extend := e_in.sign_extend; lv.update := e_in.update; lv.xerc := xerc_in; @@ -1354,40 +1559,234 @@ begin e_in.insn(5 downto 1) = "10101" then lv.ci := '1'; end if; - lv.virt_mode := ctrl.msr(MSR_DR); - lv.priv_mode := not ctrl.msr(MSR_PR); - lv.mode_32bit := not ctrl.msr(MSR_SF); + lv.virt_mode := ex1.msr(MSR_DR); + lv.priv_mode := not ex1.msr(MSR_PR); + lv.mode_32bit := not ex1.msr(MSR_SF); lv.is_32bit := e_in.is_32bit; lv.repeat := e_in.repeat; lv.second := e_in.second; + lv.e2stall := fp_in.f2stall; -- Outputs to FPU fv.op := e_in.insn_type; - fv.nia := e_in.nia; fv.insn := e_in.insn; fv.itag := e_in.instr_tag; fv.single := e_in.is_32bit; - fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1); + fv.is_signed := e_in.is_signed; + fv.fe_mode := ex1.msr(MSR_FE0) & ex1.msr(MSR_FE1); fv.fra := a_in; fv.frb := b_in; fv.frc := c_in; fv.frt := e_in.write_reg; fv.rc := e_in.rc; fv.out_cr := e_in.output_cr; + fv.m32b := not ex1.msr(MSR_SF); + fv.oe := e_in.oe; + fv.xerc := xerc_in; + fv.stall := l_in.l2stall; -- Update registers - rin <= v; + ex1in <= v; -- update outputs l_out <= lv; - e_out <= r.e; - e_out.msr <= msr_copy(ctrl.msr); fp_out <= fv; - - exception_log <= exception; irq_valid_log <= irq_valid; end process; + -- Slow SPR read mux + with ex1.spr_select.sel select spr_result <= + ctrl.tb when SPRSEL_TB, + 32x"0" & ctrl.tb(63 downto 32) when SPRSEL_TBU, + ctrl.dec when SPRSEL_DEC, + 32x"0" & PVR_MICROWATT when SPRSEL_PVR, + log_wr_addr & ex2.log_addr_spr when SPRSEL_LOGA, + log_rd_data when SPRSEL_LOGD, + ctrl.cfar when SPRSEL_CFAR, + assemble_xer(ex1.e.xerc, ctrl.xer_low) when others; + + stage2_stall <= l_in.l2stall or fp_in.f2stall; + + -- Second execute stage control + execute2_1: process(all) + variable v : reg_stage2_type; + variable overflow : std_ulogic; + variable lv : Execute1ToLoadstore1Type; + variable fv : Execute1ToFPUType; + variable k : integer; + variable go : std_ulogic; + variable bypass_valid : std_ulogic; + variable rcresult : std_ulogic_vector(63 downto 0); + variable sprres : std_ulogic_vector(63 downto 0); + variable ex_result : std_ulogic_vector(63 downto 0); + variable cr_res : std_ulogic_vector(31 downto 0); + variable cr_mask : std_ulogic_vector(7 downto 0); + variable sign, zero : std_ulogic; + variable rcnz_hi, rcnz_lo : std_ulogic; + begin + v := ex2; + if stage2_stall = '0' then + v.e := ex1.e; + v.se := ex1.se; + v.ext_interrupt := ex1.ext_interrupt; + v.taken_branch_event := ex1.taken_branch_event; + v.br_mispredict := ex1.br_mispredict; + end if; + + ctrl_tmp <= ctrl; + -- FIXME: run at 512MHz not core freq + ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1); + ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1); + + x_to_pmu.mfspr <= '0'; + x_to_pmu.mtspr <= '0'; + x_to_pmu.tbbits(3) <= ctrl.tb(63 - 47); + x_to_pmu.tbbits(2) <= ctrl.tb(63 - 51); + x_to_pmu.tbbits(1) <= ctrl.tb(63 - 55); + x_to_pmu.tbbits(0) <= ctrl.tb(63 - 63); + x_to_pmu.pmm_msr <= ctrl.msr(MSR_PMM); + x_to_pmu.pr_msr <= ctrl.msr(MSR_PR); + + if v.e.valid = '0' or flush_in = '1' then + v.e.write_enable := '0'; + v.e.write_cr_enable := '0'; + v.e.write_xerc_enable := '0'; + v.e.redirect := '0'; + v.e.br_last := '0'; + v.se := side_effect_init; + v.taken_branch_event := '0'; + v.br_mispredict := '0'; + end if; + if flush_in = '1' then + v.e.valid := '0'; + v.e.interrupt := '0'; + v.ext_interrupt := '0'; + end if; + + -- This is split like this because mfspr doesn't have an Rc bit, + -- and we don't want the zero-detect logic to be after the + -- SPR mux for timing reasons. + if ex1.res2_sel(0) = '0' then + rcresult := ex1.e.write_data; + sprres := spr_result; + else + rcresult := countbits_result; + sprres := pmu_to_x.spr_val; + end if; + if ex1.res2_sel(1) = '0' then + ex_result := rcresult; + else + ex_result := sprres; + end if; + + cr_res := ex1.e.write_cr_data; + cr_mask := ex1.e.write_cr_mask; + if ex1.e.rc = '1' and ex1.e.write_enable = '1' then + rcnz_lo := or (rcresult(31 downto 0)); + if ex1.e.mode_32bit = '0' then + rcnz_hi := or (rcresult(63 downto 32)); + zero := not (rcnz_hi or rcnz_lo); + sign := ex_result(63); + else + zero := not rcnz_lo; + sign := ex_result(31); + end if; + cr_res(31) := sign; + cr_res(30) := not (sign or zero); + cr_res(29) := zero; + cr_res(28) := ex1.e.xerc.so; + cr_mask(7) := '1'; + end if; + + if stage2_stall = '0' then + v.e.write_data := ex_result; + v.e.write_cr_data := cr_res; + v.e.write_cr_mask := cr_mask; + if ex1.e.rc = '1' and ex1.e.write_enable = '1' and v.e.valid = '1' then + v.e.write_cr_enable := '1'; + end if; + + if ex1.se.write_msr = '1' then + ctrl_tmp.msr <= ex1.msr; + end if; + if ex1.se.write_xerlow = '1' then + ctrl_tmp.xer_low <= ex1.e.write_data(17 downto 0); + end if; + if ex1.se.write_dec = '1' then + ctrl_tmp.dec <= ex1.e.write_data; + end if; + if ex1.se.write_cfar = '1' then + ctrl_tmp.cfar <= ex1.e.last_nia; + end if; + if ex1.se.write_loga = '1' then + v.log_addr_spr := ex1.e.write_data(31 downto 0); + elsif ex1.se.inc_loga = '1' then + v.log_addr_spr := std_ulogic_vector(unsigned(ex2.log_addr_spr) + 1); + end if; + x_to_pmu.mtspr <= ex1.se.write_pmuspr; + end if; + + if interrupt_in.intr = '1' then + ctrl_tmp.msr(MSR_SF) <= '1'; + ctrl_tmp.msr(MSR_EE) <= '0'; + ctrl_tmp.msr(MSR_PR) <= '0'; + ctrl_tmp.msr(MSR_SE) <= '0'; + ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; + ctrl_tmp.msr(MSR_IR) <= '0'; + ctrl_tmp.msr(MSR_DR) <= '0'; + ctrl_tmp.msr(MSR_RI) <= '0'; + ctrl_tmp.msr(MSR_LE) <= '1'; + end if; + + bypass_valid := ex1.e.valid; + if stage2_stall = '1' and ex1.res2_sel(1) = '1' then + bypass_valid := '0'; + end if; + + bypass2_data.tag.valid <= ex1.e.write_enable and bypass_valid; + bypass2_data.tag.tag <= ex1.e.instr_tag.tag; + bypass2_data.data <= ex_result; + + bypass2_cr_data.tag.valid <= (ex1.e.write_cr_enable or (ex1.e.rc and ex1.e.write_enable)) + and bypass_valid; + bypass2_cr_data.tag.tag <= ex1.e.instr_tag.tag; + bypass2_cr_data.data <= cr_res; + + -- Update registers + ex2in <= v; + + -- update outputs + e_out <= ex2.e; + e_out.msr <= msr_copy(ctrl.msr); + + terminate_out <= ex2.se.terminate; + icache_inval <= ex2.se.icache_inval; + + exception_log <= v.e.interrupt; + end process; + + sim_dump_test: if SIM generate + dump_exregs: process(all) + variable xer : std_ulogic_vector(63 downto 0); + begin + if sim_dump = '1' then + report "LR " & to_hstring(even_sprs(RAMSPR_LR)); + report "CTR " & to_hstring(odd_sprs(RAMSPR_CTR)); + sim_dump_done <= '1'; + else + sim_dump_done <= '0'; + end if; + end process; + end generate; + + -- Keep GHDL synthesis happy + sim_dump_test_synth: if not SIM generate + sim_dump_done <= '0'; + end generate; + e1_log: if LOG_LENGTH > 0 generate signal log_data : std_ulogic_vector(14 downto 0); begin @@ -1398,12 +1797,12 @@ begin ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) & exception_log & irq_valid_log & - interrupt_in & + interrupt_in.intr & "000" & - r.e.write_enable & - r.e.valid & - (r.e.redirect or r.e.interrupt) & - r.busy & + ex2.e.write_enable & + ex2.e.valid & + (ex2.e.redirect or ex2.e.interrupt) & + ex1.busy & flush_in; end if; end process; diff --git a/fetch1.vhdl b/fetch1.vhdl index ca039f69f..c6d26d795 100644 --- a/fetch1.vhdl +++ b/fetch1.vhdl @@ -93,7 +93,7 @@ begin end if; -- always send the up-to-date stop mark and req r.stop_mark <= stop_in; - r.req <= not rst; + r.req <= not rst and not stop_in; end if; end process; log_out <= log_nia; diff --git a/fpu.vhdl b/fpu.vhdl index 479e3aa7d..b09568773 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -15,6 +15,7 @@ entity fpu is port ( clk : in std_ulogic; rst : in std_ulogic; + flush_in : in std_ulogic; e_in : in Execute1ToFPUType; e_out : out FPUToExecute1Type; @@ -27,15 +28,23 @@ architecture behaviour of fpu is type fp_number_class is (ZERO, FINITE, INFINITY, NAN); constant EXP_BITS : natural := 13; + constant UNIT_BIT : natural := 56; + constant QNAN_BIT : natural := UNIT_BIT - 1; + constant SP_LSB : natural := UNIT_BIT - 23; + constant SP_GBIT : natural := SP_LSB - 1; + constant SP_RBIT : natural := SP_LSB - 2; + constant DP_LSB : natural := UNIT_BIT - 52; + constant DP_GBIT : natural := DP_LSB - 1; + constant DP_RBIT : natural := DP_LSB - 2; type fpu_reg_type is record class : fp_number_class; negative : std_ulogic; exponent : signed(EXP_BITS-1 downto 0); -- unbiased - mantissa : std_ulogic_vector(63 downto 0); -- 10.54 format + mantissa : std_ulogic_vector(63 downto 0); -- 8.56 format end record; - type state_t is (IDLE, + type state_t is (IDLE, DO_ILLEGAL, DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, DO_FMR, DO_FMRG, DO_FCMP, DO_FTDIV, DO_FTSQRT, DO_FCFID, DO_FCTI, @@ -66,28 +75,43 @@ architecture behaviour of fpu is RENORM_A, RENORM_A2, RENORM_B, RENORM_B2, RENORM_C, RENORM_C2, - NAN_RESULT, EXC_RESULT); + NAN_RESULT, EXC_RESULT, + DO_IDIVMOD, + IDIV_NORMB, IDIV_NORMB2, IDIV_NORMB3, + IDIV_CLZA, IDIV_CLZA2, IDIV_CLZA3, + IDIV_NR0, IDIV_NR1, IDIV_NR2, IDIV_USE0_5, + IDIV_DODIV, IDIV_SH32, + IDIV_DIV, IDIV_DIV2, IDIV_DIV3, IDIV_DIV4, IDIV_DIV5, + IDIV_DIV6, IDIV_DIV7, IDIV_DIV8, IDIV_DIV9, + IDIV_EXT_TBH, IDIV_EXT_TBH2, IDIV_EXT_TBH3, + IDIV_EXT_TBH4, IDIV_EXT_TBH5, + IDIV_EXTDIV, IDIV_EXTDIV1, IDIV_EXTDIV2, IDIV_EXTDIV3, + IDIV_EXTDIV4, IDIV_EXTDIV5, IDIV_EXTDIV6, + IDIV_MODADJ, IDIV_MODSUB, IDIV_DIVADJ, IDIV_OVFCHK, IDIV_DONE, IDIV_ZERO); type reg_type is record state : state_t; busy : std_ulogic; + f2stall : std_ulogic; instr_done : std_ulogic; + complete : std_ulogic; do_intr : std_ulogic; illegal : std_ulogic; op : insn_type_t; insn : std_ulogic_vector(31 downto 0); - nia : std_ulogic_vector(63 downto 0); instr_tag : instr_tag_t; dest_fpr : gspr_index_t; fe_mode : std_ulogic; rc : std_ulogic; is_cmp : std_ulogic; single_prec : std_ulogic; + sp_result : std_ulogic; fpscr : std_ulogic_vector(31 downto 0); + comm_fpscr : std_ulogic_vector(31 downto 0); -- committed FPSCR value a : fpu_reg_type; b : fpu_reg_type; c : fpu_reg_type; - r : std_ulogic_vector(63 downto 0); -- 10.54 format + r : std_ulogic_vector(63 downto 0); -- 8.56 format s : std_ulogic_vector(55 downto 0); -- extended fraction x : std_ulogic; p : std_ulogic_vector(63 downto 0); -- 8.56 format @@ -96,13 +120,18 @@ architecture behaviour of fpu is result_class : fp_number_class; result_exp : signed(EXP_BITS-1 downto 0); shift : signed(EXP_BITS-1 downto 0); - writing_back : std_ulogic; + writing_fpr : std_ulogic; + write_reg : gspr_index_t; + complete_tag : instr_tag_t; + writing_cr : std_ulogic; + writing_xer : std_ulogic; int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); old_exc : std_ulogic_vector(4 downto 0); update_fprf : std_ulogic; quieten_nan : std_ulogic; + nsnan_result : std_ulogic; tiny : std_ulogic; denorm : std_ulogic; round_mode : std_ulogic_vector(2 downto 0); @@ -122,6 +151,19 @@ architecture behaviour of fpu is invalid : std_ulogic; negate : std_ulogic; longmask : std_ulogic; + integer_op : std_ulogic; + divext : std_ulogic; + divmod : std_ulogic; + is_signed : std_ulogic; + int_ovf : std_ulogic; + div_close : std_ulogic; + inc_quot : std_ulogic; + a_hi : std_ulogic_vector(7 downto 0); + a_lo : std_ulogic_vector(55 downto 0); + m32b : std_ulogic; + oe : std_ulogic; + xerc : xer_common_t; + xerc_result : xer_common_t; end record; type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0); @@ -142,6 +184,7 @@ architecture behaviour of fpu is signal lost_bits : std_ulogic; signal r_hi_nz : std_ulogic; signal r_lo_nz : std_ulogic; + signal r_gt_1 : std_ulogic; signal s_nz : std_ulogic; signal misc_sel : std_ulogic_vector(3 downto 0); signal f_to_multiply : MultiplyInputType; @@ -161,7 +204,7 @@ architecture behaviour of fpu is constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00"; constant BIN_R : std_ulogic_vector(1 downto 0) := "01"; constant BIN_RND : std_ulogic_vector(1 downto 0) := "10"; - constant BIN_PS6 : std_ulogic_vector(1 downto 0) := "11"; + constant BIN_PS8 : std_ulogic_vector(1 downto 0) := "11"; constant RES_SUM : std_ulogic_vector(1 downto 0) := "00"; constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01"; @@ -411,23 +454,27 @@ architecture behaviour of fpu is -- Split a DP floating-point number into components and work out its class. -- If is_int = 1, the input is considered an integer - function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic) return fpu_reg_type is + function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic; + is_32bint: std_ulogic; is_signed: std_ulogic) return fpu_reg_type is variable r : fpu_reg_type; variable exp_nz : std_ulogic; variable exp_ao : std_ulogic; variable frac_nz : std_ulogic; + variable low_nz : std_ulogic; variable cls : std_ulogic_vector(2 downto 0); begin r.negative := fpr(63); exp_nz := or (fpr(62 downto 52)); exp_ao := and (fpr(62 downto 52)); frac_nz := or (fpr(51 downto 0)); + low_nz := or (fpr(31 downto 0)); if is_int = '0' then r.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS); if exp_nz = '0' then r.exponent := to_signed(-1022, EXP_BITS); end if; - r.mantissa := "000000000" & exp_nz & fpr(51 downto 0) & "00"; + r.mantissa := std_ulogic_vector(shift_left(resize(unsigned(exp_nz & fpr(51 downto 0)), 64), + UNIT_BIT - 52)); cls := exp_ao & exp_nz & frac_nz; case cls is when "000" => r.class := ZERO; @@ -437,6 +484,16 @@ architecture behaviour of fpu is when "110" => r.class := INFINITY; when others => r.class := NAN; end case; + elsif is_32bint = '1' then + r.negative := fpr(31); + r.mantissa(31 downto 0) := fpr(31 downto 0); + r.mantissa(63 downto 32) := (others => (is_signed and fpr(31))); + r.exponent := (others => '0'); + if low_nz = '1' then + r.class := FINITE; + else + r.class := ZERO; + end if; else r.mantissa := fpr; r.exponent := (others => '0'); @@ -460,22 +517,22 @@ architecture behaviour of fpu is case class is when ZERO => when FINITE => - if mantissa(54) = '1' then + if mantissa(UNIT_BIT) = '1' then -- normalized number result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023); end if; - result(51 downto 29) := mantissa(53 downto 31); + result(51 downto 29) := mantissa(UNIT_BIT - 1 downto SP_LSB); if single_prec = '0' then - result(28 downto 0) := mantissa(30 downto 2); + result(28 downto 0) := mantissa(SP_LSB - 1 downto DP_LSB); end if; when INFINITY => result(62 downto 52) := "11111111111"; when NAN => result(62 downto 52) := "11111111111"; - result(51) := quieten_nan or mantissa(53); - result(50 downto 29) := mantissa(52 downto 31); + result(51) := quieten_nan or mantissa(QNAN_BIT); + result(50 downto 29) := mantissa(QNAN_BIT - 1 downto SP_LSB); if single_prec = '0' then - result(28 downto 0) := mantissa(30 downto 2); + result(28 downto 0) := mantissa(SP_LSB - 1 downto DP_LSB); end if; end case; return result; @@ -483,8 +540,8 @@ architecture behaviour of fpu is -- Determine whether to increment when rounding -- Returns rounding_inc & inexact - -- Assumes x includes the bottom 29 bits of the mantissa already - -- if single_prec = 1 (usually arranged by setting set_x = 1 earlier). + -- If single_prec = 1, assumes x includes the bottom 31 (== SP_LSB - 2) + -- bits of the mantissa already (usually arranged by setting set_x = 1 earlier). function fp_rounding(mantissa: std_ulogic_vector(63 downto 0); x: std_ulogic; single_prec: std_ulogic; rn: std_ulogic_vector(2 downto 0); sign: std_ulogic) @@ -494,11 +551,11 @@ architecture behaviour of fpu is variable lsb : std_ulogic; begin if single_prec = '0' then - grx := mantissa(1 downto 0) & x; - lsb := mantissa(2); + grx := mantissa(DP_GBIT downto DP_RBIT) & (x or (or mantissa(DP_RBIT - 1 downto 0))); + lsb := mantissa(DP_LSB); else - grx := mantissa(30 downto 29) & x; - lsb := mantissa(31); + grx := mantissa(SP_GBIT downto SP_RBIT) & x; + lsb := mantissa(SP_LSB); end if; ret(1) := '0'; ret(0) := or (grx); @@ -546,17 +603,31 @@ begin fpu_0: process(clk) begin if rising_edge(clk) then - if rst = '1' then + if rst = '1' or flush_in = '1' then r.state <= IDLE; r.busy <= '0'; + r.f2stall <= '0'; r.instr_done <= '0'; + r.complete <= '0'; + r.illegal <= '0'; r.do_intr <= '0'; + r.writing_fpr <= '0'; + r.writing_cr <= '0'; + r.writing_xer <= '0'; r.fpscr <= (others => '0'); - r.writing_back <= '0'; - r.dest_fpr <= (others =>'0'); + r.write_reg <= (others =>'0'); + r.complete_tag.valid <= '0'; r.cr_mask <= (others =>'0'); r.cr_result <= (others =>'0'); r.instr_tag.valid <= '0'; + if rst = '1' then + r.fpscr <= (others => '0'); + r.comm_fpscr <= (others => '0'); + elsif r.do_intr = '0' then + -- flush_in = 1 and not due to us generating an interrupt, + -- roll back to committed fpscr + r.fpscr <= r.comm_fpscr; + end if; else assert not (r.state /= IDLE and e_in.valid = '1') severity failure; r <= rin; @@ -571,11 +642,11 @@ begin begin if rising_edge(clk) then if r.is_sqrt = '1' then - addrhi := r.b.mantissa(55 downto 54); + addrhi := r.b.mantissa(UNIT_BIT + 1 downto UNIT_BIT); else addrhi := "00"; end if; - addr := addrhi & r.b.mantissa(53 downto 46); + addr := addrhi & r.b.mantissa(UNIT_BIT - 1 downto UNIT_BIT - 8); if is_X(addr) then inverse_est <= (others => 'X'); else @@ -585,20 +656,26 @@ begin end process; e_out.busy <= r.busy; + e_out.f2stall <= r.f2stall; e_out.exception <= r.fpscr(FPSCR_FEX); - w_out.valid <= r.instr_done and not r.do_intr; - w_out.instr_tag <= r.instr_tag; - w_out.write_enable <= r.writing_back; - w_out.write_reg <= r.dest_fpr; + -- Note that the cycle where r.complete = 1 for an instruction can be as + -- late as the second cycle of the following instruction (i.e. in the state + -- following IDLE state). Hence it is important that none of the fields of + -- r that are used below are modified in IDLE state. + w_out.valid <= r.complete; + w_out.instr_tag <= r.complete_tag; + w_out.write_enable <= r.writing_fpr and r.complete; + w_out.write_reg <= r.write_reg; w_out.write_data <= fp_result; - w_out.write_cr_enable <= r.instr_done and (r.rc or r.is_cmp); + w_out.write_cr_enable <= r.writing_cr and r.complete; w_out.write_cr_mask <= r.cr_mask; w_out.write_cr_data <= r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result & r.cr_result; + w_out.write_xerc <= r.writing_xer and r.complete; + w_out.xerc <= r.xerc_result; w_out.interrupt <= r.do_intr; w_out.intr_vec <= 16#700#; - w_out.srr0 <= r.nia; w_out.srr1 <= (47-44 => r.illegal, 47-43 => not r.illegal, others => '0'); fpu_1: process(all) @@ -607,10 +684,10 @@ begin variable bdec : fpu_reg_type; variable cdec : fpu_reg_type; variable fpscr_mask : std_ulogic_vector(31 downto 0); - variable illegal : std_ulogic; variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); variable int_input : std_ulogic; + variable is_32bint : std_ulogic; variable mask : std_ulogic_vector(63 downto 0); variable in_a0 : std_ulogic_vector(63 downto 0); variable in_b0 : std_ulogic_vector(63 downto 0); @@ -636,7 +713,12 @@ begin variable msb : std_ulogic; variable is_add : std_ulogic; variable set_a : std_ulogic; + variable set_a_exp : std_ulogic; + variable set_a_mant : std_ulogic; + variable set_a_hi : std_ulogic; + variable set_a_lo : std_ulogic; variable set_b : std_ulogic; + variable set_b_mant : std_ulogic; variable set_c : std_ulogic; variable set_y : std_ulogic; variable set_s : std_ulogic; @@ -644,41 +726,71 @@ begin variable px_nz : std_ulogic; variable pcmpb_eq : std_ulogic; variable pcmpb_lt : std_ulogic; + variable pcmpc_eq : std_ulogic; + variable pcmpc_lt : std_ulogic; variable pshift : std_ulogic; variable renorm_sqrt : std_ulogic; variable sqrt_exp : signed(EXP_BITS-1 downto 0); variable shiftin : std_ulogic; + variable shiftin0 : std_ulogic; variable mulexp : signed(EXP_BITS-1 downto 0); variable maddend : std_ulogic_vector(127 downto 0); variable sum : std_ulogic_vector(63 downto 0); variable round_inc : std_ulogic_vector(63 downto 0); + variable rbit_inc : std_ulogic; + variable mult_mask : std_ulogic; + variable sign_bit : std_ulogic; + variable rnd_b32 : std_ulogic; + variable int_result : std_ulogic; + variable illegal : std_ulogic; begin v := r; - illegal := '0'; - v.busy := '0'; + v.complete := '0'; + v.do_intr := '0'; int_input := '0'; + is_32bint := '0'; + + if r.complete = '1' or r.do_intr = '1' then + v.instr_done := '0'; + v.writing_fpr := '0'; + v.writing_cr := '0'; + v.writing_xer := '0'; + v.comm_fpscr := r.fpscr; + v.illegal := '0'; + end if; -- capture incoming instruction if e_in.valid = '1' then v.insn := e_in.insn; - v.nia := e_in.nia; v.op := e_in.op; v.instr_tag := e_in.itag; v.fe_mode := or (e_in.fe_mode); v.dest_fpr := e_in.frt; v.single_prec := e_in.single; - v.longmask := e_in.single; - v.int_result := '0'; + v.is_signed := e_in.is_signed; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; - if e_in.out_cr = '0' then - v.cr_mask := num_to_fxm(1); - else - v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn)))); - end if; - int_input := '0'; - if e_in.op = OP_FPOP_I then + v.oe := e_in.oe; + v.m32b := e_in.m32b; + v.xerc := e_in.xerc; + v.longmask := '0'; + v.integer_op := '0'; + v.divext := '0'; + v.divmod := '0'; + if e_in.op = OP_FPOP or e_in.op = OP_FPOP_I then + v.longmask := e_in.single; + if e_in.op = OP_FPOP_I then + int_input := '1'; + end if; + else -- OP_DIV, OP_DIVE, OP_MOD + v.integer_op := '1'; int_input := '1'; + is_32bint := e_in.single; + if e_in.op = OP_DIVE then + v.divext := '1'; + elsif e_in.op = OP_MOD then + v.divmod := '1'; + end if; end if; v.quieten_nan := '1'; v.tiny := '0'; @@ -689,10 +801,12 @@ begin v.is_sqrt := '0'; v.add_bsmall := '0'; v.doing_ftdiv := "00"; + v.int_ovf := '0'; + v.div_close := '0'; - adec := decode_dp(e_in.fra, int_input); - bdec := decode_dp(e_in.frb, int_input); - cdec := decode_dp(e_in.frc, int_input); + adec := decode_dp(e_in.fra, int_input, is_32bint, e_in.is_signed); + bdec := decode_dp(e_in.frb, int_input, is_32bint, e_in.is_signed); + cdec := decode_dp(e_in.frc, int_input, '0', '0'); v.a := adec; v.b := bdec; v.c := cdec; @@ -705,10 +819,14 @@ begin if (adec.exponent + cdec.exponent + 1) >= bdec.exponent then v.madd_cmp := '1'; end if; + + v.a_hi := 8x"0"; + v.a_lo := 56x"0"; end if; - r_hi_nz <= or (r.r(55 downto 31)); - r_lo_nz <= or (r.r(30 downto 2)); + r_hi_nz <= or (r.r(UNIT_BIT + 1 downto SP_LSB)); + r_lo_nz <= or (r.r(SP_LSB - 1 downto DP_LSB)); + r_gt_1 <= or (r.r(63 downto 1)); s_nz <= or (r.s); if r.single_prec = '0' then @@ -743,20 +861,26 @@ begin end if; -- Compare P with zero and with B - px_nz := or (r.p(57 downto 4)); + px_nz := or (r.p(UNIT_BIT + 1 downto 4)); pcmpb_eq := '0'; - if r.p(59 downto 4) = r.b.mantissa(55 downto 0) then + if r.p(59 downto 4) = r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT) then pcmpb_eq := '1'; end if; pcmpb_lt := '0'; if is_X(r.p(59 downto 4)) or is_X(r.b.mantissa(55 downto 0)) then pcmpb_lt := 'X'; - elsif unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(55 downto 0)) then + elsif unsigned(r.p(59 downto 4)) < unsigned(r.b.mantissa(UNIT_BIT + 1 downto DP_RBIT)) then pcmpb_lt := '1'; end if; + pcmpc_eq := '0'; + if r.p = r.c.mantissa then + pcmpc_eq := '1'; + end if; + pcmpc_lt := '0'; + if unsigned(r.p) < unsigned(r.c.mantissa) then + pcmpc_lt := '1'; + end if; - v.writing_back := '0'; - v.instr_done := '0'; v.update_fprf := '0'; v.shift := to_signed(0, EXP_BITS); v.first := '0'; @@ -778,7 +902,12 @@ begin set_x := '0'; qnan_result := '0'; set_a := '0'; + set_a_exp := '0'; + set_a_mant := '0'; + set_a_hi := '0'; + set_a_lo := '0'; set_b := '0'; + set_b_mant := '0'; set_c := '0'; set_s := '0'; f_to_multiply.is_32bit <= '0'; @@ -791,6 +920,12 @@ begin pshift := '0'; renorm_sqrt := '0'; shiftin := '0'; + shiftin0 := '0'; + rbit_inc := '0'; + mult_mask := '0'; + rnd_b32 := '0'; + int_result := '0'; + illegal := '0'; case r.state is when IDLE => v.use_a := '0'; @@ -799,6 +934,7 @@ begin v.invalid := '0'; v.negate := '0'; if e_in.valid = '1' then + v.busy := '1'; case e_in.insn(5 downto 1) is when "00000" => if e_in.insn(8) = '1' then @@ -836,6 +972,10 @@ begin else v.state := DO_FRI; end if; + when "01001" | "01011" => + -- integer divides and mods, major opcode 31 + v.opsel_a := AIN_B; + v.state := DO_IDIVMOD; when "01100" => v.opsel_a := AIN_B; v.state := DO_FRSP; @@ -853,7 +993,7 @@ begin v.state := DO_FCTI; when "10010" => v.opsel_a := AIN_A; - if v.b.mantissa(54) = '0' and v.a.mantissa(54) = '1' then + if v.b.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then v.opsel_a := AIN_B; end if; v.state := DO_FDIV; @@ -872,7 +1012,7 @@ begin when "11001" => v.is_multiply := '1'; v.opsel_a := AIN_A; - if v.c.mantissa(54) = '0' and v.a.mantissa(54) = '1' then + if v.c.mantissa(UNIT_BIT) = '0' and v.a.mantissa(UNIT_BIT) = '1' then v.opsel_a := AIN_C; end if; v.state := DO_FMUL; @@ -881,22 +1021,26 @@ begin v.opsel_a := AIN_B; v.state := DO_FRSQRTE; when "11100" | "11101" | "11110" | "11111" => - if v.a.mantissa(54) = '0' then + if v.a.mantissa(UNIT_BIT) = '0' then v.opsel_a := AIN_A; - elsif v.c.mantissa(54) = '0' then + elsif v.c.mantissa(UNIT_BIT) = '0' then v.opsel_a := AIN_C; else v.opsel_a := AIN_B; end if; v.state := DO_FMADD; when others => - illegal := '1'; + v.state := DO_ILLEGAL; end case; end if; v.x := '0'; v.old_exc := r.fpscr(FPSCR_VX downto FPSCR_XX); set_s := '1'; + when DO_ILLEGAL => + illegal := '1'; + v.instr_done := '1'; + when DO_MCRFS => j := to_integer(unsigned(insn_bfa(r.insn))); for i in 0 to 7 loop @@ -908,14 +1052,12 @@ begin end loop; v.fpscr := r.fpscr and (fpscr_mask or x"6007F8FF"); v.instr_done := '1'; - v.state := IDLE; when DO_FTDIV => v.instr_done := '1'; - v.state := IDLE; v.cr_result := "0000"; if r.a.class = INFINITY or r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.mantissa(53) = '0') then + (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then v.cr_result(2) := '1'; end if; if r.a.class = NAN or r.a.class = INFINITY or @@ -931,10 +1073,9 @@ begin when DO_FTSQRT => v.instr_done := '1'; - v.state := IDLE; v.cr_result := "0000"; if r.b.class = ZERO or r.b.class = INFINITY or - (r.b.class = FINITE and r.b.mantissa(53) = '0') then + (r.b.class = FINITE and r.b.mantissa(UNIT_BIT) = '0') then v.cr_result(2) := '1'; end if; if r.b.class = NAN or r.b.class = INFINITY or r.b.class = ZERO @@ -946,11 +1087,10 @@ begin -- fcmp[uo] -- r.opsel_a = AIN_B v.instr_done := '1'; - v.state := IDLE; update_fx := '1'; v.result_exp := r.b.exponent; - if (r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.b.class = NAN and r.b.mantissa(53) = '0') then + if (r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or + (r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') then -- Signalling NAN v.fpscr(FPSCR_VXSNAN) := '1'; if r.insn(6) = '1' and r.fpscr(FPSCR_VE) = '0' then @@ -1007,7 +1147,6 @@ begin end if; end loop; v.instr_done := '1'; - v.state := IDLE; when DO_MTFSFI => -- mtfsfi @@ -1021,20 +1160,17 @@ begin end loop; end if; v.instr_done := '1'; - v.state := IDLE; when DO_FMRG => -- fmrgew, fmrgow opsel_r <= RES_MISC; misc_sel <= "01" & r.insn(8) & '0'; - v.int_result := '1'; - v.writing_back := '1'; + int_result := '1'; + v.writing_fpr := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_MFFS => - v.int_result := '1'; - v.writing_back := '1'; + v.writing_fpr := '1'; opsel_r <= RES_MISC; case r.insn(20 downto 16) is when "00000" => @@ -1058,10 +1194,11 @@ begin -- mffsl fpscr_mask := x"0007F0FF"; when others => - illegal := '1'; + v.illegal := '1'; + v.writing_fpr := '0'; end case; + int_result := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_MTFSF => if r.insn(25) = '1' then @@ -1078,7 +1215,6 @@ begin end if; end loop; v.instr_done := '1'; - v.state := IDLE; when DO_FMR => -- r.opsel_a = AIN_B @@ -1096,9 +1232,8 @@ begin else v.result_sign := r.a.negative; -- fcpsgn end if; - v.writing_back := '1'; + v.writing_fpr := '1'; v.instr_done := '1'; - v.state := IDLE; when DO_FRI => -- fri[nzpm] -- r.opsel_a = AIN_B @@ -1107,7 +1242,7 @@ begin v.result_exp := r.b.exponent; v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; - if r.b.class = NAN and r.b.mantissa(53) = '0' then + if r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0' then -- Signalling NAN v.fpscr(FPSCR_VXSNAN) := '1'; invalid := '1'; @@ -1167,7 +1302,7 @@ begin invalid := '1'; end if; - v.int_result := '1'; + int_result := '1'; case r.b.class is when ZERO => arith_done := '1'; @@ -1178,7 +1313,7 @@ begin elsif r.b.exponent >= to_signed(52, EXP_BITS) then -- integer already, no rounding required, -- shift into final position - v.shift := r.b.exponent - to_signed(54, EXP_BITS); + v.shift := r.b.exponent - to_signed(UNIT_BIT, EXP_BITS); if r.insn(8) = '1' and r.b.negative = '1' then v.state := INT_OFLOW; else @@ -1202,7 +1337,7 @@ begin v.result_sign := '1'; end if; v.result_class := r.b.class; - v.result_exp := to_signed(54, EXP_BITS); + v.result_exp := to_signed(UNIT_BIT, EXP_BITS); v.fpscr(FPSCR_FR) := '0'; v.fpscr(FPSCR_FI) := '0'; if r.b.class = ZERO then @@ -1274,9 +1409,9 @@ begin if r.a.class = FINITE and r.c.class = FINITE then v.result_exp := r.a.exponent + r.c.exponent; -- Renormalize denorm operands - if r.a.mantissa(54) = '0' then + if r.a.mantissa(UNIT_BIT) = '0' then v.state := RENORM_A; - elsif r.c.mantissa(54) = '0' then + elsif r.c.mantissa(UNIT_BIT) = '0' then v.state := RENORM_C; else f_to_multiply.valid <= '1'; @@ -1313,9 +1448,9 @@ begin v.count := "00"; if r.a.class = FINITE and r.b.class = FINITE then -- Renormalize denorm operands - if r.a.mantissa(54) = '0' then + if r.a.mantissa(UNIT_BIT) = '0' then v.state := RENORM_A; - elsif r.b.mantissa(54) = '0' then + elsif r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; else v.first := '1'; @@ -1372,7 +1507,7 @@ begin if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - elsif r.b.mantissa(54) = '0' then + elsif r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then v.state := SQRT_1; @@ -1404,7 +1539,7 @@ begin case r.b.class is when FINITE => v.result_exp := - r.b.exponent; - if r.b.mantissa(54) = '0' then + if r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; else v.state := FRE_1; @@ -1434,7 +1569,7 @@ begin if r.b.negative = '1' then v.fpscr(FPSCR_VXSQRT) := '1'; qnan_result := '1'; - elsif r.b.mantissa(54) = '0' then + elsif r.b.mantissa(UNIT_BIT) = '0' then v.state := RENORM_B; elsif r.b.exponent(0) = '0' then v.state := RSQRT_1; @@ -1476,9 +1611,9 @@ begin mulexp := r.a.exponent + r.c.exponent; v.result_exp := mulexp; -- Make sure A and C are normalized - if r.a.mantissa(54) = '0' then + if r.a.mantissa(UNIT_BIT) = '0' then v.state := RENORM_A; - elsif r.c.mantissa(54) = '0' then + elsif r.c.mantissa(UNIT_BIT) = '0' then v.state := RENORM_C; elsif r.b.class = ZERO then -- no addend, degenerates to multiply @@ -1547,7 +1682,7 @@ begin set_a := '1'; v.result_exp := new_exp; if r.insn(4) = '1' then - if r.c.mantissa(54) = '1' then + if r.c.mantissa(UNIT_BIT) = '1' then if r.insn(3) = '0' or r.b.class = ZERO then v.first := '1'; v.state := MULT_1; @@ -1563,7 +1698,7 @@ begin v.state := RENORM_C; end if; else - if r.b.mantissa(54) = '1' then + if r.b.mantissa(UNIT_BIT) = '1' then v.first := '1'; v.state := DIV_2; else @@ -1642,7 +1777,7 @@ begin opsel_ainv <= '1'; carry_in <= '1'; v.state := FINISH; - elsif r.r(55) = '1' then + elsif r.r(UNIT_BIT + 1) = '1' then -- sum overflowed, shift right opsel_r <= RES_SHIFT; set_x := '1'; @@ -1651,10 +1786,10 @@ begin else v.state := ROUNDING; end if; - elsif r.r(54) = '1' then + elsif r.r(UNIT_BIT) = '1' then set_x := '1'; v.state := ROUNDING; - elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then + elsif (r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then -- r.x must be zero at this point v.result_class := ZERO; if r.is_subtract = '1' then @@ -1685,7 +1820,6 @@ begin end if; v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result; v.instr_done := '1'; - v.state := IDLE; when MULT_1 => f_to_multiply.valid <= r.first; @@ -1742,12 +1876,12 @@ begin opsel_s <= S_NEG; set_s := '1'; end if; - v.shift := to_signed(56, EXP_BITS); + v.shift := to_signed(UNIT_BIT, EXP_BITS); v.state := FMADD_6; when FMADD_6 => - -- r.shift = 56 (or 0, but only if r is now nonzero) - if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then + -- r.shift = UNIT_BIT (or 0, but only if r is now nonzero) + if (r.r(UNIT_BIT + 2) or r_hi_nz or r_lo_nz or (or (r.r(DP_LSB - 1 downto 0)))) = '0' then if s_nz = '0' then -- must be a subtraction, and r.x must be zero v.result_class := ZERO; @@ -1760,7 +1894,7 @@ begin set_s := '1'; -- stay in state FMADD_6 end if; - elsif r.r(56 downto 54) = "001" then + elsif r.r(UNIT_BIT + 2 downto UNIT_BIT) = "001" then v.state := FINISH; else renormalize := '1'; @@ -1824,6 +1958,7 @@ begin set_y := r.first; f_to_multiply.valid <= r.first; pshift := '1'; + mult_mask := '1'; if multiply_to_f.valid = '1' then opsel_r <= RES_MULT; v.first := '1'; @@ -1842,13 +1977,15 @@ begin end if; when DIV_6 => + -- r.opsel_a = AIN_R -- test if remainder is 0 or >= B if pcmpb_lt = '1' then -- quotient is correct, set X if remainder non-zero - v.x := r.p(58) or px_nz; + v.x := r.p(UNIT_BIT + 2) or px_nz; else - -- quotient needs to be incremented by 1 - carry_in <= '1'; + -- quotient needs to be incremented by 1 in R-bit position + rbit_inc := '1'; + opsel_b <= BIN_RND; v.x := not pcmpb_eq; end if; v.state := FINISH; @@ -1863,7 +2000,6 @@ begin v.cr_result(1) := exp_tiny or exp_huge; if exp_tiny = '1' or exp_huge = '1' or r.a.class = ZERO or r.first = '0' then v.instr_done := '1'; - v.state := IDLE; else v.shift := r.a.exponent; v.doing_ftdiv := "10"; @@ -1903,6 +2039,7 @@ begin msel_2 <= MUL2_R; set_y := r.first; pshift := '1'; + mult_mask := '1'; if multiply_to_f.valid = '1' then -- put result into R opsel_r <= RES_MULT; @@ -1947,6 +2084,7 @@ begin set_y := r.first; -- wait for second multiply (should be here already) pshift := '1'; + mult_mask := '1'; if multiply_to_f.valid = '1' then -- put result into R opsel_r <= RES_MULT; @@ -1991,11 +2129,8 @@ begin end if; when SQRT_10 => - -- Add the bottom 8 bits of P, sign-extended, - -- divided by 4, onto R. - -- The division by 4 is because R is 10.54 format - -- whereas P is 8.56 format. - opsel_b <= BIN_PS6; + -- Add the bottom 8 bits of P, sign-extended, onto R. + opsel_b <= BIN_PS8; sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1); v.result_exp := sqrt_exp; v.shift := to_signed(1, EXP_BITS); @@ -2020,7 +2155,7 @@ begin -- test if remainder is 0 or >= B = 2*R + 1 if pcmpb_lt = '1' then -- square root is correct, set X if remainder non-zero - v.x := r.p(58) or px_nz; + v.x := r.p(UNIT_BIT + 2) or px_nz; else -- square root needs to be incremented by 1 carry_in <= '1'; @@ -2033,10 +2168,10 @@ begin opsel_r <= RES_SHIFT; set_x := '1'; v.state := INT_ROUND; - v.shift := to_signed(-2, EXP_BITS); + v.shift := to_signed(52 - UNIT_BIT, EXP_BITS); when INT_ROUND => - -- r.shift = -2 + -- r.shift = -4 (== 52 - UNIT_BIT) opsel_r <= RES_SHIFT; round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign); v.fpscr(FPSCR_FR downto FPSCR_FI) := round; @@ -2049,7 +2184,7 @@ begin end if; when INT_ISHIFT => - -- r.shift = b.exponent - 54; + -- r.shift = b.exponent - UNIT_BIT; opsel_r <= RES_SHIFT; v.state := INT_FINAL; @@ -2068,6 +2203,7 @@ begin when others => -- fctidu[z] need_check := r.r(63); end case; + int_result := '1'; if need_check = '1' then v.state := INT_CHECK; else @@ -2094,6 +2230,7 @@ begin v.fpscr(FPSCR_XX) := '1'; end if; end if; + int_result := '1'; arith_done := '1'; when INT_OFLOW => @@ -2104,6 +2241,7 @@ begin end if; v.fpscr(FPSCR_VXCVI) := '1'; invalid := '1'; + int_result := '1'; arith_done := '1'; when FRI_1 => @@ -2116,7 +2254,7 @@ begin if r.is_multiply = '1' and px_nz = '1' then v.x := '1'; end if; - if r.r(63 downto 54) /= "0000000001" then + if r.r(63 downto UNIT_BIT) /= std_ulogic_vector(to_unsigned(1, 64 - UNIT_BIT)) then renormalize := '1'; v.state := NORMALIZE; else @@ -2159,7 +2297,7 @@ begin -- if denormalized, have to normalize before rounding v.fpscr(FPSCR_UX) := '1'; v.result_exp := r.result_exp + bias_exp; - if r.r(54) = '0' then + if r.r(UNIT_BIT) = '0' then renormalize := '1'; v.state := NORMALIZE; else @@ -2202,7 +2340,7 @@ begin v.shift := to_signed(-1, EXP_BITS); v.state := ROUNDING_2; else - if r.r(54) = '0' then + if r.r(UNIT_BIT) = '0' then -- result after masking could be zero, or could be a -- denormalized result that needs to be renormalized renormalize := '1'; @@ -2222,14 +2360,14 @@ begin -- Check for overflow during rounding -- r.shift = -1 v.x := '0'; - if r.r(55) = '1' then + if r.r(UNIT_BIT + 1) = '1' then opsel_r <= RES_SHIFT; if exp_huge = '1' then v.state := ROUND_OFLOW; else arith_done := '1'; end if; - elsif r.r(54) = '0' then + elsif r.r(UNIT_BIT) = '0' then -- Do CLZ so we can renormalize the result renormalize := '1'; v.state := ROUNDING_3; @@ -2265,9 +2403,9 @@ begin arith_done := '1'; when NAN_RESULT => - if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(53) = '0') or - (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(53) = '0') or - (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(53) = '0') then + if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(QNAN_BIT) = '0') or + (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(QNAN_BIT) = '0') or + (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(QNAN_BIT) = '0') then -- Signalling NAN v.fpscr(FPSCR_VXSNAN) := '1'; invalid := '1'; @@ -2299,6 +2437,502 @@ begin end case; arith_done := '1'; + when DO_IDIVMOD => + -- r.opsel_a = AIN_B + v.result_sign := r.is_signed and (r.a.negative xor (r.b.negative and not r.divmod)); + if r.b.class = ZERO then + -- B is zero, signal overflow + v.int_ovf := '1'; + v.state := IDIV_ZERO; + elsif r.a.class = ZERO then + -- A is zero, result is zero (both for div and for mod) + v.state := IDIV_ZERO; + else + -- take absolute value for signed division, and + -- normalize and round up B to 8.56 format, like fcfid[u] + if r.is_signed = '1' and r.b.negative = '1' then + opsel_ainv <= '1'; + carry_in <= '1'; + end if; + v.result_class := FINITE; + v.result_exp := to_signed(UNIT_BIT, EXP_BITS); + v.state := IDIV_NORMB; + end if; + when IDIV_NORMB => + -- do count-leading-zeroes on B (now in R) + renormalize := '1'; + -- save the original value of B or |B| in C + set_c := '1'; + v.state := IDIV_NORMB2; + when IDIV_NORMB2 => + -- get B into the range [1, 2) in 8.56 format + set_x := '1'; -- record if any 1 bits shifted out + opsel_r <= RES_SHIFT; + v.state := IDIV_NORMB3; + when IDIV_NORMB3 => + -- add the X bit onto R to round up B + carry_in <= r.x; + -- prepare to do count-leading-zeroes on A + v.opsel_a := AIN_A; + v.state := IDIV_CLZA; + when IDIV_CLZA => + set_b := '1'; -- put R back into B + -- r.opsel_a = AIN_A + if r.is_signed = '1' and r.a.negative = '1' then + opsel_ainv <= '1'; + carry_in <= '1'; + end if; + v.result_exp := to_signed(UNIT_BIT, EXP_BITS); + v.opsel_a := AIN_C; + v.state := IDIV_CLZA2; + when IDIV_CLZA2 => + -- r.opsel_a = AIN_C + renormalize := '1'; + -- write the dividend back into A in case we negated it + set_a_mant := '1'; + -- while doing the count-leading-zeroes on A, + -- also compute A - B to tell us whether A >= B + -- (using the original value of B, which is now in C) + opsel_b <= BIN_R; + opsel_ainv <= '1'; + carry_in <= '1'; + v.state := IDIV_CLZA3; + when IDIV_CLZA3 => + -- save the exponent of A (but don't overwrite the mantissa) + v.a.exponent := new_exp; + v.div_close := '0'; + if new_exp = r.b.exponent then + v.div_close := '1'; + end if; + v.state := IDIV_NR0; + if new_exp > r.b.exponent or (v.div_close = '1' and r.r(63) = '0') then + -- A >= B, overflow if extended division + if r.divext = '1' then + v.int_ovf := '1'; + -- return 0 in overflow cases + v.state := IDIV_ZERO; + end if; + else + -- A < B, result is zero for normal division + if r.divmod = '0' and r.divext = '0' then + v.state := IDIV_ZERO; + end if; + end if; + when IDIV_NR0 => + -- reduce number of Newton-Raphson iterations for small A + if r.divext = '1' or new_exp >= to_signed(32, EXP_BITS) then + v.count := "00"; + elsif new_exp >= to_signed(16, EXP_BITS) then + v.count := "01"; + else + v.count := "10"; + end if; + -- first NR iteration does Y = LUT; P = 2 - B * LUT + msel_1 <= MUL1_B; + msel_add <= MULADD_CONST; + msel_inv <= '1'; + msel_2 <= MUL2_LUT; + set_y := '1'; + if r.b.mantissa(UNIT_BIT + 1) = '1' then + -- rounding up of the mantissa caused overflow, meaning the + -- normalized B is 2.0. Since this is outside the range + -- of the LUT, just use 0.5 as the estimated inverse. + v.state := IDIV_USE0_5; + else + -- start the first multiply now + f_to_multiply.valid <= '1'; + -- note we don't set v.first, thus the following IDIV_NR1 + -- state doesn't start a multiply (we already did that) + v.state := IDIV_NR1; + end if; + when IDIV_NR1 => + -- subsequent NR iterations do Y = P; P = 2 - B * P + msel_1 <= MUL1_B; + msel_add <= MULADD_CONST; + msel_inv <= '1'; + msel_2 <= MUL2_P; + set_y := r.first; + pshift := '1'; + f_to_multiply.valid <= r.first; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.count := r.count + 1; + v.state := IDIV_NR2; + end if; + when IDIV_NR2 => + -- compute P = Y * P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + f_to_multiply.valid <= r.first; + pshift := '1'; + v.opsel_a := AIN_A; + v.shift := to_signed(64, EXP_BITS); + -- Get 0.5 into R in case the inverse estimate turns out to be + -- less than 0.5, in which case we want to use 0.5, to avoid + -- infinite loops in some cases. + opsel_r <= RES_MISC; + misc_sel <= "0001"; + if multiply_to_f.valid = '1' then + v.first := '1'; + if r.count = "11" then + v.state := IDIV_DODIV; + else + v.state := IDIV_NR1; + end if; + end if; + when IDIV_USE0_5 => + -- Get 0.5 into R; it turns out the generated + -- QNaN mantissa is actually what we want + opsel_r <= RES_MISC; + misc_sel <= "0001"; + v.opsel_a := AIN_A; + v.shift := to_signed(64, EXP_BITS); + v.state := IDIV_DODIV; + when IDIV_DODIV => + -- r.opsel_a = AIN_A + -- r.shift = 64 + -- inverse estimate is in P or in R; copy it to Y + if r.b.mantissa(UNIT_BIT + 1) = '1' or + (r.p(UNIT_BIT) = '0' and r.p(UNIT_BIT - 1) = '0') then + msel_2 <= MUL2_R; + else + msel_2 <= MUL2_P; + end if; + set_y := '1'; + -- shift_res is 0 because r.shift = 64; + -- put that into B, which now holds the quotient + set_b_mant := '1'; + if r.divext = '0' then + v.shift := to_signed(-UNIT_BIT, EXP_BITS); + v.first := '1'; + v.state := IDIV_DIV; + elsif r.single_prec = '1' then + -- divwe[u][o], shift A left 32 bits + v.shift := to_signed(32, EXP_BITS); + v.state := IDIV_SH32; + elsif r.div_close = '0' then + v.shift := to_signed(64 - UNIT_BIT, EXP_BITS); + v.state := IDIV_EXTDIV; + else + -- handle top bit of quotient specially + -- for this we need the divisor left-justified in B + v.opsel_a := AIN_C; + v.state := IDIV_EXT_TBH; + end if; + when IDIV_SH32 => + -- r.shift = 32, R contains the dividend + opsel_r <= RES_SHIFT; + v.shift := to_signed(-UNIT_BIT, EXP_BITS); + v.first := '1'; + v.state := IDIV_DIV; + when IDIV_DIV => + -- Dividing A by C, r.shift = -56; A is in R + -- Put A into the bottom 64 bits of Ahi/A/Alo + set_a_mant := r.first; + set_a_lo := r.first; + -- compute R = R * Y (quotient estimate) + msel_1 <= MUL1_Y; + msel_2 <= MUL2_R; + f_to_multiply.valid <= r.first; + pshift := '1'; + opsel_r <= RES_MULT; + v.shift := - r.b.exponent; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV2; + end if; + when IDIV_DIV2 => + -- r.shift = - b.exponent + -- shift the quotient estimate right by b.exponent bits + opsel_r <= RES_SHIFT; + v.first := '1'; + v.state := IDIV_DIV3; + when IDIV_DIV3 => + -- quotient (so far) is in R; multiply by C and subtract from A + msel_1 <= MUL1_R; + msel_2 <= MUL2_C; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + -- store the current quotient estimate in B + set_b_mant := r.first; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV4; + end if; + when IDIV_DIV4 => + -- remainder is in R/S and P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + v.inc_quot := not pcmpc_lt and not r.divmod; + if r.divmod = '0' then + v.opsel_a := AIN_B; + end if; + v.shift := to_signed(UNIT_BIT, EXP_BITS); + if pcmpc_lt = '1' or pcmpc_eq = '1' then + if r.divmod = '0' then + v.state := IDIV_DIVADJ; + elsif pcmpc_eq = '1' then + v.state := IDIV_ZERO; + else + v.state := IDIV_MODADJ; + end if; + else + -- need to do another iteration, compute P * Y + f_to_multiply.valid <= '1'; + v.state := IDIV_DIV5; + end if; + when IDIV_DIV5 => + pshift := '1'; + opsel_r <= RES_MULT; + v.shift := - r.b.exponent; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV6; + end if; + when IDIV_DIV6 => + -- r.shift = - b.exponent + -- shift the quotient estimate right by b.exponent bits + opsel_r <= RES_SHIFT; + v.opsel_a := AIN_B; + v.first := '1'; + v.state := IDIV_DIV7; + when IDIV_DIV7 => + -- r.opsel_a = AIN_B + -- add shifted quotient delta onto the total quotient + opsel_b <= BIN_R; + v.first := '1'; + v.state := IDIV_DIV8; + when IDIV_DIV8 => + -- quotient (so far) is in R; multiply by C and subtract from A + msel_1 <= MUL1_R; + msel_2 <= MUL2_C; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + -- store the current quotient estimate in B + set_b_mant := r.first; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + if multiply_to_f.valid = '1' then + v.state := IDIV_DIV9; + end if; + when IDIV_DIV9 => + -- remainder is in R/S and P + msel_1 <= MUL1_Y; + msel_2 <= MUL2_P; + v.inc_quot := not pcmpc_lt and not r.divmod; + if r.divmod = '0' then + v.opsel_a := AIN_B; + end if; + v.shift := to_signed(UNIT_BIT, EXP_BITS); + if r.divmod = '0' then + v.state := IDIV_DIVADJ; + elsif pcmpc_eq = '1' then + v.state := IDIV_ZERO; + else + v.state := IDIV_MODADJ; + end if; + when IDIV_EXT_TBH => + -- r.opsel_a = AIN_C; get divisor into R and prepare to shift left + v.shift := to_signed(63, EXP_BITS) - r.b.exponent; + v.opsel_a := AIN_A; + v.state := IDIV_EXT_TBH2; + when IDIV_EXT_TBH2 => + -- r.opsel_a = AIN_A; divisor is in R + -- r.shift = 63 - b.exponent; shift and put into B + set_b_mant := '1'; + v.shift := to_signed(64 - UNIT_BIT, EXP_BITS); + v.state := IDIV_EXT_TBH3; + when IDIV_EXT_TBH3 => + -- Dividing (A << 64) by C + -- r.shift = 8 + -- Put A in the top 64 bits of Ahi/A/Alo + set_a_hi := '1'; + set_a_mant := '1'; + v.shift := to_signed(64, EXP_BITS) - r.b.exponent; + v.state := IDIV_EXT_TBH4; + when IDIV_EXT_TBH4 => + -- dividend (A) is in R + -- r.shift = 64 - B.exponent, so is at least 1 + opsel_r <= RES_SHIFT; + -- top bit of A gets lost in the shift, so handle it specially + v.opsel_a := AIN_B; + v.shift := to_signed(63, EXP_BITS); + v.state := IDIV_EXT_TBH5; + when IDIV_EXT_TBH5 => + -- r.opsel_a = AIN_B, r.shift = 63 + -- shifted dividend is in R, subtract left-justified divisor + opsel_b <= BIN_R; + opsel_ainv <= '1'; + carry_in <= '1'; + -- and put 1<<63 into B as the divisor (S is still 0) + shiftin0 := '1'; + set_b_mant := '1'; + v.first := '1'; + v.state := IDIV_EXTDIV2; + when IDIV_EXTDIV => + -- Dividing (A << 64) by C + -- r.shift = 8 + -- Put A in the top 64 bits of Ahi/A/Alo + set_a_hi := '1'; + set_a_mant := '1'; + v.shift := to_signed(64, EXP_BITS) - r.b.exponent; + v.state := IDIV_EXTDIV1; + when IDIV_EXTDIV1 => + -- dividend is in R + -- r.shift = 64 - B.exponent + opsel_r <= RES_SHIFT; + v.first := '1'; + v.state := IDIV_EXTDIV2; + when IDIV_EXTDIV2 => + -- shifted remainder is in R; compute R = R * Y (quotient estimate) + msel_1 <= MUL1_Y; + msel_2 <= MUL2_R; + f_to_multiply.valid <= r.first; + pshift := '1'; + v.opsel_a := AIN_B; + opsel_r <= RES_MULT; + if multiply_to_f.valid = '1' then + v.first := '1'; + v.state := IDIV_EXTDIV3; + end if; + when IDIV_EXTDIV3 => + -- r.opsel_a = AIN_B + -- delta quotient is in R; add it to B + opsel_b <= BIN_R; + v.first := '1'; + v.state := IDIV_EXTDIV4; + when IDIV_EXTDIV4 => + -- quotient is in R; put it in B and compute remainder + set_b_mant := r.first; + msel_1 <= MUL1_R; + msel_2 <= MUL2_C; + msel_add <= MULADD_A; + msel_inv <= '1'; + f_to_multiply.valid <= r.first; + opsel_r <= RES_MULT; + opsel_s <= S_MULT; + set_s := '1'; + v.shift := to_signed(UNIT_BIT, EXP_BITS) - r.b.exponent; + if multiply_to_f.valid = '1' then + v.state := IDIV_EXTDIV5; + end if; + when IDIV_EXTDIV5 => + -- r.shift = r.b.exponent - 56 + -- remainder is in R/S; shift it right r.b.exponent bits + opsel_r <= RES_SHIFT; + -- test LS 64b of remainder in P against divisor in C + v.inc_quot := not pcmpc_lt; + v.opsel_a := AIN_B; + v.state := IDIV_EXTDIV6; + when IDIV_EXTDIV6 => + -- r.opsel_a = AIN_B + -- shifted remainder is in R, see if it is > 1 + -- and compute R = R * Y if so + msel_1 <= MUL1_Y; + msel_2 <= MUL2_R; + pshift := '1'; + if r_gt_1 = '1' then + f_to_multiply.valid <= '1'; + v.state := IDIV_EXTDIV2; + else + v.state := IDIV_DIVADJ; + end if; + when IDIV_MODADJ => + -- r.shift = 56 + -- result is in R/S + opsel_r <= RES_SHIFT; + if pcmpc_lt = '0' then + v.opsel_a := AIN_C; + v.state := IDIV_MODSUB; + elsif r.result_sign = '0' then + v.state := IDIV_DONE; + else + v.state := IDIV_DIVADJ; + end if; + when IDIV_MODSUB => + -- r.opsel_a = AIN_C + -- Subtract divisor from remainder + opsel_ainv <= '1'; + carry_in <= '1'; + opsel_b <= BIN_R; + if r.result_sign = '0' then + v.state := IDIV_DONE; + else + v.state := IDIV_DIVADJ; + end if; + when IDIV_DIVADJ => + -- result (so far) is on the A input of the adder + -- set carry to increment quotient if needed + -- and also negate R if the answer is negative + opsel_ainv <= r.result_sign; + carry_in <= r.inc_quot xor r.result_sign; + rnd_b32 := '1'; + if r.divmod = '0' then + opsel_b <= BIN_RND; + end if; + if r.is_signed = '0' then + v.state := IDIV_DONE; + else + v.state := IDIV_OVFCHK; + end if; + when IDIV_OVFCHK => + if r.single_prec = '0' then + sign_bit := r.r(63); + else + sign_bit := r.r(31); + end if; + v.int_ovf := sign_bit xor r.result_sign; + if v.int_ovf = '1' then + v.state := IDIV_ZERO; + else + v.state := IDIV_DONE; + end if; + when IDIV_DONE => + v.xerc_result := v.xerc; + if r.oe = '1' then + v.xerc_result.ov := '0'; + v.xerc_result.ov32 := '0'; + v.writing_xer := '1'; + end if; + if r.m32b = '0' then + v.cr_result(3) := r.r(63); + v.cr_result(2 downto 1) := "00"; + if r.r = 64x"0" then + v.cr_result(1) := '1'; + else + v.cr_result(2) := not r.r(63); + end if; + else + v.cr_result(3) := r.r(31); + v.cr_result(2 downto 1) := "00"; + if r.r(31 downto 0) = 32x"0" then + v.cr_result(1) := '1'; + else + v.cr_result(2) := not r.r(31); + end if; + end if; + v.cr_result(0) := v.xerc.so; + int_result := '1'; + v.writing_fpr := '1'; + v.instr_done := '1'; + when IDIV_ZERO => + opsel_r <= RES_MISC; + misc_sel <= "0101"; + v.xerc_result := v.xerc; + if r.oe = '1' then + v.xerc_result.ov := r.int_ovf; + v.xerc_result.ov32 := r.int_ovf; + v.xerc_result.so := r.xerc.so or r.int_ovf; + v.writing_xer := '1'; + end if; + v.cr_result := "001" & v.xerc_result.so; + int_result := '1'; + v.writing_fpr := '1'; + v.instr_done := '1'; + end case; if zero_divide = '1' then @@ -2320,50 +2954,53 @@ begin -- Neither does enabled zero-divide exception if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then - v.writing_back := '1'; + v.writing_fpr := '1'; v.update_fprf := '1'; end if; v.instr_done := '1'; - v.state := IDLE; update_fx := '1'; end if; -- Multiplier and divide/square root data path case msel_1 is when MUL1_A => - f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00"; + f_to_multiply.data1 <= r.a.mantissa; when MUL1_B => - f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00"; + f_to_multiply.data1 <= r.b.mantissa; when MUL1_Y => f_to_multiply.data1 <= r.y; when others => - f_to_multiply.data1 <= r.r(61 downto 0) & "00"; + f_to_multiply.data1 <= r.r; end case; case msel_2 is when MUL2_C => - f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00"; + f_to_multiply.data2 <= r.c.mantissa; when MUL2_LUT => - f_to_multiply.data2 <= x"00" & inverse_est & '0' & x"000000000"; + f_to_multiply.data2 <= std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64), + UNIT_BIT - 19)); when MUL2_P => f_to_multiply.data2 <= r.p; when others => - f_to_multiply.data2 <= r.r(61 downto 0) & "00"; + f_to_multiply.data2 <= r.r; end case; maddend := (others => '0'); case msel_add is when MULADD_CONST => -- addend is 2.0 or 1.5 in 16.112 format if r.is_sqrt = '0' then - maddend(113) := '1'; -- 2.0 + maddend(2*UNIT_BIT + 1) := '1'; -- 2.0 else - maddend(112 downto 111) := "11"; -- 1.5 + maddend(2*UNIT_BIT downto 2*UNIT_BIT - 1) := "11"; -- 1.5 end if; when MULADD_A => -- addend is A in 16.112 format - maddend(121 downto 58) := r.a.mantissa; + maddend(127 downto UNIT_BIT + 64) := r.a_hi; + maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.a.mantissa; + maddend(UNIT_BIT - 1 downto 0) := r.a_lo; when MULADD_RS => -- addend is concatenation of R and S in 16.112 format - maddend := "000000" & r.r & r.s & "00"; + maddend(UNIT_BIT + 63 downto UNIT_BIT) := r.r; + maddend(UNIT_BIT - 1 downto 0) := r.s; when others => end case; if msel_inv = '1' then @@ -2379,7 +3016,7 @@ begin if pshift = '0' then v.p := multiply_to_f.result(63 downto 0); else - v.p := multiply_to_f.result(119 downto 56); + v.p := multiply_to_f.result(UNIT_BIT + 63 downto UNIT_BIT); end if; end if; @@ -2421,27 +3058,34 @@ begin when BIN_R => in_b0 := r.r; when BIN_RND => - round_inc := (31 => r.single_prec, 2 => not r.single_prec, others => '0'); + if rnd_b32 = '1' then + round_inc := (32 => r.result_sign and r.single_prec, others => '0'); + elsif rbit_inc = '0' then + round_inc := (SP_LSB => r.single_prec, DP_LSB => not r.single_prec, others => '0'); + else + round_inc := (DP_RBIT => '1', others => '0'); + end if; in_b0 := round_inc; when others => - -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64 - in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64)); + -- BIN_PS8, 8 LSBs of P sign-extended to 64 + in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 0)), 64)); end case; if opsel_binv = '1' then in_b0 := not in_b0; end if; in_b <= in_b0; if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then - shift_res := shifter_64(r.r & (shiftin or r.s(55)) & r.s(54 downto 0), + shift_res := shifter_64(r.r(63 downto 1) & (shiftin0 or r.r(0)) & + (shiftin or r.s(55)) & r.s(54 downto 0), std_ulogic_vector(r.shift(6 downto 0))); else shift_res := (others => '0'); end if; sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in); if opsel_mask = '1' then - sum(1 downto 0) := "00"; + sum(DP_LSB - 1 downto 0) := "0000"; if r.single_prec = '1' then - sum(30 downto 2) := (others => '0'); + sum(SP_LSB - 1 downto DP_LSB) := (others => '0'); end if; end if; case opsel_r is @@ -2450,20 +3094,25 @@ begin when RES_SHIFT => result <= shift_res; when RES_MULT => - result <= multiply_to_f.result(121 downto 58); + result <= multiply_to_f.result(UNIT_BIT + 63 downto UNIT_BIT); + if mult_mask = '1' then + -- trim to 54 fraction bits if mult_mask = 1, for quotient when dividing + result(UNIT_BIT - 55 downto 0) <= (others => '0'); + end if; when others => + misc := (others => '0'); case misc_sel is when "0000" => misc := x"00000000" & (r.fpscr and fpscr_mask); when "0001" => -- generated QNaN mantissa - misc := x"0020000000000000"; + misc(QNAN_BIT) := '1'; when "0010" => -- mantissa of max representable DP number - misc := x"007ffffffffffffc"; + misc(UNIT_BIT downto DP_LSB) := (others => '1'); when "0011" => -- mantissa of max representable SP number - misc := x"007fffff80000000"; + misc(UNIT_BIT downto SP_LSB) := (others => '1'); when "0100" => -- fmrgow result misc := r.a.mantissa(31 downto 0) & r.b.mantissa(31 downto 0); @@ -2471,7 +3120,8 @@ begin -- fmrgew result misc := r.a.mantissa(63 downto 32) & r.b.mantissa(63 downto 32); when "0111" => - misc := 10x"000" & inverse_est & 35x"000000000"; + misc := std_ulogic_vector(shift_left(resize(unsigned(inverse_est), 64), + UNIT_BIT - 19)); when "1000" => -- max positive result for fctiw[z] misc := x"000000007fffffff"; @@ -2497,7 +3147,6 @@ begin -- max negative result for fctidu[z] misc := x"0000000000000000"; when others => - misc := x"0000000000000000"; end case; result <= misc; end case; @@ -2507,7 +3156,7 @@ begin when S_NEG => v.s := std_ulogic_vector(unsigned(not r.s) + (not r.x)); when S_MULT => - v.s := multiply_to_f.result(57 downto 2); + v.s := multiply_to_f.result(55 downto 0); when S_SHIFT => v.s := shift_res(63 downto 8); if shift_res(7 downto 0) /= x"00" then @@ -2518,12 +3167,27 @@ begin end case; end if; - if set_a = '1' then + if set_a = '1' or set_a_exp = '1' then v.a.exponent := new_exp; + end if; + if set_a = '1' or set_a_mant = '1' then v.a.mantissa := shift_res; end if; + if e_in.valid = '1' then + v.a_hi := (others => '0'); + v.a_lo := (others => '0'); + else + if set_a_hi = '1' then + v.a_hi := r.r(63 downto 56); + end if; + if set_a_lo = '1' then + v.a_lo := r.r(55 downto 0); + end if; + end if; if set_b = '1' then v.b.exponent := new_exp; + end if; + if set_b = '1' or set_b_mant = '1' then v.b.mantissa := shift_res; end if; if set_c = '1' then @@ -2541,18 +3205,12 @@ begin -- make denormalized value end up with even exponent clz(0) := '1'; end if; - v.shift := resize(signed('0' & clz) - 9, EXP_BITS); + v.shift := resize(signed('0' & clz) - (63 - UNIT_BIT), EXP_BITS); end if; - if r.int_result = '1' then - fp_result <= r.r; - else - fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, - r.single_prec, r.quieten_nan); - end if; if r.update_fprf = '1' then v.fpscr(FPSCR_C downto FPSCR_FU) := result_flags(r.result_sign, r.result_class, - r.r(54) and not r.denorm); + r.r(UNIT_BIT) and not r.denorm); end if; v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or @@ -2563,24 +3221,51 @@ begin (v.fpscr(FPSCR_VX downto FPSCR_XX) and not r.old_exc) /= "00000" then v.fpscr(FPSCR_FX) := '1'; end if; - if r.rc = '1' then - v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); - end if; - v.illegal := illegal; - if illegal = '1' then - v.instr_done := '0'; - v.do_intr := '1'; - v.writing_back := '0'; - v.busy := '0'; - v.state := IDLE; + if v.instr_done = '1' then + if r.state /= IDLE then + v.state := IDLE; + v.busy := '0'; + v.f2stall := '0'; + if r.rc = '1' and (r.op = OP_FPOP or r.op = OP_FPOP_I) then + v.cr_result := v.fpscr(FPSCR_FX downto FPSCR_OX); + end if; + v.sp_result := r.single_prec; + v.int_result := int_result; + v.illegal := illegal; + v.nsnan_result := v.quieten_nan; + if r.integer_op = '1' then + v.cr_mask := num_to_fxm(0); + elsif r.is_cmp = '0' then + v.cr_mask := num_to_fxm(1); + else + v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(r.insn)))); + end if; + v.writing_cr := r.is_cmp or r.rc; + v.write_reg := r.dest_fpr; + v.complete_tag := r.instr_tag; + end if; + if e_in.stall = '0' then + v.complete := not v.illegal; + v.do_intr := (v.fpscr(FPSCR_FEX) and r.fe_mode) or v.illegal; + end if; + -- N.B. We rely on execute1 to prevent any new instruction + -- coming in while e_in.stall = 1, without us needing to + -- have busy asserted. else - v.do_intr := v.instr_done and v.fpscr(FPSCR_FEX) and r.fe_mode; - if v.state /= IDLE or v.do_intr = '1' then - v.busy := '1'; + if r.state /= IDLE and e_in.stall = '0' then + v.f2stall := '1'; end if; end if; + -- This mustn't depend on any fields of r that are modified in IDLE state. + if r.int_result = '1' then + fp_result <= r.r; + else + fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r, + r.sp_result, r.nsnan_result); + end if; + rin <= v; end process; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index f0c4532ca..439f124ba 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -35,6 +35,12 @@ entity loadstore1 is events : out Loadstore1EventType; + -- Access to SPRs from core_debug module + dbg_spr_req : in std_ulogic; + dbg_spr_ack : out std_ulogic; + dbg_spr_addr : in std_ulogic_vector(1 downto 0); + dbg_spr_data : out std_ulogic_vector(63 downto 0); + log_out : out std_ulogic_vector(9 downto 0) ); end loadstore1; @@ -43,9 +49,7 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction - MMU_LOOKUP, -- waiting for MMU to look up translation - TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie - FINISH_LFS -- write back converted SP data for lfs* + MMU_WAIT -- waiting for MMU to finish doing something ); type byte_index_t is array(0 to 7) of unsigned(2 downto 0); @@ -63,9 +67,7 @@ architecture behave of loadstore1 is write_spr : std_ulogic; mmu_op : std_ulogic; instr_fault : std_ulogic; - load_zero : std_ulogic; do_update : std_ulogic; - noop : std_ulogic; mode_32bit : std_ulogic; addr : std_ulogic_vector(63 downto 0); byte_sel : std_ulogic_vector(7 downto 0); @@ -88,31 +90,32 @@ architecture behave of loadstore1 is virt_mode : std_ulogic; priv_mode : std_ulogic; load_sp : std_ulogic; - sprn : std_ulogic_vector(9 downto 0); + sprsel : std_ulogic_vector(1 downto 0); + ric : std_ulogic_vector(1 downto 0); is_slbia : std_ulogic; align_intr : std_ulogic; dword_index : std_ulogic; two_dwords : std_ulogic; - nia : std_ulogic_vector(63 downto 0); + incomplete : std_ulogic; end record; constant request_init : request_t := (valid => '0', dc_req => '0', load => '0', store => '0', tlbie => '0', dcbz => '0', read_spr => '0', write_spr => '0', mmu_op => '0', - instr_fault => '0', load_zero => '0', do_update => '0', noop => '0', + instr_fault => '0', do_update => '0', mode_32bit => '0', addr => (others => '0'), byte_sel => x"00", second_bytes => x"00", store_data => (others => '0'), instr_tag => instr_tag_init, - write_reg => 7x"00", length => x"0", + write_reg => 6x"00", length => x"0", elt_length => x"0", byte_reverse => '0', brev_mask => "000", sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', atomic => '0', atomic_last => '0', rc => '0', nc => '0', virt_mode => '0', priv_mode => '0', load_sp => '0', - sprn => 10x"0", is_slbia => '0', align_intr => '0', - dword_index => '0', two_dwords => '0', - nia => (others => '0')); + sprsel => "00", ric => "00", is_slbia => '0', align_intr => '0', + dword_index => '0', two_dwords => '0', incomplete => '0'); type reg_stage1_t is record req : request_t; + busy : std_ulogic; issued : std_ulogic; addr0 : std_ulogic_vector(63 downto 0); end record; @@ -121,15 +124,20 @@ architecture behave of loadstore1 is req : request_t; byte_index : byte_index_t; use_second : std_ulogic_vector(7 downto 0); + busy : std_ulogic; wait_dc : std_ulogic; wait_mmu : std_ulogic; one_cycle : std_ulogic; wr_sel : std_ulogic_vector(1 downto 0); addr0 : std_ulogic_vector(63 downto 0); + sprsel : std_ulogic_vector(1 downto 0); + dbg_spr : std_ulogic_vector(63 downto 0); + dbg_spr_ack: std_ulogic; end record; type reg_stage3_t is record state : state_t; + complete : std_ulogic; instr_tag : instr_tag_t; write_enable : std_ulogic; write_reg : gspr_index_t; @@ -137,7 +145,6 @@ architecture behave of loadstore1 is rc : std_ulogic; xerc : xer_common_t; store_done : std_ulogic; - convert_lfs : std_ulogic; load_data : std_ulogic_vector(63 downto 0); dar : std_ulogic_vector(63 downto 0); dsisr : std_ulogic_vector(31 downto 0); @@ -147,7 +154,6 @@ architecture behave of loadstore1 is stage1_en : std_ulogic; interrupt : std_ulogic; intr_vec : integer range 0 to 16#fff#; - nia : std_ulogic_vector(63 downto 0); srr1 : std_ulogic_vector(15 downto 0); events : Loadstore1EventType; end record; @@ -157,21 +163,18 @@ architecture behave of loadstore1 is signal r2, r2in : reg_stage2_t; signal r3, r3in : reg_stage3_t; + signal flush : std_ulogic; signal busy : std_ulogic; signal complete : std_ulogic; - signal in_progress : std_ulogic; signal flushing : std_ulogic; signal store_sp_data : std_ulogic_vector(31 downto 0); signal load_dp_data : std_ulogic_vector(63 downto 0); signal store_data : std_ulogic_vector(63 downto 0); - signal stage1_issue_enable : std_ulogic; signal stage1_req : request_t; signal stage1_dcreq : std_ulogic; signal stage1_dreq : std_ulogic; - signal stage2_busy_next : std_ulogic; - signal stage3_busy_next : std_ulogic; -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is @@ -274,22 +277,29 @@ begin begin if rising_edge(clk) then if rst = '1' then + r1.busy <= '0'; + r1.issued <= '0'; r1.req.valid <= '0'; + r1.req.dc_req <= '0'; + r1.req.incomplete <= '0'; r1.req.tlbie <= '0'; r1.req.is_slbia <= '0'; r1.req.instr_fault <= '0'; r1.req.load <= '0'; r1.req.priv_mode <= '0'; - r1.req.sprn <= (others => '0'); + r1.req.sprsel <= "00"; + r1.req.ric <= "00"; r1.req.xerc <= xerc_init; r2.req.valid <= '0'; + r2.busy <= '0'; r2.req.tlbie <= '0'; r2.req.is_slbia <= '0'; r2.req.instr_fault <= '0'; r2.req.load <= '0'; r2.req.priv_mode <= '0'; - r2.req.sprn <= (others => '0'); + r2.req.sprsel <= "00"; + r2.req.ric <= "00"; r2.req.xerc <= xerc_init; r2.wait_dc <= '0'; @@ -301,8 +311,8 @@ begin r3.state <= IDLE; r3.write_enable <= '0'; r3.interrupt <= '0'; + r3.complete <= '0'; r3.stage1_en <= '1'; - r3.convert_lfs <= '0'; r3.events.load_complete <= '0'; r3.events.store_complete <= '0'; flushing <= '0'; @@ -311,7 +321,7 @@ begin r2 <= r2in; r3 <= r3in; flushing <= (flushing or (r1in.req.valid and r1in.req.align_intr)) and - not r3in.interrupt; + not flush; end if; stage1_dreq <= stage1_dcreq; if d_in.valid = '1' then @@ -321,7 +331,7 @@ begin assert r2.req.valid = '1' and r2.req.dc_req = '1' and r3.state = IDLE severity failure; end if; if m_in.done = '1' or m_in.err = '1' then - assert r2.req.valid = '1' and (r3.state = MMU_LOOKUP or r3.state = TLBIE_WAIT) severity failure; + assert r2.req.valid = '1' and r3.state = MMU_WAIT severity failure; end if; end if; end process; @@ -410,8 +420,14 @@ begin v.nc := l_in.ci; v.virt_mode := l_in.virt_mode; v.priv_mode := l_in.priv_mode; - v.sprn := sprn; - v.nia := l_in.nia; + v.ric := l_in.insn(19 downto 18); + if sprn(1) = '1' then + -- DSISR and DAR + v.sprsel := '1' & sprn(0); + else + -- PID and PTCR + v.sprsel := '0' & sprn(8); + end if; lsu_sum := std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)); @@ -457,17 +473,6 @@ begin -- check alignment for larx/stcx misaligned := or (addr_mask and addr(2 downto 0)); v.align_intr := l_in.reserve and misaligned; - if l_in.repeat = '1' and l_in.second = '0' and l_in.update = '0' and addr(3) = '1' then - -- length is really 16 not 8 - -- Make misaligned lq cause an alignment interrupt in LE mode, - -- in order to avoid the case with RA = RT + 1 where the second half - -- faults but the first doesn't (and updates RT+1, destroying RA). - -- The equivalent BE case doesn't occur because RA = RT is illegal. - misaligned := '1'; - if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then - v.align_intr := '1'; - end if; - end if; v.atomic := not misaligned; v.atomic_last := not misaligned and (l_in.second or not l_in.repeat); @@ -498,7 +503,7 @@ begin v.read_spr := '1'; when OP_MTSPR => v.write_spr := '1'; - v.mmu_op := sprn(8) or sprn(5); + v.mmu_op := not sprn(1); when OP_FETCH_FAILED => -- send it to the MMU to do the radix walk v.instr_fault := '1'; @@ -507,6 +512,7 @@ begin when others => end case; v.dc_req := l_in.valid and (v.load or v.store or v.dcbz) and not v.align_intr; + v.incomplete := v.dc_req and v.two_dwords; -- Work out controls for load and store formatting brev_lenm1 := "000"; @@ -518,16 +524,8 @@ begin req_in <= v; end process; - busy <= r1.req.valid and ((r1.req.dc_req and not r1.issued) or - (r1.issued and d_in.error) or - stage2_busy_next or - (r1.req.dc_req and r1.req.two_dwords and not r1.req.dword_index)); - complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or - (r2.wait_mmu and m_in.done) or r3.convert_lfs; - in_progress <= r1.req.valid or (r2.req.valid and not complete); - - stage1_issue_enable <= r3.stage1_en and not (r1.req.valid and r1.req.mmu_op) and - not (r2.req.valid and r2.req.mmu_op); + busy <= dc_stall or d_in.error or r1.busy or r2.busy; + complete <= r2.one_cycle or (r2.wait_dc and d_in.valid) or r3.complete; -- Processing done in the first cycle of a load/store instruction loadstore1_1: process(all) @@ -538,10 +536,11 @@ begin begin v := r1; issue := '0'; + dcreq := '0'; - if busy = '0' then + if r1.busy = '0' then req := req_in; - v.issued := '0'; + req.valid := l_in.valid; if flushing = '1' then -- Make this a no-op request rather than simply invalid. -- It will never get to stage 3 since there is a request ahead of @@ -554,37 +553,49 @@ begin end if; else req := r1.req; - end if; - - if r1.req.valid = '1' then if r1.req.dc_req = '1' and r1.issued = '0' then issue := '1'; - elsif r1.issued = '1' and d_in.error = '1' then - v.issued := '0'; - elsif stage2_busy_next = '0' then - -- we can change what's in r1 next cycle because the current thing - -- in r1 will go into r2 - if r1.req.dc_req = '1' and r1.req.two_dwords = '1' and r1.req.dword_index = '0' then - -- construct the second request for a misaligned access - req.dword_index := '1'; - req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000"; - if r1.req.mode_32bit = '1' then - req.addr(32) := '0'; - end if; - req.byte_sel := r1.req.second_bytes; - issue := '1'; + elsif r1.req.incomplete = '1' then + -- construct the second request for a misaligned access + req.dword_index := '1'; + req.incomplete := '0'; + req.addr := std_ulogic_vector(unsigned(r1.req.addr(63 downto 3)) + 1) & "000"; + if r1.req.mode_32bit = '1' then + req.addr(32) := '0'; end if; + req.byte_sel := r1.req.second_bytes; + issue := '1'; + else + -- For the lfs conversion cycle, leave the request valid + -- for another cycle but with req.dc_req = 0. + -- For an MMU request last cycle, we have nothing + -- to do in this cycle, so make it invalid. + if r1.req.load_sp = '0' then + req.valid := '0'; + end if; + req.dc_req := '0'; end if; end if; - if r3in.interrupt = '1' then - req.valid := '0'; - issue := '0'; - end if; - v.req := req; - dcreq := issue and stage1_issue_enable and not d_in.error and not dc_stall; - if issue = '1' then - v.issued := dcreq; + if flush = '1' then + v.req.valid := '0'; + v.req.dc_req := '0'; + v.req.incomplete := '0'; + v.issued := '0'; + v.busy := '0'; + elsif (dc_stall or d_in.error or r2.busy) = '0' then + -- we can change what's in r1 next cycle because the current thing + -- in r1 will go into r2 + v.req := req; + dcreq := issue; + v.issued := issue; + v.busy := (issue and (req.incomplete or req.load_sp)) or (req.valid and req.mmu_op); + else + -- pipeline is stalled + if r1.issued = '1' and d_in.error = '1' then + v.issued := '0'; + v.busy := '1'; + end if; end if; stage1_req <= req; @@ -602,6 +613,10 @@ begin variable kk : unsigned(3 downto 0); variable idx : unsigned(2 downto 0); variable byte_offset : unsigned(2 downto 0); + variable interrupt : std_ulogic; + variable dbg_spr_rd : std_ulogic; + variable sprsel : std_ulogic_vector(1 downto 0); + variable sprval : std_ulogic_vector(63 downto 0); begin v := r2; @@ -618,47 +633,91 @@ begin end if; end loop; - if stage3_busy_next = '0' and - (r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0') then - v.req := r1.req; - v.addr0 := r1.addr0; - v.req.store_data := store_data; - v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and - not (r1.req.two_dwords and not r1.req.dword_index); - v.wait_mmu := r1.req.valid and r1.req.mmu_op; - v.one_cycle := r1.req.valid and (r1.req.noop or r1.req.read_spr or - (r1.req.write_spr and not r1.req.mmu_op) or - r1.req.load_zero or r1.req.do_update); - if r1.req.read_spr = '1' then - v.wr_sel := "00"; - elsif r1.req.do_update = '1' or r1.req.store = '1' then - v.wr_sel := "01"; - elsif r1.req.load_sp = '1' then - v.wr_sel := "10"; + dbg_spr_rd := dbg_spr_req and not (r1.req.valid and r1.req.read_spr); + if dbg_spr_rd = '0' then + sprsel := r1.req.sprsel; + else + sprsel := dbg_spr_addr; + end if; + if sprsel(1) = '1' then + if sprsel(0) = '0' then + sprval := x"00000000" & r3.dsisr; else - v.wr_sel := "11"; + sprval := r3.dar; end if; + else + sprval := m_in.sprval; + end if; + if dbg_spr_req = '0' then + v.dbg_spr_ack := '0'; + elsif dbg_spr_rd = '1' and r2.dbg_spr_ack = '0' then + v.dbg_spr := sprval; + v.dbg_spr_ack := '1'; + end if; - -- Work out load formatter controls for next cycle - for i in 0 to 7 loop - idx := to_unsigned(i, 3) xor r1.req.brev_mask; - kk := ('0' & idx) + ('0' & byte_offset); - v.use_second(i) := kk(3); - v.byte_index(i) := kk(2 downto 0); - end loop; - elsif stage3_busy_next = '0' then - v.req.valid := '0'; - v.wait_dc := '0'; + if (dc_stall or d_in.error or r2.busy or l_in.e2stall) = '0' then + if r1.req.valid = '0' or r1.issued = '1' or r1.req.dc_req = '0' then + v.req := r1.req; + v.addr0 := r1.addr0; + v.req.store_data := store_data; + v.wait_dc := r1.req.valid and r1.req.dc_req and not r1.req.load_sp and + not r1.req.incomplete; + v.wait_mmu := r1.req.valid and r1.req.mmu_op; + v.busy := r1.req.valid and r1.req.mmu_op; + v.one_cycle := r1.req.valid and not (r1.req.dc_req or r1.req.mmu_op); + if r1.req.do_update = '1' or r1.req.store = '1' or r1.req.read_spr = '1' then + v.wr_sel := "00"; + elsif r1.req.load_sp = '1' then + v.wr_sel := "01"; + else + v.wr_sel := "10"; + end if; + if r1.req.read_spr = '1' then + v.addr0 := sprval; + end if; + + -- Work out load formatter controls for next cycle + for i in 0 to 7 loop + idx := to_unsigned(i, 3) xor r1.req.brev_mask; + kk := ('0' & idx) + ('0' & byte_offset); + v.use_second(i) := kk(3); + v.byte_index(i) := kk(2 downto 0); + end loop; + else + v.req.valid := '0'; + v.wait_dc := '0'; + v.wait_mmu := '0'; + v.one_cycle := '0'; + end if; + end if; + if r2.wait_mmu = '1' and m_in.done = '1' then + if r2.req.mmu_op = '1' then + v.req.valid := '0'; + v.busy := '0'; + end if; v.wait_mmu := '0'; end if; + if r2.busy = '1' and r2.wait_mmu = '0' then + v.busy := '0'; + end if; - stage2_busy_next <= r1.req.valid and stage3_busy_next; - - if r3in.interrupt = '1' then + interrupt := (r2.req.valid and r2.req.align_intr) or + (d_in.error and d_in.cache_paradox) or m_in.err; + if interrupt = '1' then v.req.valid := '0'; + v.busy := '0'; + v.wait_dc := '0'; + v.wait_mmu := '0'; + elsif d_in.error = '1' then + v.wait_mmu := '1'; + v.busy := '1'; end if; r2in <= v; + + -- SPR values for core_debug + dbg_spr_data <= r2.dbg_spr; + dbg_spr_ack <= r2.dbg_spr_ack; end process; -- Processing done in the third cycle of a load/store instruction. @@ -675,7 +734,6 @@ begin variable write_data : std_ulogic_vector(63 downto 0); variable do_update : std_ulogic; variable done : std_ulogic; - variable part_done : std_ulogic; variable exception : std_ulogic; variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); @@ -691,13 +749,12 @@ begin mmureq := '0'; mmu_mtspr := '0'; done := '0'; - part_done := '0'; exception := '0'; dsisr := (others => '0'); write_enable := '0'; sprval := (others => '0'); do_update := '0'; - v.convert_lfs := '0'; + v.complete := '0'; v.srr1 := (others => '0'); v.events := (others => '0'); @@ -765,114 +822,83 @@ begin v.load_data := data_permuted; end if; + if r2.req.valid = '1' then if r2.req.read_spr = '1' then write_enable := '1'; - -- partial decode on SPR number should be adequate given - -- the restricted set that get sent down this path - if r2.req.sprn(8) = '0' and r2.req.sprn(5) = '0' then - if r2.req.sprn(0) = '0' then - sprval := x"00000000" & r3.dsisr; - else - sprval := r3.dar; - end if; - else - -- reading one of the SPRs in the MMU - sprval := m_in.sprval; - end if; end if; if r2.req.align_intr = '1' then -- generate alignment interrupt exception := '1'; end if; - if r2.req.load_zero = '1' then - write_enable := '1'; - end if; if r2.req.do_update = '1' then do_update := '1'; end if; - end if; - - case r3.state is - when IDLE => - if d_in.valid = '1' then - if r2.req.two_dwords = '0' or r2.req.dword_index = '1' then - write_enable := r2.req.load and not r2.req.load_sp; - if HAS_FPU and r2.req.load_sp = '1' then - -- SP to DP conversion takes a cycle - v.state := FINISH_LFS; - v.convert_lfs := '1'; - else - -- stores write back rA update - do_update := r2.req.update and r2.req.store; - end if; - else - part_done := '1'; - end if; + if r2.req.load_sp = '1' and r2.req.dc_req = '0' then + write_enable := '1'; end if; - if d_in.error = '1' then - if d_in.cache_paradox = '1' then - -- signal an interrupt straight away - exception := '1'; - dsisr(63 - 38) := not r2.req.load; - -- XXX there is no architected bit for this - -- (probably should be a machine check in fact) - dsisr(63 - 35) := d_in.cache_paradox; + if r2.req.write_spr = '1' and r2.req.mmu_op = '0' then + if r2.req.sprsel(0) = '0' then + v.dsisr := r2.req.store_data(31 downto 0); else - -- Look up the translation for TLB miss - -- and also for permission error and RC error - -- in case the PTE has been updated. - mmureq := '1'; - v.state := MMU_LOOKUP; - v.stage1_en := '0'; + v.dar := r2.req.store_data; end if; end if; - if r2.req.valid = '1' then - if r2.req.mmu_op = '1' then - -- send request (tlbie, mtspr, itlb miss) to MMU - mmureq := not r2.req.write_spr; - mmu_mtspr := r2.req.write_spr; - if r2.req.instr_fault = '1' then - v.state := MMU_LOOKUP; - v.events.itlb_miss := '1'; - else - v.state := TLBIE_WAIT; - end if; - elsif r2.req.write_spr = '1' then - if r2.req.sprn(0) = '0' then - v.dsisr := r2.req.store_data(31 downto 0); - else - v.dar := r2.req.store_data; - end if; - end if; + end if; + + if r3.state = IDLE and r2.req.valid = '1' and r2.req.mmu_op = '1' then + -- send request (tlbie, mtspr, itlb miss) to MMU + mmureq := not r2.req.write_spr; + mmu_mtspr := r2.req.write_spr; + if r2.req.instr_fault = '1' then + v.events.itlb_miss := '1'; end if; + v.state := MMU_WAIT; + end if; - when MMU_LOOKUP => - if m_in.done = '1' then - if r2.req.instr_fault = '0' then - -- retry the request now that the MMU has installed a TLB entry - req := '1'; - v.stage1_en := '1'; - v.state := IDLE; - end if; + if d_in.valid = '1' then + if r2.req.incomplete = '0' then + write_enable := r2.req.load and not r2.req.load_sp; + -- stores write back rA update + do_update := r2.req.update and r2.req.store; end if; - if m_in.err = '1' then + end if; + if d_in.error = '1' then + if d_in.cache_paradox = '1' then + -- signal an interrupt straight away exception := '1'; - dsisr(63 - 33) := m_in.invalid; - dsisr(63 - 36) := m_in.perm_error; - dsisr(63 - 38) := r2.req.store or r2.req.dcbz; - dsisr(63 - 44) := m_in.badtree; - dsisr(63 - 45) := m_in.rc_error; + dsisr(63 - 38) := not r2.req.load; + -- XXX there is no architected bit for this + -- (probably should be a machine check in fact) + dsisr(63 - 35) := d_in.cache_paradox; + else + -- Look up the translation for TLB miss + -- and also for permission error and RC error + -- in case the PTE has been updated. + mmureq := '1'; + v.state := MMU_WAIT; + v.stage1_en := '0'; end if; + end if; - when TLBIE_WAIT => - - when FINISH_LFS => - write_enable := '1'; - - end case; + if m_in.done = '1' then + if r2.req.dc_req = '1' then + -- retry the request now that the MMU has installed a TLB entry + req := '1'; + else + v.complete := '1'; + end if; + end if; + if m_in.err = '1' then + exception := '1'; + dsisr(63 - 33) := m_in.invalid; + dsisr(63 - 36) := m_in.perm_error; + dsisr(63 - 38) := r2.req.store or r2.req.dcbz; + dsisr(63 - 44) := m_in.badtree; + dsisr(63 - 45) := m_in.rc_error; + end if; - if complete = '1' or exception = '1' then + if (m_in.done or m_in.err) = '1' then v.stage1_en := '1'; v.state := IDLE; end if; @@ -884,7 +910,6 @@ begin -- or ISI or ISegI for instruction fetch exceptions v.interrupt := exception; if exception = '1' then - v.nia := r2.req.nia; if r2.req.align_intr = '1' then v.intr_vec := 16#600#; v.dar := r2.req.addr; @@ -911,12 +936,9 @@ begin case r2.wr_sel is when "00" => - -- mfspr result - write_data := sprval; - when "01" => -- update reg write_data := r2.addr0; - when "10" => + when "01" => -- lfs result write_data := load_dp_data; when others => @@ -925,7 +947,7 @@ begin end case; -- Update outputs to dcache - if stage1_issue_enable = '1' then + if r3.stage1_en = '1' then d_out.valid <= stage1_dcreq; d_out.load <= stage1_req.load; d_out.dcbz <= stage1_req.dcbz; @@ -955,7 +977,7 @@ begin else d_out.data <= r2.req.store_data; end if; - d_out.hold <= r2.req.valid and r2.req.load_sp and d_in.valid; + d_out.hold <= l_in.e2stall; -- Update outputs to MMU m_out.valid <= mmureq; @@ -963,8 +985,10 @@ begin m_out.load <= r2.req.load; m_out.priv <= r2.req.priv_mode; m_out.tlbie <= r2.req.tlbie; + m_out.ric <= r2.req.ric; m_out.mtspr <= mmu_mtspr; - m_out.sprn <= r2.req.sprn; + m_out.sprnf <= r1.req.sprsel(0); + m_out.sprnt <= r2.req.sprsel(0); m_out.addr <= r2.req.addr; m_out.slbia <= r2.req.is_slbia; m_out.rs <= r2.req.store_data; @@ -980,18 +1004,15 @@ begin l_out.store_done <= d_in.store_done; l_out.interrupt <= r3.interrupt; l_out.intr_vec <= r3.intr_vec; - l_out.srr0 <= r3.nia; l_out.srr1 <= r3.srr1; -- update busy signal back to execute1 e_out.busy <= busy; - e_out.in_progress <= in_progress; - e_out.interrupt <= r3.interrupt; + e_out.l2stall <= dc_stall or d_in.error or r2.busy; events <= r3.events; - -- Busy calculation. - stage3_busy_next <= r2.req.valid and not (complete or part_done or exception); + flush <= exception; -- Update registers r3in <= v; @@ -1011,7 +1032,9 @@ begin d_out.valid & m_in.done & r2.req.dword_index & - std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 3)); + r2.req.valid & + r2.wait_dc & + std_ulogic_vector(to_unsigned(state_t'pos(r3.state), 1)); end if; end process; log_out <= log_data; diff --git a/logical.vhdl b/logical.vhdl index 60309ac35..77ef29c10 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -167,7 +167,7 @@ begin end if; tmp(7 downto 0) := rs(7 downto 0); when others => - -- e.g. OP_MTSPR + -- e.g. OP_MFSPR tmp := rs; end case; diff --git a/mmu.vhdl b/mmu.vhdl index 40c00feaf..7ab304039 100644 --- a/mmu.vhdl +++ b/mmu.vhdl @@ -81,8 +81,8 @@ architecture behave of mmu is begin -- Multiplex internal SPR values back to loadstore1, selected - -- by l_in.sprn. - l_out.sprval <= r.ptcr when l_in.sprn(8) = '1' else x"00000000" & r.pid; + -- by l_in.sprnf. + l_out.sprval <= r.ptcr when l_in.sprnf = '1' else x"00000000" & r.pid; mmu_0: process(clk) begin @@ -261,9 +261,8 @@ begin -- RB[IS] != 0 or RB[AP] != 0, or for slbia v.inval_all := l_in.slbia or l_in.addr(11) or l_in.addr(10) or l_in.addr(7) or l_in.addr(6) or l_in.addr(5); - -- The RIC field of the tlbie instruction comes across on the - -- sprn bus as bits 2--3. RIC=2 flushes process table caches. - if l_in.sprn(3) = '1' then + -- RIC=2 or 3 flushes process table caches. + if l_in.ric(1) = '1' then v.pt0_valid := '0'; v.pt3_valid := '0'; v.ptb_valid := '0'; @@ -293,7 +292,7 @@ begin -- Move to PID needs to invalidate L1 TLBs and cached -- pgtbl0 value. Move to PTCR does that plus -- invalidating the cached pgtbl3 and prtbl values as well. - if l_in.sprn(8) = '0' then + if l_in.sprnt = '0' then v.pid := l_in.rs(31 downto 0); else v.ptcr := l_in.rs; diff --git a/register_file.vhdl b/register_file.vhdl index b5e7246ac..753ce80ab 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -14,7 +14,9 @@ entity register_file is ); port( clk : in std_logic; + stall : in std_ulogic; + d1_in : in Decode1ToRegisterFileType; d_in : in Decode2ToRegisterFileType; d_out : out RegisterFileToDecode2Type; @@ -34,84 +36,130 @@ entity register_file is end entity register_file; architecture behaviour of register_file is - type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0); + type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); - signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); signal dbg_ack : std_ulogic; + signal dbg_gpr_done : std_ulogic; + signal addr_1_reg : gspr_index_t; + signal addr_2_reg : gspr_index_t; + signal addr_3_reg : gspr_index_t; + signal rd_2 : std_ulogic; + signal fwd_1 : std_ulogic; + signal fwd_2 : std_ulogic; + signal fwd_3 : std_ulogic; + signal data_1 : std_ulogic_vector(63 downto 0); + signal data_2 : std_ulogic_vector(63 downto 0); + signal data_3 : std_ulogic_vector(63 downto 0); + signal prev_write_data : std_ulogic_vector(63 downto 0); + begin - -- synchronous writes + -- synchronous reads and writes register_write_0: process(clk) + variable a_addr, b_addr, c_addr : gspr_index_t; variable w_addr : gspr_index_t; + variable b_enable : std_ulogic; begin if rising_edge(clk) then if w_in.write_enable = '1' then w_addr := w_in.write_reg; - if HAS_FPU and w_addr(6) = '1' then + if HAS_FPU and w_addr(5) = '1' then report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data); else - w_addr(6) := '0'; - if w_addr(5) = '0' then - report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); - else - report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); - end if; + w_addr(5) := '0'; + report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); end if; assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; registers(to_integer(unsigned(w_addr))) <= w_in.write_data; end if; + + a_addr := d1_in.reg_1_addr; + b_addr := d1_in.reg_2_addr; + c_addr := d1_in.reg_3_addr; + b_enable := d1_in.read_2_enable; + if stall = '1' then + a_addr := addr_1_reg; + b_addr := addr_2_reg; + c_addr := addr_3_reg; + b_enable := rd_2; + else + addr_1_reg <= a_addr; + addr_2_reg <= b_addr; + addr_3_reg <= c_addr; + rd_2 <= b_enable; + end if; + + fwd_1 <= '0'; + fwd_2 <= '0'; + fwd_3 <= '0'; + if w_in.write_enable = '1' then + if w_addr = a_addr then + fwd_1 <= '1'; + end if; + if w_addr = b_addr then + fwd_2 <= '1'; + end if; + if w_addr = c_addr then + fwd_3 <= '1'; + end if; + end if; + + -- Do debug reads to GPRs and FPRs using the B port when it is not in use + if dbg_gpr_req = '1' then + if b_enable = '0' then + b_addr := dbg_gpr_addr(5 downto 0); + dbg_gpr_done <= '1'; + end if; + else + dbg_gpr_done <= '0'; + end if; + + if not HAS_FPU then + -- Make it obvious that we only want 32 GSPRs for a no-FPU implementation + a_addr(5) := '0'; + b_addr(5) := '0'; + c_addr(5) := '0'; + end if; + data_1 <= registers(to_integer(unsigned(a_addr))); + data_2 <= registers(to_integer(unsigned(b_addr))); + data_3 <= registers(to_integer(unsigned(c_addr))); + + prev_write_data <= w_in.write_data; end if; end process register_write_0; - -- asynchronous reads + -- asynchronous forwarding of write data register_read_0: process(all) - variable a_addr, b_addr, c_addr : gspr_index_t; - variable w_addr : gspr_index_t; + variable out_data_1 : std_ulogic_vector(63 downto 0); + variable out_data_2 : std_ulogic_vector(63 downto 0); + variable out_data_3 : std_ulogic_vector(63 downto 0); begin - a_addr := d_in.read1_reg; - b_addr := d_in.read2_reg; - c_addr := d_in.read3_reg; - w_addr := w_in.write_reg; - if not HAS_FPU then - -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation - a_addr(6) := '0'; - b_addr(6) := '0'; - c_addr(6) := '0'; - w_addr(6) := '0'; + out_data_1 := data_1; + out_data_2 := data_2; + out_data_3 := data_3; + if fwd_1 = '1' then + out_data_1 := prev_write_data; end if; + if fwd_2 = '1' then + out_data_2 := prev_write_data; + end if; + if fwd_3 = '1' then + out_data_3 := prev_write_data; + end if; + if d_in.read1_enable = '1' then - report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr)))); + report "Reading GPR " & to_hstring(addr_1_reg) & " " & to_hstring(out_data_1); end if; if d_in.read2_enable = '1' then - report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr)))); + report "Reading GPR " & to_hstring(addr_2_reg) & " " & to_hstring(out_data_2); end if; if d_in.read3_enable = '1' then - report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr)))); - end if; - d_out.read1_data <= registers(to_integer(unsigned(a_addr))); - -- B read port is multiplexed with reads from the debug circuitry - if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then - b_addr := dbg_gpr_addr; - if not HAS_FPU then - b_addr(6) := '0'; - end if; - end if; - rd_port_b <= registers(to_integer(unsigned(b_addr))); - d_out.read2_data <= rd_port_b; - d_out.read3_data <= registers(to_integer(unsigned(c_addr))); - - -- Forward any written data - if w_in.write_enable = '1' then - if a_addr = w_addr then - d_out.read1_data <= w_in.write_data; - end if; - if b_addr = w_addr then - d_out.read2_data <= w_in.write_data; - end if; - if c_addr = w_addr then - d_out.read3_data <= w_in.write_data; - end if; + report "Reading GPR " & to_hstring(addr_3_reg) & " " & to_hstring(out_data_3); end if; + + d_out.read1_data <= out_data_1; + d_out.read2_data <= out_data_2; + d_out.read3_data <= out_data_3; end process register_read_0; -- Latch read data and ack if dbg read requested and B port not busy @@ -119,8 +167,8 @@ begin begin if rising_edge(clk) then if dbg_gpr_req = '1' then - if d_in.read2_enable = '0' and dbg_ack = '0' then - dbg_data <= rd_port_b; + if dbg_ack = '0' and dbg_gpr_done = '1' then + dbg_data <= data_2; dbg_ack <= '1'; end if; else @@ -140,10 +188,6 @@ begin loop_0: for i in 0 to 31 loop report "GPR" & integer'image(i) & " " & to_hstring(registers(i)); end loop loop_0; - - report "LR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_LR))))); - report "CTR " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_CTR))))); - report "XER " & to_hstring(registers(to_integer(unsigned(fast_spr_num(SPR_XER))))); sim_dump_done <= '1'; else sim_dump_done <= '0'; @@ -164,7 +208,7 @@ begin if rising_edge(clk) then log_data <= w_in.write_data & w_in.write_enable & - w_in.write_reg; + '0' & w_in.write_reg; end if; end process; log_out <= log_data; diff --git a/scripts/mw_debug/mw_debug.c b/scripts/mw_debug/mw_debug.c index 62717601a..81e809416 100644 --- a/scripts/mw_debug/mw_debug.c +++ b/scripts/mw_debug/mw_debug.c @@ -548,7 +548,11 @@ static const char *fast_spr_names[] = { "lr", "ctr", "srr0", "srr1", "hsrr0", "hsrr1", "sprg0", "sprg1", "sprg2", "sprg3", - "hsprg0", "hsprg1", "xer" + "hsprg0", "hsprg1", "xer", "tar", +}; + +static const char *ldst_spr_names[] = { + "pidr", "ptcr", "dsisr", "dar" }; static void gpr_read(uint64_t reg, uint64_t count) @@ -566,8 +570,10 @@ static void gpr_read(uint64_t reg, uint64_t count) printf("r%"PRId64, reg); else if ((reg - 32) < sizeof(fast_spr_names) / sizeof(fast_spr_names[0])) printf("%s", fast_spr_names[reg - 32]); - else if (reg < 64) + else if (reg < 60) printf("gspr%"PRId64, reg); + else if (reg < 64) + printf("%s", ldst_spr_names[reg - 60]); else printf("FPR%"PRId64, reg - 64); printf(":\t%016"PRIx64"\n", data); diff --git a/scripts/run_test.sh b/scripts/run_test.sh index 9fcb7ce3c..185c3a634 100755 --- a/scripts/run_test.sh +++ b/scripts/run_test.sh @@ -21,9 +21,9 @@ cd $TMPDIR cp ${MICROWATT_DIR}/tests/${TEST}.bin main_ram.bin -${MICROWATT_DIR}/core_tb | sed 's/.*: //' | egrep '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 | grep -v XER > test.out || true +${MICROWATT_DIR}/core_tb | sed 's/.*: //' | egrep '^(GPR[0-9]|LR |CTR |XER |CR [0-9])' | sort | grep -v GPR31 > test.out || true -grep -v "^$" ${MICROWATT_DIR}/tests/${TEST}.out | sort | grep -v GPR31 | grep -v XER > exp.out +grep -v "^$" ${MICROWATT_DIR}/tests/${TEST}.out | sort | grep -v GPR31 > exp.out cp test.out /tmp cp exp.out /tmp diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index 500e92df0..773c05d59 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -1410,6 +1410,110 @@ int fpu_test_23(void) return trapit(0, test23); } +struct idiv_tests { + unsigned long denom; + unsigned long divisor; + unsigned long divd; + unsigned long divdu; + unsigned long divde; + unsigned long divdeu; + unsigned long modsd; + unsigned long modud; +} idiv_tests[] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0x56789a, 0x1234, 0x4c0, 0x4c0, 0, 0, 0x19a, 0x19a }, + { 2, 3, 0, 0, 0, 0xaaaaaaaaaaaaaaaa, 2, 2 }, + { 31, 157, 0, 0, 0x328c3ab35cf15328, 0x328c3ab35cf15328, 31, 31 }, + { -4329874, 43879, -98, 0x17e5a119b9170, 0, 0, -29732, 39518 }, + { -4329874, -43879, 98, 0, 0, 0xffffffffffbe99d4, -29732, -4329874 }, + { 0x8000000000000000ul, -1, 0, 0, 0, 0x8000000000000000ul, 0, 0x8000000000000000ul }, +}; + +int fpu_test_24(void) +{ + long i; + unsigned long a, b, results[6]; + + for (i = 0; i < sizeof(idiv_tests) / sizeof(idiv_tests[0]); ++i) { + a = idiv_tests[i].denom; + b = idiv_tests[i].divisor; + asm("divd %0,%1,%2" : "=r" (results[0]) : "r" (a), "r" (b)); + asm("divdu %0,%1,%2" : "=r" (results[1]) : "r" (a), "r" (b)); + asm("divde %0,%1,%2" : "=r" (results[2]) : "r" (a), "r" (b)); + asm("divdeu %0,%1,%2" : "=r" (results[3]) : "r" (a), "r" (b)); + asm("modsd %0,%1,%2" : "=r" (results[4]) : "r" (a), "r" (b)); + asm("modud %0,%1,%2" : "=r" (results[5]) : "r" (a), "r" (b)); + if (results[0] != idiv_tests[i].divd || + results[1] != idiv_tests[i].divdu || + results[2] != idiv_tests[i].divde || + results[3] != idiv_tests[i].divdeu || + results[4] != idiv_tests[i].modsd || + results[5] != idiv_tests[i].modud) { + print_hex(i, 2, " "); + print_hex(results[0], 16, " "); + print_hex(results[1], 16, " "); + print_hex(results[2], 16, " "); + print_hex(results[3], 16, " "); + print_hex(results[4], 16, " "); + print_hex(results[5], 16, "\r\n"); + return i + 1; + } + } + return 0; +} + +struct wdiv_tests { + unsigned int denom; + unsigned int divisor; + unsigned int divw; + unsigned int divwu; + unsigned int divwe; + unsigned int divweu; + unsigned int modsw; + unsigned int moduw; +} wdiv_tests[] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0x56789a, 0x1234, 0x4c0, 0x4c0, 0, 0, 0x19a, 0x19a }, + { 2, 3, 0, 0, 0, 0xaaaaaaaa, 2, 2 }, + { 31, 157, 0, 0, 0x328c3ab3, 0x328c3ab3, 31, 31 }, + { -4329874, 43879, -98, 0x17df7, 0, 0, -29732, 17165 }, + { -4329874, -43879, 98, 0, 0, 0xffbe99a9, -29732, -4329874 }, + { 0x80000000u, -1, 0, 0, 0, 0x80000000u, 0, 0x80000000u }, +}; + +int fpu_test_25(void) +{ + long i; + unsigned int a, b, results[6]; + + for (i = 0; i < sizeof(wdiv_tests) / sizeof(wdiv_tests[0]); ++i) { + a = wdiv_tests[i].denom; + b = wdiv_tests[i].divisor; + asm("divw %0,%1,%2" : "=r" (results[0]) : "r" (a), "r" (b)); + asm("divwu %0,%1,%2" : "=r" (results[1]) : "r" (a), "r" (b)); + asm("divwe %0,%1,%2" : "=r" (results[2]) : "r" (a), "r" (b)); + asm("divweu %0,%1,%2" : "=r" (results[3]) : "r" (a), "r" (b)); + asm("modsw %0,%1,%2" : "=r" (results[4]) : "r" (a), "r" (b)); + asm("moduw %0,%1,%2" : "=r" (results[5]) : "r" (a), "r" (b)); + if (results[0] != wdiv_tests[i].divw || + results[1] != wdiv_tests[i].divwu || + results[2] != wdiv_tests[i].divwe || + results[3] != wdiv_tests[i].divweu || + results[4] != wdiv_tests[i].modsw || + results[5] != wdiv_tests[i].moduw) { + print_hex(i, 2, " "); + print_hex(results[0], 8, " "); + print_hex(results[1], 8, " "); + print_hex(results[2], 8, " "); + print_hex(results[3], 8, " "); + print_hex(results[4], 8, " "); + print_hex(results[5], 8, "\r\n"); + return i + 1; + } + } + return 0; +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -1458,6 +1562,8 @@ int main(void) do_test(21, fpu_test_21); do_test(22, fpu_test_22); do_test(23, fpu_test_23); + do_test(24, fpu_test_24); + do_test(25, fpu_test_25); return fail; } diff --git a/tests/modes/head.S b/tests/modes/head.S index 8b00bdd45..d9e69dc87 100644 --- a/tests/modes/head.S +++ b/tests/modes/head.S @@ -230,63 +230,3 @@ restore: ld %r0,16(%r1) mtlr %r0 blr - - .global do_lq -do_lq: - lq %r6,0(%r3) - std %r6,0(%r4) - std %r7,8(%r4) - li %r3,0 - blr - - .global do_lq_np /* "non-preferred" form of lq */ -do_lq_np: - mr %r7,%r3 - lq %r6,0(%r7) - std %r6,0(%r4) - std %r7,8(%r4) - li %r3,0 - blr - - .global do_lq_bad /* illegal form of lq */ -do_lq_bad: - mr %r6,%r3 - .long 0xe0c60000 /* lq %r6,0(%r6) */ - std %r6,0(%r4) - std %r7,8(%r4) - li %r3,0 - blr - - .global do_stq -do_stq: - ld %r8,0(%r4) - ld %r9,8(%r4) - stq %r8,0(%r3) - li %r3,0 - blr - - /* big-endian versions of the above */ - .global do_lq_be -do_lq_be: - .long 0x0000c3e0 - .long 0x0000c4f8 - .long 0x0800e4f8 - .long 0x00006038 - .long 0x2000804e - - .global do_lq_np_be /* "non-preferred" form of lq */ -do_lq_np_be: - .long 0x781b677c - .long 0x0000c7e0 - .long 0x0000c4f8 - .long 0x0800e4f8 - .long 0x00006038 - .long 0x2000804e - - .global do_stq_be -do_stq_be: - .long 0x000004e9 - .long 0x080024e9 - .long 0x020003f9 - .long 0x00006038 - .long 0x2000804e diff --git a/tests/modes/modes.c b/tests/modes/modes.c index b94bb47a2..fa4872cb8 100644 --- a/tests/modes/modes.c +++ b/tests/modes/modes.c @@ -12,14 +12,6 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long fn, unsigned long msr); -extern void do_lq(void *src, unsigned long *regs); -extern void do_lq_np(void *src, unsigned long *regs); -extern void do_lq_bad(void *src, unsigned long *regs); -extern void do_stq(void *dst, unsigned long *regs); -extern void do_lq_be(void *src, unsigned long *regs); -extern void do_lq_np_be(void *src, unsigned long *regs); -extern void do_stq_be(void *dst, unsigned long *regs); - static inline void do_tlbie(unsigned long rb, unsigned long rs) { __asm__ volatile("tlbie %0,%1" : : "r" (rb), "r" (rs) : "memory"); @@ -302,167 +294,6 @@ int mode_test_6(void) return 0; } -int mode_test_7(void) -{ - unsigned long quad[4] __attribute__((__aligned__(16))); - unsigned long regs[2]; - unsigned long ret, msr; - - /* - * Test lq/stq in LE mode - */ - msr = MSR_SF | MSR_LE; - quad[0] = 0x123456789abcdef0ul; - quad[1] = 0xfafa5959bcbc3434ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq, msr); - if (ret) - return ret | 1; - if (regs[0] != quad[1] || regs[1] != quad[0]) - return 2; - /* unaligned may give alignment interrupt */ - quad[2] = 0x0011223344556677ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq, msr); - if (ret == 0) { - if (regs[0] != quad[2] || regs[1] != quad[1]) - return 3; - } else if (ret == 0x600) { - if (mfspr(SPRG0) != (unsigned long) &do_lq || - mfspr(DAR) != (unsigned long) &quad[1]) - return ret | 4; - } else - return ret | 5; - - /* try stq */ - regs[0] = 0x5238523852385238ul; - regs[1] = 0x5239523952395239ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_stq, msr); - if (ret) - return ret | 5; - if (quad[0] != regs[1] || quad[1] != regs[0]) - return 6; - regs[0] = 0x0172686966746564ul; - regs[1] = 0xfe8d0badd00dabcdul; - ret = callit((unsigned long)quad + 1, (unsigned long)regs, - (unsigned long)&do_stq, msr); - if (ret) - return ret | 7; - if (((quad[0] >> 8) | (quad[1] << 56)) != regs[1] || - ((quad[1] >> 8) | (quad[2] << 56)) != regs[0]) - return 8; - - /* try lq non-preferred form */ - quad[0] = 0x56789abcdef01234ul; - quad[1] = 0x5959bcbc3434fafaul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_np, msr); - if (ret) - return ret | 9; - if (regs[0] != quad[1] || regs[1] != quad[0]) - return 10; - /* unaligned should give alignment interrupt in uW implementation */ - quad[2] = 0x6677001122334455ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq_np, msr); - if (ret == 0x600) { - if (mfspr(SPRG0) != (unsigned long) &do_lq_np + 4 || - mfspr(DAR) != (unsigned long) &quad[1]) - return ret | 11; - } else - return 12; - - /* make sure lq with rt = ra causes an illegal instruction interrupt */ - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_bad, msr); - if (ret != 0x700) - return 13; - if (mfspr(SPRG0) != (unsigned long)&do_lq_bad + 4 || - !(mfspr(SPRG3) & 0x80000)) - return 14; - return 0; -} - -int mode_test_8(void) -{ - unsigned long quad[4] __attribute__((__aligned__(16))); - unsigned long regs[2]; - unsigned long ret, msr; - - /* - * Test lq/stq in BE mode - */ - msr = MSR_SF; - quad[0] = 0x123456789abcdef0ul; - quad[1] = 0xfafa5959bcbc3434ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_be, msr); - if (ret) - return ret | 1; - if (regs[0] != quad[0] || regs[1] != quad[1]) { - print_hex(regs[0], 16); - print_string(" "); - print_hex(regs[1], 16); - print_string(" "); - return 2; - } - /* don't expect alignment interrupt */ - quad[2] = 0x0011223344556677ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq_be, msr); - if (ret == 0) { - if (regs[0] != quad[1] || regs[1] != quad[2]) - return 3; - } else - return ret | 5; - - /* try stq */ - regs[0] = 0x5238523852385238ul; - regs[1] = 0x5239523952395239ul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_stq_be, msr); - if (ret) - return ret | 5; - if (quad[0] != regs[0] || quad[1] != regs[1]) - return 6; - regs[0] = 0x0172686966746564ul; - regs[1] = 0xfe8d0badd00dabcdul; - ret = callit((unsigned long)quad + 1, (unsigned long)regs, - (unsigned long)&do_stq_be, msr); - if (ret) - return ret | 7; - if (((quad[0] >> 8) | (quad[1] << 56)) != regs[0] || - ((quad[1] >> 8) | (quad[2] << 56)) != regs[1]) { - print_hex(quad[0], 16); - print_string(" "); - print_hex(quad[1], 16); - print_string(" "); - print_hex(quad[2], 16); - print_string(" "); - return 8; - } - - /* try lq non-preferred form */ - quad[0] = 0x56789abcdef01234ul; - quad[1] = 0x5959bcbc3434fafaul; - ret = callit((unsigned long)quad, (unsigned long)regs, - (unsigned long)&do_lq_np_be, msr); - if (ret) - return ret | 9; - if (regs[0] != quad[0] || regs[1] != quad[1]) - return 10; - /* unaligned should not give alignment interrupt in uW implementation */ - quad[2] = 0x6677001122334455ul; - ret = callit((unsigned long)&quad[1], (unsigned long)regs, - (unsigned long)&do_lq_np_be, msr); - if (ret) - return ret | 11; - if (regs[0] != quad[1] || regs[1] != quad[2]) - return 12; - return 0; -} - int fail = 0; void do_test(int num, int (*test)(void)) @@ -507,8 +338,6 @@ int main(void) do_test(4, mode_test_4); do_test(5, mode_test_5); do_test(6, mode_test_6); - do_test(7, mode_test_7); - do_test(8, mode_test_8); return fail; } diff --git a/tests/pmu/Makefile b/tests/pmu/Makefile new file mode 100644 index 000000000..2fd6c280e --- /dev/null +++ b/tests/pmu/Makefile @@ -0,0 +1,3 @@ +TEST=pmu + +include ../Makefile.test diff --git a/tests/pmu/head.S b/tests/pmu/head.S new file mode 100644 index 000000000..03cffe4fa --- /dev/null +++ b/tests/pmu/head.S @@ -0,0 +1,46 @@ +/* Copyright 2013-2014 IBM Corp. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#define STACK_TOP 0x4000 + +/* Load an immediate 64-bit value into a register */ +#define LOAD_IMM64(r, e) \ + lis r,(e)@highest; \ + ori r,r,(e)@higher; \ + rldicr r,r, 32, 31; \ + oris r,r, (e)@h; \ + ori r,r, (e)@l; + + .section ".head","ax" + + /* + * Microwatt currently enters in LE mode at 0x0, so we don't need to + * do any endian fix ups> + */ + . = 0 +.global _start +_start: + b boot_entry + +.global boot_entry +boot_entry: + /* setup stack */ + LOAD_IMM64(%r1, STACK_TOP - 0x100) + LOAD_IMM64(%r12, main) + mtctr %r12, + bctrl + attn // terminate on exit + b . diff --git a/tests/pmu/pmu.c b/tests/pmu/pmu.c new file mode 100644 index 000000000..b3def50e3 --- /dev/null +++ b/tests/pmu/pmu.c @@ -0,0 +1,173 @@ +#include +#include + +#include "console.h" + + +#define asm __asm__ volatile + +#define MMCR0 795 +#define MMCR1 798 +#define MMCR2 785 +#define MMCRA 786 +#define PMC1 771 +#define PMC2 772 +#define PMC3 773 +#define PMC4 774 +#define PMC5 775 +#define PMC6 776 + +#define MMCR0_FC 0x80000000 // Freeze Counters +#define PMC1SEL_FC 0xFC000000 // Load Completed +#define PMC2SEL_F0 0x00F00000 // Store Completed + +#define TEST "Test " +#define PASS "PASS\n" +#define FAIL "FAIL\n" + +static inline unsigned long mfspr(int sprnum) +{ + unsigned long val; + + asm("mfspr %0,%1" : "=r" ((unsigned long) val) : "i" (sprnum)); + return val; +} + +static inline void mtspr(int sprnum, unsigned long val) +{ + asm("mtspr %0,%1" : : "i" (sprnum), "r" ((unsigned long) val)); +} + +void print_test_number(int i) +{ + puts(TEST); + putchar(48 + i/10); + putchar(48 + i%10); + putchar(':'); +} + +void reset_pmu() { + mtspr(MMCR0, MMCR0_FC); + mtspr(MMCR1, 0); + mtspr(PMC1, 0); + mtspr(PMC2, 0); + mtspr(PMC3, 0); + mtspr(PMC4, 0); + mtspr(PMC5, 0); + mtspr(PMC6, 0); +} + +/* + Sets PMC1 to count finished load instructions + Runs 50 load instructions + Expects PMC1 to be 50 at the end +*/ +int test_load_complete() +{ + reset_pmu(); + unsigned long volatile b = 0; + mtspr(MMCR1, PMC1SEL_FC); + mtspr(MMCR0, 0); + + for(int i = 0; i < 50; i++) + ++b; + + mtspr(MMCR0, MMCR0_FC); + + return mfspr(PMC1) == 50; +} + +/* + Sets PMC2 to count finished store instructions + Runs 50 store instructions + Expects PMC2 to be 50 at the end +*/ +int test_store_complete() +{ + reset_pmu(); + unsigned long volatile b = 0; + mtspr(MMCR1, PMC2SEL_F0); + mtspr(MMCR0, 0); + + for(int i = 0; i < 50; i++) + ++b; + + mtspr(MMCR0, MMCR0_FC); + + return mfspr(PMC2) == 50; +} + +/* + Allow PMC5 to count finished instructions + Runs a loop 50 times + Expects PMC5 to be more than zero at the end +*/ +int test_instruction_complete() +{ + reset_pmu(); + unsigned long volatile b = 0; + mtspr(MMCR0, 0); + + for(int i = 0; i < 50; i++) + ++b; + + mtspr(MMCR0, MMCR0_FC); + + return mfspr(PMC5) > 0; +} + +/* + Allow PMC6 to count cycles + Runs a loop 50 times + Expects PMC6 to be more than zero at the end +*/ +int test_count_cycles() +{ + reset_pmu(); + unsigned long volatile b = 0; + mtspr(MMCR0, 0); + + for(int i = 0; i < 50; i++) + ++b; + + mtspr(MMCR0, MMCR0_FC); + + return mfspr(PMC6) > 0; +} + +int main(void) +{ + int fail = 0; + + console_init(); + + print_test_number(1); + if (test_load_complete() != 1) { + fail = 1; + puts(FAIL); + } else + puts(PASS); + + print_test_number(2); + if (test_store_complete() != 1) { + fail = 1; + puts(FAIL); + } else + puts(PASS); + + print_test_number(3); + if (test_instruction_complete() == 0) { + fail = 1; + puts(FAIL); + } else + puts(PASS); + + print_test_number(4); + if (test_count_cycles() == 0) { + fail = 1; + puts(FAIL); + } else + puts(PASS); + + return fail; +} diff --git a/tests/pmu/powerpc.lds b/tests/pmu/powerpc.lds new file mode 100644 index 000000000..99611ab41 --- /dev/null +++ b/tests/pmu/powerpc.lds @@ -0,0 +1,27 @@ +SECTIONS +{ + . = 0; + _start = .; + .head : { + KEEP(*(.head)) + } + . = ALIGN(0x1000); + .text : { *(.text) *(.text.*) *(.rodata) *(.rodata.*) } + . = ALIGN(0x1000); + .data : { *(.data) *(.data.*) *(.got) *(.toc) } + . = ALIGN(0x80); + __bss_start = .; + .bss : { + *(.dynsbss) + *(.sbss) + *(.scommon) + *(.dynbss) + *(.bss) + *(.common) + *(.bss.*) + } + . = ALIGN(0x80); + __bss_end = .; + . = . + 0x4000; + __stack_top = .; +} diff --git a/tests/reservation/head.S b/tests/reservation/head.S index 4ff85ceaa..ce258b5d8 100644 --- a/tests/reservation/head.S +++ b/tests/reservation/head.S @@ -155,31 +155,3 @@ call_ret: ld %r31,248(%r1) addi %r1,%r1,256 blr - - .global do_lqarx -do_lqarx: - /* r3 = src, r4 = regs */ - lqarx %r10,0,%r3 - std %r10,0(%r4) - std %r11,8(%r4) - li %r3,0 - blr - - .global do_lqarx_bad -do_lqarx_bad: - /* r3 = src, r4 = regs */ - .long 0x7d405228 /* lqarx %r10,0,%r10 */ - std %r10,0(%r4) - std %r11,8(%r4) - li %r3,0 - blr - - .global do_stqcx -do_stqcx: - /* r3 = dest, r4 = regs, return CR */ - ld %r10,0(%r4) - ld %r11,8(%r4) - stqcx. %r10,0,%r3 - mfcr %r3 - oris %r3,%r3,1 /* to distinguish from trap number */ - blr diff --git a/tests/reservation/reservation.c b/tests/reservation/reservation.c index a3d5a7a17..79bbc1f72 100644 --- a/tests/reservation/reservation.c +++ b/tests/reservation/reservation.c @@ -7,10 +7,6 @@ extern unsigned long callit(unsigned long arg1, unsigned long arg2, unsigned long (*fn)(unsigned long, unsigned long)); -extern unsigned long do_lqarx(unsigned long src, unsigned long regs); -extern unsigned long do_lqarx_bad(unsigned long src, unsigned long regs); -extern unsigned long do_stqcx(unsigned long dst, unsigned long regs); - #define DSISR 18 #define DAR 19 #define SRR0 26 @@ -184,63 +180,6 @@ int resv_test_2(void) return 0; } -/* test lqarx/stqcx */ -int resv_test_3(void) -{ - unsigned long x[4] __attribute__((__aligned__(16))); - unsigned long y[2], regs[2]; - unsigned long ret, offset; - int count; - - x[0] = 0x7766554433221100ul; - x[1] = 0xffeeddccbbaa9988ul; - y[0] = 0x0badcafef00dd00dul; - y[1] = 0xdeadbeef07070707ul; - for (count = 0; count < 1000; ++count) { - ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx); - if (ret) - return ret | 1; - ret = callit((unsigned long)x, (unsigned long)y, do_stqcx); - if (ret < 0x10000) - return ret | 2; - if (ret & 0x20000000) - break; - } - if (count == 1000) - return 3; - if (x[0] != y[1] || x[1] != y[0]) - return 4; - if (regs[1] != 0x7766554433221100ul || regs[0] != 0xffeeddccbbaa9988ul) - return 5; - ret = callit((unsigned long)x, (unsigned long)regs, do_stqcx); - if (ret < 0x10000 || (ret & 0x20000000)) - return ret | 12; - /* test alignment interrupts */ - for (offset = 0; offset < 16; ++offset) { - ret = callit((unsigned long)x + offset, (unsigned long)regs, do_lqarx); - if (ret == 0 && (offset & 15) != 0) - return 6; - if (ret == 0x600) { - if ((offset & 15) == 0) - return ret + 7; - } else if (ret) - return ret; - ret = callit((unsigned long)x + offset, (unsigned long)y, do_stqcx); - if (ret >= 0x10000 && (offset & 15) != 0) - return 8; - if (ret == 0x600) { - if ((offset & 15) == 0) - return ret + 9; - } else if (ret < 0x10000) - return ret; - } - /* test illegal interrupt for bad lqarx case */ - ret = callit((unsigned long)x, (unsigned long)regs, do_lqarx_bad); - if (ret != 0x700 || !(mfspr(SRR1) & 0x80000)) - return ret + 10; - return 0; -} - int fail = 0; void do_test(int num, int (*test)(void)) @@ -265,7 +204,6 @@ int main(void) do_test(1, resv_test_1); do_test(2, resv_test_2); - do_test(3, resv_test_3); return fail; } diff --git a/tests/test_modes.bin b/tests/test_modes.bin index 7e6b8f5d0..24e39813f 100755 Binary files a/tests/test_modes.bin and b/tests/test_modes.bin differ diff --git a/tests/test_modes.console_out b/tests/test_modes.console_out index 25e791c78..a49bb9b00 100644 --- a/tests/test_modes.console_out +++ b/tests/test_modes.console_out @@ -4,5 +4,3 @@ test 03:PASS test 04:PASS test 05:PASS test 06:PASS -test 07:PASS -test 08:PASS diff --git a/tests/test_pmu.bin b/tests/test_pmu.bin new file mode 100755 index 000000000..0791139a6 Binary files /dev/null and b/tests/test_pmu.bin differ diff --git a/tests/test_pmu.console_out b/tests/test_pmu.console_out new file mode 100644 index 000000000..2ff5a99ff --- /dev/null +++ b/tests/test_pmu.console_out @@ -0,0 +1,4 @@ +Test 01:PASS +Test 02:PASS +Test 03:PASS +Test 04:PASS diff --git a/tests/test_reservation.bin b/tests/test_reservation.bin index 1e305f43a..1cb625055 100755 Binary files a/tests/test_reservation.bin and b/tests/test_reservation.bin differ diff --git a/tests/test_reservation.console_out b/tests/test_reservation.console_out index 623335dd3..0c39ae380 100644 --- a/tests/test_reservation.console_out +++ b/tests/test_reservation.console_out @@ -1,3 +1,2 @@ test 01:PASS test 02:PASS -test 03:PASS diff --git a/tests/update_console_tests b/tests/update_console_tests index 4e013aaa9..b168e8d3c 100755 --- a/tests/update_console_tests +++ b/tests/update_console_tests @@ -3,7 +3,7 @@ # Script to update console related tests from source # -for i in sc illegal decrementer xics privileged mmu misc modes reservation trace fpu spr_read ; do +for i in sc illegal decrementer xics privileged mmu misc modes pmu reservation trace fpu spr_read ; do cd $i make cd - diff --git a/writeback.vhdl b/writeback.vhdl index a99d4d233..2f6af2cc1 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -19,24 +19,18 @@ entity writeback is c_out : out WritebackToCrFileType; f_out : out WritebackToFetch1Type; + wb_bypass : out bypass_data_t; + -- PMU event bus events : out WritebackEventType; flush_out : out std_ulogic; - interrupt_out: out std_ulogic; + interrupt_out: out WritebackToExecute1Type; complete_out : out instr_tag_t ); end entity writeback; architecture behaviour of writeback is - type irq_state_t is (WRITE_SRR0, WRITE_SRR1); - - type reg_type is record - state : irq_state_t; - srr1 : std_ulogic_vector(63 downto 0); - end record; - - signal r, rin : reg_type; begin writeback_0: process(clk) @@ -45,13 +39,6 @@ begin variable w : std_ulogic_vector(0 downto 0); begin if rising_edge(clk) then - if rst = '1' then - r.state <= WRITE_SRR0; - r.srr1 <= (others => '0'); - else - r <= rin; - end if; - -- Do consistency checks only on the clock edge x(0) := e_in.valid; y(0) := l_in.valid; @@ -66,11 +53,13 @@ begin to_integer(unsigned(w))) <= 1 severity failure; w(0) := e_in.write_cr_enable; - x(0) := (e_in.write_enable and e_in.rc); + x(0) := l_in.rc; y(0) := fp_in.write_cr_enable; assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y))) <= 1 severity failure; + assert (e_in.write_xerc_enable and fp_in.write_xerc) /= '1' severity failure; + assert not (e_in.valid = '1' and e_in.instr_tag.valid = '0') severity failure; assert not (l_in.valid = '1' and l_in.instr_tag.valid = '0') severity failure; assert not (fp_in.valid = '1' and fp_in.instr_tag.valid = '0') severity failure; @@ -78,11 +67,7 @@ begin end process; writeback_1: process(all) - variable v : reg_type; variable f : WritebackToFetch1Type; - variable cf: std_ulogic_vector(3 downto 0); - variable zero : std_ulogic; - variable sign : std_ulogic; variable scf : std_ulogic_vector(3 downto 0); variable vec : integer range 0 to 16#fff#; variable srr1 : std_ulogic_vector(15 downto 0); @@ -91,9 +76,7 @@ begin w_out <= WritebackToRegisterFileInit; c_out <= WritebackToCrFileInit; f := WritebackToFetch1Init; - interrupt_out <= '0'; vec := 0; - v := r; complete_out <= instr_tag_init; if e_in.valid = '1' then @@ -107,37 +90,21 @@ begin events.fp_complete <= fp_in.valid; intr := e_in.interrupt or l_in.interrupt or fp_in.interrupt; + interrupt_out.intr <= intr; - if r.state = WRITE_SRR1 then - w_out.write_reg <= fast_spr_num(SPR_SRR1); - w_out.write_data <= r.srr1; - w_out.write_enable <= '1'; - interrupt_out <= '1'; - v.state := WRITE_SRR0; - - elsif intr = '1' then - w_out.write_reg <= fast_spr_num(SPR_SRR0); - w_out.write_enable <= '1'; - v.state := WRITE_SRR1; + if intr = '1' then srr1 := (others => '0'); if e_in.interrupt = '1' then vec := e_in.intr_vec; - w_out.write_data <= e_in.last_nia; srr1 := e_in.srr1; elsif l_in.interrupt = '1' then vec := l_in.intr_vec; - w_out.write_data <= l_in.srr0; srr1 := l_in.srr1; elsif fp_in.interrupt = '1' then vec := fp_in.intr_vec; - w_out.write_data <= fp_in.srr0; srr1 := fp_in.srr1; end if; - v.srr1(63 downto 31) := e_in.msr(63 downto 31); - v.srr1(30 downto 27) := srr1(14 downto 11); - v.srr1(26 downto 22) := e_in.msr(26 downto 22); - v.srr1(21 downto 16) := srr1(5 downto 0); - v.srr1(15 downto 0) := e_in.msr(15 downto 0); + interrupt_out.srr1 <= srr1; else if e_in.write_enable = '1' then @@ -169,6 +136,11 @@ begin c_out.write_cr_data <= fp_in.write_cr_data; end if; + if fp_in.write_xerc = '1' then + c_out.write_xerc_enable <= '1'; + c_out.write_xerc_data <= fp_in.xerc; + end if; + if l_in.write_enable = '1' then w_out.write_reg <= l_in.write_reg; w_out.write_data <= l_in.write_data; @@ -186,24 +158,6 @@ begin c_out.write_cr_data(31 downto 28) <= scf; end if; - -- Perform CR0 update for RC forms - -- Note that loads never have a form with an RC bit, therefore this can test e_in.write_data - if e_in.rc = '1' and e_in.write_enable = '1' then - zero := not (or e_in.write_data(31 downto 0)); - if e_in.mode_32bit = '0' then - sign := e_in.write_data(63); - zero := zero and not (or e_in.write_data(63 downto 32)); - else - sign := e_in.write_data(31); - end if; - c_out.write_cr_enable <= '1'; - c_out.write_cr_mask <= num_to_fxm(0); - cf(3) := sign; - cf(2) := not sign and not zero; - cf(1) := zero; - cf(0) := e_in.xerc.so; - c_out.write_cr_data(31 downto 28) <= cf; - end if; end if; -- Outputs to fetch1 @@ -236,6 +190,10 @@ begin f_out <= f; flush_out <= f_out.redirect; - rin <= v; + -- Register write data bypass to decode2 + wb_bypass.tag.tag <= complete_out.tag; + wb_bypass.tag.valid <= complete_out.valid and w_out.write_enable; + wb_bypass.data <= w_out.write_data; + end process; end;