// Copyright lowRISC contributors.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

/**
 * Instruction cache
 *
 * Provides an instruction cache along with cache management, instruction buffering and prefetching
 */

`include "prim_assert.sv"

module [docs]ibex_icache import ibex_pkg::*; #(
  parameter bit          ICacheECC       = 1'b0,
  parameter bit          ResetAll        = 1'b0,
  parameter int unsigned BusSizeECC      = BUS_SIZE,
  parameter int unsigned TagSizeECC      = IC_TAG_SIZE,
  parameter int unsigned LineSizeECC     = IC_LINE_SIZE,
  // Only cache branch targets
  parameter bit          BranchCache     = 1'b0
) (
  // Clock and reset
  input  logic                           clk_i,
  input  logic                           rst_ni,

  // Signal that the core would like instructions
  input  logic                           req_i,

  // Set the cache's address counter
  input  logic                           branch_i,
  input  logic [31:0]                    addr_i,

  // IF stage interface: Pass fetched instructions to the core
  input  logic                           ready_i,
  output logic                           valid_o,
  output logic [31:0]                    rdata_o,
  output logic [31:0]                    addr_o,
  output logic                           err_o,
  output logic                           err_plus2_o,

  // Instruction memory / interconnect interface: Fetch instruction data from memory
  output logic                           instr_req_o,
  input  logic                           instr_gnt_i,
  output logic [31:0]                    instr_addr_o,
  input  logic [BUS_SIZE-1:0]            instr_rdata_i,
  input  logic                           instr_err_i,
  input  logic                           instr_rvalid_i,

  // RAM IO
  output logic [IC_NUM_WAYS-1:0]         ic_tag_req_o,
  output logic                           ic_tag_write_o,
  output logic [IC_INDEX_W-1:0]          ic_tag_addr_o,
  output logic [TagSizeECC-1:0]          ic_tag_wdata_o,
  input  logic [TagSizeECC-1:0]          ic_tag_rdata_i [IC_NUM_WAYS],
  output logic [IC_NUM_WAYS-1:0]         ic_data_req_o,
  output logic                           ic_data_write_o,
  output logic [IC_INDEX_W-1:0]          ic_data_addr_o,
  output logic [LineSizeECC-1:0]         ic_data_wdata_o,
  input  logic [LineSizeECC-1:0]         ic_data_rdata_i [IC_NUM_WAYS],
  input  logic                           ic_scr_key_valid_i,
  output logic                           ic_scr_key_req_o,

  // Cache status
  input  logic                           icache_enable_i,
  input  logic                           icache_inval_i,
  output logic                           busy_o,
  output logic                           ecc_error_o
);

  // Number of fill buffers (must be >= 2)
  localparam int unsigned NUM_FB        = 4;
  // Request throttling threshold
  localparam int unsigned FB_THRESHOLD  = NUM_FB - 2;

  // Prefetch signals
  logic [ADDR_W-1:0]                      lookup_addr_aligned;
  logic [ADDR_W-1:0]                      prefetch_addr_d, prefetch_addr_q;
  logic                                   prefetch_addr_en;
  // Cache pipelipe IC0 signals
  logic                                   lookup_throttle;
  logic                                   lookup_req_ic0;
  logic [ADDR_W-1:0]                      lookup_addr_ic0;
  logic [IC_INDEX_W-1:0]                  lookup_index_ic0;
  logic                                   fill_req_ic0;
  logic [IC_INDEX_W-1:0]                  fill_index_ic0;
  logic [IC_TAG_SIZE-1:0]                 fill_tag_ic0;
  logic [IC_LINE_SIZE-1:0]                fill_wdata_ic0;
  logic                                   lookup_grant_ic0;
  logic                                   lookup_actual_ic0;
  logic                                   fill_grant_ic0;
  logic                                   tag_req_ic0;
  logic [IC_INDEX_W-1:0]                  tag_index_ic0;
  logic [IC_NUM_WAYS-1:0]                 tag_banks_ic0;
  logic                                   tag_write_ic0;
  logic [TagSizeECC-1:0]                  tag_wdata_ic0;
  logic                                   data_req_ic0;
  logic [IC_INDEX_W-1:0]                  data_index_ic0;
  logic [IC_NUM_WAYS-1:0]                 data_banks_ic0;
  logic                                   data_write_ic0;
  logic [LineSizeECC-1:0]                 data_wdata_ic0;
  // Cache pipelipe IC1 signals
  logic [TagSizeECC-1:0]                  tag_rdata_ic1  [IC_NUM_WAYS];
  logic [LineSizeECC-1:0]                 data_rdata_ic1 [IC_NUM_WAYS];
  logic [LineSizeECC-1:0]                 hit_data_ecc_ic1;
  logic [IC_LINE_SIZE-1:0]                hit_data_ic1;
  logic                                   lookup_valid_ic1;
  logic [ADDR_W-1:IC_INDEX_HI+1]          lookup_addr_ic1;
  logic [IC_NUM_WAYS-1:0]                 tag_match_ic1;
  logic                                   tag_hit_ic1;
  logic [IC_NUM_WAYS-1:0]                 tag_invalid_ic1;
  logic [IC_NUM_WAYS-1:0]                 lowest_invalid_way_ic1;
  logic [IC_NUM_WAYS-1:0]                 round_robin_way_ic1, round_robin_way_q;
  logic [IC_NUM_WAYS-1:0]                 sel_way_ic1;
  logic                                   ecc_err_ic1;
  logic                                   ecc_write_req;
  logic [IC_NUM_WAYS-1:0]                 ecc_write_ways;
  logic [IC_INDEX_W-1:0]                  ecc_write_index;
  // Fill buffer signals
  logic [$clog2(NUM_FB)-1:0]              fb_fill_level;
  logic                                   fill_cache_new;
  logic                                   fill_new_alloc;
  logic                                   fill_spec_req, fill_spec_done, fill_spec_hold;
  logic [NUM_FB-1:0][NUM_FB-1:0]          fill_older_d, fill_older_q;
  logic [NUM_FB-1:0]                      fill_alloc_sel, fill_alloc;
  logic [NUM_FB-1:0]                      fill_busy_d, fill_busy_q;
  logic [NUM_FB-1:0]                      fill_done;
  logic [NUM_FB-1:0]                      fill_in_ic1;
  logic [NUM_FB-1:0]                      fill_stale_d, fill_stale_q;
  logic [NUM_FB-1:0]                      fill_cache_d, fill_cache_q;
  logic [NUM_FB-1:0]                      fill_hit_ic1, fill_hit_d, fill_hit_q;
  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_ext_cnt_d, fill_ext_cnt_q;
  logic [NUM_FB-1:0]                      fill_ext_hold_d, fill_ext_hold_q;
  logic [NUM_FB-1:0]                      fill_ext_done_d, fill_ext_done_q;
  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_rvd_cnt_d, fill_rvd_cnt_q;
  logic [NUM_FB-1:0]                      fill_rvd_done;
  logic [NUM_FB-1:0]                      fill_ram_done_d, fill_ram_done_q;
  logic [NUM_FB-1:0]                      fill_out_grant;
  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_out_cnt_d, fill_out_cnt_q;
  logic [NUM_FB-1:0]                      fill_out_done;
  logic [NUM_FB-1:0]                      fill_ext_req, fill_rvd_exp, fill_ram_req, fill_out_req;
  logic [NUM_FB-1:0]                      fill_data_sel, fill_data_reg;
  logic [NUM_FB-1:0]                      fill_data_hit, fill_data_rvd;
  logic [NUM_FB-1:0][IC_LINE_BEATS_W-1:0] fill_ext_off, fill_rvd_off;
  logic [NUM_FB-1:0][IC_LINE_BEATS_W:0]   fill_ext_beat, fill_rvd_beat;
  logic [NUM_FB-1:0]                      fill_ext_arb, fill_ram_arb, fill_out_arb;
  logic [NUM_FB-1:0]                      fill_rvd_arb;
  logic [NUM_FB-1:0]                      fill_entry_en;
  logic [NUM_FB-1:0]                      fill_addr_en;
  logic [NUM_FB-1:0]                      fill_way_en;
  logic [NUM_FB-1:0][IC_LINE_BEATS-1:0]   fill_data_en;
  logic [NUM_FB-1:0][IC_LINE_BEATS-1:0]   fill_err_d, fill_err_q;
  logic [ADDR_W-1:0]                      fill_addr_q [NUM_FB];
  logic [IC_NUM_WAYS-1:0]                 fill_way_q  [NUM_FB];
  logic [IC_LINE_SIZE-1:0]                fill_data_d [NUM_FB];
  logic [IC_LINE_SIZE-1:0]                fill_data_q [NUM_FB];
  logic [ADDR_W-1:BUS_W]                  fill_ext_req_addr;
  logic [ADDR_W-1:0]                      fill_ram_req_addr;
  logic [IC_NUM_WAYS-1:0]                 fill_ram_req_way;
  logic [IC_LINE_SIZE-1:0]                fill_ram_req_data;
  logic [IC_LINE_SIZE-1:0]                fill_out_data;
  logic [IC_LINE_BEATS-1:0]               fill_out_err;
  // External req signals
  logic                                   instr_req;
  logic [ADDR_W-1:BUS_W]                  instr_addr;
  // Data output signals
  logic                                   skid_complete_instr;
  logic                                   skid_ready;
  logic                                   output_compressed;
  logic                                   skid_valid_d, skid_valid_q, skid_en;
  logic [15:0]                            skid_data_d, skid_data_q;
  logic                                   skid_err_q;
  logic                                   output_valid;
  logic                                   addr_incr_two;
  logic                                   output_addr_en;
  logic [ADDR_W-1:1]                      output_addr_incr;
  logic [ADDR_W-1:1]                      output_addr_d, output_addr_q;
  logic [15:0]                            output_data_lo, output_data_hi;
  logic                                   data_valid, output_ready;
  logic [IC_LINE_SIZE-1:0]                line_data;
  logic [IC_LINE_BEATS-1:0]               line_err;
  logic [31:0]                            line_data_muxed;
  logic                                   line_err_muxed;
  logic [31:0]                            output_data;
  logic                                   output_err;
  // Invalidations
  typedef enum logic [1:0] {
    OUT_OF_RESET,
    AWAIT_SCRAMBLE_KEY,
    INVAL_CACHE,
    IDLE
  } inval_state_e;

  inval_state_e          inval_state_q, inval_state_d;
  logic                  inval_write_req;
  logic                  inval_block_cache;
  logic [IC_INDEX_W-1:0] inval_index_d, inval_index_q;
  logic                  inval_index_en;
  logic                  inval_active;

  //////////////////////////
  // Instruction prefetch //
  //////////////////////////

  assign lookup_addr_aligned = {lookup_addr_ic0[ADDR_W-1:IC_LINE_W], {IC_LINE_W{1'b0}}};

  // The prefetch address increments by one cache line for each granted request.
  // This address is also updated if there is a branch that is not granted, since the target
  // address (addr_i) is only valid for one cycle while branch_i is high.

  // The captured branch target address is not forced to be aligned since the offset in the cache
  // line must also be recorded for later use by the fill buffers.
  assign prefetch_addr_d     =
      lookup_grant_ic0 ? (lookup_addr_aligned +
                          {{ADDR_W-IC_LINE_W-1{1'b0}}, 1'b1, {IC_LINE_W{1'b0}}}) :
                         addr_i;

  assign prefetch_addr_en    = branch_i | lookup_grant_ic0;

  if (ResetAll) begin : g_prefetch_addr_ra
    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        prefetch_addr_q <= '0;
      end else if (prefetch_addr_en) begin
        prefetch_addr_q <= prefetch_addr_d;
      end
    end
  end else begin : g_prefetch_addr_nr
    always_ff @(posedge clk_i) begin
      if (prefetch_addr_en) begin
        prefetch_addr_q <= prefetch_addr_d;
      end
    end
  end

  ////////////////////////
  // Pipeline stage IC0 //
  ////////////////////////

  // Cache lookup
  assign lookup_throttle  = (fb_fill_level > FB_THRESHOLD[$clog2(NUM_FB)-1:0]);

  assign lookup_req_ic0   = req_i & ~&fill_busy_q & (branch_i | ~lookup_throttle) &
                            ~ecc_write_req;
  assign lookup_addr_ic0  = branch_i ? addr_i : prefetch_addr_q;
  assign lookup_index_ic0 = lookup_addr_ic0[IC_INDEX_HI:IC_LINE_W];

  // Cache write
  assign fill_req_ic0   = (|fill_ram_req);
  assign fill_index_ic0 = fill_ram_req_addr[IC_INDEX_HI:IC_LINE_W];
  assign fill_tag_ic0   = {(~inval_write_req & ~ecc_write_req),
                           fill_ram_req_addr[ADDR_W-1:IC_INDEX_HI+1]};
  assign fill_wdata_ic0 = fill_ram_req_data;

  // Arbitrated signals - lookups have highest priority
  assign lookup_grant_ic0  = lookup_req_ic0;
  assign fill_grant_ic0    = fill_req_ic0 & ~lookup_req_ic0 & ~inval_write_req &
                             ~ecc_write_req;
  // Qualified lookup grant to mask ram signals in IC1 if access was not made
  assign lookup_actual_ic0 = lookup_grant_ic0 & icache_enable_i & ~inval_block_cache;

  // Tagram
  assign tag_req_ic0   = lookup_req_ic0 | fill_req_ic0 | inval_write_req | ecc_write_req;
  assign tag_index_ic0 = inval_write_req ? inval_index_q :
                         ecc_write_req   ? ecc_write_index :
                         fill_grant_ic0  ? fill_index_ic0 :
                                           lookup_index_ic0;
  assign tag_banks_ic0 = ecc_write_req  ? ecc_write_ways :
                         fill_grant_ic0 ? fill_ram_req_way :
                                          {IC_NUM_WAYS{1'b1}};
  assign tag_write_ic0 = fill_grant_ic0 | inval_write_req | ecc_write_req;

  // Dataram
  assign data_req_ic0   = lookup_req_ic0 | fill_req_ic0;
  assign data_index_ic0 = tag_index_ic0;
  assign data_banks_ic0 = tag_banks_ic0;
  assign data_write_ic0 = tag_write_ic0;

  // Append ECC checkbits to write data if required
  if (ICacheECC) begin : gen_ecc_wdata
    // SEC_CM: ICACHE.MEM.INTEGRITY
    // Tagram ECC
    // Reuse the same ecc encoding module for larger cache sizes by padding with zeros
    logic [21:0]             tag_ecc_input_padded;
    logic [27:0]             tag_ecc_output_padded;
    logic [22-IC_TAG_SIZE:0] unused_tag_ecc_output;

    assign tag_ecc_input_padded  = {{22-IC_TAG_SIZE{1'b0}},fill_tag_ic0};
    assign unused_tag_ecc_output = tag_ecc_output_padded[21:IC_TAG_SIZE-1];

    prim_secded_inv_28_22_enc tag_ecc_enc (
      .data_i (tag_ecc_input_padded),
      .data_o (tag_ecc_output_padded)
    );

    assign tag_wdata_ic0 = {tag_ecc_output_padded[27:22],tag_ecc_output_padded[IC_TAG_SIZE-1:0]};

    // Dataram ECC
    for (genvar bank = 0; bank < IC_LINE_BEATS; bank++) begin : gen_ecc_banks
      prim_secded_inv_39_32_enc data_ecc_enc (
        .data_i (fill_wdata_ic0[bank*BUS_SIZE+:BUS_SIZE]),
        .data_o (data_wdata_ic0[bank*BusSizeECC+:BusSizeECC])
      );
    end

  end else begin : gen_noecc_wdata
    assign tag_wdata_ic0  = fill_tag_ic0;
    assign data_wdata_ic0 = fill_wdata_ic0;
  end

  ////////////////
  // IC0 -> IC1 //
  ////////////////

  // Tag RAMs outputs
  assign ic_tag_req_o    = {IC_NUM_WAYS{tag_req_ic0}} & tag_banks_ic0;
  assign ic_tag_write_o  = tag_write_ic0;
  assign ic_tag_addr_o   = tag_index_ic0;
  assign ic_tag_wdata_o  = tag_wdata_ic0;

  // Tag RAMs inputs
  assign tag_rdata_ic1   = ic_tag_rdata_i;

  // Data RAMs outputs
  assign ic_data_req_o   = {IC_NUM_WAYS{data_req_ic0}} & data_banks_ic0;
  assign ic_data_write_o = data_write_ic0;
  assign ic_data_addr_o  = data_index_ic0;
  assign ic_data_wdata_o = data_wdata_ic0;

  // Data RAMs inputs
  assign data_rdata_ic1  = ic_data_rdata_i;

  always_ff @(posedge clk_i or negedge rst_ni) begin
    if (!rst_ni) begin
      lookup_valid_ic1 <= 1'b0;
    end else begin
      lookup_valid_ic1 <= lookup_actual_ic0;
    end
  end

  if (ResetAll) begin : g_lookup_addr_ra
    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        lookup_addr_ic1 <= '0;
        fill_in_ic1     <= '0;
      end else if (lookup_grant_ic0) begin
        lookup_addr_ic1 <= lookup_addr_ic0[ADDR_W-1:IC_INDEX_HI+1];
        fill_in_ic1     <= fill_alloc_sel;
      end
    end
  end else begin : g_lookup_addr_nr
    always_ff @(posedge clk_i) begin
      if (lookup_grant_ic0) begin
        lookup_addr_ic1 <= lookup_addr_ic0[ADDR_W-1:IC_INDEX_HI+1];
        fill_in_ic1     <= fill_alloc_sel;
      end
    end
  end

  ////////////////////////
  // Pipeline stage IC1 //
  ////////////////////////

  // Tag matching
  for (genvar way = 0; way < IC_NUM_WAYS; way++) begin : gen_tag_match
    assign tag_match_ic1[way]   = (tag_rdata_ic1[way][IC_TAG_SIZE-1:0] ==
                                   {1'b1,lookup_addr_ic1[ADDR_W-1:IC_INDEX_HI+1]});
    assign tag_invalid_ic1[way] = ~tag_rdata_ic1[way][IC_TAG_SIZE-1];
  end

  assign tag_hit_ic1 = |tag_match_ic1;

  // Hit data mux
  always_comb[docs] begin
    hit_data_ecc_ic1 = 'b0;
    for (int way = 0; way < IC_NUM_WAYS; way++) begin
      if (tag_match_ic1[way]) begin
        hit_data_ecc_ic1 |= data_rdata_ic1[way];
      end
    end
  end

  // Way selection for allocations to the cache (onehot signals)
  // 1 first invalid way
  // 2 global round-robin (pseudorandom) way
  assign lowest_invalid_way_ic1[0] = tag_invalid_ic1[0];
  assign round_robin_way_ic1[0]    = round_robin_way_q[IC_NUM_WAYS-1];
  for (genvar way = 1; way < IC_NUM_WAYS; way++) begin : gen_lowest_way
    assign lowest_invalid_way_ic1[way] = tag_invalid_ic1[way] & ~|tag_invalid_ic1[way-1:0];
    assign round_robin_way_ic1[way]    = round_robin_way_q[way-1];
  end

  always_ff @(posedge clk_i or negedge rst_ni) begin
    if (!rst_ni) begin
      round_robin_way_q <= {{IC_NUM_WAYS-1{1'b0}}, 1'b1};
    end else if (lookup_valid_ic1) begin
      round_robin_way_q <= round_robin_way_ic1;
    end
  end

  assign sel_way_ic1 = |tag_invalid_ic1 ? lowest_invalid_way_ic1 :
                                          round_robin_way_q;

  // ECC checking logic
  if (ICacheECC) begin : gen_data_ecc_checking
    // SEC_CM: ICACHE.MEM.INTEGRITY
    logic [IC_NUM_WAYS-1:0]     tag_err_ic1;
    logic [IC_LINE_BEATS*2-1:0] data_err_ic1;
    logic                       ecc_correction_write_d, ecc_correction_write_q;
    logic [IC_NUM_WAYS-1:0]     ecc_correction_ways_d, ecc_correction_ways_q;
    logic [IC_INDEX_W-1:0]      lookup_index_ic1, ecc_correction_index_q;

    // Tag ECC checking
    for (genvar way = 0; way < IC_NUM_WAYS; way++) begin : gen_tag_ecc
      logic [1:0]  tag_err_bank_ic1;
      logic [27:0] tag_rdata_padded_ic1;

      // Expand the tag rdata with extra padding if the tag size is less than the maximum
      assign tag_rdata_padded_ic1 = {tag_rdata_ic1[way][TagSizeECC-1-:6],
                                     {22-IC_TAG_SIZE{1'b0}},
                                     tag_rdata_ic1[way][IC_TAG_SIZE-1:0]};

      prim_secded_inv_28_22_dec data_ecc_dec (
        .data_i     (tag_rdata_padded_ic1),
        .data_o     (),
        .syndrome_o (),
        .err_o      (tag_err_bank_ic1)
      );
      assign tag_err_ic1[way] = |tag_err_bank_ic1;
    end

    // Data ECC checking
    // Note - could generate for all ways and mux after
    for (genvar bank = 0; bank < IC_LINE_BEATS; bank++) begin : gen_ecc_banks
      prim_secded_inv_39_32_dec data_ecc_dec (
        .data_i     (hit_data_ecc_ic1[bank*BusSizeECC+:BusSizeECC]),
        .data_o     (),
        .syndrome_o (),
        .err_o      (data_err_ic1[bank*2+:2])
      );

      assign hit_data_ic1[bank*BUS_SIZE+:BUS_SIZE] =
          hit_data_ecc_ic1[bank*BusSizeECC+:BUS_SIZE];

    end

    // Tag ECC across all ways is always expected to be correct so the check does not need to be
    // qualified by hit or tag valid. Initial (invalid with correct ECC) tags are written on reset
    // and all further tag writes produce correct ECC. For data ECC no initialisation is done on
    // reset so unused data (in particular those ways that don't have a valid tag) may have
    // incorrect ECC. We only check data ECC where tags indicate it is valid and we have hit on it.
    assign ecc_err_ic1 = lookup_valid_ic1 & (((|data_err_ic1) & tag_hit_ic1) | (|tag_err_ic1));

    // Error correction
    // All ways will be invalidated on a tag error to prevent X-propagation from data_err_ic1 on
    // spurious hits. Also prevents the same line being allocated twice when there was a true
    // hit and a spurious hit.
    assign ecc_correction_ways_d  = {IC_NUM_WAYS{|tag_err_ic1}} |
                                    (tag_match_ic1 & {IC_NUM_WAYS{|data_err_ic1}});
    assign ecc_correction_write_d = ecc_err_ic1;

    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        ecc_correction_write_q <= 1'b0;
      end else begin
        ecc_correction_write_q <= ecc_correction_write_d;
      end
    end

    // The index is required in IC1 only when ECC is configured so is registered here
    if (ResetAll) begin : g_lookup_ind_ra
      always_ff @(posedge clk_i or negedge rst_ni) begin
        if (!rst_ni) begin
          lookup_index_ic1 <= '0;
        end else if (lookup_grant_ic0) begin
          lookup_index_ic1 <= lookup_addr_ic0[IC_INDEX_HI-:IC_INDEX_W];
        end
      end
    end else begin : g_lookup_ind_nr
      always_ff @(posedge clk_i) begin
        if (lookup_grant_ic0) begin
          lookup_index_ic1 <= lookup_addr_ic0[IC_INDEX_HI-:IC_INDEX_W];
        end
      end
    end

    // Store the ways with errors to be invalidated
    if (ResetAll) begin : g_ecc_correction_ra
      always_ff @(posedge clk_i or negedge rst_ni) begin
        if (!rst_ni) begin
          ecc_correction_ways_q  <= '0;
          ecc_correction_index_q <= '0;
        end else if (ecc_err_ic1) begin
          ecc_correction_ways_q  <= ecc_correction_ways_d;
          ecc_correction_index_q <= lookup_index_ic1;
        end
      end
    end else begin : g_ecc_correction_nr
      always_ff @(posedge clk_i) begin
        if (ecc_err_ic1) begin
          ecc_correction_ways_q  <= ecc_correction_ways_d;
          ecc_correction_index_q <= lookup_index_ic1;
        end
      end
    end

    assign ecc_write_req   = ecc_correction_write_q;
    assign ecc_write_ways  = ecc_correction_ways_q;
    assign ecc_write_index = ecc_correction_index_q;

    assign ecc_error_o = ecc_err_ic1;
  end else begin : gen_no_data_ecc
    assign ecc_err_ic1     = 1'b0;
    assign ecc_write_req   = 1'b0;
    assign ecc_write_ways  = '0;
    assign ecc_write_index = '0;
    assign hit_data_ic1    = hit_data_ecc_ic1;

    assign ecc_error_o = 1'b0;
  end

  ///////////////////////////////
  // Cache allocation decision //
  ///////////////////////////////

  if (BranchCache) begin : gen_caching_logic

    // Cache branch target + a number of subsequent lines
    localparam int unsigned CACHE_AHEAD = 2;
    localparam int unsigned CACHE_CNT_W = (CACHE_AHEAD == 1) ? 1 : $clog2(CACHE_AHEAD) + 1;
    logic                   cache_cnt_dec;
    logic [CACHE_CNT_W-1:0] cache_cnt_d, cache_cnt_q;

    assign cache_cnt_dec = lookup_grant_ic0 & (|cache_cnt_q);
    assign cache_cnt_d   = branch_i ? CACHE_AHEAD[CACHE_CNT_W-1:0] :
                                      (cache_cnt_q - {{CACHE_CNT_W-1{1'b0}},cache_cnt_dec});

    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        cache_cnt_q <= '0;
      end else begin
        cache_cnt_q <= cache_cnt_d;
      end
    end

    assign fill_cache_new = (branch_i | (|cache_cnt_q)) & icache_enable_i & ~inval_block_cache;
  end else begin : gen_cache_all

    // Cache all missing fetches
    assign fill_cache_new = icache_enable_i & ~inval_block_cache;
  end

  //////////////////////////
  // Fill buffer tracking //
  //////////////////////////

  always_comb[docs] begin
    fb_fill_level = '0;
    for (int i = 0; i < NUM_FB; i++) begin
      if (fill_busy_q[i] & ~fill_stale_q[i]) begin
        fb_fill_level += {{$clog2(NUM_FB) - 1{1'b0}}, 1'b1};
      end
    end
  end

  // Allocate a new buffer for every granted lookup
  assign fill_new_alloc = lookup_grant_ic0;
  // Track whether a speculative external request was made from IC0, and whether it was granted
  // Speculative requests are only made for branches, or if the cache is disabled
  assign fill_spec_req  = (~icache_enable_i | branch_i) & ~|fill_ext_req;
  assign fill_spec_done = fill_spec_req & instr_gnt_i;
  assign fill_spec_hold = fill_spec_req & ~instr_gnt_i;

  for (genvar fb = 0; fb < NUM_FB; fb++) begin : gen_fbs

    /////////////////////////////
    // Fill buffer allocations //
    /////////////////////////////

    // Allocate the lowest available buffer
    if (fb == 0) begin : gen_fb_zero
      assign fill_alloc_sel[fb] = ~fill_busy_q[fb];
    end else begin : gen_fb_rest
      assign fill_alloc_sel[fb] = ~fill_busy_q[fb] & (&fill_busy_q[fb-1:0]);
    end

    assign fill_alloc[fb]      = fill_alloc_sel[fb] & fill_new_alloc;
    assign fill_busy_d[fb]     = fill_alloc[fb] | (fill_busy_q[fb] & ~fill_done[fb]);

    // Track which other fill buffers are older than this one (for age-based arbitration)
    // TODO sparsify
    assign fill_older_d[fb]    = (fill_alloc[fb] ? fill_busy_q : fill_older_q[fb]) & ~fill_done;

    // A fill buffer can release once all its actions are completed
                                 // all data written to the cache (unless hit or error)
    assign fill_done[fb]       = (fill_ram_done_q[fb] | fill_hit_q[fb] | ~fill_cache_q[fb] |
                                  (|fill_err_q[fb])) &
                                 // all data output unless stale due to intervening branch
                                 (fill_out_done[fb] | fill_stale_q[fb] | branch_i) &
                                 // all external requests completed
                                 fill_rvd_done[fb];

    /////////////////////////////////
    // Fill buffer status tracking //
    /////////////////////////////////

    // Track staleness (requests become stale when a branch intervenes)
    assign fill_stale_d[fb]    = fill_busy_q[fb] & (branch_i | fill_stale_q[fb]);
    // Track whether or not this request should allocate to the cache
    // Any invalidation or disabling of the cache while the buffer is busy will stop allocation
    assign fill_cache_d[fb]    = (fill_alloc[fb] & fill_cache_new) |
                                 (fill_cache_q[fb] & fill_busy_q[fb] &
                                  icache_enable_i & ~icache_inval_i);
    // Record whether the request hit in the cache
    assign fill_hit_ic1[fb]    = lookup_valid_ic1 & fill_in_ic1[fb] & tag_hit_ic1 & ~ecc_err_ic1;
    assign fill_hit_d[fb]      = fill_hit_ic1[fb] | (fill_hit_q[fb] & fill_busy_q[fb]);

    ///////////////////////////////////////////
    // Fill buffer external request tracking //
    ///////////////////////////////////////////

    // Make an external request
    assign fill_ext_req[fb]    = fill_busy_q[fb] & ~fill_ext_done_d[fb];

    // Count the number of completed external requests (each line requires IC_LINE_BEATS requests)
    assign fill_ext_cnt_d[fb]  = fill_alloc[fb] ?
                                   {{IC_LINE_BEATS_W{1'b0}},fill_spec_done} :
                                   (fill_ext_cnt_q[fb] + {{IC_LINE_BEATS_W{1'b0}},
                                                          fill_ext_arb[fb] & instr_gnt_i});
    // External request must be held until granted
    assign fill_ext_hold_d[fb] = (fill_alloc[fb] & fill_spec_hold) |
                                 (fill_ext_arb[fb] & ~instr_gnt_i);
    // External requests are completed when the counter is filled or when the request is cancelled
    assign fill_ext_done_d[fb] = (fill_ext_cnt_q[fb][IC_LINE_BEATS_W] |
                                  // external requests are considered complete if the request hit
                                  fill_hit_ic1[fb] | fill_hit_q[fb] |
                                  // cancel if the line won't be cached and, it is stale
                                  (~fill_cache_q[fb] & (branch_i | fill_stale_q[fb] |
                                   // or we're already at the end of the line
                                                        fill_ext_beat[fb][IC_LINE_BEATS_W]))) &
                                 // can't cancel while we are waiting for a grant on the bus
                                 ~fill_ext_hold_q[fb] & fill_busy_q[fb];
    // Track whether this fill buffer expects to receive beats of data
    assign fill_rvd_exp[fb]    = fill_busy_q[fb] & ~fill_rvd_done[fb];
    // Count the number of rvalid beats received
    assign fill_rvd_cnt_d[fb]  = fill_alloc[fb] ? '0 :
                                                  (fill_rvd_cnt_q[fb] +
                                                   {{IC_LINE_BEATS_W{1'b0}},fill_rvd_arb[fb]});
    // External data is complete when all issued external requests have received their data
    assign fill_rvd_done[fb]   = (fill_ext_done_q[fb] & ~fill_ext_hold_q[fb]) &
                                 (fill_rvd_cnt_q[fb] == fill_ext_cnt_q[fb]);

    //////////////////////////////////////
    // Fill buffer data output tracking //
    //////////////////////////////////////

    // Send data to the IF stage for requests that are not stale, have not completed their
    // data output, and have data available to send.
    // Data is available if:
    // - The request hit in the cache
    // - Buffered data is available (fill_rvd_cnt_q is ahead of fill_out_cnt_q)
    // - Data is available from the bus this cycle (fill_rvd_arb)
    assign fill_out_req[fb]    = fill_busy_q[fb] & ~fill_stale_q[fb] & ~fill_out_done[fb] &
                                 (fill_hit_ic1[fb] | fill_hit_q[fb] |
                                  (fill_rvd_beat[fb] > fill_out_cnt_q[fb]) | fill_rvd_arb[fb]);

    // Calculate when a beat of data is output. Any ECC error squashes the output that cycle.
    assign fill_out_grant[fb]  = fill_out_arb[fb] & output_ready;

    // Count the beats of data output to the IF stage
    assign fill_out_cnt_d[fb]  = fill_alloc[fb] ? {1'b0,lookup_addr_ic0[IC_LINE_W-1:BUS_W]} :
                                                  (fill_out_cnt_q[fb] +
                                                   {{IC_LINE_BEATS_W{1'b0}},fill_out_grant[fb]});
    // Data output complete when the counter fills
    assign fill_out_done[fb]   = fill_out_cnt_q[fb][IC_LINE_BEATS_W];

    //////////////////////////////////////
    // Fill buffer ram request tracking //
    //////////////////////////////////////

                                 // make a fill request once all data beats received
    assign fill_ram_req[fb]    = fill_busy_q[fb] & fill_rvd_cnt_q[fb][IC_LINE_BEATS_W] &
                                 // unless the request hit, was non-allocating or got an error
                                 ~fill_hit_q[fb] & fill_cache_q[fb] & ~|fill_err_q[fb] &
                                 // or the request was already completed
                                 ~fill_ram_done_q[fb];

    // Record when a cache allocation request has been completed
    assign fill_ram_done_d[fb] = fill_ram_arb[fb] | (fill_ram_done_q[fb] & fill_busy_q[fb]);

    //////////////////////////////
    // Fill buffer line offsets //
    //////////////////////////////

    // When we branch into the middle of a line, the output count will not start from zero. This
    // beat count is used to know which incoming rdata beats are relevant.
    assign fill_ext_beat[fb]   = {1'b0,fill_addr_q[fb][IC_LINE_W-1:BUS_W]} +
                                 fill_ext_cnt_q[fb][IC_LINE_BEATS_W:0];
    assign fill_ext_off[fb]    = fill_ext_beat[fb][IC_LINE_BEATS_W-1:0];
    assign fill_rvd_beat[fb]   = {1'b0,fill_addr_q[fb][IC_LINE_W-1:BUS_W]} +
                                 fill_rvd_cnt_q[fb][IC_LINE_BEATS_W:0];
    assign fill_rvd_off[fb]    = fill_rvd_beat[fb][IC_LINE_BEATS_W-1:0];

    /////////////////////////////
    // Fill buffer arbitration //
    /////////////////////////////

    // Age based arbitration - all these signals are one-hot
    assign fill_ext_arb[fb]    = fill_ext_req[fb] & ~|(fill_ext_req & fill_older_q[fb]);
    assign fill_ram_arb[fb]    = fill_ram_req[fb] & fill_grant_ic0 &
                                 ~|(fill_ram_req & fill_older_q[fb]);
    // Calculate which fill buffer is the oldest one which still needs to output data to IF
    assign fill_data_sel[fb]   = ~|(fill_busy_q & ~fill_out_done & ~fill_stale_q &
                                    fill_older_q[fb]);
    // Arbitrate the request which has data available to send, and is the oldest outstanding
    assign fill_out_arb[fb]    = fill_out_req[fb] & fill_data_sel[fb];
    // Assign incoming rvalid data to the oldest fill buffer expecting it
    assign fill_rvd_arb[fb]    = instr_rvalid_i & fill_rvd_exp[fb] &
                                 ~|(fill_rvd_exp & fill_older_q[fb]);

    /////////////////////////////
    // Fill buffer data muxing //
    /////////////////////////////

    // Output data muxing controls
    // 1. Select data from the fill buffer data register
    assign fill_data_reg[fb]   = fill_busy_q[fb] & ~fill_stale_q[fb] &
                                 ~fill_out_done[fb] & fill_data_sel[fb] &
    //                           The incoming data is already ahead of the output count
                                 ((fill_rvd_beat[fb] > fill_out_cnt_q[fb]) | fill_hit_q[fb] |
                                  (|fill_err_q[fb]));
    // 2. Select IC1 hit data
    assign fill_data_hit[fb]   = fill_busy_q[fb] & fill_hit_ic1[fb] & fill_data_sel[fb];
    // 3. Select incoming instr_rdata_i
    assign fill_data_rvd[fb]   = fill_busy_q[fb] & fill_rvd_arb[fb] & ~fill_hit_q[fb] &
                                 ~fill_hit_ic1[fb] & ~fill_stale_q[fb] & ~fill_out_done[fb] &
    //                           The incoming data lines up with the output count
                                 (fill_rvd_beat[fb] == fill_out_cnt_q[fb]) & fill_data_sel[fb];


    ///////////////////////////
    // Fill buffer registers //
    ///////////////////////////

    // Fill buffer general enable
    assign fill_entry_en[fb]   = fill_alloc[fb] | fill_busy_q[fb];

    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        fill_busy_q[fb]     <= 1'b0;
        fill_older_q[fb]    <= '0;
        fill_stale_q[fb]    <= 1'b0;
        fill_cache_q[fb]    <= 1'b0;
        fill_hit_q[fb]      <= 1'b0;
        fill_ext_cnt_q[fb]  <= '0;
        fill_ext_hold_q[fb] <= 1'b0;
        fill_ext_done_q[fb] <= 1'b0;
        fill_rvd_cnt_q[fb]  <= '0;
        fill_ram_done_q[fb] <= 1'b0;
        fill_out_cnt_q[fb]  <= '0;
      end else if (fill_entry_en[fb]) begin
        fill_busy_q[fb]     <= fill_busy_d[fb];
        fill_older_q[fb]    <= fill_older_d[fb];
        fill_stale_q[fb]    <= fill_stale_d[fb];
        fill_cache_q[fb]    <= fill_cache_d[fb];
        fill_hit_q[fb]      <= fill_hit_d[fb];
        fill_ext_cnt_q[fb]  <= fill_ext_cnt_d[fb];
        fill_ext_hold_q[fb] <= fill_ext_hold_d[fb];
        fill_ext_done_q[fb] <= fill_ext_done_d[fb];
        fill_rvd_cnt_q[fb]  <= fill_rvd_cnt_d[fb];
        fill_ram_done_q[fb] <= fill_ram_done_d[fb];
        fill_out_cnt_q[fb]  <= fill_out_cnt_d[fb];
      end
    end

    ////////////////////////////////////////
    // Fill buffer address / data storage //
    ////////////////////////////////////////

    assign fill_addr_en[fb]    = fill_alloc[fb];
    assign fill_way_en[fb]     = (lookup_valid_ic1 & fill_in_ic1[fb]);

    if (ResetAll) begin : g_fill_addr_ra
      always_ff @(posedge clk_i or negedge rst_ni) begin
        if (!rst_ni) begin
          fill_addr_q[fb] <= '0;
        end else if (fill_addr_en[fb]) begin
          fill_addr_q[fb] <= lookup_addr_ic0;
        end
      end
    end else begin : g_fill_addr_nr
      always_ff @(posedge clk_i) begin
        if (fill_addr_en[fb]) begin
          fill_addr_q[fb] <= lookup_addr_ic0;
        end
      end
    end

    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        fill_way_q[fb]  <= '0;
      end else if (fill_way_en[fb]) begin
        fill_way_q[fb]  <= sel_way_ic1;
      end
    end

    // Data either comes from the cache or the bus. If there was an ECC error, we must take
    // the incoming bus data since the cache hit data is corrupted.
    assign fill_data_d[fb] = fill_hit_ic1[fb] ? hit_data_ic1 :
                                                {IC_LINE_BEATS{instr_rdata_i}};

    for (genvar b = 0; b < IC_LINE_BEATS; b++) begin : gen_data_buf
      // Error tracking (per beat)
      assign fill_err_d[fb][b]   = (fill_rvd_arb[fb] & instr_err_i &
                                    (fill_rvd_off[fb] == b[IC_LINE_BEATS_W-1:0])) |
      //                           Hold the error once recorded
                                   (fill_busy_q[fb] & fill_err_q[fb][b]);

      always_ff @(posedge clk_i or negedge rst_ni) begin
        if (!rst_ni) begin
          fill_err_q[fb][b] <= '0;
        end else if (fill_entry_en[fb]) begin
          fill_err_q[fb][b] <= fill_err_d[fb][b];
        end
      end

      // Enable the relevant part of the data register (or all for cache hits)
      // Ignore incoming rvalid data when we already have cache hit data
      assign fill_data_en[fb][b] = fill_hit_ic1[fb] |
                                   (fill_rvd_arb[fb] & ~fill_hit_q[fb] &
                                    (fill_rvd_off[fb] == b[IC_LINE_BEATS_W-1:0]));

      if (ResetAll) begin : g_fill_data_ra
        always_ff @(posedge clk_i or negedge rst_ni) begin
          if (!rst_ni) begin
            fill_data_q[fb][b*BUS_SIZE+:BUS_SIZE] <= '0;
          end else if (fill_data_en[fb][b]) begin
            fill_data_q[fb][b*BUS_SIZE+:BUS_SIZE] <= fill_data_d[fb][b*BUS_SIZE+:BUS_SIZE];
          end
        end
      end else begin : g_fill_data_nr
        always_ff @(posedge clk_i) begin
          if (fill_data_en[fb][b]) begin
            fill_data_q[fb][b*BUS_SIZE+:BUS_SIZE] <= fill_data_d[fb][b*BUS_SIZE+:BUS_SIZE];
          end
        end
      end

    end
  end

  ////////////////////////////////
  // Fill buffer one-hot muxing //
  ////////////////////////////////

  // External req info
  always_comb[docs] begin
    fill_ext_req_addr = '0;
    for (int i = 0; i < NUM_FB; i++) begin
      if (fill_ext_arb[i]) begin
        fill_ext_req_addr |= {fill_addr_q[i][ADDR_W-1:IC_LINE_W], fill_ext_off[i]};
      end
    end
  end

  // Cache req info
  always_comb[docs] begin
    fill_ram_req_addr = '0;
    fill_ram_req_way  = '0;
    fill_ram_req_data = '0;
    for (int i = 0; i < NUM_FB; i++) begin
      if (fill_ram_arb[i]) begin
        fill_ram_req_addr |= fill_addr_q[i];
        fill_ram_req_way  |= fill_way_q[i];
        fill_ram_req_data |= fill_data_q[i];
      end
    end
  end

  // IF stage output data
  always_comb[docs] begin
    fill_out_data = '0;
    fill_out_err  = '0;
    for (int i = 0; i < NUM_FB; i++) begin
      if (fill_data_reg[i]) begin
        fill_out_data |= fill_data_q[i];
        // Ignore any speculative errors accumulated on cache hits
        fill_out_err  |= (fill_err_q[i] & ~{IC_LINE_BEATS{fill_hit_q[i]}});
      end
    end
  end

  ///////////////////////
  // External requests //
  ///////////////////////

  assign instr_req  = ((~icache_enable_i | branch_i) & lookup_grant_ic0) |
                      (|fill_ext_req);

  assign instr_addr = |fill_ext_req ? fill_ext_req_addr :
                                      lookup_addr_ic0[ADDR_W-1:BUS_W];

  assign instr_req_o  = instr_req;
  assign instr_addr_o = {instr_addr[ADDR_W-1:BUS_W],{BUS_W{1'b0}}};

  ////////////////////////
  // Output data muxing //
  ////////////////////////

  // Mux between line-width data sources
  assign line_data = |fill_data_hit ? hit_data_ic1 : fill_out_data;
  assign line_err  = |fill_data_hit ? {IC_LINE_BEATS{1'b0}} : fill_out_err;

  // Mux the relevant beat of line data, based on the output address
  always_comb[docs] begin
    line_data_muxed = '0;
    line_err_muxed  = 1'b0;
    for (int unsigned i = 0; i < IC_LINE_BEATS; i++) begin
      // When data has been skidded, the output address is behind by one
      if ((output_addr_q[IC_LINE_W-1:BUS_W] + {{IC_LINE_BEATS_W-1{1'b0}},skid_valid_q}) ==
          i[IC_LINE_BEATS_W-1:0]) begin
        line_data_muxed |= line_data[i*32+:32];
        line_err_muxed  |= line_err[i];
      end
    end
  end

  // Mux between incoming rdata and the muxed line data
  assign output_data = |fill_data_rvd ? instr_rdata_i : line_data_muxed;
  assign output_err  = |fill_data_rvd ? instr_err_i   : line_err_muxed;

  // Output data is valid (from any of the three possible sources). Note that fill_out_arb
  // must be used here rather than fill_out_req because data can become valid out of order
  // (e.g. cache hit data can become available ahead of an older outstanding miss).
  assign data_valid = |fill_out_arb;

  // Skid buffer data
  assign skid_data_d = output_data[31:16];

  assign skid_en     = data_valid & (ready_i | skid_ready);

  if (ResetAll) begin : g_skid_data_ra
    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        skid_data_q <= '0;
        skid_err_q  <= '0;
      end else if (skid_en) begin
        skid_data_q <= skid_data_d;
        skid_err_q  <= output_err;
      end
    end
  end else begin : g_skid_data_nr
    always_ff @(posedge clk_i) begin
      if (skid_en) begin
        skid_data_q <= skid_data_d;
        skid_err_q  <= output_err;
      end
    end
  end

  // The data in the skid buffer is ready if it's a complete compressed instruction or if there's
  // an error (no need to wait for the second half)
  assign skid_complete_instr = skid_valid_q & ((skid_data_q[1:0] != 2'b11) | skid_err_q);

  // Data can be loaded into the skid buffer for an unaligned uncompressed instruction
  assign skid_ready = output_addr_q[1] & ~skid_valid_q & (~output_compressed | output_err);

  assign output_ready = (ready_i | skid_ready) & ~skid_complete_instr;

  assign output_compressed = (rdata_o[1:0] != 2'b11);

  assign skid_valid_d =
      // Branches invalidate the skid buffer
      branch_i ? 1'b0 :
      // Once valid, the skid buffer stays valid until a compressed instruction realigns the stream
      (skid_valid_q ? ~(ready_i & ((skid_data_q[1:0] != 2'b11) | skid_err_q)) :
      // The skid buffer becomes valid when:
                        // - we branch to an unaligned uncompressed instruction
                      (data_valid &
                       (((output_addr_q[1] & (~output_compressed | output_err)) |
                        // - a compressed instruction misaligns the stream
                        (~output_addr_q[1] & output_compressed & ~output_err & ready_i)))));

  always_ff @(posedge clk_i or negedge rst_ni) begin
    if (!rst_ni) begin
      skid_valid_q <= 1'b0;
    end else begin
      skid_valid_q <= skid_valid_d;
    end
  end

  // Signal that valid data is available to the IF stage
  // Note that if the first half of an unaligned instruction reports an error, we do not need
  // to wait for the second half
                        // Compressed instruction completely satisfied by skid buffer
  assign output_valid = skid_complete_instr |
                        // Output data available and, output stream aligned, or skid data available,
                        (data_valid & (~output_addr_q[1] | skid_valid_q |
                                       // or this is an error or an unaligned compressed instruction
                                       output_err | (output_data[17:16] != 2'b11)));

  // Update the address on branches and every time an instruction is driven
  assign output_addr_en = branch_i | (ready_i & valid_o);

  // Increment the address by two every time a compressed instruction is popped
  assign addr_incr_two = output_compressed & ~err_o;

  // Next IF stage PC
  assign output_addr_incr = (output_addr_q[31:1] +
                             // Increment address by 4 or 2
                             {29'd0, ~addr_incr_two, addr_incr_two});

  // Redirect the address on branches
  assign output_addr_d = branch_i ? addr_i[31:1] : output_addr_incr;

  if (ResetAll) begin : g_output_addr_ra
    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        output_addr_q <= '0;
      end else if (output_addr_en) begin
        output_addr_q <= output_addr_d;
      end
    end
  end else begin : g_output_addr_nr
    always_ff @(posedge clk_i) begin
      if (output_addr_en) begin
        output_addr_q <= output_addr_d;
      end
    end
  end

  // Mux the data from BUS_SIZE to halfword
  // This muxing realigns data when instruction words are split across BUS_W e.g.
  // word 1 |----|*h1*|
  // word 0 |*h0*|----| --> |*h1*|*h0*|
  //        31   15   0     31   15   0
  always_comb[docs] begin
    output_data_lo = '0;
    for (int unsigned i = 0; i < IC_OUTPUT_BEATS; i++) begin
      if (output_addr_q[BUS_W-1:1] == i[BUS_W-2:0]) begin
        output_data_lo |= output_data[i*16+:16];
      end
    end
  end

  always_comb begin
    output_data_hi = '0;
    for (int unsigned i = 0; i < IC_OUTPUT_BEATS - 1; i++) begin
      if (output_addr_q[BUS_W-1:1] == i[BUS_W-2:0]) begin
        output_data_hi |= output_data[(i+1)*16+:16];
      end
    end
    if (&output_addr_q[BUS_W-1:1]) begin
      output_data_hi |= output_data[15:0];
    end
  end

  assign valid_o     = output_valid;
  assign rdata_o     = {output_data_hi, (skid_valid_q ? skid_data_q : output_data_lo)};
  assign addr_o      = {output_addr_q, 1'b0};
  assign err_o       = (skid_valid_q & skid_err_q) | (~skid_complete_instr & output_err);
  // Error caused by the second half of a misaligned uncompressed instruction
  // (only relevant when err_o is set)
  assign err_plus2_o = skid_valid_q & ~skid_err_q;

  ///////////////////
  // Invalidations //
  ///////////////////

  // Invalidation (writing all entries in the tag RAM with an invalid tag) occurs straight out of
  // reset and after any invalidation request (signalled via icache_inval_i). An invalidation
  // request coming whilst another is writing tags causes the invalidation to start again. This
  // ensures a new scramble key is requested where a previous one is in use.
  // TODO: Ditch this behaviour for non-secure ibex?
  always_comb[docs] begin
    inval_state_d     = inval_state_q;
    inval_index_d     = inval_index_q;
    inval_index_en    = 1'b0;
    inval_write_req   = 1'b0;
    ic_scr_key_req_o  = 1'b0;

    // Prevent other cache activity (cache lookups and cache allocations) whilst an invalidation is
    // in progress. Set to 1 by default as the only time we don't block is when the state machine is
    // IDLE.
    inval_block_cache = 1'b1;

    unique case (inval_state_q)
      OUT_OF_RESET: begin
        // Initial state, this initialises the tag RAMs out of reset before the icache can be used
        inval_state_d = AWAIT_SCRAMBLE_KEY;

        if (~ic_scr_key_valid_i) begin
          ic_scr_key_req_o = 1'b1;
        end
      end
      AWAIT_SCRAMBLE_KEY: begin
        // When invalidating a new scrambling key is requested on all invalidation requests. Wait
        // for that new key to be available before beginning with the actual invalidation (cannot
        // write to the tag RAM until we have the new scrambling key that will be used). Ignore any
        // requests in this phase (once a scramble key request has started we cannot request a new
        // one until the on-going request is done).
        if (ic_scr_key_valid_i) begin
          inval_state_d  = INVAL_CACHE;
          inval_index_d  = '0;
          inval_index_en = 1'b1;
        end
      end
      INVAL_CACHE: begin
        // Actually invalidate the cache. Write every entry in the tag RAM with an invalid tag. Once
        // all are written we're done.
        inval_write_req = 1'b1;
        inval_index_d   = (inval_index_q + {{IC_INDEX_W-1{1'b0}},1'b1});
        inval_index_en  = 1'b1;

        if (icache_inval_i) begin
          // If a new invalidaiton requests comes in go back to the beginning with a new scramble
          // key
          ic_scr_key_req_o = 1'b1;
          inval_state_d = AWAIT_SCRAMBLE_KEY;
        end else if (&inval_index_q) begin
          // When the final index is written we're done
            inval_state_d = IDLE;
        end
      end
      IDLE: begin
        // Usual running state
        if (icache_inval_i) begin
          ic_scr_key_req_o = 1'b1;
          inval_state_d = AWAIT_SCRAMBLE_KEY;
        end else begin
          // Allow other cache activies whilst in IDLE and no invalidation has been requested
          inval_block_cache = 1'b0;
        end
      end
      default: ;
    endcase
  end

  assign inval_active = inval_state_q != IDLE;

  always_ff[docs] @(posedge clk_i or negedge rst_ni) begin
    if (!rst_ni) begin
      inval_state_q <= OUT_OF_RESET;
    end else begin
      inval_state_q <= inval_state_d;
    end
  end

  if (ResetAll) begin : g_inval_index_ra
    always_ff @(posedge clk_i or negedge rst_ni) begin
      if (!rst_ni) begin
        inval_index_q <= '0;
      end else if (inval_index_en) begin
        inval_index_q <= inval_index_d;
      end
    end
  end else begin : g_inval_index_nr
    always_ff @(posedge clk_i) begin
      if (inval_index_en) begin
        inval_index_q <= inval_index_d;
      end
    end
  end

  /////////////////
  // Busy status //
  /////////////////

  // Only busy (for WFI purposes) while an invalidation is in-progress, or external requests are
  // outstanding.
  assign busy_o = inval_active | (|(fill_busy_q & ~fill_rvd_done));

  ////////////////
  // Assertions //
  ////////////////

  `ASSERT_INIT(size_param_legal, (IC_LINE_SIZE > 32))

  // ECC primitives will need to be changed for different sizes
  `ASSERT_INIT(ecc_tag_param_legal, (IC_TAG_SIZE <= 27))
  `ASSERT_INIT(ecc_data_param_legal, !ICacheECC || (BUS_SIZE == 32))

  // Lookups in the tag ram should always give a known result
  `ASSERT_KNOWN(TagHitKnown,     lookup_valid_ic1 & tag_hit_ic1)
  `ASSERT_KNOWN(TagInvalidKnown, lookup_valid_ic1 & tag_invalid_ic1)

  // This is only used for the Yosys-based formal flow. Once we have working bind support, we can
  // get rid of it.
`ifdef FORMAL
 `ifdef YOSYS
  // Unfortunately, Yosys doesn't support passing unpacked arrays as ports. Explicitly pack up the
  // signals we need.
  logic [NUM_FB-1:0][ADDR_W-1:0] packed_fill_addr_q;
  always_comb begin
    for (int i = 0; i < NUM_FB; i++) begin
      packed_fill_addr_q[i][ADDR_W-1:0] = fill_addr_q[i];
    end
  end

  `include "formal_tb_frag.svh"
 `endif
`endif


endmodule