-- EMACS settings: -*-  tab-width: 2; indent-tabs-mode: t -*-
-- vim: tabstop=2:shiftwidth=2:noexpandtab
-- kate: tab-width 2; replace-tabs off; indent-width 2;
-- =============================================================================
-- Authors:         Martin Zabel
--
-- Entity:          Cache with cache controller to be used within a CPU
--
-- Description:
-- 
--
-- License:
-- =============================================================================
-- Copyright 2016-2016 Technische Universitaet Dresden - Germany
--										 Chair of VLSI-Design, Diagnostics and Architecture
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
--		http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- =============================================================================

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library poc;
use poc.utils.all;


-- This unit provides a cache (:ref:`IP:cache_par2`) together
-- with a cache controller which reads / writes cache lines from / to memory.
-- The memory is accessed using a :ref:`INT:PoC.Mem` interfaces, the related
-- ports and parameters are prefixed with ``mem_``.
--
-- The CPU side (prefix ``cpu_``) has a modified PoC.Mem interface, so that
-- this unit can be easily integrated into processor pipelines. For example,
-- let's have a pipeline where a load/store instruction is executed in 3
-- stages (after fetching, decoding, ...):
--
-- 1. Execute (EX) for address calculation,
-- 2. Load/Store 1 (LS1) for the cache access,
-- 3. Load/Store 2 (LS2) where the cache returns the read data.
--
-- The read data is always returned one cycle after the cache access completes,
-- so there is conceptually a pipeline register within this unit. The stage LS2
-- can be merged with a write-back stage if the clock period allows so.
--
-- The stage LS1 and thus EX and LS2 must stall, until the cache access is
-- completed, i.e., the EX/LS1 pipeline register must hold the cache request
-- until it is acknowledged by the cache. This is signaled by ``cpu_got`` as
-- described in Section Operation below. The pipeline moves forward (is
-- enabled) when::
--
--   pipeline_enable <= (not cpu_req) or cpu_got;
--
-- If the pipeline can stall due to other reasons, care must be taken to not
-- unintentionally executing the cache access twice or missing the read data.
--
-- Of course, the EX/LS1 pipeline register can be omitted and the CPU side
-- directly fed by the address caculator. But be aware of the high setup time
-- of this unit and high propate time for ``cpu_got``.
--
-- This unit supports only one outstanding CPU request. More outstanding
-- requests are provided by :ref:`IP:cache_mem`.
--
--
-- Configuration
-- *************
--
-- +--------------------+-----------------------------------------------------+
-- | Parameter          | Description                                         |
-- +====================+=====================================================+
-- | REPLACEMENT_POLICY | Replacement policy of embedded cache. For supported |
-- |                    | values see PoC.cache_replacement_policy.            |
-- +--------------------+-----------------------------------------------------+
-- | CACHE_LINES        | Number of cache lines.                              |
-- +--------------------+-----------------------------------------------------+
-- | ASSOCIATIVITY      | Associativity of embedded cache.                    |
-- +--------------------+-----------------------------------------------------+
-- | CPU_ADDR_BITS      | Number of address bits on the CPU side. Each address|
-- |                    | identifies one memory word as seen from the CPU.    |
-- |                    | Calculated from other parameters as described below.|
-- +--------------------+-----------------------------------------------------+
-- | CPU_DATA_BITS      | Width of the data bus (in bits) on the CPU side.    |
-- |                    | CPU_DATA_BITS must be divisible by 8.               |
-- +--------------------+-----------------------------------------------------+
-- | MEM_ADDR_BITS      | Number of address bits on the memory side. Each     |
-- |                    | address identifies one word in the memory.          |
-- +--------------------+-----------------------------------------------------+
-- | MEM_DATA_BITS      | Width of a memory word and of a cache line in bits. |
-- |                    | MEM_DATA_BITS must be divisible by CPU_DATA_BITS.   |
-- +--------------------+-----------------------------------------------------+
--
-- If the CPU data-bus width is smaller than the memory data-bus width, then
-- the CPU needs additional address bits to identify one CPU data word inside a
-- memory word. Thus, the CPU address-bus width is calculated from::
--
--   CPU_ADDR_BITS=log2ceil(MEM_DATA_BITS/CPU_DATA_BITS)+MEM_ADDR_BITS
--
-- The write policy is: write-through, no-write-allocate.
--
--
-- Operation
-- *********
--
-- Alignment of Cache / Memory Accesses
-- ++++++++++++++++++++++++++++++++++++
--
-- Memory accesses are always aligned to a word boundary. Each memory word
-- (and each cache line) consists of MEM_DATA_BITS bits.
-- For example if MEM_DATA_BITS=128:
--
-- * memory address 0 selects the bits   0..127 in memory,
-- * memory address 1 selects the bits 128..256 in memory, and so on.
--
-- Cache accesses are always aligned to a CPU word boundary. Each CPU word
-- consists of CPU_DATA_BITS bits. For example if CPU_DATA_BITS=32:
--
-- * CPU address 0 selects the bits   0.. 31 in memory word 0,
-- * CPU address 1 selects the bits  32.. 63 in memory word 0,
-- * CPU address 2 selects the bits  64.. 95 in memory word 0,
-- * CPU address 3 selects the bits  96..127 in memory word 0,
-- * CPU address 4 selects the bits   0.. 31 in memory word 1,
-- * CPU address 5 selects the bits  32.. 63 in memory word 1, and so on.
--
--
-- Shared and Memory Side Interface
-- ++++++++++++++++++++++++++++++++
--
-- A synchronous reset must be applied even on a FPGA.
--
-- The memory side interface is documented in detail :ref:`here <INT:PoC.Mem>`.
--
--
-- CPU Side Interface
-- ++++++++++++++++++
--
-- The CPU (pipeline stage LS1, see above) issues a request by setting
-- ``cpu_req``, ``cpu_write``, ``cpu_addr``, ``cpu_wdata`` and ``cpu_wmask`` as
-- in the :ref:`INT:PoC.Mem` interface. The cache acknowledges the request by
-- setting ``cpu_got`` to '1'. If the request is not acknowledged (``cpu_got =
-- '0'``) in the current clock cycle, then the request must be repeated in the
-- following clock cycle(s) until it is acknowledged, i.e., the pipeline must
-- stall.
--
-- A cache access is completed when it is acknowledged. A new request can be
-- issued in the following clock cycle.
--
-- Of course, ``cpu_got`` may be asserted in the same clock cycle where the
-- request was issued if a read hit occurs. This allows a throughput of one
-- (read) request per clock cycle, but the drawback is, that ``cpu_got`` has a
-- high propagation delay. Thus, this output should only control a simple
-- pipeline enable logic.
--
-- When ``cpu_got`` is asserted for a read access, then the read data will be
-- available in the following clock cycle.
--
-- Due to the write-through policy, a write will always take several clock
-- cycles and acknowledged when the data has been issued to the memory.
--
-- .. WARNING::
--
--    If the design is synthesized with Xilinx ISE / XST, then the synthesis
--    option "Keep Hierarchy" must be set to SOFT or TRUE.
--
-- SeeAlso:
--   :ref:`IP:cache_mem`
entity [docs]cache_cpu is
	generic (
		REPLACEMENT_POLICY : string		:= "LRU";
		CACHE_LINES        : positive;
		ASSOCIATIVITY      : positive;
		CPU_DATA_BITS      : positive;
		MEM_ADDR_BITS      : positive;
		MEM_DATA_BITS      : positive
	);
	port (
    clk : in std_logic; -- clock
    rst : in std_logic; -- reset

    -- "CPU" side
    cpu_req   : in  std_logic;
    cpu_write : in  std_logic;
    cpu_addr  : in  unsigned(log2ceil(MEM_DATA_BITS/CPU_DATA_BITS)+MEM_ADDR_BITS-1 downto 0);
    cpu_wdata : in  std_logic_vector(CPU_DATA_BITS-1 downto 0);
    cpu_wmask : in  std_logic_vector(CPU_DATA_BITS/8-1 downto 0);
    cpu_got   : out std_logic;
    cpu_rdata : out std_logic_vector(CPU_DATA_BITS-1 downto 0);

		-- Memory side
		mem_req		: out std_logic;
		mem_write : out std_logic;
		mem_addr	: out unsigned(MEM_ADDR_BITS-1 downto 0);
		mem_wdata : out std_logic_vector(MEM_DATA_BITS-1 downto 0);
		mem_wmask : out std_logic_vector(MEM_DATA_BITS/8-1 downto 0);
		mem_rdy		: in	std_logic;
		mem_rstb	: in	std_logic;
		mem_rdata : in	std_logic_vector(MEM_DATA_BITS-1 downto 0)
    );
end entity;

architecture [docs]rtl of cache_cpu is
	-- Ratio 1:n between CPU data bus and cache-line size (memory data bus)
	constant RATIO : positive := MEM_DATA_BITS/CPU_DATA_BITS;

	-- Number of address bits identifying the CPU data word within a cache line (memory word)
	constant LOWER_ADDR_BITS : natural := log2ceil(RATIO);

	-- Widened CPU data path
	signal cpu_wdata_wide : std_logic_vector(MEM_DATA_BITS-1 downto 0);
	signal cpu_wmask_wide : std_logic_vector(MEM_DATA_BITS/8-1 downto 0);

	-- Interface to Cache instance.
	signal cache_Request		: std_logic;
	signal cache_ReadWrite	: std_logic;
	signal cache_Writemask	: std_logic_vector(MEM_DATA_BITS/8-1 downto 0);
	signal cache_Invalidate : std_logic;
	signal cache_Replace		: std_logic;
	signal cache_Address		: std_logic_vector(MEM_ADDR_BITS-1 downto 0);
	signal cache_LineIn			: std_logic_vector(MEM_DATA_BITS-1 downto 0);
	signal cache_LineOut		: std_logic_vector(MEM_DATA_BITS-1 downto 0);
	signal cache_Hit				: std_logic;
	signal cache_Miss				: std_logic;

  -- FSM and other state registers
  type T_FSM is (READY, ACCESS_MEM, READING_MEM, UNKNOWN);
  signal fsm_cs : T_FSM -- current state
		-- synthesis translate_off
		:= UNKNOWN
		-- synthesis translate_on
		;
  signal fsm_ns : T_FSM;-- next state

begin  -- architecture rtl

  cache_inst: entity work.cache_par2
    generic map (
			REPLACEMENT_POLICY => REPLACEMENT_POLICY,
			CACHE_LINES        => CACHE_LINES,
			ASSOCIATIVITY      => ASSOCIATIVITY,
			ADDR_BITS          => MEM_ADDR_BITS,
			DATA_BITS          => MEM_DATA_BITS,
			HIT_MISS_REG       => false)
    port map (
			Clock        => clk,
			Reset        => rst,
			Request      => cache_Request,
			ReadWrite    => cache_ReadWrite,
			WriteMask    => cache_WriteMask,
			Invalidate   => cache_Invalidate,
			Replace      => cache_Replace,
			Address      => cache_Address,
			CacheLineIn  => cache_LineIn,
			CacheLineOut => cache_LineOut,
			CacheHit     => cache_Hit,
			CacheMiss    => cache_Miss,
			OldAddress   => open);

  -- Address and Data path
  -- ===========================================================================
	gEqual: if RATIO = 1 generate -- Cache line size equals CPU data bus size
		cpu_wdata_wide <= cpu_wdata;
		cpu_wmask_wide <= cpu_wmask;
		cpu_rdata      <= cache_LineOut;
	end generate gEqual;

	gWider: if RATIO > 1 generate -- Cache line size is greater than CPU data bus size
		signal lower_addr   : unsigned(LOWER_ADDR_BITS-1 downto 0);
		signal lower_addr_r : unsigned(LOWER_ADDR_BITS-1 downto 0);
		type T_ARRAY is array(0 to RATIO-1) of std_logic_vector(CPU_DATA_BITS-1 downto 0);
		signal cache_LineOut_array : T_ARRAY;
	begin
		-- CPU Request Data Path
		lower_addr <= cpu_addr(LOWER_ADDR_BITS-1 downto 0);

		l0: for i in 0 to RATIO-1 generate
			cpu_wdata_wide((i+1)*CPU_DATA_BITS-1 downto i*CPU_DATA_BITS) <= cpu_wdata;

			cpu_wmask_wide((i+1)*CPU_DATA_BITS/8-1 downto i*CPU_DATA_BITS/8) <=
				-- synthesis translate_off
				(others => 'X') when is_x(lower_addr) else
				-- synthesis translate_on
				cpu_wmask when to_integer(lower_addr) = i else
				(others => '1');
		end generate l0;

		-- CPU Reply Data Path
		lower_addr_r <= lower_addr when rising_edge(clk); -- pipeline register

		l1: for i in 0 to RATIO-1 generate
			cache_LineOut_array(i) <= cache_LineOut((i+1)*CPU_DATA_BITS-1 downto i*CPU_DATA_BITS);
		end generate l1;

		cpu_rdata <=
			-- synthesis translate_off
			(others => 'X') when is_x(lower_addr_r) else
			-- synthesis translate_on
			cache_LineOut_array(to_integer(lower_addr_r));
	end generate gWider;

	-- Cache Request Data Path
	cache_Address   <= std_logic_vector(cpu_addr(cpu_addr'left downto LOWER_ADDR_BITS));
	cache_LineIn    <= mem_rdata       when fsm_cs = READING_MEM else cpu_wdata_wide;
	cache_WriteMask <= (others => '0') when fsm_cs = READING_MEM else cpu_wmask_wide;

	-- These outputs can be fed from buffer registers, but this is not
	-- neccessary because the cpu_* signals will typically be connected to a
	-- pipeline register. And even if this pipeline register is omitted, then the
	-- cache tag comparison will dominate the critical path.
	mem_write <= cpu_write;
	mem_addr  <= cpu_addr(cpu_addr'left downto LOWER_ADDR_BITS);
	mem_wdata <= cpu_wdata_wide;
	mem_wmask <= cpu_wmask_wide;

	-- FSM
	-- ===========================================================================
	process(fsm_cs, cpu_req, cpu_write, cache_Hit, cache_Miss, mem_rdy, mem_rstb)
	begin
		-- Update state registers
		fsm_ns <= fsm_cs;

		-- Control signals for cache access
		cache_Request		 <= '0';
		cache_ReadWrite	 <= '-';
		cache_Invalidate <= '-';
		cache_Replace		 <= '0';

		-- Control / status signals for CPU and MEM side
		cpu_got <= '0';
		mem_req <= '0';

		case fsm_cs is
			when READY =>
				-- Ready for a new cache access.
				-- -----------------------------
				cache_Request		 <= to_x01(cpu_req);
				cache_ReadWrite	 <= to_x01(cpu_write); -- doesn't care if no request
				cache_Invalidate <= '0';

				case ((cache_Hit and cpu_write) or cache_Miss) is
					when '1' =>	-- write successfull but write-through, or cache miss
						fsm_ns  <= ACCESS_MEM;
					when '0' => -- read successfull, or no request
						cpu_got <= to_x01(cpu_req);
					when others => -- invalid input
						fsm_ns  <= UNKNOWN;
						cpu_got <= 'X';
				end case;


			when ACCESS_MEM =>
				-- Access memory.
				-- --------------
				mem_req <= '1';
				case to_x01(mem_rdy) is
					when '1' => -- access granted
						case to_x01(cpu_write) is
							when '1'    => fsm_ns <= READY;   cpu_got <= '1'; -- write
							when '0'    => fsm_ns <= READING_MEM; -- read
							when others => fsm_ns <= UNKNOWN; cpu_got <= 'X'; -- invalid input
						end case;

					when '0' =>	null; -- still waiting
					when others => fsm_ns <= UNKNOWN; -- invalid input
				end case;


      when READING_MEM =>
        -- Wait for incoming read data and write it to cache.
				-- --------------------------------------------------
				cache_ReadWrite <= '1';

				case to_x01(mem_rstb) is
					when '1' => -- read data available
						fsm_ns        <= READY;
						cpu_got       <= '1'; -- cache access is complete now
						cache_Replace <= '1'; -- replace cache line
						-- The new data will be available on cache_LineOut in the following
						-- clock cycle.

					when '0' => null;-- still waiting
					when others => -- invalid input
						fsm_ns        <= UNKNOWN;
						cpu_got       <= 'X';
						cache_Replace <= 'X';
				end case;


			when UNKNOWN =>
				-- Catches invalid state transitions.
				-- ----------------------------------
				fsm_ns           <= UNKNOWN;
				cpu_got          <= 'X';
				cache_Request    <= 'X';
				cache_ReadWrite	 <= 'X';
				cache_Invalidate <= 'X';
				cache_Replace    <= 'X';
		end case;
	end process;

	[docs]process(clk)
	begin
		if rising_edge(clk) then
			case to_x01(rst) is
				when '1' =>
					fsm_cs <= READY;
				when '0' =>
					fsm_cs <= fsm_ns;
				when others =>
					fsm_cs <= UNKNOWN;
			end case;
		end if;
	end process;

end architecture rtl;