-- EMACS settings: -*-  tab-width: 2; indent-tabs-mode: t -*-
-- vim: tabstop=2:shiftwidth=2:noexpandtab
-- kate: tab-width 2; replace-tabs off; indent-width 2;
-- =============================================================================
-- Authors:         Martin Zabel
--
-- Entity:          Cache with :ref:`INT:PoC.Mem` interface on the "CPU" side
--
-- Description:
-- 
-- This unit provides a cache (:ref:`IP:cache_par2`) together
-- with a cache controller which reads / writes cache lines from / to memory.
-- It has two :ref:`INT:PoC.Mem` interfaces:
--
-- * one for the "CPU" side  (ports with prefix ``cpu_``), and
-- * one for the memory side (ports with prefix ``mem_``).
--
-- Thus, this unit can be placed into an already available memory path between
-- the CPU and the memory (controller). If you want to plugin a cache into a
-- CPU pipeline, see :ref:`IP:cache_cpu`.
--
--
-- Configuration
-- *************
--
-- +--------------------+-----------------------------------------------------+
-- | Parameter          | Description                                         |
-- +====================+=====================================================+
-- | REPLACEMENT_POLICY | Replacement policy of embedded cache. For supported |
-- |                    | values see PoC.cache_replacement_policy.            |
-- +--------------------+-----------------------------------------------------+
-- | CACHE_LINES        | Number of cache lines.                              |
-- +--------------------+-----------------------------------------------------+
-- | ASSOCIATIVITY      | Associativity of embedded cache.                    |
-- +--------------------+-----------------------------------------------------+
-- | CPU_ADDR_BITS      | Number of address bits on the CPU side. Each address|
-- |                    | identifies one memory word as seen from the CPU.    |
-- |                    | Calculated from other parameters as described below.|
-- +--------------------+-----------------------------------------------------+
-- | CPU_DATA_BITS      | Width of the data bus (in bits) on the CPU side.    |
-- |                    | CPU_DATA_BITS must be divisible by 8.               |
-- +--------------------+-----------------------------------------------------+
-- | MEM_ADDR_BITS      | Number of address bits on the memory side. Each     |
-- |                    | address identifies one word in the memory.          |
-- +--------------------+-----------------------------------------------------+
-- | MEM_DATA_BITS      | Width of a memory word and of a cache line in bits. |
-- |                    | MEM_DATA_BITS must be divisible by CPU_DATA_BITS.   |
-- +--------------------+-----------------------------------------------------+
-- | OUTSTANDING_REQ    | Number of oustanding requests, see notes below.     |
-- +--------------------+-----------------------------------------------------+
--
-- If the CPU data-bus width is smaller than the memory data-bus width, then
-- the CPU needs additional address bits to identify one CPU data word inside a
-- memory word. Thus, the CPU address-bus width is calculated from::
--
--   CPU_ADDR_BITS=log2ceil(MEM_DATA_BITS/CPU_DATA_BITS)+MEM_ADDR_BITS
--
-- The write policy is: write-through, no-write-allocate.
--
-- The maximum throughput is one request per clock cycle, except for
-- ``OUSTANDING_REQ = 1``.
--
-- If ``OUTSTANDING_REQ`` is:
--
-- * 1: then 1 request is buffered by a single register. To give a short
--   critical path (clock-to-output delay) for ``cpu_rdy``, the throughput is
--   degraded to one request per 2 clock cycles at maximum.
--
-- * 2: then 2 requests are buffered by :ref:`IP:fifo_glue`. This setting has
--   the lowest area requirements without degrading the performance.
--
-- * >2: then the requests are buffered by :ref:`IP:fifo_cc_got`. The number of
--   outstanding requests is rounded up to the next suitable value. This setting
--   is useful in applications with out-of-order execution (of other
--   operations). The CPU requests to the cache are always processed in-order.
--
--
-- Operation
-- *********
--
-- Memory accesses are always aligned to a word boundary. Each memory word
-- (and each cache line) consists of MEM_DATA_BITS bits.
-- For example if MEM_DATA_BITS=128:
--
-- * memory address 0 selects the bits   0..127 in memory,
-- * memory address 1 selects the bits 128..256 in memory, and so on.
--
-- Cache accesses are always aligned to a CPU word boundary. Each CPU word
-- consists of CPU_DATA_BITS bits. For example if CPU_DATA_BITS=32:
--
-- * CPU address 0 selects the bits   0.. 31 in memory word 0,
-- * CPU address 1 selects the bits  32.. 63 in memory word 0,
-- * CPU address 2 selects the bits  64.. 95 in memory word 0,
-- * CPU address 3 selects the bits  96..127 in memory word 0,
-- * CPU address 4 selects the bits   0.. 31 in memory word 1,
-- * CPU address 5 selects the bits  32.. 63 in memory word 1, and so on.
--
-- A synchronous reset must be applied even on a FPGA.
--
-- The interface is documented in detail :ref:`here <INT:PoC.Mem>`.
--
-- .. WARNING::
--
--    If the design is synthesized with Xilinx ISE / XST, then the synthesis
--    option "Keep Hierarchy" must be set to SOFT or TRUE.
--
-- SeeAlso:
--   :ref:`IP:cache_cpu`
--
-- License:
-- =============================================================================
-- Copyright 2016-2016 Technische Universitaet Dresden - Germany
--										 Chair of VLSI-Design, Diagnostics and Architecture
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
--		http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- =============================================================================

library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

library poc;
use poc.utils.all;

entity [docs]cache_mem is
	generic (
		REPLACEMENT_POLICY : string		:= "LRU";
		CACHE_LINES        : positive;
		ASSOCIATIVITY      : positive;
		CPU_DATA_BITS      : positive;
		MEM_ADDR_BITS      : positive;
		MEM_DATA_BITS      : positive;
		OUTSTANDING_REQ    : positive := 2
	);
	port (
		clk : in std_logic; -- clock
		rst : in std_logic; -- reset

		-- "CPU" side
		cpu_req   : in  std_logic;
		cpu_write : in  std_logic;
		cpu_addr  : in  unsigned(log2ceil(MEM_DATA_BITS/CPU_DATA_BITS)+MEM_ADDR_BITS-1 downto 0);
		cpu_wdata : in  std_logic_vector(CPU_DATA_BITS-1 downto 0);
		cpu_wmask : in  std_logic_vector(CPU_DATA_BITS/8-1 downto 0) := (others => '0');
		cpu_rdy   : out std_logic;
		cpu_rstb  : out std_logic;
		cpu_rdata : out std_logic_vector(CPU_DATA_BITS-1 downto 0);

		-- Memory side
		mem_req   : out std_logic;
		mem_write : out std_logic;
		mem_addr  : out unsigned(MEM_ADDR_BITS-1 downto 0);
		mem_wdata : out std_logic_vector(MEM_DATA_BITS-1 downto 0);
		mem_wmask : out std_logic_vector(MEM_DATA_BITS/8-1 downto 0);
		mem_rdy   : in  std_logic;
		mem_rstb  : in  std_logic;
		mem_rdata : in  std_logic_vector(MEM_DATA_BITS-1 downto 0)
    );
end entity;

architecture [docs]rtl of cache_mem is
	constant CPU_ADDR_BITS : positive := log2ceil(MEM_DATA_BITS/CPU_DATA_BITS)+MEM_ADDR_BITS;

	-- signals to internal cache_cpu
	signal int_req   : std_logic;
	signal int_write : std_logic;
	signal int_addr  : unsigned(CPU_ADDR_BITS-1 downto 0);
	signal int_wdata : std_logic_vector(CPU_DATA_BITS-1 downto 0);
	signal int_wmask : std_logic_vector(CPU_DATA_BITS/8-1 downto 0);
	signal int_got   : std_logic;
	signal int_rdata : std_logic_vector(CPU_DATA_BITS-1 downto 0);

begin

	cache_cpu_inst: entity work.cache_cpu
		generic map (
			REPLACEMENT_POLICY => REPLACEMENT_POLICY,
			CACHE_LINES        => CACHE_LINES,
			ASSOCIATIVITY      => ASSOCIATIVITY,
			CPU_DATA_BITS      => CPU_DATA_BITS,
			MEM_ADDR_BITS      => MEM_ADDR_BITS,
			MEM_DATA_BITS      => MEM_DATA_BITS)
		port map (
			clk       => clk,
			rst       => rst,
			cpu_req   => int_req,
			cpu_write => int_write,
			cpu_addr  => int_addr,
			cpu_wdata => int_wdata,
			cpu_wmask => int_wmask,
			cpu_got   => int_got,
			cpu_rdata => int_rdata,
			mem_req   => mem_req,
			mem_write => mem_write,
			mem_addr  => mem_addr,
			mem_wdata => mem_wdata,
			mem_wmask => mem_wmask,
			mem_rdy   => mem_rdy,
			mem_rstb  => mem_rstb,
			mem_rdata => mem_rdata);

	-- read data is valid one clock cycle after int_got is asserted
	cpu_rstb  <= (not rst) and (not int_write) and int_got when rising_edge(clk);
	cpu_rdata <= int_rdata; -- already delayed by one clock cycle


	------------------------------------------------------------------------------
	g1: if OUTSTANDING_REQ = 1 generate
    signal cpu_req_r   : std_logic;
    signal cpu_write_r : std_logic;
    signal cpu_addr_r  : unsigned(CPU_ADDR_BITS-1 downto 0);
    signal cpu_wdata_r : std_logic_vector(CPU_DATA_BITS-1 downto 0);
    signal cpu_wmask_r : std_logic_vector(CPU_DATA_BITS/8-1 downto 0);
    signal cpu_rdy_r   : std_logic;
    signal cpu_rstb_r  : std_logic;
    signal cpu_rdata_r : std_logic_vector(CPU_DATA_BITS-1 downto 0);
	begin
		-- cpu_rdy should have a short clock-to-output delay, but int_got has a large
		-- propagation delay. Thus, do not depend cpu_rdy and int_got.
		-- This single entry FIFO stores a valid request if cpu_req_r = '1',
		-- otherwise it is empty.
		process(clk)
		begin
			if rising_edge(clk) then
				-- store new request only if FIFO is empty
				case to_x01(cpu_req_r) is
					when '1' => null; -- FIFO is full
					when '0' =>
						cpu_write_r <= cpu_write;
						cpu_addr_r  <= cpu_addr;
						cpu_wdata_r <= cpu_wdata;
						cpu_wmask_r <= cpu_wmask;

					when others => -- just for simulation
						cpu_write_r <= 'X';
						cpu_addr_r  <= (others => 'X');
						cpu_wdata_r <= (others => 'X');
						cpu_wmask_r <= (others => 'X');
				end case;

				-- FIFO state logic
				case to_x01(rst) is
					when '1' =>    cpu_req_r <= '0';
					when '0' =>	   cpu_req_r <=
													 (cpu_req_r and not int_got) or -- keep if not yet acknowledged
													 (not cpu_req_r and cpu_req);   -- or new request when empty
					when others => cpu_req_r <= 'X';
				end case;
			end if;
		end process;

		cpu_rdy   <= not cpu_req_r; -- ready when empty

		int_req   <= cpu_req_r;
		int_write <= cpu_write_r;
		int_addr  <= cpu_addr_r;
		int_wdata <= cpu_wdata_r;
		int_wmask <= cpu_wmask_r;
	end generate g1;


	------------------------------------------------------------------------------
	g2: if OUTSTANDING_REQ = 2 generate
		constant D_BITS : positive := CPU_DATA_BITS/8+CPU_DATA_BITS+CPU_ADDR_BITS+1;

    signal put   : std_logic;
    signal din   : std_logic_vector(D_BITS-1 downto 0);
    signal full  : std_logic;
    signal valid : std_logic;
    signal dout  : std_logic_vector(D_BITS-1 downto 0);
	begin
		req_fifo: entity work.fifo_glue
			generic map (
				D_BITS => D_BITS)
			port map (
				clk => clk,
				rst => rst,
				put => put,
				di  => din,
				ful => full,
				vld => valid,
				do  => dout,
				got => int_got);

		din <= cpu_wmask & cpu_wdata & std_logic_vector(cpu_addr) & (0 downto 0 => cpu_write);
		put <= cpu_req;

		int_req   <= valid;
		int_write <= dout(0);
		int_addr  <= unsigned(dout(CPU_ADDR_BITS+1-1 downto 1));
		int_wdata <= dout(CPU_DATA_BITS+CPU_ADDR_BITS+1-1 downto CPU_ADDR_BITS+1);
		int_wmask <= dout(CPU_DATA_BITS/8+CPU_DATA_BITS+CPU_ADDR_BITS+1-1 downto CPU_DATA_BITS+CPU_ADDR_BITS+1);

		cpu_rdy <= not full;
	end generate g2;


	------------------------------------------------------------------------------
	gt2: if OUTSTANDING_REQ > 2 generate
		constant D_BITS : positive := CPU_DATA_BITS/8+CPU_DATA_BITS+CPU_ADDR_BITS+1;

    signal put   : std_logic;
    signal din   : std_logic_vector(D_BITS-1 downto 0);
    signal full  : std_logic;
    signal valid : std_logic;
    signal dout  : std_logic_vector(D_BITS-1 downto 0);
	begin
		req_fifo: entity work.fifo_cc_got
      generic map (
        D_BITS         => D_BITS,
        MIN_DEPTH      => OUTSTANDING_REQ,
        DATA_REG       => false, -- otherwise OUTPUT_REG has no effect
        STATE_REG      => true,
        OUTPUT_REG     => true) -- extra register required for optimal synthesis on Altera FPGAs
      port map (
        rst       => rst,
        clk       => clk,
        put       => put,
        din       => din,
        full      => full,
        estate_wr => open,
        got       => int_got,
        dout      => dout,
        valid     => valid,
        fstate_rd => open);

		din <= cpu_wmask & cpu_wdata & std_logic_vector(cpu_addr) & (0 downto 0 => cpu_write);
		put <= cpu_req;

		int_req   <= valid;
		int_write <= dout(0);
		int_addr  <= unsigned(dout(CPU_ADDR_BITS+1-1 downto 1));
		int_wdata <= dout(CPU_DATA_BITS+CPU_ADDR_BITS+1-1 downto CPU_ADDR_BITS+1);
		int_wmask <= dout(CPU_DATA_BITS/8+CPU_DATA_BITS+CPU_ADDR_BITS+1-1 downto CPU_DATA_BITS+CPU_ADDR_BITS+1);

		cpu_rdy <= not full;
	end generate gt2;
end architecture rtl;