-- EMACS settings: -*-  tab-width: 2; indent-tabs-mode: t -*-
-- vim: tabstop=2:shiftwidth=2:noexpandtab
-- kate: tab-width 2; replace-tabs off; indent-width 2;
-- =============================================================================
-- Authors:				 	Martin Zabel
--
-- Entity:				 	Simple dual-port memory with write-first behavior.
--
-- Description:
-- 
-- Inferring / instantiating simple dual-port memory, with:
--
-- * single clock, clock enable,
-- * 1 read port plus 1 write port.
--
-- Command truth table:
--
-- == == ===============================
-- ce we Command
-- == == ===============================
-- 0   X   No operation
-- 1   0   Read only from memory
-- 1   1   Read from and Write to memory
-- == == ===============================
--
-- Both reading and writing are synchronous to the rising-edge of the clock.
-- Thus, when reading, the memory data will be outputted after the
-- clock edge, i.e, in the following clock cycle.
--
-- Mixed-Port Read-During-Write
--   When reading at the write address, the read value will be the new data,
--   aka. "write-first behavior". Of course, the read is still synchronous,
--   i.e, the latency is still one clock cyle.
--
-- License:
-- =============================================================================
-- Copyright 2008-2015 Technische Universitaet Dresden - Germany
--										 Chair of VLSI-Design, Diagnostics and Architecture
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
--		http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- =============================================================================

library	ieee;
use			ieee.std_logic_1164.all;
use			ieee.numeric_std.all;

entity [docs]ocram_sdp_wf is
	generic (
		A_BITS		: positive;                           -- number of address bits
		D_BITS		: positive;                           -- number of data bits
		FILENAME	: string		:= ""                     -- file-name for RAM initialization
	);
	port (
		clk : in  std_logic;                            -- clock
		ce  : in  std_logic;                            -- clock-enable
		we  : in  std_logic;                            -- write enable
		ra  : in  unsigned(A_BITS-1 downto 0);          -- read address
		wa  : in  unsigned(A_BITS-1 downto 0);          -- write address
		d   : in  std_logic_vector(D_BITS-1 downto 0);  -- data in
		q   : out std_logic_vector(D_BITS-1 downto 0)   -- data out
	);
end entity;


architecture [docs]rtl of ocram_sdp_wf is
	-- Implementation Notes:
	-- ---------------------
	--
	-- I have also checked a modified version of the unit `ocram_sp` with just a
	-- single clock and an asynchronous read like::
	--
	--   process(clk)
	--   begin
	--     if rising_edge(clk) then
	--       if ce = '1' then
	--         ra_r <= ra;
	--       end if;
	--     end if;
	--   end process;
	--
	--   q <= ram(to_integer(ra_r));
	--
	-- But the result from various FPGA synthesis tools was as follows:
	--
	-- * Altera Quartus 13.0: adds proper bypass-logic as expected.
	--
	-- * Lattice Synthesis Engine: adds proper bypass-logic, but there was an
	--   unneccessary multiplexer for the read address to mimic the read enable.
	--
	-- * XST 14.7: RAM is mapped to Block-RAM which has not the desired
	--   read-during-write behavior and also no bypass logic is added. XST adds
	--   also an unneccessary multiplexer for the read address to mimic the read
	--   enable.
	--
	--   Enforcing distributed RAM gives the desired behavior when synthesizing
	--   just this unit. But synthesis has failed in complex projects when
	--   KEEP_HIERARCHY was set to NO.
	--
	-- * Vivado 2016.2: RAM is mapped to Block-RAM which has not the desired
	--   read-during-write behavior and also no bypass logic is added. Vivado
	--   adds also an unneccessary multiplexer for the read address to mimic the
	--   read enable.
	--
	--   Enforcing distributed RAM gives the desired behavior when synthesizing
	--   just this unit. Synthesis results have not yet been checked for larger
	--   designs.
	--
	-- Thus, the solution below is to explictly implement the bypass logic.


	signal wd_r  : std_logic_vector(d'range); -- write data
	signal fwd_r : std_logic;                 -- forward write data
	signal ram_q : std_logic_vector(q'range); -- RAM output

	-- Compares two addresses, returns 'X' if either ``a1`` or ``a2`` contains
	-- meta-values, otherwise returns '1' if ``a1 == a2`` is true else
	-- '0'. Returns 'X' even when the addresses contain '-' values, to signal an
	-- undefined outcome.
	function [docs]addr_equal(a1 : unsigned; a2 : unsigned) return X01 is
	begin
		-- synthesis translate_off
		if is_x(a1) or is_x(a2) then return 'X'; end if;
		-- synthesis translate_on
		if to_x01(std_logic_vector(a1)) = to_x01(std_logic_vector(a2)) then
			return '1';
		end if;
		return '0';
	end function;

begin
	process(clk)
	begin
		if rising_edge(clk) then
			case to_x01(ce) is
				when '1' =>
					wd_r  <= to_x01(d);
					fwd_r <= addr_equal(ra, wa) and we;

				when '0' =>	null; -- keep previous state

				when others => -- X propagation in simulation
					wd_r  <= (others => 'X');
					fwd_r <= 'X';
			end case;
		end if;
	end process;

	ram_sdp: entity work.ocram_sdp
		generic map (
			A_BITS   => A_BITS,
			D_BITS   => D_BITS,
			FILENAME => FILENAME)
		port map (
			rclk => clk,
			rce  => ce,
			wclk => clk,
			wce  => ce,
			we   => we,
			ra   => ra,
			wa   => wa,
			d    => d,
			q    => ram_q);

	with fwd_r select q <=
		wd_r            when '1',
		ram_q           when '0',
		(others => 'X') when others; -- X propagation in simulation

end architecture;