-- EMACS settings: -*- tab-width: 2; indent-tabs-mode: t -*-
-- vim: tabstop=2:shiftwidth=2:noexpandtab
-- kate: tab-width 2; replace-tabs off; indent-width 2;
-- =============================================================================
-- Authors: Thomas B. Preusser
--
-- Entity: arith_addw
--
-- Description:
--
-- Implements wide addition providing several options all based
-- on an adaptation of a carry-select approach.
--
-- References:
--
-- * Hong Diep Nguyen and Bogdan Pasca and Thomas B. Preusser:
-- FPGA-Specific Arithmetic Optimizations of Short-Latency Adders,
-- FPL 2011.
-- -> ARCH: AAM, CAI, CCA
-- -> SKIPPING: CCC
--
-- * Marcin Rogawski, Kris Gaj and Ekawat Homsirikamol:
-- A Novel Modular Adder for One Thousand Bits and More
-- Using Fast Carry Chains of Modern FPGAs, FPL 2014.
-- -> ARCH: PAI
-- -> SKIPPING: PPN_KS, PPN_BK
--
-- License:
-- =============================================================================
-- Copyright 2007-2015 Technische Universitaet Dresden - Germany
-- Chair of VLSI-Design, Diagnostics and Architecture
--
-- Licensed under the Apache License, Version 2.0 (the "License");
-- you may not use this file except in compliance with the License.
-- You may obtain a copy of the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS,
-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-- See the License for the specific language governing permissions and
-- limitations under the License.
-- =============================================================================
library IEEE;
use IEEE.std_logic_1164.all;
library PoC;
use PoC.utils.all;
use PoC.arith.all;
entity [docs]arith_addw is
generic (
N : positive; -- Operand Width
K : positive; -- Block Count
ARCH : tArch := AAM; -- Architecture
BLOCKING : tBlocking := DFLT; -- Blocking Scheme
SKIPPING : tSkipping := CCC; -- Carry Skip Scheme
P_INCLUSIVE : boolean := false -- Use Inclusive Propagate, i.e. c^1
);
port (
a, b : in std_logic_vector(N-1 downto 0);
cin : in std_logic;
s : out std_logic_vector(N-1 downto 0);
cout : out std_logic
);
end entity;
use std.textio.all;
library IEEE;
use IEEE.numeric_std.all;
architecture [docs]rtl of arith_addw is
-- Determine Block Boundaries
type tBlocking_vector is array(tArch) of tBlocking;
constant DEFAULT_BLOCKING : tBlocking_vector := (AAM => ASC, CAI => DESC, PAI => DESC, CCA => DESC);
type integer_vector is array(natural range<>) of integer;
impure function [docs]compute_blocks return integer_vector is
variable bs : tBlocking := BLOCKING;
variable res : integer_vector(K-1 downto 0);
variable l : line;
begin
if bs = DFLT then
bs := DEFAULT_BLOCKING(ARCH);
end if;
case bs is
when FIX =>
assert N >= K
report "Cannot have more blocks than input bits."
severity failure;
for i in res'range loop
res(i) := ((i+1)*N+K/2)/K;
end loop;
when ASC =>
assert N-K*(K-1)/2 >= K
report "Too few input bits to implement growing block sizes."
severity failure;
for i in res'range loop
res(i) := ((i+1)*(N-K*(K-1)/2)+K/2)/K + (i+1)*i/2;
end loop;
when DESC =>
assert N-K*(K-1)/2 >= K
report "Too few input bits to implement growing block sizes."
severity failure;
for i in res'range loop
res(i) := ((i+1)*(N+K*(K-1)/2)+K/2)/K - (i+1)*i/2;
end loop;
when others =>
report "Unknown blocking scheme: "&tBlocking'image(bs) severity failure;
end case;
--synthesis translate_off
write(l, "Implementing "&integer'image(N)&"-bit wide adder: ARCH="&tArch'image(ARCH)&
", BLOCKING="&tBlocking'image(bs)&'[');
for i in K-1 downto 1 loop
write(l, res(i)-res(i-1));
write(l, ',');
end loop;
write(l, res(0));
write(l, "], SKIPPING="&tSkipping'image(SKIPPING));
writeline(output, l);
--synthesis translate_on
return res;
end compute_blocks;
constant BLOCKS : integer_vector(K-1 downto 0) := compute_blocks;
signal g : std_logic_vector(K-1 downto 1); -- Block Generate
signal p : std_logic_vector(K-1 downto 1); -- Block Propagate
signal c : std_logic_vector(K-1 downto 1); -- Block Carry-in
begin
-----------------------------------------------------------------------------
-- Rightmost Block + Carry Computation Core
blkCore: block
constant M : positive := BLOCKS(0); -- Rightmost Block Width
begin
-- Carry Computation with Carry Chain
genCCC: if SKIPPING = CCC generate
signal x, y : unsigned(K+M-2 downto 0);
signal z : unsigned(K+M-1 downto 0);
begin
x <= unsigned(g & a(M-1 downto 0));
genExcl: if not P_INCLUSIVE generate
y <= unsigned((g or p) & b(M-1 downto 0));
-- carry recovery for other blocks
c <= std_logic_vector(z(K+M-2 downto M)) xor p;
end generate genExcl;
genIncl: if P_INCLUSIVE generate
y <= unsigned(p & b(M-1 downto 0));
-- carry recovery for other blocks
c <= std_logic_vector(z(K+M-2 downto M)) xor (p xor g);
end generate genIncl;
z <= ('0' & x) + y + (0 to 0 => cin);
-- output of rightmost block
s(M-1 downto 0) <= std_logic_vector(z(M-1 downto 0));
-- carry output
cout <= z(z'left);
end generate genCCC;
-- LUT-based Carry Computations
genLUT: if SKIPPING /= CCC generate
signal z : unsigned(M downto 0);
begin
-- rightmost block
z <= unsigned('0' & a(M-1 downto 0)) + unsigned(b(M-1 downto 0)) + (0 to 0 => cin);
s(M-1 downto 0) <= std_logic_vector(z(M-1 downto 0));
-- Plain linear LUT-based Carry Forwarding
genPlain: if SKIPPING = PLAIN generate
signal t : std_logic_vector(K downto 1);
begin
-- carry forwarding
t(1) <= z(M);
t(K downto 2) <= g or (p and c);
c <= t(K-1 downto 1);
cout <= t(K);
end generate genPlain;
-- Kogge-Stone Parallel Prefix Network
genPPN_KS: if SKIPPING = PPN_KS generate
subtype tLevel is std_logic_vector(K-1 downto 0);
type tLevels is array(natural range<>) of tLevel;
constant LEVELS : positive := log2ceil(K);
signal pp, gg : tLevels(0 to LEVELS);
begin
-- carry forwarding
pp(0) <= p & 'X';
gg(0) <= g & z(M);
genLevels: for i in 1 to LEVELS generate
constant D : positive := 2**(i-1);
begin
pp(i) <= (pp(i-1)(K-1 downto D) and pp(i-1)(K-D-1 downto 0)) & pp(i-1)(D-1 downto 0);
gg(i) <= (gg(i-1)(K-1 downto D) or (pp(i-1)(K-1 downto D) and gg(i-1)(K-D-1 downto 0))) & gg(i-1)(D-1 downto 0);
end generate genLevels;
c <= gg(LEVELS)(K-2 downto 0);
cout <= gg(LEVELS)(K-1);
end generate genPPN_KS;
-- Brent-Kung Parallel Prefix Network
genPPN_BK: if SKIPPING = PPN_BK generate
subtype tLevel is std_logic_vector(K-1 downto 0);
type tLevels is array(natural range<>) of tLevel;
constant LEVELS : positive := log2ceil(K);
signal pp, gg : tLevels(0 to 2*LEVELS-1);
begin
-- carry forwarding
pp(0) <= p & 'X';
gg(0) <= g & z(M);
genMerge: for i in 1 to LEVELS generate
constant D : positive := 2**(i-1);
begin
genBits: for j in 0 to K-1 generate
genOp: if j mod (2*D) = 2*D-1 generate
gg(i)(j) <= (pp(i-1)(j) and gg(i-1)(j-D)) or gg(i-1)(j);
pp(i)(j) <= pp(i-1)(j) and pp(i-1)(j-D);
end generate;
genCp: if j mod (2*D) /= 2*D-1 generate
gg(i)(j) <= gg(i-1)(j);
pp(i)(j) <= pp(i-1)(j);
end generate;
end generate;
end generate genMerge;
genSpread: for i in LEVELS+1 to 2*LEVELS-1 generate
constant D : positive := 2**(2*LEVELS-i-1);
begin
genBits: for j in 0 to K-1 generate
genOp: if j > D and (j+1) mod (2*D) = D generate
gg(i)(j) <= (pp(i-1)(j) and gg(i-1)(j-D)) or gg(i-1)(j);
pp(i)(j) <= pp(i-1)(j) and pp(i-1)(j-D);
end generate;
genCp: if j <= D or (j+1) mod (2*D) /= D generate
gg(i)(j) <= gg(i-1)(j);
pp(i)(j) <= pp(i-1)(j);
end generate;
end generate;
end generate genSpread;
c <= gg(gg'high)(K-2 downto 0);
cout <= gg(gg'high)(K-1);
end generate genPPN_BK;
end generate genLUT;
end block blkCore;
-----------------------------------------------------------------------------
-- Implement Carry-Select Variant
--
-- all but rightmost block, implementation architecture selected by ARCH
genBlocks: for i in 1 to K-1 generate
-- Covered Index Range
constant LO : positive := BLOCKS(i-1); -- Low Bit Index
constant HI : positive := BLOCKS(i)-1; -- High Bit Index
-- Internal Block Interface
signal aa : unsigned(HI downto LO);
signal bb : unsigned(HI downto LO);
signal ss : unsigned(HI downto LO);
begin
-- Connect common block interface
aa <= unsigned(a(HI downto LO));
bb <= unsigned(b(HI downto LO));
s(HI downto LO) <= std_logic_vector(ss);
-- ARCH-specific Implementations
--Add-Add-Multiplex
genAAM: if ARCH = AAM generate
signal s0 : unsigned(HI+1 downto LO); -- Block Sum (cin=0)
signal s1 : unsigned(HI+1 downto LO); -- Block Sum (cin=1)
begin
s0 <= ('0' & aa) + bb;
s1 <= ('0' & aa) + bb + 1;
g(i) <= s0(HI+1);
genExcl: if not P_INCLUSIVE generate
p(i) <= s1(HI+1) xor s0(HI+1);
end generate genExcl;
genIncl: if P_INCLUSIVE generate
p(i) <= s1(HI+1);
end generate genIncl;
ss <= s0(HI downto LO) when c(i) = '0' else s1(HI downto LO);
end generate genAAM;
-- Compare-Add-Increment
genCAI: if ARCH = CAI generate
signal s0 : unsigned(HI+1 downto LO); -- Block Sum (cin=0)
begin
s0 <= ('0' & aa) + bb;
g(i) <= s0(HI+1);
genExcl: if not P_INCLUSIVE generate
p(i) <= 'X' when Is_X(std_logic_vector(aa&bb)) else
'1' when (aa xor bb) = (aa'range => '1') else '0';
end generate genExcl;
genIncl: if P_INCLUSIVE generate
p(i) <= 'X' when Is_X(std_logic_vector(aa&bb)) else
'1' when aa >= not bb else '0';
end generate genIncl;
ss <= s0(HI downto LO) when c(i) = '0' else s0(HI downto LO)+1;
end generate genCAI;
-- Propagate-Add-Increment
genPAI: if ARCH = PAI generate
signal s0 : unsigned(HI+1 downto LO); -- Block Sum (cin=0)
begin
s0 <= ('0' & aa) + bb;
g(i) <= s0(HI+1);
genExcl: if not P_INCLUSIVE generate
p(i) <= 'X' when Is_X(std_logic_vector(s0)) else
'1' when s0(HI downto LO) = (HI downto LO => '1') else '0';
end generate genExcl;
genIncl: if P_INCLUSIVE generate
p(i) <= 'X' when Is_X(std_logic_vector(s0)) else
'1' when s0(HI downto LO) = (HI downto LO => '1') else g(i);
end generate genIncl;
ss <= s0(HI downto LO) when c(i) = '0' else s0(HI downto LO)+1;
end generate genPAI;
-- Compare-Compare-Add
genCCA: if ARCH = CCA generate
g(i) <= 'X' when Is_X(std_logic_vector(aa&bb)) else
'1' when aa > not bb else '0';
genExcl: if not P_INCLUSIVE generate
p(i) <= 'X' when Is_X(std_logic_vector(aa&bb)) else
'1' when (aa xor bb) = (aa'range => '1') else '0';
end generate genExcl;
genIncl: if P_INCLUSIVE generate
p(i) <= 'X' when Is_X(std_logic_vector(aa&bb)) else
'1' when aa >= not bb else '0';
end generate genIncl;
ss <= aa + bb + (0 to 0 => c(i));
end generate genCCA;
end generate genBlocks;
end architecture;