-
Yaman Umuroglu authored
this is essential for the streaming weights to work correctly due to the depth of the mem addr gen pipeline
Yaman Umuroglu authoredthis is essential for the streaming weights to work correctly due to the depth of the mem addr gen pipeline
Q_srl.v 11.21 KiB
// original source:
// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v
// Copyright (c) 1999 The Regents of the University of California
// Copyright (c) 2010 The Regents of the University of Pennsylvania
// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London
// Copyright (c) 2020 Xilinx
//
// Permission to use, copy, modify, and distribute this software and
// its documentation for any purpose, without fee, and without a
// written agreement is hereby granted, provided that the above copyright
// notice and this paragraph and the following two paragraphs appear in
// all copies.
//
// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
// SUCH DAMAGE.
//
// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON
// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
//
// Q_srl_oreg3_prefull_SIMPLE.v
//
// - In-page queue with parameterizable depth, bit width
// - Stream I/O is triple (data, valid, back-pressure),
// with EOS concatenated into the data
// - Flow control for input & output is combinationally decoupled
// - 2 <= depth <= 256
// * (depth >= 2) is required to decouple I/O flow control,
// where empty => no produce, full => no consume,
// and depth 1 would ping-pong between the two at half rate
// * (depth <= 256) can be modified
// by changing ''synthesis loop_limit X'' below
// and changing ''addrwidth'' or its log computation
// - 1 <= width
// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice,
// plus output register (for fast output)
// - Queue addressing is done by ''addr'' up-down counter
// - Queue fullness is checked by comparator (addr==depth)
// - Queue fullness is pre-computed for next cycle
// - Queue input back-pressure is pre-computed for next cycle
// - Queue output valid (state!=state__empty) is pre-computed for next cycle
// (necessary since SRL data output reg requires non-boolean state)
// - FSM has 3 states (empty, one, more)
// - When empty, continue to emit most recently emitted value (for debugging)
//
// - Queue slots used = / (state==state_empty) ? 0
// | (state==state_one) ? 1
// \ (state==state_more) ? addr+2
// - Queue slots used <= depth
// - Queue slots remaining = depth - used
// = / (state==state_empty) ? depth
// | (state==state_one) ? depth-1
// \ (state==state_more) ? depth-2-addr
//
// - Synplify 7.1 / 8.0
// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05
`ifdef Q_srl
`else
`define Q_srl
module Q_srl (clock, reset, i_d, i_v, i_b, o_d, o_v, o_b, count);
parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256)
parameter width = 16; // - width of data (i_d, o_d)
`define LOG2 ( (((depth)) ==0) ? 0 /* - depth==0 LOG2=0 */ \
: (((depth-1)>>0)==0) ? 0 /* - depth<=1 LOG2=0 */ \
: (((depth-1)>>1)==0) ? 1 /* - depth<=2 LOG2=1 */ \
: (((depth-1)>>2)==0) ? 2 /* - depth<=4 LOG2=2 */ \
: (((depth-1)>>3)==0) ? 3 /* - depth<=8 LOG2=3 */ \
: (((depth-1)>>4)==0) ? 4 /* - depth<=16 LOG2=4 */ \
: (((depth-1)>>5)==0) ? 5 /* - depth<=32 LOG2=5 */ \
: (((depth-1)>>6)==0) ? 6 /* - depth<=64 LOG2=6 */ \
: (((depth-1)>>7)==0) ? 7 /* - depth<=128 LOG2=7 */ \
: 8) /* - depth<=256 LOG2=8 */
// parameter addrwidth = LOG2; // - width of queue addr
parameter addrwidth =
( (((depth)) ==0) ? 0 // - depth==0 LOG2=0
: (((depth-1)>>0)==0) ? 0 // - depth<=1 LOG2=0
: (((depth-1)>>1)==0) ? 1 // - depth<=2 LOG2=1
: (((depth-1)>>2)==0) ? 2 // - depth<=4 LOG2=2
: (((depth-1)>>3)==0) ? 3 // - depth<=8 LOG2=3
: (((depth-1)>>4)==0) ? 4 // - depth<=16 LOG2=4
: (((depth-1)>>5)==0) ? 5 // - depth<=32 LOG2=5
: (((depth-1)>>6)==0) ? 6 // - depth<=64 LOG2=6
: (((depth-1)>>7)==0) ? 7 // - depth<=128 LOG2=7
: 8) // - depth<=256 LOG2=8
;
input clock;
input reset;
input [width-1:0] i_d; // - input stream data (concat data + eos)
input i_v; // - input stream valid
output i_b; // - input stream back-pressure
output [width-1:0] o_d; // - output stream data (concat data + eos)
output o_v; // - output stream valid
input o_b; // - output stream back-pressure
output [addrwidth:0] count; // - output number of elems in queue
reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address
// for data output
reg shift_en_; // - SRL16 shift enable
reg [width-1:0] srl [depth-2:0]; // - SRL16 memory
reg shift_en_o_; // - SRLO shift enable
reg [width-1:0] srlo_, srlo // - SRLO output reg
/* synthesis syn_allow_retiming=0 */ ;
parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED
parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo
parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo
// #items in srl = addr+2
reg [1:0] state, state_; // - state register
wire addr_full_; // - true iff addr==depth-2 on NEXT cycle
reg addr_full; // - true iff addr==depth-2
wire addr_zero_; // - true iff addr==0
wire o_v_reg_; // - true iff state_empty on NEXT cycle
reg o_v_reg // - true iff state_empty
/* synthesis syn_allow_retiming=0 */ ;
wire i_b_reg_; // - true iff !full on NEXT cycle
reg i_b_reg // - true iff !full
/* synthesis syn_allow_retiming=0 */ ;
assign addr_full_ = (state_==state_more) && (addr_==depth-2);
// - queue full
assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0)
assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty
assign i_b_reg_ = addr_full_; // - input bp if full
assign o_d = srlo; // - output data from queue
assign o_v = o_v_reg; // - output valid if non-empty
assign i_b = i_b_reg; // - input bp if full
assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0));
// - ''always'' block with both FFs and SRL16 does not work,
// since FFs need reset but SRL16 does not
always @(posedge clock) begin // - seq always: FFs
if (reset) begin
state <= state_empty;
addr <= 0;
addr_full <= 0;
o_v_reg <= 0;
i_b_reg <= 1;
end
else begin
state <= state_;
addr <= addr_;
addr_full <= addr_full_;
o_v_reg <= o_v_reg_;
i_b_reg <= i_b_reg_;
end
end // always @ (posedge clock)
always @(posedge clock) begin // - seq always: srlo
// - infer enabled output reg at end of shift chain
// - input first element from i_d, all subsequent elements from SRL16
if (reset) begin
srlo <= 0;
end
else begin
if (shift_en_o_) begin
srlo <= srlo_;
end
end
end // always @ (posedge clock)
always @(posedge clock) begin // - seq always: srl
// - infer enabled SRL16E from shifting srl array
// - no reset capability; srl[] contents undefined on reset
if (shift_en_) begin
// synthesis loop_limit 256
for (a_=depth-2; a_>0; a_=a_-1) begin
srl[a_] <= srl[a_-1];
end
srl[0] <= i_d;
end
end // always @ (posedge clock or negedge reset)
always @* begin // - combi always
srlo_ <= 'bx;
shift_en_o_ <= 1'bx;
shift_en_ <= 1'bx;
addr_ <= 'bx;
state_ <= 2'bx;
case (state)
state_empty: begin // - (empty, will not produce)
if (i_v) begin // - empty & i_v => consume
srlo_ <= i_d;
shift_en_o_ <= 1;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_one;
end
else begin // - empty & !i_v => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_empty;
end
end
state_one: begin // - (contains one)
if (i_v && o_b) begin // - one & i_v & o_b => consume
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1;
addr_ <= 0;
state_ <= state_more;
end
else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod
srlo_ <= i_d;
shift_en_o_ <= 1;
shift_en_ <= 1;
addr_ <= 0;
state_ <= state_one;
end
else if (!i_v && o_b) begin // - one & !i_v & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_one;
end
else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1'bx;
addr_ <= 0;
state_ <= state_empty;
end
end // case: state_one
state_more: begin // - (contains more than one)
if (addr_full || (depth==2)) begin
// - (full, will not consume)
// - (full here if depth==2)
if (o_b) begin // - full & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 0;
addr_ <= addr;
state_ <= state_more;
end
else begin // - full & !o_b => produce
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 0;
// addr_ <= addr-1;
// state_ <= state_more;
addr_ <= addr_zero_ ? 0 : addr-1;
state_ <= addr_zero_ ? state_one : state_more;
end
end
else begin // - (mid: neither empty nor full)
if (i_v && o_b) begin // - mid & i_v & o_b => consume
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 1;
addr_ <= addr+1;
state_ <= state_more;
end
else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 1;
addr_ <= addr;
state_ <= state_more;
end
else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle
srlo_ <= 'bx;
shift_en_o_ <= 0;
shift_en_ <= 0;
addr_ <= addr;
state_ <= state_more;
end
else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce
srlo_ <= srl[addr];
shift_en_o_ <= 1;
shift_en_ <= 0;
addr_ <= addr_zero_ ? 0 : addr-1;
state_ <= addr_zero_ ? state_one : state_more;
end
end // else: !if(addr_full)
end // case: state_more
default: begin
srlo_ <= 'bx;
shift_en_o_ <= 1'bx;
shift_en_ <= 1'bx;
addr_ <= 'bx;
state_ <= 2'bx;
end // case: default
endcase // case(state)
end // always @ *
endmodule // Q_srl
`endif // `ifdef Q_srl