diff --git a/finn-rtllib/memstream/hdl/Q_srl.v b/finn-rtllib/memstream/hdl/Q_srl.v
new file mode 100644
index 0000000000000000000000000000000000000000..80b015f6d4eb69df36831b25262cda3539ac8ae9
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/Q_srl.v
@@ -0,0 +1,325 @@
+// original source:
+// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v
+
+
+// Copyright (c) 1999 The Regents of the University of California
+// Copyright (c) 2010 The Regents of the University of Pennsylvania
+// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London
+// Copyright (c) 2020 Xilinx
+//
+// Permission to use, copy, modify, and distribute this software and
+// its documentation for any purpose, without fee, and without a
+// written agreement is hereby granted, provided that the above copyright
+// notice and this paragraph and the following two paragraphs appear in
+// all copies.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
+// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON
+// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
+// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+//
+
+// Q_srl_oreg3_prefull_SIMPLE.v
+//
+//  - In-page queue with parameterizable depth, bit width
+//  - Stream I/O is triple (data, valid, back-pressure),
+//      with EOS concatenated into the data
+//  - Flow control for input & output is combinationally decoupled
+//  - 2 <= depth <= 256
+//      * (depth >= 2)  is required to decouple I/O flow control,
+//          where empty => no produce,  full => no consume,
+//          and depth 1 would ping-pong between the two at half rate
+//      * (depth <= 256) can be modified
+//           by changing ''synthesis loop_limit X'' below
+//          and changing ''addrwidth'' or its log computation
+//  - 1 <= width
+//  - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice,
+//      plus output register (for fast output)
+//  - Queue addressing is done by ''addr'' up-down counter
+//  - Queue fullness is checked by comparator (addr==depth)
+//  - Queue fullness                           is pre-computed for next cycle
+//  - Queue input back-pressure                is pre-computed for next cycle
+//  - Queue output valid (state!=state__empty) is pre-computed for next cycle
+//      (necessary since SRL data output reg requires non-boolean state)
+//  - FSM has 3 states (empty, one, more)
+//  - When empty, continue to emit most recently emitted value (for debugging)
+//
+//  - Queue slots used      = / (state==state_empty) ? 0
+//                            | (state==state_one)   ? 1
+//                            \ (state==state_more)  ? addr+2
+//  - Queue slots used     <=  depth
+//  - Queue slots remaining =  depth - used
+//                          = / (state==state_empty) ? depth
+//                            | (state==state_one)   ? depth-1
+//                            \ (state==state_more)  ? depth-2-addr
+//
+//  - Synplify 7.1 / 8.0
+//  - Eylon Caspi,  9/11/03, 8/18/04, 3/29/05
+
+
+`ifdef  Q_srl
+`else
+`define Q_srl
+
+
+module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count);
+
+   parameter depth = 16;   // - greatest #items in queue  (2 <= depth <= 256)
+   parameter width = 16;   // - width of data (i_d, o_d)
+
+   `define LOG2 (  (((depth))     ==0) ? 0	/* - depth==0   LOG2=0 */ \
+		 : (((depth-1)>>0)==0) ? 0	/* - depth<=1   LOG2=0 */ \
+		 : (((depth-1)>>1)==0) ? 1	/* - depth<=2   LOG2=1 */ \
+		 : (((depth-1)>>2)==0) ? 2	/* - depth<=4   LOG2=2 */ \
+		 : (((depth-1)>>3)==0) ? 3	/* - depth<=8   LOG2=3 */ \
+		 : (((depth-1)>>4)==0) ? 4	/* - depth<=16  LOG2=4 */ \
+		 : (((depth-1)>>5)==0) ? 5	/* - depth<=32  LOG2=5 */ \
+		 : (((depth-1)>>6)==0) ? 6	/* - depth<=64  LOG2=6 */ \
+		 : (((depth-1)>>7)==0) ? 7	/* - depth<=128 LOG2=7 */ \
+		 :                       8)	/* - depth<=256 LOG2=8 */
+
+// parameter addrwidth = LOG2;			// - width of queue addr
+
+   parameter addrwidth =
+		(  (((depth))     ==0) ? 0	// - depth==0   LOG2=0
+		 : (((depth-1)>>0)==0) ? 0	// - depth<=1   LOG2=0
+		 : (((depth-1)>>1)==0) ? 1	// - depth<=2   LOG2=1
+		 : (((depth-1)>>2)==0) ? 2	// - depth<=4   LOG2=2
+		 : (((depth-1)>>3)==0) ? 3	// - depth<=8   LOG2=3
+		 : (((depth-1)>>4)==0) ? 4	// - depth<=16  LOG2=4
+		 : (((depth-1)>>5)==0) ? 5	// - depth<=32  LOG2=5
+		 : (((depth-1)>>6)==0) ? 6	// - depth<=64  LOG2=6
+		 : (((depth-1)>>7)==0) ? 7	// - depth<=128 LOG2=7
+		 :                       8)	// - depth<=256 LOG2=8
+		 ;
+
+   input     clock;
+   input     reset;
+
+   input  [width-1:0] i_d;	// - input  stream data (concat data + eos)
+   input              i_v;	// - input  stream valid
+   output             i_r;	// - input  stream ready
+   wire               i_b;  // - input  stream back-pressure
+
+   output [width-1:0] o_d;	// - output stream data (concat data + eos)
+   output             o_v;	// - output stream valid
+   input              o_r;	// - output stream ready
+   wire               o_b;	// - output stream back-pressure
+
+   output [addrwidth:0] count;  // - output number of elems in queue
+
+   reg    [addrwidth-1:0] addr, addr_, a_;		// - SRL16 address
+							//     for data output
+   reg 			  shift_en_;			// - SRL16 shift enable
+   reg    [width-1:0] 	  srl [depth-2:0];		// - SRL16 memory
+   reg 			  shift_en_o_;			// - SRLO  shift enable
+   reg    [width-1:0] 	  srlo_, srlo			// - SRLO  output reg
+			  /* synthesis syn_allow_retiming=0 */ ;
+
+   parameter state_empty = 2'd0;    // - state empty : o_v=0 o_d=UNDEFINED
+   parameter state_one   = 2'd1;    // - state one   : o_v=1 o_d=srlo
+   parameter state_more  = 2'd2;    // - state more  : o_v=1 o_d=srlo
+				    //     #items in srl = addr+2
+
+   reg [1:0] state, state_;	    // - state register
+
+   wire      addr_full_;	    // - true iff addr==depth-2 on NEXT cycle
+   reg       addr_full; 	    // - true iff addr==depth-2
+   wire      addr_zero_;	    // - true iff addr==0
+   wire      o_v_reg_;		    // - true iff state_empty   on NEXT cycle
+   reg       o_v_reg  		    // - true iff state_empty
+	     /* synthesis syn_allow_retiming=0 */ ;
+   wire      i_b_reg_;		    // - true iff !full         on NEXT cycle
+   reg       i_b_reg  		    // - true iff !full
+	     /* synthesis syn_allow_retiming=0 */ ;
+
+   assign addr_full_ = (state_==state_more) && (addr_==depth-2);
+						// - queue full
+   assign addr_zero_ = (addr==0);		// - queue contains 2 (or 1,0)
+   assign o_v_reg_   = (state_!=state_empty);	// - output valid if non-empty
+   assign i_b_reg_   = addr_full_;		// - input bp if full
+   assign o_d = srlo;				// - output data from queue
+   assign o_v = o_v_reg;			// - output valid if non-empty
+   assign i_b = i_b_reg;			// - input bp if full
+
+   assign i_r = !i_b;
+   assign o_b = !o_r;
+
+   assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0));
+
+   // - ''always'' block with both FFs and SRL16 does not work,
+   //      since FFs need reset but SRL16 does not
+
+   always @(posedge clock) begin	// - seq always: FFs
+      if (reset) begin
+	 state     <= state_empty;
+	 addr      <= 0;
+         addr_full <= 0;
+	 o_v_reg   <= 0;
+	 i_b_reg   <= 1;
+      end
+      else begin
+	 state     <= state_;
+	 addr      <= addr_;
+         addr_full <= addr_full_;
+	 o_v_reg   <= o_v_reg_;
+	 i_b_reg   <= i_b_reg_;
+      end
+   end // always @ (posedge clock)
+
+   always @(posedge clock) begin	// - seq always: srlo
+      // - infer enabled output reg at end of shift chain
+      // - input first element from i_d, all subsequent elements from SRL16
+      if (reset) begin
+	 srlo <= 0;
+      end
+      else begin
+	 if (shift_en_o_) begin
+	    srlo <= srlo_;
+	 end
+      end
+   end // always @ (posedge clock)
+
+   always @(posedge clock) begin			// - seq always: srl
+      // - infer enabled SRL16E from shifting srl array
+      // - no reset capability;  srl[] contents undefined on reset
+      if (shift_en_) begin
+	 // synthesis loop_limit 256
+	 for (a_=depth-2; a_>0; a_=a_-1) begin
+	    srl[a_] <= srl[a_-1];
+	 end
+	 srl[0] <= i_d;
+      end
+   end // always @ (posedge clock or negedge reset)
+
+   always @* begin					// - combi always
+        srlo_       <=  'bx;
+        shift_en_o_ <= 1'bx;
+        shift_en_   <= 1'bx;
+        addr_       <=  'bx;
+        state_      <= 2'bx;
+      case (state)
+
+	state_empty: begin		    // - (empty, will not produce)
+	      if (i_v) begin		    // - empty & i_v => consume
+		 srlo_       <= i_d;
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_one;
+	      end
+	      else	begin		    // - empty & !i_v => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_empty;
+	      end
+	end
+
+	state_one: begin		    // - (contains one)
+	      if (i_v && o_b) begin	    // - one & i_v & o_b => consume
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1;
+		 addr_       <= 0;
+		 state_      <= state_more;
+	      end
+	      else if (i_v && !o_b) begin   // - one & i_v & !o_b => cons+prod
+		 srlo_       <= i_d;
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 1;
+		 addr_       <= 0;
+		 state_      <= state_one;
+	      end
+	      else if (!i_v && o_b) begin   // - one & !i_v & o_b => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_one;
+	      end
+	      else if (!i_v && !o_b) begin  // - one & !i_v & !o_b => produce
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1'bx;
+		 addr_       <= 0;
+		 state_      <= state_empty;
+	      end
+	end // case: state_one
+
+	state_more: begin		    // - (contains more than one)
+	   if (addr_full || (depth==2)) begin
+					    // - (full, will not consume)
+					    // - (full here if depth==2)
+	      if (o_b) begin		    // - full & o_b => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 0;
+		 addr_       <= addr;
+		 state_      <= state_more;
+	      end
+	      else begin		    // - full & !o_b => produce
+		 srlo_       <= srl[addr];
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 0;
+//		 addr_       <= addr-1;
+//		 state_      <= state_more;
+		 addr_       <= addr_zero_ ? 0         : addr-1;
+		 state_      <= addr_zero_ ? state_one : state_more;
+	      end
+	   end
+	   else begin			    // - (mid: neither empty nor full)
+	      if (i_v && o_b) begin	    // - mid & i_v & o_b => consume
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 1;
+		 addr_       <= addr+1;
+		 state_      <= state_more;
+	      end
+	      else if (i_v && !o_b) begin   // - mid & i_v & !o_b => cons+prod
+		 srlo_       <= srl[addr];
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 1;
+		 addr_       <= addr;
+		 state_      <= state_more;
+	      end
+	      else if (!i_v && o_b) begin   // - mid & !i_v & o_b => idle
+		 srlo_       <= 'bx;
+		 shift_en_o_ <= 0;
+		 shift_en_   <= 0;
+		 addr_       <= addr;
+		 state_      <= state_more;
+	      end
+	      else if (!i_v && !o_b) begin  // - mid & !i_v & !o_b => produce
+		 srlo_       <= srl[addr];
+		 shift_en_o_ <= 1;
+		 shift_en_   <= 0;
+		 addr_       <= addr_zero_ ? 0         : addr-1;
+		 state_      <= addr_zero_ ? state_one : state_more;
+	      end
+	   end // else: !if(addr_full)
+	end // case: state_more
+
+	default: begin
+		 srlo_       <=  'bx;
+		 shift_en_o_ <= 1'bx;
+		 shift_en_   <= 1'bx;
+		 addr_       <=  'bx;
+		 state_      <= 2'bx;
+	end // case: default
+
+      endcase // case(state)
+   end // always @ *
+
+endmodule // Q_srl
+
+
+`endif  // `ifdef  Q_srl
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 5a846b12c1c91b89174d337d14a188a15ebd9a44..10f8b7feedf7584afb66a7fad8f1ee20745bf67d 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -68,6 +68,8 @@ class HLSCustomOp(CustomOp):
             "code_gen_dir_ipgen": ("s", False, ""),
             "executable_path": ("s", False, ""),
             "ipgen_path": ("s", False, ""),
+            "ip_path": ("s", False, ""),
+            "ip_vlnv": ("s", False, ""),
             "exec_mode": ("s", False, ""),
             "sim_cycles": ("i", False, 0),
             "rtlsim_trace": ("s", False, ""),
@@ -146,6 +148,9 @@ class HLSCustomOp(CustomOp):
         builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name))
         builder.build(code_gen_dir)
         self.set_nodeattr("ipgen_path", builder.ipgen_path)
+        self.set_nodeattr("ip_path", builder.ipgen_path + "/sol1/impl/ip")
+        vlnv = "xilinx.com:hls:%s:1.0" % node.name
+        self.set_nodeattr("ip_vlnv", vlnv)
 
     def code_generation_npysim(self, model):
         """Generates c++ code for simulation (npysim)."""
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 606c02778d8e44e93d621880bd769450da577ec5..00b8287a312fc82425b508ffef66f5187d074617 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -28,6 +28,7 @@
 
 import math
 import os
+import subprocess
 from shutil import copy
 
 import numpy as np
@@ -130,9 +131,13 @@ class StreamingFCLayer_Batch(HLSCustomOp):
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
-        # data type stays the same
-        dtype = model.get_tensor_datatype(node.input[0])
-        model.set_tensor_datatype(node.output[0], dtype)
+        # check input datatype against property
+        idt_name = self.get_input_datatype().name
+        exp_idt_name = self.get_nodeattr("inputDataType")
+        assert exp_idt_name == idt_name, "Bad input DataType for StreamingFCLayer"
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
 
     def verify_node(self):
         info_messages = []
@@ -493,22 +498,26 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped)
 
             """Saves weights into .dat file"""
-            # convert weight value sinto hexstring
+            # convert weight values into hexstring
             weight_width = self.get_weightstream_width()
             weight_tensor_unflipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_unflipped, export_wdt, weight_width
+                weight_tensor_unflipped, export_wdt, weight_width, prefix=""
             )
-            weight_pad = np.zeros((1024), int).astype(str)
-            weight_tensor_unflipped = weight_tensor_unflipped.flatten()
-            # delete "0x" in the beginning of the hexstring
-            for i in range(len(weight_tensor_unflipped)):
-                weight_tensor_unflipped[i] = weight_tensor_unflipped[i][2:]
-            weight_pad[: weight_tensor_unflipped.shape[0]] = weight_tensor_unflipped
-            weight_pad = weight_pad.copy()
-            f = open("{}/memblock_0.dat".format(code_gen_dir), "w+")
-            for val in weight_pad:
-                f.write(val + "\n")
-            f.close()
+            weight_stream_len = np.prod(weight_tensor_unflipped.shape)
+            assert (
+                weight_stream_len <= 1024
+            ), """Decoupled mem mode needs
+            weight stream length <= 1024 for now"""
+            # add zeroes to pad out file to 1024 entries
+            weight_stream = weight_tensor_unflipped.flatten()
+            pad_amt = 1024 - weight_stream_len
+            weight_stream = np.pad(
+                weight_stream, (0, pad_amt), mode="constant", constant_values="0"
+            )
+            weight_stream = weight_stream.copy()
+            with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f:
+                for val in weight_stream:
+                    f.write(val + "\n")
 
         else:
             raise Exception(
@@ -920,7 +929,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 "#pragma HLS INTERFACE axis port=weights"
             )
             self.code_gen_dict["$PRAGMAS$"].append(
-                "#pragma HLS stream depth=8 variable=8"
+                "#pragma HLS stream depth=8 variable=weights"
             )
 
         else:
@@ -1023,3 +1032,35 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 code_gen_dir, self.onnx_node.name
             )
             copy(verilog_wrapper, verilog_folder)
+            # prepare the IP packaging tcl template
+            template = templates.ip_package_tcl
+            self.code_gen_dict["$TOPNAME$"] = [
+                "{}_memstream".format(self.onnx_node.name)
+            ]
+            self.code_gen_dict["$VERILOG_DIR$"] = [verilog_folder]
+            for key in self.code_gen_dict:
+                # transform list into long string separated by '\n'
+                code_gen_line = "\n".join(self.code_gen_dict[key])
+                template = template.replace(key, code_gen_line)
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            f = open(os.path.join(verilog_folder, "package_ip.tcl"), "w")
+            f.write(template)
+            f.close()
+            # create a shell script and call Vivado to invoke the IP pkg script
+            make_project_sh = verilog_folder + "/make_ip.sh"
+            working_dir = os.environ["PWD"]
+            with open(make_project_sh, "w") as f:
+                f.write("#!/bin/bash \n")
+                f.write("cd {}\n".format(verilog_folder))
+                f.write("vivado -mode batch -source package_ip.tcl\n")
+                f.write("cd {}\n".format(working_dir))
+            bash_command = ["bash", make_project_sh]
+            process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
+            process_compile.communicate()
+            # re-set ip_path to point to the new packaged IP
+            self.set_nodeattr("ip_path", verilog_folder)
+            vlnv = "xilinx.com:hls:%s:1.0" % (
+                "{}_memstream".format(self.onnx_node.name)
+            )
+            self.set_nodeattr("ip_vlnv", vlnv)
+            self.code_gen_dict.clear()
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index c8191d714777b05b9cbb548eaa351b3e20d84a4b..90a54b019b090ea47e77c8efa841c86a1802edb5 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -1,3 +1,4 @@
+# flake8: noqa
 # Copyright (c) 2020, Xilinx
 # All rights reserved.
 #
@@ -137,6 +138,14 @@ reg m_axis_0_tready;
 wire m_axis_0_tvalid;
 wire $WEIGHT_RANGE$ m_axis_0_tdata;
 
+reg m_axis_0_tready_q;
+wire m_axis_0_tvalid_q;
+wire $WEIGHT_RANGE$ m_axis_0_tdata_q;
+
+reg m_axis_0_tready_q2;
+wire m_axis_0_tvalid_q2;
+wire $WEIGHT_RANGE$ m_axis_0_tdata_q2;
+
 reg m_axis_1_afull = 0;
 reg m_axis_1_tready = 1;
 wire m_axis_1_tvalid;
@@ -244,6 +253,43 @@ mem
 
 );
 
+// two consecutive weight streamer FIFOs to provide the same functionality
+// as "programmable full"
+
+// weight streamer FIFO 1
+Q_srl #(
+.depth(16),
+.width($WEIGHT_WIDTH$)
+)
+$LAYER_NAME$_w_fifo_1
+(
+ .clock(ap_clk),
+ .reset(!ap_rst_n),
+ .i_d(m_axis_0_tdata),
+ .i_v(m_axis_0_tvalid),
+ .i_r(m_axis_0_tready),
+ .o_d(m_axis_0_tdata_q),
+ .o_v(m_axis_0_tvalid_q),
+ .o_r(m_axis_0_tready_q)
+);
+
+// weight streamer FIFO 2
+Q_srl #(
+.depth(16),
+.width($WEIGHT_WIDTH$)
+)
+$LAYER_NAME$_w_fifo_2
+(
+ .clock(ap_clk),
+ .reset(!ap_rst_n),
+ .i_d(m_axis_0_tdata_q),
+ .i_v(m_axis_0_tvalid_q),
+ .i_r(m_axis_0_tready_q),
+ .o_d(m_axis_0_tdata_q2),
+ .o_v(m_axis_0_tvalid_q2),
+ .o_r(m_axis_0_tready_q2)
+);
+
 //MVA_Stream_Unit
 
 $LAYER_NAME$
@@ -254,9 +300,9 @@ MVA_Stream_U
 .in0_V_V_TDATA(in0_V_V_TDATA),		//$IN_RANGE$ input
 .in0_V_V_TVALID(in0_V_V_TVALID),  	//input
 .in0_V_V_TREADY(in0_V_V_TREADY),	//output
-.weights_V_V_TDATA(m_axis_0_tdata),	//$WEIGHT_RANGE$ input
-.weights_V_V_TVALID(m_axis_0_tvalid),	//input
-.weights_V_V_TREADY(m_axis_0_tready),	//output
+.weights_V_V_TDATA(m_axis_0_tdata_q2),	//$WEIGHT_RANGE$ input
+.weights_V_V_TVALID(m_axis_0_tvalid_q2),	//input
+.weights_V_V_TREADY(m_axis_0_tready_q2),	//output
 .out_V_V_TDATA(out_V_V_TDATA),		//$OUT_RANGE$ output
 .out_V_V_TVALID(out_V_V_TVALID),	//output
 .out_V_V_TREADY(out_V_V_TREADY)		//input
@@ -265,3 +311,103 @@ MVA_Stream_U
 
 endmodule
 """
+
+ip_package_tcl = """
+## IP Info
+set Vendor      "xilinx.com"
+set Library     "hls"
+set IPName      "$TOPNAME$"
+set Version     "1.0"
+set DisplayName "$TOPNAME$"
+set Description "An IP generated by Xilinx FINN"
+set Device      "zynq"
+set Catalog     "/UserIP"
+set RootDir     "$VERILOG_DIR$"
+
+## Variables
+set Top "$TOPNAME$"
+set VerilogFiles [glob -nocomplain $RootDir/*]
+
+
+## Enter IP directory
+cd [file dir [info script]]
+
+## Generate sub cores
+set IPs ""
+set IPFiles ""
+
+## Basic info
+set core [ipx::create_core $Vendor $Library $IPName $Version]
+set_property display_name $DisplayName $core
+set_property description $Description $core
+set_property taxonomy $Catalog $core
+set_property supported_families { \
+  artix7 Production \
+  artix7l Production \
+  kintex7 Production \
+  kintex7l Production \
+  kintexu Production \
+  kintexuplus Production \
+  virtex7 Production \
+  virtexu Production \
+  virtexuplus Production \
+  zynq Production \
+  zynquplus Production \
+  aartix7 Production \
+  azynq Production \
+  qartix7 Production \
+  qkintex7 Production \
+  qkintex7l Production \
+  qvirtex7 Production \
+  qzynq Production \
+} $core;
+
+## Add verilog files
+if {[llength $VerilogFiles] > 0} {
+    # synthesis
+    set group [ipx::add_file_group xilinx_verilogsynthesis $core]
+    foreach f [concat $VerilogFiles $IPFiles] {
+        set current_file [ipx::add_file $f $group]
+        if {[file ext $f] == ".dat"} {
+            set_property type "mif" $current_file
+        }
+    }
+    set_property model_name $Top $group
+    if {$IPs != ""} {
+        set_property component_subcores $IPs $group
+    }
+
+    # simulation
+    set group [ipx::add_file_group xilinx_verilogbehavioralsimulation $core]
+    foreach f [concat $VerilogFiles $IPFiles] {
+        set current_file [ipx::add_file $f $group]
+        if {[file ext $f] == ".dat"} {
+            set_property type "mif" $current_file
+        }
+    }
+    set_property model_name $Top $group
+    if {$IPs != ""} {
+        set_property component_subcores $IPs $group
+    }
+}
+
+## Import ports
+ipx::add_ports_from_hdl \
+    -top_level_hdl_file $RootDir/$Top.v \
+    -top_module_name $Top \
+    $core
+
+## Infer interfaces
+ipx::infer_bus_interface ap_clk xilinx.com:signal:clock_rtl:1.0 [ipx::current_core]
+ipx::infer_bus_interface ap_rst_n xilinx.com:signal:reset_rtl:1.0 [ipx::current_core]
+ipx::infer_bus_interface {in0_V_V_TDATA in0_V_V_TVALID in0_V_V_TREADY} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
+ipx::infer_bus_interface {out_V_V_TREADY out_V_V_TDATA out_V_V_TVALID} xilinx.com:interface:axis_rtl:1.0 [ipx::current_core]
+ipx::associate_bus_interfaces -busif in0_V_V -clock ap_clk [ipx::current_core]
+ipx::associate_bus_interfaces -busif out_V_V -clock ap_clk [ipx::current_core]
+
+## Finalize
+set_property core_revision 2 [ipx::current_core]
+ipx::create_xgui_files [ipx::current_core]
+ipx::update_checksums [ipx::current_core]
+ipx::save_core [ipx::current_core]
+"""
diff --git a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
index fcb4af37c951de3869b731e755ef48ba4fdb579f..0fbd83199d88ec68cbf11c6ded5af33fdd4d91a3 100644
--- a/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
+++ b/src/finn/transformation/fpgadataflow/codegen_ipstitch.py
@@ -31,6 +31,7 @@ import subprocess
 
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name, make_build_dir
+from finn.custom_op.registry import getCustomOp
 
 
 class CodeGen_ipstitch(Transformation):
@@ -65,16 +66,11 @@ class CodeGen_ipstitch(Transformation):
                 backend_value == "fpgadataflow"
             ), """Backend node attribute is not
             set to "fpgadataflow"."""
-            ip_dir_attribute = get_by_name(node.attribute, "ipgen_path")
-            assert (
-                ip_dir_attribute is not None
-            ), """Node attribute "ipgen_path" is not set.
-            Please run transformation CodeGen_ipgen first."""
-            ip_dir_value = ip_dir_attribute.s.decode("UTF-8")
-            ip_dir_value += "/sol1/impl/ip"
+            node_inst = getCustomOp(node)
+            ip_dir_value = node_inst.get_nodeattr("ip_path")
             assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
             ip_dirs += [ip_dir_value]
-            vlnv = "xilinx.com:hls:%s:1.0" % node.name
+            vlnv = node_inst.get_nodeattr("ip_vlnv")
             inst_name = node.name
             create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name)
             create_cmds += [create_cmd]
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 113ea8ea02dc64aacf92b3fc3f5dda6417e25517..dbd98623c4cdf5baca9fa9c137debf8be0f70981 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -139,6 +139,10 @@ class InferBinaryStreamingFCLayer(Transformation):
     StreamingFCLayer_Batch layers. Any immediately following MultiThreshold
     layers will also be absorbed into the MVTU."""
 
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -219,6 +223,7 @@ class InferBinaryStreamingFCLayer(Transformation):
                         binaryXnorMode=1,
                         noActivation=0,
                         numInputVectors=list(mm_in_shape[:-1]),
+                        mem_mode=self.mem_mode,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old nodes
@@ -249,6 +254,7 @@ class InferBinaryStreamingFCLayer(Transformation):
                         binaryXnorMode=1,
                         noActivation=1,
                         numInputVectors=list(mm_in_shape[:-1]),
+                        mem_mode=self.mem_mode,
                     )
                     graph.node.insert(node_ind, new_node)
                     # remove old node
@@ -265,6 +271,10 @@ class InferQuantizedStreamingFCLayer(Transformation):
     StreamingFCLayer_Batch layers. Any immediately following MultiThreshold
     layers will also be absorbed into the MVTU."""
 
+    def __init__(self, mem_mode="const"):
+        super().__init__()
+        self.mem_mode = mem_mode
+
     def apply(self, model):
         graph = model.graph
         node_ind = 0
@@ -347,6 +357,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             binaryXnorMode=0,
                             noActivation=0,
                             numInputVectors=list(mm_in_shape[:-1]),
+                            mem_mode=self.mem_mode,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old nodes
@@ -377,6 +388,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                             binaryXnorMode=0,
                             noActivation=1,
                             numInputVectors=list(mm_in_shape[:-1]),
+                            mem_mode=self.mem_mode,
                         )
                         graph.node.insert(node_ind, new_node)
                         # remove old node
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index 41498edc078506b0d6db87f28dce558fdf5a1aa4..c2c3802635ba8b1be9bf7f0c71e48ad13b79771f 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -70,13 +70,12 @@ class MakePYNQProject(Transformation):
         # collect list of all IP dirs
         ip_dirs = ["list"]
         for node in model.graph.node:
-            ip_dir_attribute = get_by_name(node.attribute, "ipgen_path")
+            ip_dir_attribute = get_by_name(node.attribute, "ip_path")
             assert (
                 ip_dir_attribute is not None
-            ), """Node attribute "ipgen_path" is
+            ), """Node attribute "ip_path" is
             empty. Please run transformation HLSSynth_ipgen first."""
             ip_dir_value = ip_dir_attribute.s.decode("UTF-8")
-            ip_dir_value += "/sol1/impl/ip"
             assert os.path.isdir(
                 ip_dir_value
             ), """The directory that should
diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
index dc0a17893d9d9aa8f25fa7ca67242fca94810e3d..dce62c20b99097feee7208cbf57aa8921ddb3566 100644
--- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
+++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
@@ -63,6 +63,9 @@ class ReplaceVerilogRelPaths(Transformation):
                                         old = '$readmemh(".'
                                         new = '$readmemh("%s' % dname
                                         s = s.replace(old, new)
+                                        old = '"./'
+                                        new = '"%s/' % dname
+                                        s = s.replace(old, new)
                                         with open(fpath, "w") as f:
                                             f.write(s)
                     except KeyError:
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 3788b2b6acb8bcb3c5919d3e1f0185dcc82aa4af..1d919de5d55363bbe71f0dfc44ca6fe3025f5a4a 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -119,7 +119,9 @@ def npbytearray2hexstring(npbytearray, prefix="0x"):
     return prefix + binascii.hexlify(bytearray(npbytearray)).decode("utf-8")
 
 
-def pack_innermost_dim_as_hex_string(ndarray, dtype, pad_to_nbits, reverse_inner=False):
+def pack_innermost_dim_as_hex_string(
+    ndarray, dtype, pad_to_nbits, reverse_inner=False, prefix="0x"
+):
     """Pack the innermost dimension of the given numpy ndarray into hex
     strings using array2hexstring.
 
@@ -143,7 +145,9 @@ def pack_innermost_dim_as_hex_string(ndarray, dtype, pad_to_nbits, reverse_inner
         ndarray = np.asarray(ndarray, dtype=np.float32)
 
     def fun(x):
-        return array2hexstring(x, dtype, pad_to_nbits, reverse=reverse_inner)
+        return array2hexstring(
+            x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix
+        )
 
     return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray)
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 62d45ea8ba6f9398ff070d28168dc48eda37de42..80c9e84ba92c93e8a5d57ffaceb22b5abf188963 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -132,19 +132,19 @@ def prepare_inputs(input_tensor, idt, wdt):
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT4])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4])
 # neuron folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1, 2, 1])
 # synapse folding, -1 is maximum possible
 @pytest.mark.parametrize("sf", [-1, 2, 1])
 # HLS matrix width (input features)
-@pytest.mark.parametrize("mw", [4])
+@pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
-@pytest.mark.parametrize("mh", [4])
+@pytest.mark.parametrize("mh", [16])
 def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     if nf == -1:
         nf = mh
@@ -217,19 +217,19 @@ def test_fpgadataflow_fclayer_npysim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 # mem_mode: const or decoupled
 @pytest.mark.parametrize("mem_mode", ["const", "decoupled"])
 # activation: None or DataType
-@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("act", [None, DataType.BIPOLAR, DataType.INT4])
 # weight datatype
-@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("wdt", [DataType.BIPOLAR, DataType.INT4])
 # input datatype
-@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
+@pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT4])
 # neuron folding, -1 is maximum possible
 @pytest.mark.parametrize("nf", [-1, 2, 1])
 # synapse folding, -1 is maximum possible
 @pytest.mark.parametrize("sf", [-1, 2, 1])
 # HLS matrix width (input features)
-@pytest.mark.parametrize("mw", [4])
+@pytest.mark.parametrize("mw", [16])
 # HLS matrix height (output features)
-@pytest.mark.parametrize("mh", [4])
+@pytest.mark.parametrize("mh", [16])
 def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     if nf == -1:
         nf = mh
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index eeff14c4d7c4aa8213f8673d9dd6a4745ececb1a..4a81977d49d174f66e1a02140a7643bd352db7a2 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -48,12 +48,9 @@ from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+import finn.transformation.fpgadataflow.replace_verilog_relpaths as rvp
 from finn.transformation.general import GiveUniqueNodeNames
-from finn.util.basic import (
-    calculate_signed_dot_prod_range,
-    gen_finn_dt_tensor,
-    pynq_part_map,
-)
+from finn.util.basic import gen_finn_dt_tensor, pynq_part_map
 from finn.util.fpgadataflow import pyverilate_stitched_ip
 
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -66,7 +63,7 @@ def create_one_fc_model():
     # create a model with a StreamingFCLayer instance with no activation
     # the wider range of the full accumulator makes debugging a bit easier
     wdt = DataType.INT2
-    idt = DataType.INT2
+    idt = DataType.INT32
     odt = DataType.INT32
     m = 4
     no_act = 1
@@ -119,13 +116,11 @@ def create_one_fc_model():
 def create_two_fc_model():
     # create a model with two StreamingFCLayer instances
     wdt = DataType.INT2
-    idt = DataType.INT2
-    odt = DataType.INT2
-    act = DataType.INT2
+    idt = DataType.INT32
+    odt = DataType.INT32
     m = 4
-    tdt = DataType.INT32
-    actval = odt.min()
-    no_act = 0
+    actval = 0
+    no_act = 1
     binary_xnor_mode = 0
     pe = 2
     simd = 2
@@ -136,7 +131,7 @@ def create_two_fc_model():
 
     fc0 = helper.make_node(
         "StreamingFCLayer_Batch",
-        ["inp", "w0", "t0"],
+        ["inp", "w0"],
         ["mid"],
         domain="finn",
         backend="fpgadataflow",
@@ -151,11 +146,12 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
+        mem_mode="decoupled",
     )
 
     fc1 = helper.make_node(
         "StreamingFCLayer_Batch",
-        ["mid", "w1", "t1"],
+        ["mid", "w1"],
         ["outp"],
         domain="finn",
         backend="fpgadataflow",
@@ -170,6 +166,7 @@ def create_two_fc_model():
         ActVal=actval,
         binaryXnorMode=binary_xnor_mode,
         noActivation=no_act,
+        mem_mode="decoupled",
     )
 
     graph = helper.make_graph(
@@ -190,31 +187,19 @@ def create_two_fc_model():
     model.set_tensor_datatype("w1", wdt)
 
     # generate weights
-    w0 = gen_finn_dt_tensor(wdt, (m, m))
-    w1 = gen_finn_dt_tensor(wdt, (m, m))
+    w0 = np.eye(m, dtype=np.float32)
+    w1 = np.eye(m, dtype=np.float32)
     model.set_initializer("w0", w0)
     model.set_initializer("w1", w1)
 
-    # generate thresholds
-    (min, max) = calculate_signed_dot_prod_range(idt, wdt, m)
-    n_steps = act.get_num_possible_values() - 1
-    t0 = np.random.randint(min, max - 1, (m, n_steps)).astype(np.float32)
-    t1 = np.random.randint(min, max - 1, (m, n_steps)).astype(np.float32)
-    # provide non-decreasing thresholds
-    t0 = np.sort(t0, axis=1)
-    t1 = np.sort(t1, axis=1)
-
-    model.set_initializer("t0", t0)
-    model.set_initializer("t1", t1)
-    model.set_tensor_datatype("t0", tdt)
-    model.set_tensor_datatype("t1", tdt)
+    model = model.transform(CreateDataflowPartition())
     return model
 
 
 # exec_mode of StreamingDataflowPartition
 # @pytest.mark.parametrize("exec_mode", ["remote_pynq"]) #, "rtlsim"])
 def test_fpgadataflow_ipstitch_gen_model():  # exec_mode):
-    model = create_one_fc_model()
+    model = create_two_fc_model()
     if model.graph.node[0].op_type == "StreamingDataflowPartition":
         sdp_node = getCustomOp(model.graph.node[0])
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
@@ -234,6 +219,7 @@ def test_fpgadataflow_ipstitch_do_stitch():
     model = ModelWrapper(
         ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx"
     )
+    model = model.transform(rvp.ReplaceVerilogRelPaths())
     model = model.transform(CodeGen_ipstitch(test_fpga_part))
     vivado_stitch_proj_dir = model.get_metadata_prop("vivado_stitch_proj")
     assert vivado_stitch_proj_dir is not None
@@ -247,6 +233,7 @@ def test_fpgadataflow_ipstitch_do_stitch():
 
 def test_fpgadataflow_ipstitch_rtlsim():
     model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch.onnx")
+    model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
     sim = pyverilate_stitched_ip(model)
     exp_io = [
         "ap_clk_0",
@@ -265,6 +252,8 @@ def test_fpgadataflow_ipstitch_rtlsim():
     idt = model.get_tensor_datatype("inp")
     ishape = model.get_tensor_shape("inp")
     x = gen_finn_dt_tensor(idt, ishape)
+    # x = np.zeros(ishape, dtype=np.float32)
+    # x = np.asarray([[-2, -1, 0, 1]], dtype=np.float32)
     rtlsim_res = execute_onnx(model, {"inp": x})["outp"]
     assert (rtlsim_res == x).all()