diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..f12dafa857b8a99493d7266ad029bec3f725d9ec
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,34 @@
+We welcome contributions to FINN.
+
+Please follow the steps below and be sure that your contribution complies with our guidelines.
+
+1. Share your proposal via <a href="https://github.com/Xilinx/finn/issues" target="_blank">Github issues</a>. If you are looking for some issues to get started with, we have a list of <a href="https://github.com/Xilinx/finn/labels/good%20first%20issue">good first issues</a> in the issue tracker. Feel free to ask questions on the <a href="https://gitter.im/xilinx-finn/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge">FINN gitter channel as well</a>.
+
+	We welcome submissions to:
+
+	1. The FINN flow like additional custom ONNX nodes, transformation and analysis passes.
+	2. Contributions to the documentation and Jupyter notebooks
+
+	To ensure clean separation of toolflow and examples, we do not keep example networks in this repo. If you want to add example networks, we ask you to make them into a separate repo and use FINN as a dependency -- we'll be happy to add it to the list of <a href="https://xilinx.github.io/finn/community">FINN community projects</a>.
+
+2. Submitting your pull request:
+
+	1. Fork this repository to your own GitHub account using the *fork* button above.
+
+	2. Clone the fork to your local computer using *git clone*. Checkout the branch you want to work on.
+
+	3. Please install <a href="https://pre-commit.com/" target="_blank">pre-commit</a> to ensure your code is formatted to our style guidelines. The hooks we use for pre-commit can be found in <a href="https://github.com/Xilinx/finn/blob/master/.pre-commit-config.yaml" target="_blank">this file</a>
+
+	4. Modify the Python source code, Jupyter notebooks and Sphinx documentation etc. as needed.
+
+	5. Use *git add*, *git commit*, *git push* to add changes to your fork.
+
+	6. If you are introducing new functionality, add at least one unit test under the `test/` folder and make sure it passes before you submit the pull request.
+
+	7. Submit a pull request by clicking the *pull request* button on your GitHub repo:
+		1. The <a href="https://github.com/Xilinx/finn" target="_blank">master branch</a> should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
+        2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the <a href="https://github.com/Xilinx/finn/tree/dev" target="_blank">development branch</a>.
+
+3. We will review your contribution and, if any additional fixes or modifications are
+necessary, may provide feedback to guide you. When accepted, your pull request will
+be merged to the repository. If you have more questions please contact us via the <a href="https://gitter.im/xilinx-finn/community" target="_blank">FINN gitter channel</a>.
diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile
index c4c50a0434a9f037f71cb293cf0e1e6feb300b39..e64280222a6d2e558f00d20a25a4a79d55526a97 100644
--- a/docker/Jenkinsfile
+++ b/docker/Jenkinsfile
@@ -8,12 +8,13 @@ pipeline {
         string(name: 'PYNQ_USERNAME', defaultValue: 'xilinx', description: 'PYNQ board username')
         string(name: 'PYNQ_PASSWORD', defaultValue: 'xilinx', description: 'PYNQ board password')
         string(name: 'PYNQ_TARGET_DIR', defaultValue: '/home/xilinx/finn', description: 'PYNQ board target deployment directory')
+        string(name: 'NUM_DEFAULT_WORKERS', defaultValue: '1', description: 'Number of cores for parallel transformations')
+        string(name: 'DOCKER_CMD', defaultValue: """python setup.py test""", description: 'Command to run')
     }
     environment {
         DOCKER_TAG='finn_ci:$BUILD_ID'
         DOCKER_INST_NAME='finn_ci_$BUILD_ID'
         BUILD_PATH='/tmp/finn_ci_$BUILD_ID'
-        DOCKER_CMD="python setup.py test"
     }
     stages {
         stage("Clone") {
@@ -37,6 +38,7 @@ pipeline {
                 docker run --name $DOCKER_INST_NAME \
                 --hostname $DOCKER_INST_NAME \
                 -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
                 -e FINN_INST_NAME=$DOCKER_INST_NAME \
                 -e VIVADO_PATH=${params.VIVADO_PATH} \
                 -e PYNQ_BOARD=${params.PYNQ_BOARD} \
@@ -44,7 +46,7 @@ pipeline {
                 -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
                 -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
                 -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
-                $DOCKER_TAG bash -c "$DOCKER_CMD"
+                $DOCKER_TAG ${params.DOCKER_CMD}
                 """
             }
         }
diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
index 7b207fbd6db7c9d985ba3ed50d7fcd97612e07f5..e107c400565f9f500e7902117a1eb2223b7dbc4c 100644
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
@@ -1,86 +1,99 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
-   contributors may be used to endorse or promote products derived from
-   this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module ramb18_wf_dualport
-#(
-    parameter ID = 0,
-	parameter DWIDTH = 18,
-	parameter AWIDTH = 10,
-	parameter MEM_INIT = ""
-)
-(
-	input clk,
-	
-	input wea,
-	input [AWIDTH-1:0] addra,
-	input [DWIDTH-1:0] wdataa,
-	output reg [DWIDTH-1:0] rdqa,
-
-	input web,
-	input [AWIDTH-1:0] addrb,
-	input [DWIDTH-1:0] wdatab,
-	output reg [DWIDTH-1:0] rdqb
-);
-
-(* ram_style = "block" *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
-reg [DWIDTH-1:0] rdataa;
-reg [DWIDTH-1:0] rdatab;
-
-reg [7:0] idx = ID;
-//initialize memory
-initial begin
-    //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
-	//ID can go up to 99
-	if (ID < 0 && ID > 99) begin
-	    $display("ID out of range [0-99]");
-	    $finish();
-    end
-	//MEM_INIT path must be terminated by /
-	if (ID < 10)
-		$readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, 1023);
-	else
-		$readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, 1023);
-end
-
-//memory ports, with output pipeline register
-always @(posedge clk) begin
-    if(wea)
-        mem[addra] <= wdataa;
-    rdataa <= mem[addra];
-    rdqa <= rdataa;
-end
-always @(posedge clk) begin
-    if(web)
-        mem[addrb] <= wdatab;
-    rdatab <= mem[addrb];
-    rdqb <= rdatab;
-end
-
-endmodule
\ No newline at end of file
+/*
+ Copyright (c) 2020, Xilinx
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name of FINN nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+module ramb18_wf_dualport
+#(
+    parameter ID = 0,
+	parameter DWIDTH = 18,
+	parameter AWIDTH = 10,
+	parameter MEM_INIT = ""
+)
+(
+	input clk,
+
+	input wea,
+	input [AWIDTH-1:0] addra,
+	input [DWIDTH-1:0] wdataa,
+	output reg [DWIDTH-1:0] rdqa,
+
+	input web,
+	input [AWIDTH-1:0] addrb,
+	input [DWIDTH-1:0] wdatab,
+	output reg [DWIDTH-1:0] rdqb
+);
+
+(* ram_style = "block" *) reg [DWIDTH-1:0] mem[0:2**AWIDTH-1];
+reg [DWIDTH-1:0] rdataa;
+reg [DWIDTH-1:0] rdatab;
+
+`ifdef SYNTHESIS
+reg [7:0] idx = ID;
+`else
+reg [15:0] idx;
+`endif
+
+//initialize memory
+initial begin
+  //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
+  //ID can go up to 99
+  if (ID < 0 && ID > 99) begin
+    $display("ID out of range [0-99]");
+    $finish();
+  end
+	//MEM_INIT path must be terminated by /
+  `ifdef SYNTHESIS
+  if (ID < 10)
+    $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, 1023);
+  else
+    $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, 1023);
+  `else
+  $sformat(idx,"%0d",ID);
+  if (ID < 10)
+    $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, 1023);
+  else
+    $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, 1023);
+  `endif
+end
+
+//memory ports, with output pipeline register
+always @(posedge clk) begin
+    if(wea)
+        mem[addra] <= wdataa;
+    rdataa <= mem[addra];
+    rdqa <= rdataa;
+end
+always @(posedge clk) begin
+    if(web)
+        mem[addrb] <= wdatab;
+    rdatab <= mem[addrb];
+    rdqb <= rdatab;
+end
+
+endmodule
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 30e9ca279f8f2ded312e92b484171a6c69a966fc..43f85bb56f44591ecefd79ffe82bc2bd88030780 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -538,20 +538,23 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 weight_tensor_unflipped, export_wdt, weight_width, prefix=""
             )
             weight_stream_len = np.prod(weight_tensor_unflipped.shape)
-            assert (
-                weight_stream_len <= 1024
-            ), """Decoupled mem mode needs
-            weight stream length <= 1024 for now"""
+            factor = math.ceil(weight_stream_len / 1024)
             # add zeroes to pad out file to 1024 entries
             weight_stream = weight_tensor_unflipped.flatten()
-            pad_amt = 1024 - weight_stream_len
+            pad_amt = (factor * 1024) - weight_stream_len
             weight_stream = np.pad(
                 weight_stream, (0, pad_amt), mode="constant", constant_values="0"
             )
             weight_stream = weight_stream.copy()
-            with open("{}/memblock_0.dat".format(code_gen_dir), "w+") as f:
-                for val in weight_stream:
+            i = 0
+            j = 0
+            for val in weight_stream:
+                if i == 1024:
+                    i = 0
+                    j += 1
+                with open("{}/memblock_{}.dat".format(code_gen_dir, j), "a+") as f:
                     f.write(val + "\n")
+                i += 1
 
         else:
             raise Exception(
@@ -973,17 +976,20 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             self.code_gen_dict["$LAYER_NAME$"] = [
                 "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
             ]
-            in_width = self.get_instream_width(axi_strm_padding=True)
+            # make instream width a multiple of 8 for AXI stream interface
+            in_width = roundup_to_integer_multiple(self.get_instream_width(), 8)
             self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
             self.code_gen_dict["$OUT_RANGE$"] = [
                 "[{}:0]".format(self.get_outstream_width(axi_strm_padding=True) - 1)
             ]
-            weight_width = self.get_weightstream_width(axi_strm_padding=True)
+            # make weight stream width a multiple of 8 for AXI stream interface
+            weight_width = roundup_to_integer_multiple(self.get_weightstream_width(), 8)
             self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
             self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
-            mw = self.get_nodeattr("MW")
-            mh = self.get_nodeattr("MH")
-            self.code_gen_dict["$WEIGHT_DEPTH$"] = [str(int(mw * mh))]
+            self.code_gen_dict["$WSTREAM_DEPTH$"] = [str(self.calc_wmem())]
+            self.code_gen_dict["$MEM_DEPTH$"] = [
+                str(roundup_to_integer_multiple(self.calc_wmem(), 1024))
+            ]
 
             template = self.decoupled_wrapper
 
@@ -1020,9 +1026,11 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 if file.endswith(".v"):
                     verilog_file = os.path.join(memstream_dir, file)
                     copy(verilog_file, verilog_folder)
-            # copy .dat file of weights
-            dat_file = "{}/memblock_0.dat".format(code_gen_dir)
-            copy(dat_file, verilog_folder)
+            # copy .dat files of weights
+            for file in os.listdir(code_gen_dir):
+                if file.endswith(".dat"):
+                    dat_file = os.path.join(code_gen_dir, file)
+                    copy(dat_file, verilog_folder)
             # copy verilog wrapper
             verilog_wrapper = "{}/{}_memstream.v".format(
                 code_gen_dir, self.onnx_node.name
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index c53a17aafc496a2ffb6dd8009f8bbf7358b90737..ee0c0d487e6a666c2ff22e9c6f7f533ab00742a0 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -137,7 +137,9 @@ reg [31:0] config_d0 = 0;
 wire [31:0] config_q0;
 
 //multiple wire AXI Streams
-reg m_axis_0_afull = 0;
+wire m_axis_0_afull;
+// FIFO count to generate programmable full
+wire [5:0] fifo_0_count;
 wire m_axis_0_tready;
 wire m_axis_0_tvalid;
 wire $WEIGHT_RANGE$ m_axis_0_tdata;
@@ -183,7 +185,7 @@ memstream
 // memory, set per-stream offsets in memory, set per-stream widths
 .CONFIG_EN(1),
 .NSTREAMS(1),
-.MEM_DEPTH(1024),
+.MEM_DEPTH($MEM_DEPTH$),
 .MEM_WIDTH($WEIGHT_WIDTH$),
 .MEM_INIT("./"),
 
@@ -196,7 +198,7 @@ memstream
 .STRM5_WIDTH($WEIGHT_WIDTH$),
 
 //depths per stream
-.STRM0_DEPTH($WEIGHT_DEPTH$),
+.STRM0_DEPTH($WSTREAM_DEPTH$),
 .STRM1_DEPTH(1),
 .STRM2_DEPTH(1),
 .STRM3_DEPTH(1),
@@ -257,12 +259,9 @@ mem
 
 );
 
-// two consecutive weight streamer FIFOs to provide the same functionality
-// as "programmable full"
 
-// weight streamer FIFO 1
 Q_srl #(
-.depth(16),
+.depth(32),
 .width($WEIGHT_WIDTH$)
 )
 $LAYER_NAME$_w_fifo_1
@@ -274,25 +273,10 @@ $LAYER_NAME$_w_fifo_1
  .i_r(m_axis_0_tready),
  .o_d(m_axis_0_tdata_q),
  .o_v(m_axis_0_tvalid_q),
- .o_r(m_axis_0_tready_q)
+ .o_r(m_axis_0_tready_q),
+ .count(fifo_0_count)
 );
 
-// weight streamer FIFO 2
-Q_srl #(
-.depth(16),
-.width($WEIGHT_WIDTH$)
-)
-$LAYER_NAME$_w_fifo_2
-(
- .clock(ap_clk),
- .reset(!ap_rst_n),
- .i_d(m_axis_0_tdata_q),
- .i_v(m_axis_0_tvalid_q),
- .i_r(m_axis_0_tready_q),
- .o_d(m_axis_0_tdata_q2),
- .o_v(m_axis_0_tvalid_q2),
- .o_r(m_axis_0_tready_q2)
-);
 
 //MVA_Stream_Unit
 
@@ -304,14 +288,16 @@ MVA_Stream_U
 .in0_V_V_TDATA(in0_V_V_TDATA),		//$IN_RANGE$ input
 .in0_V_V_TVALID(in0_V_V_TVALID),  	//input
 .in0_V_V_TREADY(in0_V_V_TREADY),	//output
-.weights_V_V_TDATA(m_axis_0_tdata_q2),	//$WEIGHT_RANGE$ input
-.weights_V_V_TVALID(m_axis_0_tvalid_q2),	//input
-.weights_V_V_TREADY(m_axis_0_tready_q2),	//output
+.weights_V_V_TDATA(m_axis_0_tdata_q),	//$WEIGHT_RANGE$ input
+.weights_V_V_TVALID(m_axis_0_tvalid_q),	//input
+.weights_V_V_TREADY(m_axis_0_tready_q),	//output
 .out_V_V_TDATA(out_V_V_TDATA),		//$OUT_RANGE$ output
 .out_V_V_TVALID(out_V_V_TVALID),	//output
 .out_V_V_TREADY(out_V_V_TREADY)		//input
 );
 
+// programmable full threshold at 16 elements
+assign m_axis_0_afull = (fifo_0_count > 16);
 
 endmodule
 """
diff --git a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
index 5a41d3fb70ee04526ba6e0fcbc5d7a448fb2ecf0..de3ed973af1b13cc7368a738046bec35e0eb8669 100644
--- a/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
+++ b/tests/end2end/test_end2end_tfc_w1a1_throughput_test.py
@@ -79,7 +79,7 @@ build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 5
-mem_mode = "const"
+mem_mode = "decoupled"
 
 
 def test_end2end_tfc_w1a1_export():
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index b81ae6da2fc48b061b78189c44d9421b218f9dd4..7552fecd85ee0e36216f6c934d454f057a2a41ce 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -306,3 +306,95 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
 
     hls_synt_res_est = model.analysis(hls_synth_res_estimation)
     assert "StreamingFCLayer_Batch_0" in hls_synt_res_est
+
+
+# mem_mode: const or decoupled
+@pytest.mark.parametrize("mem_mode", ["decoupled"])
+# activation: None or DataType
+@pytest.mark.parametrize("act", [DataType.INT4])
+# weight datatype
+@pytest.mark.parametrize("wdt", [DataType.INT4])
+# input datatype
+@pytest.mark.parametrize("idt", [DataType.INT4])
+# neuron folding, -1 is maximum possible
+@pytest.mark.parametrize("nf", [-1])
+# synapse folding, -1 is maximum possible
+@pytest.mark.parametrize("sf", [-1])
+# HLS matrix width (input features)
+@pytest.mark.parametrize("mw", [128])
+# HLS matrix height (output features)
+@pytest.mark.parametrize("mh", [128])
+def test_fpgadataflow_fclayer_large_depth_decoupled_mode(
+    mem_mode, idt, wdt, act, nf, sf, mw, mh
+):
+    if nf == -1:
+        nf = mh
+    if sf == -1:
+        sf = mw
+    pe = mh // nf
+    simd = mw // sf
+    assert mh % pe == 0
+    assert mw % sf == 0
+    # generate weights
+    W = gen_finn_dt_tensor(wdt, (mw, mh))
+    # generate input data
+    x = gen_finn_dt_tensor(idt, (1, mw))
+    if act is None:
+        # no activation, produce accumulators
+        T = None
+        tdt = None
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            odt = DataType.UINT32
+        else:
+            odt = DataType.INT32
+    else:
+        odt = act
+        (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw)
+        n_steps = act.get_num_possible_values() - 1
+        T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32)
+        # provide non-decreasing thresholds
+        T = np.sort(T, axis=1)
+        # generate thresholds for activation
+        if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+            tdt = DataType.UINT32
+            # bias thresholds to be positive
+            T = np.ceil((T + mw) / 2)
+            assert (T >= 0).all()
+        else:
+            tdt = DataType.INT32
+    model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt)
+    for node in model.graph.node:
+        # lookup op_type in registry of CustomOps
+        inst = getCustomOp(node)
+        inst.set_nodeattr("mem_mode", mem_mode)
+
+    # prepare input data
+    input_dict = prepare_inputs(x, idt, wdt)
+    if wdt == DataType.BIPOLAR and idt == DataType.BIPOLAR:
+        # convert inputs to binary and use xnorpopcountmatmul
+        y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2)
+    else:
+        y = np.matmul(x, W)
+    if T is not None:
+        y = multithreshold(y, T)
+        if act == DataType.BIPOLAR:
+            # binary to bipolar
+            y = 2 * y - 1
+        else:
+            # signed offset
+            y += act.min()
+    oshape = model.get_tensor_shape("outp")
+    y_expected = y.reshape(oshape)
+    # TODO split up into several dependent tests -- need to check how this
+    # works for parametrized tests...
+    model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
+    model = model.transform(HLSSynth_IPGen())
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(PrepareRTLSim())
+    y_produced = oxe.execute_onnx(model, input_dict)["outp"]
+    assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
+
+    hls_synt_res_est = model.analysis(hls_synth_res_estimation)
+    assert "StreamingFCLayer_Batch_0" in hls_synt_res_est