diff --git a/notebooks/9-FINN-EndToEndFlow.ipynb b/notebooks/9-FINN-EndToEndFlow.ipynb
index 2478d276eb3885b98b0ec18eaa49ccb2ca81cd19..1796faab6166caefc880ab2ec4e29d6abab29dec 100644
--- a/notebooks/9-FINN-EndToEndFlow.ipynb
+++ b/notebooks/9-FINN-EndToEndFlow.ipynb
@@ -86,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [
     {
@@ -117,13 +117,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "\n",
+      "Stopping http://0.0.0.0:8081\n",
       "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
      ]
     }
@@ -134,7 +136,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
@@ -164,7 +166,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -233,7 +235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -262,7 +264,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [
     {
@@ -282,7 +284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [
     {
@@ -315,7 +317,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [
     {
@@ -367,7 +369,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [
     {
@@ -376,19 +378,19 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/workspace/finn/tfc_w1_a1_streamlined.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
    "source": [
     "model = model.transform(Streamline())\n",
-    "model.save(build_dir+\"/tfc_w1_a1.onnx\")\n",
-    "netron.start(build_dir+\"/tfc_w1_a1.onnx\", port=8081, host=\"0.0.0.0\")"
+    "model.save(build_dir+\"/tfc_w1_a1_streamlined.onnx\")\n",
+    "netron.start(build_dir+\"/tfc_w1_a1_streamlined.onnx\", port=8081, host=\"0.0.0.0\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [
     {
@@ -421,7 +423,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -447,7 +449,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [
     {
@@ -456,20 +458,20 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/workspace/finn/tfc_w1_a1_hls_layers.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
    "source": [
     "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n",
     "model = model.transform(to_hls.InferBinaryStreamingFCLayer())\n",
-    "model.save(build_dir+\"/tfc_w1_a1.onnx\")\n",
-    "netron.start(build_dir+\"/tfc_w1_a1.onnx\", port=8081, host=\"0.0.0.0\")"
+    "model.save(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n",
+    "netron.start(build_dir+\"/tfc_w1_a1_hls_layers.onnx\", port=8081, host=\"0.0.0.0\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 52,
    "metadata": {
     "scrolled": true
    },
@@ -512,7 +514,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [
     {
@@ -535,7 +537,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 54,
    "metadata": {
     "scrolled": false
    },
@@ -567,7 +569,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 55,
    "metadata": {},
    "outputs": [
     {
@@ -576,7 +578,7 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_maltanar_22115/dataflow_partition_9vof1ltc/df_model.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/tmp/finn_maltanar/dataflow_partition_n7ae7i0t/df_model.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
@@ -589,7 +591,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 56,
    "metadata": {},
    "outputs": [
     {
@@ -619,7 +621,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -636,7 +638,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -670,7 +672,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -694,16 +696,18 @@
        " 'outputDataType': ('s', True, ''),\n",
        " 'binaryXnorMode': ('i', False, 0),\n",
        " 'noActivation': ('i', False, 0),\n",
+       " 'inFIFODepth': ('i', False, 0),\n",
+       " 'outFIFODepth': ('i', False, 0),\n",
        " 'backend': ('s', True, 'fpgadataflow'),\n",
        " 'code_gen_dir_npysim': ('s', False, ''),\n",
        " 'code_gen_dir_ipgen': ('s', False, ''),\n",
        " 'executable_path': ('s', False, ''),\n",
        " 'ipgen_path': ('s', False, ''),\n",
-       " 'sim_mode': ('s', False, ''),\n",
+       " 'exec_mode': ('s', False, ''),\n",
        " 'sim_cycles': ('i', False, 0)}"
       ]
      },
-     "execution_count": 26,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -721,24 +725,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
     "# SIMD controls the folding over the input vector\n",
     "# PE controls the folding over the output vector\n",
     "\n",
+    "fc0w.set_nodeattr(\"inFIFODepth\", 50)\n",
     "fc0w.set_nodeattr(\"SIMD\", 16)\n",
     "fc0w.set_nodeattr(\"PE\", 16)\n",
+    "fc0w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
     "fc1w.set_nodeattr(\"SIMD\", 16)\n",
     "fc1w.set_nodeattr(\"PE\", 16)\n",
+    "fc1w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
     "fc2w.set_nodeattr(\"SIMD\", 16)\n",
     "fc2w.set_nodeattr(\"PE\", 16)\n",
+    "fc2w.set_nodeattr(\"outFIFODepth\", 4)\n",
     "\n",
     "fc3w.set_nodeattr(\"SIMD\", 16)\n",
-    "fc3w.set_nodeattr(\"PE\", 10)"
+    "fc3w.set_nodeattr(\"PE\", 10)\n",
+    "fc3w.set_nodeattr(\"outFIFODepth\", 50)"
    ]
   },
   {
@@ -750,7 +759,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -782,7 +791,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -812,7 +821,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -833,7 +842,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -851,7 +860,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [
     {
@@ -871,7 +880,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 29,
    "metadata": {
     "scrolled": true
    },
@@ -907,7 +916,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [
     {
@@ -936,7 +945,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [
     {
@@ -944,8 +953,8 @@
      "output_type": "stream",
      "text": [
       "#!/bin/bash \r\n",
-      "cd /tmp/finn_maltanar_22115/code_gen_ipgen_StreamingFCLayer_Batch_bwxffr0g\r\n",
-      "vivado_hls /tmp/finn_maltanar_22115/code_gen_ipgen_StreamingFCLayer_Batch_bwxffr0g/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
+      "cd /tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_y_fxb2eb\r\n",
+      "vivado_hls /tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_y_fxb2eb/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
       "cd /workspace/finn\r\n"
      ]
     }
@@ -966,7 +975,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
@@ -976,7 +985,7 @@
       "\r\n",
       "set config_proj_name project_StreamingFCLayer_Batch_0\r\n",
       "puts \"HLS project: $config_proj_name\"\r\n",
-      "set config_hwsrcdir \"/tmp/finn_maltanar_22115/code_gen_ipgen_StreamingFCLayer_Batch_bwxffr0g\"\r\n",
+      "set config_hwsrcdir \"/tmp/finn_maltanar/code_gen_ipgen_StreamingFCLayer_Batch_y_fxb2eb\"\r\n",
       "puts \"HW source dir: $config_hwsrcdir\"\r\n",
       "set config_proj_part \"xczu3eg-sbva484-1-e\"\r\n",
       "\r\n",
@@ -993,6 +1002,7 @@
       "set_part $config_proj_part\r\n",
       "\r\n",
       "config_interface -m_axi_addr64\r\n",
+      "config_rtl -auto_prefix\r\n",
       "\r\n",
       "create_clock -period $config_clkperiod -name default\r\n",
       "csynth_design\r\n",
@@ -1015,33 +1025,25 @@
     "Now that all IP blocks are in place, they can be stitched together to create an IP design that matches the ONNX model. This is covered in the next section."
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# save model with other name for section \"Emulation using PyVerilator\"\"\n",
-    "model.save(build_dir+\"/tfc_w1_a1_after_hls_ip_per_layer.onnx\")"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### IP Stitching <a id='ip_stitching'></a>\n",
     "\n",
-    "We now have IP blocks for each of our layers, and will stitch them together into a larger IP that implements the whole network using the `CodeGen_ipstitch` transformation. Bear in mind that this transformation can only be applied on a graph that only contains HLS nodes that already have been through the `HLSSynth_IPGen` transformation, which is the last step we performed. **This invokes Vivado and may take a few minutes to run.**"
+    "We now have IP blocks for each of our layers, and will stitch them together into a larger IP that implements the whole network using the `CodeGen_ipstitch` transformation. Bear in mind that this transformation can only be applied on a graph that only contains HLS nodes that already have been through the `HLSSynth_IPGen` transformation, which is the last step we performed. Prior to calling IP stitching, we'll also use the `ReplaceVerilogRelPaths` transformation to convert any relative `$readmemh` paths in the generated IP blocks to absolute ones, which prevents errors later on. **This step invokes Vivado and may take a few minutes to run.**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
     "from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch\n",
-    "\n",
+    "from finn.transformation.fpgadataflow.replace_verilog_relpaths import ReplaceVerilogRelPaths\n",
+    "model = ModelWrapper(build_dir+\"/tfc_w1_a1_ipgen.onnx\")\n",
+    "model = model.transform(ReplaceVerilogRelPaths())\n",
     "model = model.transform(CodeGen_ipstitch(fpga_part))"
    ]
   },
@@ -1054,20 +1056,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_gvhcdxah\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
+       ", key: \"wrapper_filename\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_gvhcdxah/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        "]"
       ]
      },
-     "execution_count": 39,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1078,16 +1082,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0'"
+       "'/tmp/finn_maltanar/vivado_stitch_proj_gvhcdxah'"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1110,6 +1114,15 @@
     "![](stitched_ip.png)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(build_dir+\"/tfc_w1_a1_ipstitch.onnx\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1121,45 +1134,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_ud9yxuzi\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
+       ", key: \"wrapper_filename\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_ud9yxuzi/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_6rhrsy8m\"\n",
        "]"
       ]
      },
-     "execution_count": 42,
+     "execution_count": 39,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject\n",
-    "\n",
+    "model = ModelWrapper(build_dir+\"/tfc_w1_a1_ipstitch.onnx\")\n",
     "model = model.transform(MakePYNQProject(pynq_board))\n",
     "model.model.metadata_props"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ip_config.tcl\t resizer.hw\t\tresizer.srcs\t  vivado.jou\r\n",
-      "make_project.sh  resizer.ip_user_files\tresizer.xpr\t  vivado.log\r\n",
-      "resizer.cache\t resizer.sim\t\tsynth_project.sh  vivado_pid24853.str\r\n"
+      "ip_config.tcl\t resizer.cache\tresizer.ip_user_files  resizer.xpr\r\n",
+      "make_project.sh  resizer.hw\tresizer.srcs\t       synth_project.sh\r\n"
      ]
     }
    ],
@@ -1175,6 +1189,15 @@
     "![](pynq_shell_project.png)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(build_dir + \"/tfc_w1_a1_pynq_project.onnx\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1191,33 +1214,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "[key: \"vivado_stitch_proj\"\n",
-       "value: \"/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_ud9yxuzi\"\n",
        ", key: \"vivado_stitch_vlnv\"\n",
        "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
+       ", key: \"wrapper_filename\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_stitch_proj_ud9yxuzi/finn_vivado_stitch_proj.srcs/sources_1/bd/finn_design/hdl/finn_design_wrapper.v\"\n",
        ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_6rhrsy8m\"\n",
        ", key: \"vivado_pynq_bitfile\"\n",
-       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0/resizer.bit\"\n",
+       "value: \"/tmp/finn_maltanar/vivado_pynq_proj_6rhrsy8m/resizer.bit\"\n",
        "]"
       ]
      },
-     "execution_count": 44,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "model.save(build_dir + \"/tfc_w1_a1_pre_synthesis.onnx\")\n",
-    "\n",
     "from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject\n",
-    "\n",
+    "model = ModelWrapper(build_dir + \"/tfc_w1_a1_pynq_project.onnx\")\n",
     "model = model.transform(SynthPYNQProject())\n",
     "model.model.metadata_props"
    ]
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index 0102d236c3a9a6eafe87f59d01e23e0d1307b6f4..5ed45339cc2fe5f77339e33c4e7a8f6c556b704f 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -45,7 +45,8 @@ def execute_node(node, context, graph):
     if node.op_type == "StreamingDataflowPartition":
         sdp_node = getCustomOp(node)
         model = ModelWrapper(sdp_node.get_nodeattr("model"))
-        execute_onnx(model, context)
+        ret = execute_onnx(model, context, True)
+        context.update(ret)
     else:
         if node.domain == "finn":
 
@@ -124,8 +125,8 @@ def execute_onnx(model, input_dict, return_full_exec_context=False):
                         str(input_dict[inp_name].shape),
                     )
                 )
-        else:
-            raise Exception("Provided input not found in graph context: %s" % inp_name)
+        # else:
+        # raise Exception("Provided input not found in graph context: %s" % inp_name)
 
     # check if model has an execution mode set
     # if None, execute model node by node using execute_node()
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 51cd5745733c8fe1d9d437feef2d91d3dbd35bfb..bc8d74b3e3bc585f58bfca79eb593542e8733e9a 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -2,7 +2,10 @@ import os
 
 from finn.custom_op.registry import getCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-from finn.util.fpgadataflow import pyverilate_stitched_ip
+from finn.util.fpgadataflow import (
+    pyverilate_get_liveness_threshold_cycles,
+    pyverilate_stitched_ip,
+)
 
 
 def rtlsim_exec(model, execution_context):
@@ -11,6 +14,7 @@ def rtlsim_exec(model, execution_context):
     # ensure stitched ip project already exists
     assert os.path.isfile(model.get_metadata_prop("wrapper_filename"))
     assert os.path.isdir(model.get_metadata_prop("vivado_stitch_proj"))
+    trace_file = model.get_metadata_prop("rtlsim_trace")
     # extract input shape
     # TODO extend for multiple inputs
     i_name = model.graph.input[0].name
@@ -38,7 +42,7 @@ def rtlsim_exec(model, execution_context):
     sim = pyverilate_stitched_ip(model)
     _reset_rtlsim(sim)
     _toggle_clk(sim)
-    ret = _run_rtlsim(sim, packed_input, num_out_values)
+    ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file)
     packed_output = ret[0]
     model.set_metadata_prop("sim_cycles", str(ret[1]))
     # unpack output and put into context
@@ -61,7 +65,7 @@ def _toggle_clk(sim):
     sim.io.ap_clk_0 = 0
 
 
-def _run_rtlsim(sim, inp, num_out_values):
+def _run_rtlsim(sim, inp, num_out_values, trace_file=None):
     # import pdb; pdb.set_trace()
     inputs = inp
     outputs = []
@@ -73,9 +77,13 @@ def _run_rtlsim(sim, inp, num_out_values):
     observation_count = 0
 
     # avoid infinite looping of simulation by aborting when there is no change in
-    # output values after 100 cycles
+    # output values after LIVENESS_THRESHOLD cycles
     no_change_count = 0
     old_outputs = outputs
+    liveness_threshold = pyverilate_get_liveness_threshold_cycles()
+
+    if trace_file is not None:
+        sim.start_vcd_trace(trace_file)
 
     while not (output_observed):
         sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0
@@ -94,13 +102,21 @@ def _run_rtlsim(sim, inp, num_out_values):
             sim_cycles = observation_count
             output_observed = True
 
-        if no_change_count == 100:
+        if no_change_count == liveness_threshold:
             if old_outputs == outputs:
+                if trace_file is not None:
+                    sim.flush_vcd_trace()
+                    sim.stop_vcd_trace()
                 raise Exception(
                     "Error in simulation! Takes too long to produce output."
+                    "Consider setting the LIVENESS_THRESHOLD env.var. to a "
+                    "larger value."
                 )
             else:
                 no_change_count = 0
                 old_outputs = outputs
+    if trace_file is not None:
+        sim.flush_vcd_trace()
+        sim.stop_vcd_trace()
 
     return (outputs, sim_cycles)
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index b786944edfc75041efc21b40b881c8ba7ffc0736..672831fb07e85c544cf8d46d66d9917169ea07b9 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -4,7 +4,10 @@ import os
 import subprocess
 from finn.custom_op import CustomOp
 from finn.util.basic import CppBuilder
-from finn.util.fpgadataflow import IPGenBuilder
+from finn.util.fpgadataflow import (
+    IPGenBuilder,
+    pyverilate_get_liveness_threshold_cycles,
+)
 from . import templates
 
 
@@ -34,6 +37,7 @@ class HLSCustomOp(CustomOp):
             "ipgen_path": ("s", False, ""),
             "exec_mode": ("s", False, ""),
             "sim_cycles": ("i", False, 0),
+            "rtlsim_trace": ("s", False, ""),
         }
 
     def node_res_estimation(self):
@@ -192,6 +196,11 @@ compilation transformations?
 
     def rtlsim(self, sim, inp):
         # import pdb; pdb.set_trace()
+        trace_file = self.get_nodeattr("rtlsim_trace")
+        if trace_file != "":
+            if trace_file == "default":
+                trace_file = self.onnx_node.name + ".vcd"
+            sim.start_vcd_trace(trace_file)
         inputs = inp
         outputs = []
         sim.io.out_V_V_TREADY = 1
@@ -206,6 +215,7 @@ compilation transformations?
         # output values after 100 cycles
         no_change_count = 0
         old_outputs = outputs
+        liveness_threshold = pyverilate_get_liveness_threshold_cycles()
 
         while not (output_observed):
             sim.io.in0_V_V_TVALID = 1 if len(inputs) > 0 else 0
@@ -224,15 +234,22 @@ compilation transformations?
                 self.set_nodeattr("sim_cycles", observation_count)
                 output_observed = True
 
-            if no_change_count == 100:
+            if no_change_count == liveness_threshold:
                 if old_outputs == outputs:
+                    if trace_file != "":
+                        sim.flush_vcd_trace()
+                        sim.stop_vcd_trace()
                     raise Exception(
-                        "Error in simulation! Takes too long to produce output."
+                        "Error in simulation! Takes too long to produce output. "
+                        "Consider setting the LIVENESS_THRESHOLD env.var. to a "
+                        "larger value."
                     )
                 else:
                     no_change_count = 0
                     old_outputs = outputs
-
+        if trace_file != "":
+            sim.flush_vcd_trace()
+            sim.stop_vcd_trace()
         return outputs
 
     def execute_node(self, context, graph):
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 8ae626f14febc72fc6c81f07732de2229af9cc33..1da7618718fc32a53835ec8021da93936dc03380 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -98,9 +98,10 @@ class ConvolutionInputGenerator(HLSCustomOp):
             )
         elif mode == "rtlsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+            prefixed_top_name = "%s_%s" % (node.name, node.name)
             # check if needed file exists
             verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, node.name
+                code_gen_dir, node.name, prefixed_top_name
             )
             if os.path.isfile(verilog_file):
                 inp = context[node.input[0]]
@@ -209,7 +210,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         self.code_gen_dict["$DOCOMPUTE$"] = [
             """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
                 OFMDim1, SIMD1, Stride1> (in0, out, numReps);""".format(
-                node.op_type,
+                node.op_type
             )
         ]
 
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 643a2062dc50d712f0e0b08ff281638354813fe8..915a498248222d802b398aa02518fbfc0fa5d482 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -297,6 +297,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
         # create SIMD as innermost dimension and add a dummy outer dim
         ret = ret.reshape(1, pe, wmem, simd)
+        # reverse the SIMD dimension
+        ret = np.flip(ret, axis=-1)
         return ret
 
     def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
@@ -479,9 +481,10 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             # reshape output to have expected shape
             context[node.output[0]] = context[node.output[0]].reshape(1, mh)
         elif mode == "rtlsim":
+            prefixed_top_name = "%s_%s" % (node.name, node.name)
             # check if needed file exists
             verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, node.name
+                code_gen_dir, node.name, prefixed_top_name
             )
             if os.path.isfile(verilog_file):
                 nbits = self.get_instream_width()
@@ -568,7 +571,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         self.code_gen_dict["$READNPYDATA$"] = []
         # note: the innermost dim is reversed for the input
         self.code_gen_dict["$READNPYDATA$"].append(
-            'npy2apintstream<%s, %s, %d, %s>("%s", in0, true);'
+            'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
             % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
         )
 
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 07a356e5ee7e10a6b51859cb7fb2c4bdb5deeda7..e2f43f4edf206b32974adaf02fd479f0af522702 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -65,6 +65,7 @@ open_solution sol1
 set_part $config_proj_part
 
 config_interface -m_axi_addr64
+config_rtl -auto_prefix
 
 create_clock -period $config_clkperiod -name default
 csynth_design
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index 3f3bf6e79c8ab6d485c5356a4c3bba3623220df2..31a0347f3dfa571a95f354fb2ffbc74caab5ca2e 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -54,13 +54,8 @@ class TLastMarker(HLSCustomOp):
         ]
 
     def read_npy_data(self):
-        # TLastMarker does not support npysim
         self.code_gen_dict["$READNPYDATA$"] = []
 
-    def strm_decl(self):
-        # TLastMarker does not support npysim
-        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
-
     def docompute(self):
         self.code_gen_dict["$DOCOMPUTE$"] = [
             "for(int i=0; i<NumIters; i++) {",
@@ -74,7 +69,6 @@ class TLastMarker(HLSCustomOp):
         ]
 
     def dataoutstrm(self):
-        # TLastMarker does not support npysim
         self.code_gen_dict["$DATAOUTSTREAM$"] = []
 
     def save_as_npy(self):
@@ -114,3 +108,12 @@ class TLastMarker(HLSCustomOp):
     def get_outstream_width(self):
         stream_width = self.get_nodeattr("StreamWidth")
         return stream_width
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<OutDType> out ("out");'
+        )
diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
new file mode 100644
index 0000000000000000000000000000000000000000..597aa36eb80dd5e19b19764f027ebeb515a73bdd
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py
@@ -0,0 +1,42 @@
+import os
+
+import finn.custom_op.registry as registry
+import finn.util.basic as util
+from finn.transformation import Transformation
+
+
+class ReplaceVerilogRelPaths(Transformation):
+    """Convert ./ relative file paths to absolute ones for generated Verilog"""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        for node in model.graph.node:
+            op_type = node.op_type
+            if node.domain == "finn":
+                backend_attribute = util.get_by_name(node.attribute, "backend")
+                if backend_attribute is None:
+                    continue
+                backend_value = backend_attribute.s.decode("UTF-8")
+                if backend_value == "fpgadataflow":
+                    try:
+                        # lookup op_type in registry of CustomOps
+                        inst = registry.custom_op[op_type](node)
+                        # find the IP gen dir
+                        ipgen_path = inst.get_nodeattr("ipgen_path")
+                        if ipgen_path is not None and os.path.isdir(ipgen_path):
+                            for dname, dirs, files in os.walk(ipgen_path):
+                                for fname in files:
+                                    if fname.endswith(".v"):
+                                        fpath = os.path.join(dname, fname)
+                                        with open(fpath, "r") as f:
+                                            s = f.read()
+                                        old = '$readmemh(".'
+                                        new = '$readmemh("%s' % dname
+                                        s = s.replace(old, new)
+                                        with open(fpath, "w") as f:
+                                            f.write(s)
+                    except KeyError:
+                        pass
+        return (model, False)
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index b5db59046b80bb4ccd8750e3aa91f7298377f2a8..639fa6d3fb44694c347d984524c175aee8575713 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -106,7 +106,7 @@ def pack_innermost_dim_as_hex_string(ndarray, dtype, pad_to_nbits, reverse_inner
 
 
 def unpack_innermost_dim_from_hex_string(
-    ndarray, dtype, out_shape, reverse_inner=False
+    ndarray, dtype, out_shape, packedBits, reverse_inner=False
 ):
     """Convert a NumPy array of hex strings into a FINN NumPy array by unpacking
     the hex strings into the specified data type. out_shape can be specified
@@ -125,7 +125,6 @@ def unpack_innermost_dim_from_hex_string(
         )
     # convert ndarray into flattened list
     data = ndarray.flatten().tolist()
-    packedBits = len(data[0]) * 8
     targetBits = dtype.bitwidth()
     # calculate outer and inner dim shapes
     outer_dim_elems = 1
@@ -221,7 +220,7 @@ def numpy_to_hls_code(
     return ret
 
 
-def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=False):
+def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=True):
     """Convert the multidimensional NumPy array of integers (stored as floats)
     from input_file into a flattened sequence of Python arbitrary-precision
     integers, packing the innermost dimension. See
@@ -253,7 +252,7 @@ def rtlsim_output_to_npy(
     # TODO should have its own testbench?
     output = np.asarray([hex(int(x)) for x in output])
     out_array = unpack_innermost_dim_from_hex_string(
-        output, dtype, shape, reverse_inner=reverse_inner
+        output, dtype, shape, packedBits=packedBits, reverse_inner=reverse_inner
     )
     np.save(path, out_array)
     return out_array
@@ -326,7 +325,7 @@ def packed_bytearray_to_finnpy(
         npbytearray2hexstring, packed_dim, packed_bytearray
     )
     ret = unpack_innermost_dim_from_hex_string(
-        packed_hexstring, dtype, output_shape, reverse_inner
+        packed_hexstring, dtype, output_shape, packed_bits, reverse_inner
     )
 
     return ret
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index f11192a121a4b4a8a0524629fe6b3988923a363b..a6887e0fd6329e1c0ca5ad8e187e6ee1fabb1679 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -45,3 +45,10 @@ def pyverilate_stitched_ip(model):
     top_verilog = model.get_metadata_prop("wrapper_filename")
     sim = PyVerilator.build(top_verilog, verilog_path=all_verilog_dirs)
     return sim
+
+
+def pyverilate_get_liveness_threshold_cycles():
+    """Return the number of no-output cycles rtlsim will wait before assuming
+    the simulation is not finishing and throwing an exception."""
+
+    return int(os.getenv("LIVENESS_THRESHOLD", 10000))
diff --git a/tests/end2end/test_end2end_tfc.py b/tests/end2end/test_end2end_tfc.py
new file mode 100644
index 0000000000000000000000000000000000000000..598bf16758310388aef4ecc621a16afab0ad8062
--- /dev/null
+++ b/tests/end2end/test_end2end_tfc.py
@@ -0,0 +1,217 @@
+import os
+
+import numpy as np
+# as of Feb'20 there is a bug that segfaults ONNX shape inference if we
+# import pytorch before onnx, so we make sure to import onnx first
+import onnx  # NOQA
+
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.streamline.absorb as absorb
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.onnx_exec import execute_onnx
+from finn.custom_op.registry import getCustomOp
+from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
+from finn.transformation.fold_constants import FoldConstants
+from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
+from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
+from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
+from finn.transformation.fpgadataflow.compile import Compile
+from finn.transformation.fpgadataflow.create_dataflow_partition import (
+    CreateDataflowPartition,
+)
+from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
+from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.streamline import Streamline
+from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
+from finn.util.basic import pynq_part_map
+from finn.util.test import get_test_model_trained
+
+build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
+target_clk_ns = 5
+
+
+def test_end2end_tfc_export():
+    import brevitas.onnx as bo
+
+    tfc = get_test_model_trained("TFC", 1, 1)
+    bo.export_finn_onnx(
+        tfc, (1, 1, 28, 28), build_dir + "/end2end_tfc_w1_a1_export.onnx"
+    )
+
+
+def test_end2end_tfc_import_and_tidy():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_export.onnx")
+    model = model.transform(InferShapes())
+    model = model.transform(FoldConstants())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+    model.save(build_dir + "/end2end_tfc_w1_a1_tidy.onnx")
+
+
+def test_end2end_tfc_streamline():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_tidy.onnx")
+    model = model.transform(Streamline())
+    model.save(build_dir + "/end2end_tfc_w1_a1_streamlined.onnx")
+
+
+def test_end2end_tfc_convert_to_hls_layers():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_streamlined.onnx")
+    model = model.transform(ConvertBipolarMatMulToXnorPopcount())
+    model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
+    model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
+    model = model.transform(RoundAndClipThresholds())
+    model = model.transform(to_hls.InferBinaryStreamingFCLayer())
+    model.save(build_dir + "/end2end_tfc_w1_a1_hls_layers.onnx")
+
+
+def test_end2end_tfc_create_dataflow_partition():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_hls_layers.onnx")
+    parent_model = model.transform(CreateDataflowPartition())
+    parent_model.save(build_dir + "/end2end_tfc_w1_a1_dataflow_parent.onnx")
+    sdp_node = getCustomOp(parent_model.graph.node[2])
+    dataflow_model_filename = sdp_node.get_nodeattr("model")
+    dataflow_model = ModelWrapper(dataflow_model_filename)
+    dataflow_model.save(build_dir + "/end2end_tfc_w1_a1_dataflow_model.onnx")
+
+
+def test_end2end_tfc_fold_and_tlastmarker():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_dataflow_model.onnx")
+    fc0 = model.graph.node[0]
+    fc1 = model.graph.node[1]
+    fc2 = model.graph.node[2]
+    fc3 = model.graph.node[3]
+    fc0w = getCustomOp(fc0)
+    fc1w = getCustomOp(fc1)
+    fc2w = getCustomOp(fc2)
+    fc3w = getCustomOp(fc3)
+    fc0w.set_nodeattr("inFIFODepth", 50)
+    fc0w.set_nodeattr("SIMD", 16)
+    fc0w.set_nodeattr("PE", 16)
+    fc0w.set_nodeattr("outFIFODepth", 4)
+    fc1w.set_nodeattr("SIMD", 16)
+    fc1w.set_nodeattr("PE", 16)
+    fc1w.set_nodeattr("outFIFODepth", 4)
+    fc2w.set_nodeattr("SIMD", 16)
+    fc2w.set_nodeattr("PE", 16)
+    fc2w.set_nodeattr("outFIFODepth", 4)
+    fc3w.set_nodeattr("SIMD", 16)
+    fc3w.set_nodeattr("PE", 10)
+    fc3w.set_nodeattr("outFIFODepth", 50)
+    model = model.transform(InsertTLastMarker())
+    model.save(build_dir + "/end2end_tfc_w1_a1_folded.onnx")
+
+
+def test_end2end_tfc_gen_hls_ip():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_folded.onnx")
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(CodeGen_ipgen(test_fpga_part, target_clk_ns))
+    model = model.transform(HLSSynth_IPGen())
+    model.save(build_dir + "/end2end_tfc_w1_a1_ipgen.onnx")
+
+
+def test_end2end_tfc_ip_stitch():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_ipgen.onnx")
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(CodeGen_ipstitch(test_fpga_part))
+    model.save(build_dir + "/end2end_tfc_w1_a1_ipstitch.onnx")
+
+
+def test_end2end_tfc_verify_dataflow_part():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_ipstitch.onnx")
+    x = np.zeros((1, 784), dtype=np.float32)
+    inp_name = model.graph.input[0].name
+    out_name = model.graph.output[0].name
+    inp_dict = {inp_name: x}
+    # npysim
+    model = model.transform(CodeGen_npysim())
+    model = model.transform(Compile())
+    model = model.transform(SetExecMode("npysim"))
+    model.save(build_dir + "/end2end_tfc_w1_a1_ipstitch_npysim.onnx")
+    ret_npysim = execute_onnx(model, inp_dict, True)
+    res_npysim = ret_npysim[out_name]
+    # node-by-node rtlsim
+    model = model.transform(SetExecMode("rtlsim"))
+    getCustomOp(model.graph.node[0]).set_nodeattr("rtlsim_trace", "default")
+    getCustomOp(model.graph.node[1]).set_nodeattr("rtlsim_trace", "default")
+    getCustomOp(model.graph.node[2]).set_nodeattr("rtlsim_trace", "default")
+    getCustomOp(model.graph.node[3]).set_nodeattr("rtlsim_trace", "default")
+    model.save(build_dir + "/end2end_tfc_w1_a1_ipstitch_nodebynode_rtlsim.onnx")
+    ret_rtlsim_nodebynode = execute_onnx(model, inp_dict, True)
+    res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
+    # whole-network (ip-stitched) rtlsim
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", "whole_trace.vcd")
+    model.save(build_dir + "/end2end_tfc_w1_a1_ipstitch_whole_rtlsim.onnx")
+    ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
+    res_rtlsim_whole = ret_rtlsim_whole[out_name]
+    assert np.isclose(res_npysim, res_rtlsim_nodebynode).all()
+    assert np.isclose(res_npysim, res_rtlsim_whole).all()
+
+
+def test_end2end_tfc_verify_all():
+    # use the streamlined model as the "golden" model for right answers
+    golden = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_streamlined.onnx")
+    iname = golden.graph.input[0].name
+    oname = golden.graph.output[0].name
+    ishape = golden.get_tensor_shape(iname)
+    x = np.zeros(ishape, dtype=np.float32)
+    ret_golden = execute_onnx(golden, {iname: x}, True)
+    y_golden = ret_golden[oname]
+    # set up parent+child graph to test
+    # we'll use models from the previous step as the child model
+    parent_model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_dataflow_parent.onnx")
+    iname = parent_model.graph.input[0].name
+    oname = parent_model.graph.output[0].name
+    # produce results with npysim
+    sdp_node = getCustomOp(parent_model.graph.node[2])
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_tfc_w1_a1_ipstitch_npysim.onnx"
+    )
+    ret_npysim = execute_onnx(parent_model, {iname: x}, True)
+    y_npysim = ret_npysim[oname]
+    # produce results with node-by-node rtlsim
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_tfc_w1_a1_ipstitch_nodebynode_rtlsim.onnx"
+    )
+    ret_nodebynode_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_nodebynode_rtlsim = ret_nodebynode_rtlsim[oname]
+    # produce results with whole-network (stitched ip) rtlsim
+    sdp_node.set_nodeattr(
+        "model", build_dir + "/end2end_tfc_w1_a1_ipstitch_whole_rtlsim.onnx"
+    )
+    ret_whole_rtlsim = execute_onnx(parent_model, {iname: x}, True)
+    y_whole_rtlsim = ret_whole_rtlsim[oname]
+    assert np.isclose(y_golden, y_npysim).all()
+    assert np.isclose(y_golden, y_nodebynode_rtlsim).all()
+    assert np.isclose(y_golden, y_whole_rtlsim).all()
+
+
+def test_end2end_tfc_make_pynq_proj():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_ipstitch.onnx")
+    model = model.transform(MakePYNQProject(test_pynq_board))
+    model.save(build_dir + "/end2end_tfc_w1_a1_pynq_project.onnx")
+
+
+def test_end2end_synth_pynq_project():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_pynq_project.onnx")
+    model = model.transform(SynthPYNQProject())
+    model.save(build_dir + "/end2end_tfc_w1_a1_synth.onnx")
+
+
+def test_end2end_tfc_make_driver():
+    model = ModelWrapper(build_dir + "/end2end_tfc_w1_a1_synth.onnx")
+    model = model.transform(MakePYNQDriver())
+    model.save(build_dir + "/end2end_tfc_w1_a1_pynq_driver.onnx")
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 5c1e3908c94a38fb008ca400f7058d8928807eb7..9d3b36f2204abbe4277c7164d0259175fce5e085 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -183,9 +183,9 @@ def test_fpgadataflow_fclayer_npysim(idt, wdt, act, nf, sf, mw, mh):
 # input datatype
 @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
 # neuron folding, -1 is maximum possible
-@pytest.mark.parametrize("nf", [-1, 1])
+@pytest.mark.parametrize("nf", [-1, 2, 1])
 # synapse folding, -1 is maximum possible
-@pytest.mark.parametrize("sf", [-1, 1])
+@pytest.mark.parametrize("sf", [-1, 2, 1])
 # HLS matrix width (input features)
 @pytest.mark.parametrize("mw", [4])
 # HLS matrix height (output features)
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 6c4ce2d235d138227026e7ec082d1e13e4ea3673..775251b13bfe0d35100b79c84cdf2611ba94f99c 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -200,7 +200,6 @@ def test_fpgadataflow_ipstitch_gen_model():  # exec_mode):
     model = model.transform(CodeGen_ipgen(test_fpga_part, 5))
     model = model.transform(HLSSynth_IPGen())
     assert model.graph.node[0].op_type == "StreamingFCLayer_Batch"
-    # assert model.graph.node[1].op_type == "StreamingFCLayer_Batch"
     assert model.graph.node[-1].op_type == "TLastMarker"
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx")
 
diff --git a/tests/util/test_rtlsim2npy.py b/tests/util/test_rtlsim2npy.py
index c1a10fe49a7956c83a7bfcd26f15a8c4238ebdb3..9ddf34bad46376f618f4be401ca6aaeace0e9d72 100644
--- a/tests/util/test_rtlsim2npy.py
+++ b/tests/util/test_rtlsim2npy.py
@@ -10,13 +10,13 @@ def test_unpack_innermost_dim_from_hex_string():
     dtype = DataType.BINARY
     shape = (1, 2, 4)
     eA = [[1, 1, 1, 0], [0, 1, 1, 0]]
-    A_unpacked = unpack_innermost_dim_from_hex_string(A, dtype, shape)
+    A_unpacked = unpack_innermost_dim_from_hex_string(A, dtype, shape, 8)
     assert (A_unpacked == eA).all()
 
     A = np.asarray(["0x0e", "0x06"])
     eA_flipped = [[0, 1, 1, 1], [0, 1, 1, 0]]
     A_unpacked_flipped = unpack_innermost_dim_from_hex_string(
-        A, dtype, shape, reverse_inner=True
+        A, dtype, shape, 8, reverse_inner=True
     )
     assert (A_unpacked_flipped == eA_flipped).all()
 
@@ -25,13 +25,13 @@ def test_unpack_innermost_dim_from_hex_string():
     dtype = DataType.UINT2
     shape = (1, 2, 2, 2)
     eB = [[[3, 3], [3, 3]], [[1, 3], [3, 1]]]
-    B_unpacked = unpack_innermost_dim_from_hex_string(B, dtype, shape)
+    B_unpacked = unpack_innermost_dim_from_hex_string(B, dtype, shape, 8)
     assert (B_unpacked == eB).all()
 
     B = np.asarray([["0x0f", "0x0f"], ["0x07", "0x0d"]])
     eB_flipped = [[[3, 3], [3, 3]], [[3, 1], [1, 3]]]
     B_unpacked_flipped = unpack_innermost_dim_from_hex_string(
-        B, dtype, shape, reverse_inner=True
+        B, dtype, shape, 8, reverse_inner=True
     )
     assert (B_unpacked_flipped == eB_flipped).all()
 
@@ -40,7 +40,7 @@ def test_unpack_innermost_dim_from_hex_string():
     dtype = DataType.INT2
     shape = (1, 2, 2, 2)
     eC = [[[-1, -1], [-1, -1]], [[1, -1], [-1, 1]]]
-    C_unpacked = unpack_innermost_dim_from_hex_string(C, dtype, shape)
+    C_unpacked = unpack_innermost_dim_from_hex_string(C, dtype, shape, 8)
     assert (C_unpacked == eC).all()
 
     C = np.asarray([["0x0f", "0x0f"], ["0x07", "0x0d"]])
@@ -48,7 +48,7 @@ def test_unpack_innermost_dim_from_hex_string():
     shape = (1, 2, 2, 2)
     eC = [[[-1, -1], [-1, -1]], [[-1, 1], [1, -1]]]
     C_unpacked = unpack_innermost_dim_from_hex_string(
-        C, dtype, shape, reverse_inner=True
+        C, dtype, shape, 8, reverse_inner=True
     )
     assert (C_unpacked == eC).all()
 
@@ -57,11 +57,11 @@ def test_unpack_innermost_dim_from_hex_string():
     dtype = DataType.INT4
     shape = (2, 1)
     eD = [[-2], [6]]
-    D_unpacked = unpack_innermost_dim_from_hex_string(D, dtype, shape)
+    D_unpacked = unpack_innermost_dim_from_hex_string(D, dtype, shape, 8)
     assert (D_unpacked == eD).all()
 
     D_unpacked = unpack_innermost_dim_from_hex_string(
-        D, dtype, shape, reverse_inner=True
+        D, dtype, shape, 8, reverse_inner=True
     )
     assert (D_unpacked == eD).all()
 
@@ -70,10 +70,10 @@ def test_unpack_innermost_dim_from_hex_string():
     dtype = DataType.INT32
     shape = (1, 4, 1)
     eE = [[[-1], [-2], [2], [-17]]]
-    E_unpacked = unpack_innermost_dim_from_hex_string(E, dtype, shape)
+    E_unpacked = unpack_innermost_dim_from_hex_string(E, dtype, shape, 32)
     assert (E_unpacked == eE).all()
 
     E_unpacked = unpack_innermost_dim_from_hex_string(
-        E, dtype, shape, reverse_inner=True
+        E, dtype, shape, 32, reverse_inner=True
     )
     assert (E_unpacked == eE).all()