diff --git a/Dockerfile b/Dockerfile
index 734a8fd3f4b6493246f1c7e522da2934d09e849b..7780d3fd4e630af7a6395b84858211fb93c2b834 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,6 +25,7 @@ ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
+ENV PYNQ_BOARD "Pynq-Z1"
 
 ARG GID
 ARG GNAME
diff --git a/notebooks/9-FINN-EndToEndFlow.ipynb b/notebooks/9-FINN-EndToEndFlow.ipynb
index 37cbd2256ed1d19fc5e29859f64735e44035b49a..144c26bc5cbaa89314fe90c9b2b07990db0f1c4b 100644
--- a/notebooks/9-FINN-EndToEndFlow.ipynb
+++ b/notebooks/9-FINN-EndToEndFlow.ipynb
@@ -13,7 +13,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -24,8 +24,7 @@
     "def showSrc(what):\n",
     "    print(\"\".join(inspect.getsourcelines(what)[0]))\n",
     "    \n",
-    "# create a host-Docker shared folder for the build\n",
-    "build_dir = make_build_dir(\"end2end_\")"
+    "build_dir = \"/workspace/finn\""
    ]
   },
   {
@@ -87,7 +86,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -117,14 +116,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Serving '/tmp/finn_maltanar_9257/end2end_f4ebazeo/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
@@ -134,7 +133,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -164,7 +163,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -233,7 +232,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -262,16 +261,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\n",
-      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_maltanar_9257/end2end_f4ebazeo/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
@@ -282,7 +279,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -315,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -367,7 +364,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -376,7 +373,7 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_maltanar_9257/end2end_f4ebazeo/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
@@ -388,7 +385,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -421,7 +418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -447,7 +444,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -456,7 +453,7 @@
      "text": [
       "\n",
       "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_maltanar_9257/end2end_f4ebazeo/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
@@ -469,7 +466,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 12,
    "metadata": {
     "scrolled": true
    },
@@ -505,13 +502,138 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Folding <a id='folding'></a>\n",
-    "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we have to extract the nodes which are StreamingFCLayer_Batch operations. This is where netron helps us, in the above diagram we can see that the third to sixth nodes are StreamingFCLayer_Batch. Through the `print`s we can check if the extracted nodes all have the op_type \"StreamingFCLayer_Batch\". For more details on how to use ONNX model, see Jupyter notebook [1-FINN-HowToWorkWithONNX](1-FINN-HowToWorkWithONNX.ipynb)."
+    "### Creating a Dataflow Partition <a id='dataflow_partition'></a>\n",
+    "\n",
+    "In the graph above, you can see that there is a mixture of FINN HLS layers (StreamingFCLayer_Batch) with regular ONNX layers (Reshape, Mul, Add). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Stopping http://0.0.0.0:8081\n",
+      "Serving '/workspace/finn/tfc_w1_a1_dataflow_parent.onnx' at http://0.0.0.0:8081\n"
+     ]
+    }
+   ],
+   "source": [
+    "from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition\n",
+    "\n",
+    "parent_model = model.transform(CreateDataflowPartition())\n",
+    "parent_model.save(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")\n",
+    "netron.start(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\", port=8081, host=\"0.0.0.0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"400\"></iframe>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%%html\n",
+    "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"400\"></iframe>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the StreamingFCLayer instances have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Stopping http://0.0.0.0:8081\n",
+      "Serving '/tmp/finn_maltanar_22115/dataflow_partition_9vof1ltc/df_model.onnx' at http://0.0.0.0:8081\n"
+     ]
+    }
+   ],
+   "source": [
+    "from finn.custom_op.registry import getCustomOp\n",
+    "sdp_node = getCustomOp(parent_model.graph.node[2])\n",
+    "dataflow_model_filename = sdp_node.get_nodeattr(\"model\")\n",
+    "netron.start(dataflow_model_filename, port=8081, host=\"0.0.0.0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"400\"></iframe>\n"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "%%html\n",
+    "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"400\"></iframe>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see all the extracted `StreamingFCLayer` instances have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = ModelWrapper(dataflow_model_filename)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Folding and TLastMarker Insertion <a id='folding'></a>\n",
+    "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we have to extract the nodes which are StreamingFCLayer_Batch operations. This is where netron helps us, in the above diagram we can see that the first four nodes are StreamingFCLayer_Batch. Through the `print`s we can check if the extracted nodes all have the op_type \"StreamingFCLayer_Batch\". For more details on how to use ONNX model, see Jupyter notebook [1-FINN-HowToWorkWithONNX](1-FINN-HowToWorkWithONNX.ipynb)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [
     {
@@ -526,10 +648,10 @@
     }
    ],
    "source": [
-    "fc0 = model.graph.node[2]\n",
-    "fc1 = model.graph.node[3]\n",
-    "fc2 = model.graph.node[4]\n",
-    "fc3 = model.graph.node[5]\n",
+    "fc0 = model.graph.node[0]\n",
+    "fc1 = model.graph.node[1]\n",
+    "fc2 = model.graph.node[2]\n",
+    "fc3 = model.graph.node[3]\n",
     "print(\"fc0 has the op_type: \" + str(fc0.op_type))\n",
     "print(\"fc1 has the op_type: \" + str(fc1.op_type))\n",
     "print(\"fc2 has the op_type: \" + str(fc2.op_type))\n",
@@ -545,7 +667,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -578,14 +700,12 @@
        " 'sim_cycles': ('i', False, 0)}"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "from finn.custom_op.registry import getCustomOp\n",
-    "\n",
     "fc0w = getCustomOp(fc0)\n",
     "fc1w = getCustomOp(fc1)\n",
     "fc2w = getCustomOp(fc2)\n",
@@ -598,7 +718,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -618,14 +738,22 @@
     "fc3w.set_nodeattr(\"PE\", 10)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, we will run the `InsertTLastMarker` transformation to get a `TLastMarker` node at the output of this graph, which is necessary to run the DMA engines correctly. "
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# save model with other name for section \"Simulation using C++\"\"\n",
-    "model.save(build_dir+\"/tfc_w1_a1_after_conv_to_hls.onnx\")"
+    "from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker\n",
+    "model = model.transform(InsertTLastMarker())\n",
+    "model.save(build_dir+\"/tfc_w1_a1_set_folding_factors.onnx\")"
    ]
   },
   {
@@ -642,8 +770,8 @@
     "## 3. Vivado HLS and Vivado synthesis <a id='vivado'></a>\n",
     "* [Generating HLS Code](#hls_per_layer)\n",
     "* [Synthesizing HLS to IP Blocks](#hls_synth)\n",
-    "* [Creation of stitched design](#stitched_design)\n",
-    "* [PYNQ shell project](#pynq_shell)\n",
+    "* [IP Stitching](#ip_stitching)\n",
+    "* [Inserting the IP into a PYNQ Shell](#pynq_shell)\n",
     "* [Synthesis, place and route](#synth_pl_ro)\n",
     "\n",
     "As we will be performing FPGA synthesis in these tasks, we'll define two helper variables that describe the Xilinx FPGA part name and the PYNQ board name that we are targeting."
@@ -651,7 +779,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -681,7 +809,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -691,102 +819,18 @@
     "model = model.transform(CodeGen_ipgen(fpga_part, target_clk_ns))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Each `fpgadataflow` node will have its own code generation directory, which we can examine as follows:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "hls_syn_StreamingFCLayer_Batch_0.tcl  thresh.h\r\n",
-      "params.h\t\t\t      top_StreamingFCLayer_Batch_0.cpp\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "fc0w = getCustomOp(model.graph.node[2])\n",
-    "codegen_dir = fc0w.get_nodeattr(\"code_gen_dir_ipgen\")\n",
-    "! ls {codegen_dir}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "We can see the various generated files. In particular, the `top*.cpp` will contain the Vivado HLS function call that instantiates the correct `finn-hlslib` library component for this node:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "\r\n",
-      "#include \"bnn-library.h\"\r\n",
-      "// includes for network parameters\r\n",
-      "#include \"weights.hpp\"\r\n",
-      "#include \"activations.hpp\"\r\n",
-      "#include \"params.h\"\r\n",
-      "#include \"thresh.h\"\r\n",
-      "\r\n",
-      "// defines for network parameters\r\n",
-      "#define MW1 784\r\n",
-      " #define MH1 64\r\n",
-      " #define SIMD1 16\r\n",
-      "\r\n",
-      "            #define PE1 16\r\n",
-      " #define WMEM1 196\r\n",
-      " #define TMEM1 4\r\n",
-      "\r\n",
-      "            #define numReps 1\r\n",
-      "#define PRAGMA_SUB(x) _Pragma (#x)\r\n",
-      "#define DO_PRAGMA(x) PRAGMA_SUB(x)\r\n",
-      "\r\n",
-      "void StreamingFCLayer_Batch_0(hls::stream<ap_uint<16>> &in0,\r\n",
-      "                hls::stream<ap_uint<16>> &out\r\n",
-      "                )\r\n",
-      "{\r\n",
-      "#pragma HLS INTERFACE axis port=in0\r\n",
-      "#pragma HLS INTERFACE axis port=out\r\n",
-      "#pragma HLS INTERFACE ap_ctrl_none port=return\r\n",
-      "DO_PRAGMA(HLS ARRAY_PARTITION variable=weights.m_weights complete dim=1)\r\n",
-      "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds complete dim=1)\r\n",
-      "DO_PRAGMA(HLS ARRAY_PARTITION variable=threshs.m_thresholds complete dim=3)\r\n",
-      "StreamingFCLayer_Batch<MW1, MH1, SIMD1, PE1, Recast<XnorMul>, Identity, Identity>\r\n",
-      "            (in0, out, weights, threshs, numReps, ap_resource_lut());\r\n",
-      "}\r\n"
-     ]
-    }
-   ],
-   "source": [
-    "! cat {codegen_dir}/top_StreamingFCLayer_Batch_0.cpp"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "### Synthesizing HLS to IP Blocks <a id='hls_synth'></a>\n",
     "\n",
-    "Now that we have generated the HLS code for each layer, we can call the `HLSSynth_IPGen` transformation to convert the generated HLS into Vivado IP blocks. As this involves calling HLS synthesis, this transformation will run for some time."
+    "Now that we have generated the HLS code for each layer, we can call the `HLSSynth_IPGen` transformation to convert the generated HLS into Vivado IP blocks. **As this involves calling HLS synthesis, this transformation will run for some time (several minutes).**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -799,30 +843,32 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Each StreamingFCLayer_Batch node now has new attributes which can be examined more closely with netron."
+    "Each `StreamingFCLayer_Batch` node now has new attributes which can be examined more closely with netron."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Serving 'lfc_w1_a1.onnx' at http://0.0.0.0:8081\n"
+      "\n",
+      "Stopping http://0.0.0.0:8081\n",
+      "Serving '/workspace/finn/tfc_w1_a1_ipgen.onnx' at http://0.0.0.0:8081\n"
      ]
     }
    ],
    "source": [
-    "model.save(build_dir+\"/tfc_w1_a1.onnx\")\n",
-    "netron.start(build_dir+\"/tfc_w1_a1.onnx\", port=8081, host=\"0.0.0.0\")"
+    "model.save(build_dir+\"/tfc_w1_a1_ipgen.onnx\")\n",
+    "netron.start(build_dir+\"/tfc_w1_a1_ipgen.onnx\", port=8081, host=\"0.0.0.0\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 33,
    "metadata": {
     "scrolled": true
    },
@@ -830,7 +876,7 @@
     {
      "data": {
       "text/html": [
-       "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"400\"></iframe>\n"
+       "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"800\"></iframe>\n"
       ],
       "text/plain": [
        "<IPython.core.display.HTML object>"
@@ -842,7 +888,7 @@
    ],
    "source": [
     "%%html\n",
-    "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"400\"></iframe>"
+    "<iframe src=\"http://0.0.0.0:8081/\" style=\"position: relative; width: 100%;\" height=\"800\"></iframe>"
    ]
   },
   {
@@ -858,7 +904,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [
     {
@@ -873,7 +919,7 @@
     }
    ],
    "source": [
-    "fc0w = getCustomOp(model.graph.node[2])\n",
+    "fc0w = getCustomOp(model.graph.node[0])\n",
     "code_gen_dir = fc0w.get_nodeattr(\"code_gen_dir_ipgen\")\n",
     "!ls {code_gen_dir}"
    ]
@@ -887,7 +933,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [
     {
@@ -895,8 +941,8 @@
      "output_type": "stream",
      "text": [
       "#!/bin/bash \r\n",
-      "cd /tmp/code_gen_ipgen_StreamingFCLayer_Batch_a00d34e6\r\n",
-      "vivado_hls /tmp/code_gen_ipgen_StreamingFCLayer_Batch_a00d34e6/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
+      "cd /tmp/finn_maltanar_22115/code_gen_ipgen_StreamingFCLayer_Batch_bwxffr0g\r\n",
+      "vivado_hls /tmp/finn_maltanar_22115/code_gen_ipgen_StreamingFCLayer_Batch_bwxffr0g/hls_syn_StreamingFCLayer_Batch_0.tcl\r\n",
       "cd /workspace/finn\r\n"
      ]
     }
@@ -917,7 +963,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [
     {
@@ -927,9 +973,9 @@
       "\r\n",
       "set config_proj_name project_StreamingFCLayer_Batch_0\r\n",
       "puts \"HLS project: $config_proj_name\"\r\n",
-      "set config_hwsrcdir \"/tmp/code_gen_ipgen_StreamingFCLayer_Batch_a00d34e6\"\r\n",
+      "set config_hwsrcdir \"/tmp/finn_maltanar_22115/code_gen_ipgen_StreamingFCLayer_Batch_bwxffr0g\"\r\n",
       "puts \"HW source dir: $config_hwsrcdir\"\r\n",
-      "set config_proj_part \"xc7z020clg400-1\"\r\n",
+      "set config_proj_part \"xczu3eg-sbva484-1-e\"\r\n",
       "\r\n",
       "set config_bnnlibdir \"/workspace/finn-hlslib\"\r\n",
       "\r\n",
@@ -968,7 +1014,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -980,15 +1026,150 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Creation of stitched design <a id='stitched_design'></a>\n",
-    "The goal of this transformation is the creation of a Vivado IP Block Design project from all the generated IPs of a model. All nodes in the model must have the fpgadataflow backend attribute, and the CodeGen_ipgen transformation must have been previously run on the model. The resulting block design is also packaged as IP."
+    "### IP Stitching <a id='ip_stitching'></a>\n",
+    "\n",
+    "We now have IP blocks for each of our layers, and will stitch them together into a larger IP that implements the whole network using the `CodeGen_ipstitch` transformation. Bear in mind that this transformation can only be applied on a graph that only contains HLS nodes that already have been through the `HLSSynth_IPGen` transformation, which is the last step we performed. **This invokes Vivado and may take a few minutes to run.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch\n",
+    "\n",
+    "model = model.transform(CodeGen_ipstitch(fpga_part))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you examine the nodes themselves on the transformed model you won't see a difference, because the IP stitching adds model-level metadata to the graph. This can be accessed using the `.model.metadata_props`, the `get_metadata_prop` function in `ModelWrapper`, or by clicking on the global input/output tensors in Netron."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[key: \"vivado_stitch_proj\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0\"\n",
+       ", key: \"vivado_stitch_vlnv\"\n",
+       "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
+       "]"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.model.metadata_props"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0'"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.get_metadata_prop(\"vivado_stitch_proj\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### PYNQ shell project <a id='pynq_shell'></a>"
+    "If you navigate to the folder above (remember the /tmp/finn_xxx folder is mounted on the host as well as inside Docker) you can open the Vivado project (.xpr) file there using Vivado, and view the following stitched IP block design:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![](stitched_ip.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Inserting the IP into a PYNQ Shell <a id='pynq_shell'></a>\n",
+    "\n",
+    "We are almost done preparing our hardware design. To deploy our accelerator on a PYNQ platform, it needs to be put inside an appropriate *shell* that bridges it with the interfaces that the underlying system exposes. FINN makes it easy to create a PYNQ-compatible overlay by inserting the stitched IP into an appropriate PYNQ shell with the `MakePYNQProject` transformation, and view the created PYNQ shell project directory using the `metadata_props`. **This invokes Vivado and may take a few minutes to run.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[key: \"vivado_stitch_proj\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0\"\n",
+       ", key: \"vivado_stitch_vlnv\"\n",
+       "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
+       ", key: \"vivado_pynq_proj\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0\"\n",
+       "]"
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject\n",
+    "\n",
+    "model = model.transform(MakePYNQProject(pynq_board))\n",
+    "model.model.metadata_props"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ip_config.tcl\t resizer.hw\t\tresizer.srcs\t  vivado.jou\r\n",
+      "make_project.sh  resizer.ip_user_files\tresizer.xpr\t  vivado.log\r\n",
+      "resizer.cache\t resizer.sim\t\tsynth_project.sh  vivado_pid24853.str\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {model.get_metadata_prop(\"vivado_pynq_proj\")}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If we open the created Vivado project (.xpr) under the `vivado_pynq_proj` directory above, we can see the system-level block design as below, with the FINN-generated part of the design highlighted. Various other components, such as the DMA engine and data width converters, have also been instantiated.\n",
+    "![](pynq_shell_project.png)"
    ]
   },
   {
@@ -998,6 +1179,55 @@
     "### Synthesis, place and route <a id='synth_pl_ro'></a>"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We are now ready for the final hardware generation step, which is synthesis, place and route to generate an FPGA bitfile. This can be done by either running the `synth_project.sh` script in the generated Vivado PYNQ project directory inside Docker, or by executing the `SynthPYNQProject` transformation. **This step involves launching Vivado for synthesis and may take a few hours.**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[key: \"vivado_stitch_proj\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0\"\n",
+       ", key: \"vivado_stitch_vlnv\"\n",
+       "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
+       ", key: \"vivado_pynq_proj\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0\"\n",
+       ", key: \"vivado_pynq_bitfile\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0/resizer.bit\"\n",
+       "]"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.save(build_dir + \"/tfc_w1_a1_pre_synthesis.onnx\")\n",
+    "\n",
+    "from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject\n",
+    "\n",
+    "model = model.transform(SynthPYNQProject())\n",
+    "model.model.metadata_props"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1005,6 +1235,38 @@
     "## 4. Hardware test <a id='hw_test'></a>"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[key: \"vivado_stitch_proj\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_stitch_proj_nfte0nh0\"\n",
+       ", key: \"vivado_stitch_vlnv\"\n",
+       "value: \"xilinx_finn:finn:finn_design:1.0\"\n",
+       ", key: \"vivado_pynq_proj\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0\"\n",
+       ", key: \"vivado_pynq_bitfile\"\n",
+       "value: \"/tmp/finn_maltanar_22115/vivado_pynq_proj_bj_z4tm0/resizer.bit\"\n",
+       ", key: \"pynq_driver_dir\"\n",
+       "value: \"/tmp/finn_maltanar_22115/pynq_driver_63xiuej8\"\n",
+       "]"
+      ]
+     },
+     "execution_count": 49,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver\n",
+    "model = ModelWrapper(build_dir + \"/tfc_w1_a1_post_synthesis.onnx\")\n",
+    "model = model.transform(MakePYNQDriver())"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/notebooks/pynq_shell_project.png b/notebooks/pynq_shell_project.png
new file mode 100644
index 0000000000000000000000000000000000000000..b1f3c5e4f6231ca692fddc2e6a1e14cdea49dc20
Binary files /dev/null and b/notebooks/pynq_shell_project.png differ
diff --git a/notebooks/stitched_ip.png b/notebooks/stitched_ip.png
new file mode 100644
index 0000000000000000000000000000000000000000..64b96d3451f33ebad99befcf903aba9c57052f79
Binary files /dev/null and b/notebooks/stitched_ip.png differ
diff --git a/run-docker.sh b/run-docker.sh
index d774f0871d7b52e1015f4817a0e8aa49bef5a427..77441ed955c8a055ac57a7328f2998f8855c20e9 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -14,7 +14,9 @@ DOCKER_PASSWD="finn"
 # containers from the same user
 DOCKER_RND=$(shuf -i0-32768 -n1)
 DOCKER_TAG="finn_${DOCKER_UNAME}"
-DOCKER_INST_NAME="finn_${DOCKER_UNAME}_${DOCKER_RND}"
+# uncomment to run multiple instances with different names
+# DOCKER_INST_NAME="finn_${DOCKER_UNAME}_${DOCKER_RND}"
+DOCKER_INST_NAME="finn_${DOCKER_UNAME}"
 : ${JUPYTER_PORT=8888}
 : ${NETRON_PORT=8081}
 
@@ -38,6 +40,7 @@ PYVERILATOR_LOCAL=$SCRIPTPATH/pyverilator
 PYNQSHELL_LOCAL=$SCRIPTPATH/PYNQ-HelloWorld
 BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
+: ${VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache}
 
 # clone dependency repos
 git clone --branch feature/finn_onnx_export $BREVITAS_REPO $BREVITAS_LOCAL ||  git -C "$BREVITAS_LOCAL" pull
@@ -49,6 +52,7 @@ git clone $PYNQSHELL_REPO $PYNQSHELL_LOCAL ||  git -C "$PYNQSHELL_LOCAL" pull
 
 # ensure build dir exists locally
 mkdir -p $BUILD_LOCAL
+mkdir -p $VIVADO_IP_CACHE
 
 echo "Instance is named as $DOCKER_INST_NAME"
 echo "Mounting $SCRIPTPATH into /workspace/finn"
@@ -62,6 +66,7 @@ echo "Mounting $BUILD_LOCAL into $BUILD_LOCAL"
 echo "Mounting $VIVADO_PATH into $VIVADO_PATH"
 echo "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 echo "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
+echo "Vivado IP cache dir is at $VIVADO_IP_CACHE"
 
 if [ "$1" = "test" ]; then
         echo "Running test suite"
@@ -101,6 +106,7 @@ docker run -t --rm --name $DOCKER_INST_NAME -it \
 -e VIVADO_PATH=$VIVADO_PATH \
 -e FINN_INST_NAME=$DOCKER_INST_NAME \
 -e FINN_ROOT="/workspace/finn" \
+-e VIVADO_IP_CACHE="$VIVADO_IP_CACHE" \
 -p $JUPYTER_PORT:$JUPYTER_PORT \
 -p $NETRON_PORT:$NETRON_PORT \
 $DOCKER_TAG bash -c "$DOCKER_CMD"
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 75a475acbc2c58ddddc1e8ff50dbc34cecf69f27..eb705c724b99a4897197addad4561cb9b3d12940 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -299,3 +299,15 @@ compilation transformations?
     @abstractmethod
     def pragmas(self):
         pass
+
+    def get_folded_input_shape(self):
+        raise Exception("get_folded_input_shape not implemented for this op")
+
+    def get_folded_output_shape(self):
+        raise Exception("get_folded_output_shape not implemented for this op")
+
+    def get_instream_width(self):
+        raise Exception("get_instream_width not implemented for this op")
+
+    def get_outstream_width(self):
+        raise Exception("get_outstream_width not implemented for this op")
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index ce21ad38c842bf967e96c06ad39525d4b7690297..be9b51e6a7b1b3e255cd2ee8baf10937b95f8665 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -208,10 +208,21 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         o_bits = self.get_output_datatype().bitwidth()
         return o_bits * self.get_nodeattr("PE")
 
-    def get_number_output_values(self):
+    def get_folded_input_shape(self):
+        mw = self.get_nodeattr("MW")
+        simd = self.get_nodeattr("SIMD")
+        sf = mw // simd
+        return (1, sf, simd)
+
+    def get_folded_output_shape(self):
         mh = self.get_nodeattr("MH")
         pe = self.get_nodeattr("PE")
-        return mh // pe
+        nf = mh // pe
+        return (1, nf, pe)
+
+    def get_number_output_values(self):
+        nf = self.get_folded_output_shape()[1]
+        return nf
 
     def get_template_param_values(self):
         ret = dict()
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/tlastmarker.py
index d1bbca7a8d1c2db91e047fed505faa19024fbf0e..9a372179042a7236c15a02cf4191ec608836256c 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/tlastmarker.py
@@ -8,8 +8,10 @@ class TLastMarker(HLSCustomOp):
     def get_nodeattr_types(self):
         my_attrs = {
             "NumIters": ("i", True, 0),
-            # width of input-output data streams
+            # width of input-output data streams, in bits
             "StreamWidth": ("i", True, 0),
+            # width of individual element in stream, in bits
+            "ElemWidth": ("i", True, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -87,3 +89,21 @@ class TLastMarker(HLSCustomOp):
 
     def get_number_output_values(self):
         return self.get_nodeattr("NumIters")
+
+    def get_folded_input_shape(self):
+        stream_width = self.get_nodeattr("StreamWidth")
+        elem_width = self.get_nodeattr("ElemWidth")
+        n_packed_elems = stream_width // elem_width
+        n_iters = self.get_nodeattr("NumIters")
+        return (1, n_iters, n_packed_elems)
+
+    def get_folded_output_shape(self):
+        return self.get_folded_input_shape()
+
+    def get_instream_width(self):
+        stream_width = self.get_nodeattr("StreamWidth")
+        return stream_width
+
+    def get_outstream_width(self):
+        stream_width = self.get_nodeattr("StreamWidth")
+        return stream_width
diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
new file mode 100644
index 0000000000000000000000000000000000000000..051c207511a11fbd03245857a6378836037084b7
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py
@@ -0,0 +1,48 @@
+from onnx import TensorProto
+from onnx import helper as oh
+
+from finn.custom_op.registry import getCustomOp
+from finn.transformation import Transformation
+
+
+class InsertTLastMarker(Transformation):
+    """Ensure that the graph is terminated with a TLastMarker node, inserting
+    one if necessary."""
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        # TODO only makes sense for a pure fpgadataflow graph -- check!
+        graph_out_name = model.graph.output[0].name
+        final_node = model.find_producer(graph_out_name)
+        if final_node.op_type == "TLastMarker":
+            # TODO maybe check the correctness of properties
+            return (model, False)
+        else:
+            custom_op = getCustomOp(final_node)
+            num_iters = int(custom_op.get_number_output_values())
+            stream_width = int(custom_op.get_outstream_width())
+            out_shape = model.get_tensor_shape(graph_out_name)
+            out_dtype = model.get_tensor_datatype(graph_out_name)
+            elem_width = out_dtype.bitwidth()
+            # make new buffer
+            final_node_out = oh.make_tensor_value_info(
+                model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
+            )
+            model.graph.value_info.append(final_node_out)
+            model.set_tensor_datatype(final_node_out.name, out_dtype)
+            # reroute final node output to final_node_out_name
+            final_node.output[0] = final_node_out.name
+            tlast_node = oh.make_node(
+                "TLastMarker",
+                [final_node_out.name],
+                [graph_out_name],
+                NumIters=num_iters,
+                StreamWidth=stream_width,
+                ElemWidth=elem_width,
+                domain="finn",
+                backend="fpgadataflow",
+            )
+            model.graph.node.append(tlast_node)
+            return (model, True)
diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a252a43189996085829e6ba623cbcaddf714cb
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/make_deployment.py
@@ -0,0 +1,50 @@
+import os
+from distutils.dir_util import copy_tree
+from shutil import copy
+
+from finn.transformation import Transformation
+from finn.util.basic import make_build_dir
+
+
+class DeployToPYNQ(Transformation):
+    """Collects all necessary files for deployment and copies them to the PYNQ board.
+    Expects information about PYNQ board to make scp possible:
+    * ip address of board
+    * username and password for board
+    * target directory where the files are stored on the board"""
+
+    def __init__(self, ip, username, password, target_dir):
+        super().__init__()
+        self.ip = ip
+        self.username = username
+        self.password = password
+        self.target_dir = target_dir
+
+    def apply(self, model):
+        # set metadata properties accordingly to user input specifications
+        model.set_metadata_prop("pynq_ip", self.ip)
+        model.set_metadata_prop("pynq_username", self.username)
+        model.set_metadata_prop("pynq_password", self.password)
+        model.set_metadata_prop("pynq_target_dir", self.target_dir)
+
+        # create directory for deployment files
+        deployment_dir = make_build_dir(prefix="pynq_deployment_")
+        model.set_metadata_prop("pynq_deployment_dir", deployment_dir)
+
+        # get and copy necessary files
+        # .bit and .hwh file
+        vivado_pynq_proj = model.get_metadata_prop("vivado_pynq_proj")
+        for file in os.listdir(vivado_pynq_proj):
+            if file.endswith(".bit"):
+                bitfile = os.path.join(vivado_pynq_proj, file)
+            elif file.endswith(".hwh"):
+                hwhfile = os.path.join(vivado_pynq_proj, file)
+        copy(bitfile, deployment_dir)
+        copy(hwhfile, deployment_dir)
+
+        # driver.py and python libraries
+        pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir")
+        copy_tree(pynq_driver_dir, deployment_dir)
+        model.set_metadata_prop("pynq_deploy_dir", deployment_dir)
+
+        return (model, False)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
index a845188727cde28aa3eed89877542e21f9862f55..d218e2a1cbedb396067aae009ed7dc52f446491d 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -1,6 +1,7 @@
 import os
 import shutil
 
+from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
 from finn.util.basic import gen_finn_dt_tensor, get_finn_root, make_build_dir
 from finn.util.data_packing import finnpy_to_packed_bytearray
@@ -18,9 +19,8 @@ class MakePYNQDriver(Transformation):
     value.
     """
 
-    def __init__(self, platform):
+    def __init__(self):
         super().__init__()
-        self.platform = platform
 
     def apply(self, model):
         vivado_pynq_proj = model.get_metadata_prop("vivado_pynq_proj")
@@ -35,15 +35,24 @@ class MakePYNQDriver(Transformation):
         # TODO convert this to an analysis pass
         i_tensor_name = model.graph.input[0].name
         o_tensor_name = model.graph.output[0].name
-        i_tensor_shape = tuple(model.get_tensor_shape(i_tensor_name))
-        o_tensor_shape = tuple(model.get_tensor_shape(o_tensor_name))
+        i_tensor_shape_normal = tuple(model.get_tensor_shape(i_tensor_name))
+        o_tensor_shape_normal = tuple(model.get_tensor_shape(o_tensor_name))
         i_tensor_dt = model.get_tensor_datatype(i_tensor_name)
         o_tensor_dt = model.get_tensor_datatype(o_tensor_name)
-        # generate dummy i/o tensors and their packed versions
-        i_tensor_dummy = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape)
-        o_tensor_dummy = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape)
-        i_tensor_dummy_packed = finnpy_to_packed_bytearray(i_tensor_dummy, i_tensor_dt)
-        o_tensor_dummy_packed = finnpy_to_packed_bytearray(o_tensor_dummy, o_tensor_dt)
+        # extract HLSCustomOp instances to get folded i/o shapes
+        first_node = getCustomOp(model.find_consumer(i_tensor_name))
+        last_node = getCustomOp(model.find_producer(o_tensor_name))
+        i_tensor_shape_folded = first_node.get_folded_input_shape()
+        o_tensor_shape_folded = last_node.get_folded_output_shape()
+        # generate dummy folded i/o tensors and their packed versions
+        i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded)
+        o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded)
+        i_tensor_dummy_packed = finnpy_to_packed_bytearray(
+            i_tensor_dummy_folded, i_tensor_dt
+        )
+        o_tensor_dummy_packed = finnpy_to_packed_bytearray(
+            o_tensor_dummy_folded, o_tensor_dt
+        )
         i_tensor_shape_packed = i_tensor_dummy_packed.shape
         o_tensor_shape_packed = o_tensor_dummy_packed.shape
 
@@ -51,11 +60,13 @@ class MakePYNQDriver(Transformation):
         driver_py = pynq_driver_dir + "/driver.py"
         driver = templates.pynq_driver_template
         driver = driver.replace("$INPUT_FINN_DATATYPE$", str(i_tensor_dt))
-        driver = driver.replace("$INPUT_SHAPE_UNPACKED$", str(i_tensor_shape))
+        driver = driver.replace("$INPUT_SHAPE_NORMAL$", str(i_tensor_shape_normal))
+        driver = driver.replace("$INPUT_SHAPE_FOLDED$", str(i_tensor_shape_folded))
         driver = driver.replace("$INPUT_SHAPE_PACKED$", str(i_tensor_shape_packed))
         driver = driver.replace("$OUTPUT_FINN_DATATYPE$", str(o_tensor_dt))
+        driver = driver.replace("$OUTPUT_SHAPE_NORMAL$", str(o_tensor_shape_normal))
+        driver = driver.replace("$OUTPUT_SHAPE_FOLDED$", str(o_tensor_shape_folded))
         driver = driver.replace("$OUTPUT_SHAPE_PACKED$", str(o_tensor_shape_packed))
-        driver = driver.replace("$OUTPUT_SHAPE_UNPACKED$", str(o_tensor_shape))
 
         with open(driver_py, "w") as f:
             f.write(driver)
diff --git a/src/finn/transformation/fpgadataflow/make_pynq_proj.py b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
index 010c53a45058d665822b88c3bbf4e350d7eb57c6..3a63f23336957dcc95ebedc2f8a45c144f4e2035 100644
--- a/src/finn/transformation/fpgadataflow/make_pynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_pynq_proj.py
@@ -1,6 +1,7 @@
 import os
 import subprocess
 
+from finn.custom_op.registry import getCustomOp
 from finn.transformation import Transformation
 from finn.util.basic import get_by_name, make_build_dir, roundup_to_integer_multiple
 
@@ -50,16 +51,13 @@ class MakePYNQProject(Transformation):
         ip_dirs += [ipstitch_path + "/ip"]
         ip_dirs_str = "[%s]" % (" ".join(ip_dirs))
 
-        # extract the actual in-out bytes from graph
-        # TODO convert this to an analysis pass
+        # extract HLSCustomOp instances to get i/o stream widths
         i_tensor_name = model.graph.input[0].name
         o_tensor_name = model.graph.output[0].name
-        i_tensor_shape = model.get_tensor_shape(i_tensor_name)
-        o_tensor_shape = model.get_tensor_shape(o_tensor_name)
-        i_tensor_dt = model.get_tensor_datatype(i_tensor_name)
-        o_tensor_dt = model.get_tensor_datatype(o_tensor_name)
-        i_bits_per_cycle = i_tensor_dt.bitwidth() * i_tensor_shape[-1]
-        o_bits_per_cycle = o_tensor_dt.bitwidth() * o_tensor_shape[-1]
+        first_node = getCustomOp(model.find_consumer(i_tensor_name))
+        last_node = getCustomOp(model.find_producer(o_tensor_name))
+        i_bits_per_cycle = first_node.get_instream_width()
+        o_bits_per_cycle = last_node.get_outstream_width()
         # ensure i/o is padded to bytes
         i_bits_per_cycle_padded = roundup_to_integer_multiple(i_bits_per_cycle, 8)
         o_bits_per_cycle_padded = roundup_to_integer_multiple(o_bits_per_cycle, 8)
@@ -71,6 +69,7 @@ class MakePYNQProject(Transformation):
         out_if_name = "out_r_0"
         clk_name = "ap_clk_0"
         nrst_name = "ap_rst_n_0"
+        vivado_ip_cache = os.getenv("VIVADO_IP_CACHE", default="")
 
         # create a temporary folder for the project
         vivado_pynq_proj_dir = make_build_dir(prefix="vivado_pynq_proj_")
@@ -87,6 +86,7 @@ class MakePYNQProject(Transformation):
             out_if_name,
             clk_name,
             nrst_name,
+            vivado_ip_cache,
         )
 
         with open(vivado_pynq_proj_dir + "/ip_config.tcl", "w") as f:
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index c1a42e64cec0549c2b105e02039032433b8189b2..a613d0622ee95e7f1ca848142e2930cf6d3c91bd 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -9,6 +9,7 @@ variable config_ip_axis_name_out
 variable config_ip_use_axilite
 variable config_ip_project_dir
 variable config_output_products_dir
+variable config_remote_cache
 
 # for arguments involving paths below: use absolute paths or relative to the
 # platform/overlay/bitstream folder
@@ -36,6 +37,8 @@ set config_ip_clk_name %s
 set config_ip_nrst_name %s
 # whether the IP needs an AXI Lite interface for control
 set config_ip_use_axilite 0
+# Vivado OOC IP cache
+set config_remote_cache "%s"
 """
 
 call_pynqshell_makefile_template = """
@@ -64,22 +67,26 @@ dma=ol.axi_dma_0
 # declare input/output types and shapes for the accelerator
 # input FINN DataType
 idt = $INPUT_FINN_DATATYPE$
-# unpacked and packed input shapes
-ishape_unpacked = $INPUT_SHAPE_UNPACKED$
+# normal, folded and packed input shapes
+ishape_normal = $INPUT_SHAPE_NORMAL$
+ishape_folded = $INPUT_SHAPE_FOLDED$
 ishape_packed = $INPUT_SHAPE_PACKED$
 # output FINN DataType
 odt = $OUTPUT_FINN_DATATYPE$
-# unpacked and packed output shapes
+# normal, folded and packed output shapes
+oshape_normal = $OUTPUT_SHAPE_NORMAL$
+oshape_folded = $OUTPUT_SHAPE_FOLDED$
 oshape_packed = $OUTPUT_SHAPE_PACKED$
-oshape_unpacked = $OUTPUT_SHAPE_UNPACKED$
 
 # load desired input .npy file
-ibuf_unpacked = np.load("input.npy")
+ibuf_normal = np.load("input.npy")
 # ensure that shape is as expected
-assert ibuf_unpacked.shape == ishape_unpacked
+assert ibuf_normal.shape == ishape_normal
+# convert to folded form
+ibuf_folded = ibuf_normal.reshape(ishape_folded)
 
 # pack the input buffer
-ibuf_packed = finnpy_to_packed_bytearray(ibuf_unpacked, idt)
+ibuf_packed = finnpy_to_packed_bytearray(ibuf_folded, idt)
 # allocate a PYNQ buffer for the packed input buffer
 ibuf_packed_device = allocate(shape=ishape_packed, dtype=np.uint8)
 # copy the packed data into the PYNQ buffer
@@ -96,6 +103,8 @@ dma.sendchannel.wait()
 dma.recvchannel.wait()
 
 # unpack the packed output buffer from accelerator
-obuf_unpacked = packed_bytearray_to_finnpy(obuf_packed, odt, oshape_unpacked)
-np.save("output.npy", obuf_unpacked)
+obuf_folded = packed_bytearray_to_finnpy(obuf_packed, odt, oshape_folded)
+# convert to normal reshape and save
+obuf_normal = obuf_folded.reshape(oshape_normal)
+np.save("output.npy", obuf_normal)
 """
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 6fc357dcc1a2e4de3b133eec2fa09b2a2fc0426e..63dc9227c4aef95f445fdd4b006b500500ffd658 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -8,6 +8,11 @@ import numpy as np
 
 from finn.core.datatype import DataType
 
+# mapping from PYNQ board names to FPGA part names
+pynq_part_map = dict()
+pynq_part_map["Ultra96"] = "xczu3eg-sbva484-1-e"
+pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1"
+
 
 def get_finn_root():
     "Return the root directory that FINN is cloned into."
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 3363a4fb9d07ef478e34377f1882a6644e331f96..58c62219287940eb6533d2513e66d2c9c33cfb01 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -277,7 +277,7 @@ def finnpy_to_packed_bytearray(ndarray, dtype):
         return np.apply_along_axis(fn, packed_hexstring.ndim - 1, packed_hexstring)
 
 
-def packed_bytearray_to_finnpy(packed_bytearray, dtype, output_shape=None):
+def packed_bytearray_to_finnpy(packed_bytearray, dtype, output_shape=None, reverse_inner=False):
     """Given a packed numpy uint8 ndarray, unpack it into a FINN array of
     given DataType. output_shape can be specified to remove padding from the
     packed dimension, or set to None to be inferred from the input."""
@@ -300,6 +300,6 @@ def packed_bytearray_to_finnpy(packed_bytearray, dtype, output_shape=None):
     packed_hexstring = np.apply_along_axis(
         npbytearray2hexstring, packed_dim, packed_bytearray
     )
-    ret = unpack_innermost_dim_from_hex_string(packed_hexstring, dtype, output_shape)
+    ret = unpack_innermost_dim_from_hex_string(packed_hexstring, dtype, output_shape, reverse_inner)
 
     return ret
diff --git a/tests/fpgadataflow/test_create_dataflow_partition.py b/tests/fpgadataflow/test_create_dataflow_partition.py
index e0b8b491b0ed926a56331a8e125f3de4ecd91615..5b78a2000e4327440cadfa81392e66742e07634f 100644
--- a/tests/fpgadataflow/test_create_dataflow_partition.py
+++ b/tests/fpgadataflow/test_create_dataflow_partition.py
@@ -1,14 +1,21 @@
 import os.path
 from pkgutil import get_data
 
+import pytest
+
 from finn.core.modelwrapper import ModelWrapper
 from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.create_dataflow_partition import (
     CreateDataflowPartition,
 )
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
+from finn.util.basic import make_build_dir
+
+build_dir = make_build_dir("test_dataflow_partition_")
 
 
-def test_create_dataflow_partition():
+@pytest.mark.dependency()
+def test_dataflow_partition_create():
     # load the onnx model
     raw_m = get_data(
         "finn", "data/onnx/finn-hls-model/tfc_w1_a1_after_conv_to_hls.onnx"
@@ -19,3 +26,21 @@ def test_create_dataflow_partition():
     sdp_node = getCustomOp(model.graph.node[2])
     assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
     assert os.path.isfile(sdp_node.get_nodeattr("model"))
+    model.save(build_dir + "/test_dataflow_partition_create.onnx")
+
+
+@pytest.mark.dependency(depends=["test_dataflow_partition_create"])
+def test_dataflow_partition_tlastmarker():
+    model = ModelWrapper(build_dir + "/test_dataflow_partition_create.onnx")
+    model_path = getCustomOp(model.graph.node[2]).get_nodeattr("model")
+    model = ModelWrapper(model_path)
+    model = model.transform(InsertTLastMarker())
+    assert model.graph.node[-1].op_type == "TLastMarker"
+    assert model.graph.node[-1].domain == "finn"
+    tl_node = getCustomOp(model.graph.node[-1])
+    assert tl_node.get_nodeattr("NumIters") == 1
+    assert tl_node.get_nodeattr("StreamWidth") == 320
+    assert tl_node.get_nodeattr("ElemWidth") == 32
+    model.save(build_dir + "/test_dataflow_partition_tlastmarker.onnx")
+    model = model.transform(InsertTLastMarker())
+    model.save(build_dir + "/test_dataflow_partition_tlastmarker2.onnx")
diff --git a/tests/fpgadataflow/test_data_packing.py b/tests/fpgadataflow/test_data_packing.py
index c9d0cd064bbc487a35ade1a2bffeaf40d32a458e..3616219ef0e1046e7ef1a6daf3c1bfb6528a21cc 100644
--- a/tests/fpgadataflow/test_data_packing.py
+++ b/tests/fpgadataflow/test_data_packing.py
@@ -1,4 +1,5 @@
 import shutil
+import os
 import subprocess
 
 import numpy as np
@@ -59,8 +60,8 @@ def make_npy2apintstream_testcase(ndarray, dtype):
         f.write("\n".join(test_app_string))
     cmd_compile = """
 g++ -o test_npy2apintstream test.cpp /workspace/cnpy/cnpy.cpp \
--I/workspace/cnpy/ -I/workspace/vivado-hlslib -I/workspace/finn/src/finn/data/cpp \
---std=c++11 -lz"""
+-I/workspace/cnpy/ -I{}/include -I/workspace/finn/src/finn/data/cpp \
+--std=c++11 -lz""".format(os.environ["VIVADO_PATH"])
     with open(test_dir + "/compile.sh", "w") as f:
         f.write(cmd_compile)
     compile = subprocess.Popen(
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 5d7d41a23908aabbab2f21ae5387bf2e46cba201..40b272efb9d6b00126312a9934f28c2a899bd942 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -1,4 +1,5 @@
-import os.path
+# import os.path
+import os
 
 import pytest
 
@@ -10,6 +11,7 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
 from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
 from finn.transformation.fpgadataflow.synth_pynq_proj import SynthPYNQProject
@@ -18,16 +20,11 @@ from finn.util.basic import (
     calculate_signed_dot_prod_range,
     gen_finn_dt_tensor,
     make_build_dir,
+    pynq_part_map,
 )
 
-# TODO control board/part for tests from a global place
-# settings for Ultra96
-test_fpga_part = "xczu3eg-sbva484-1-e"
-test_pynq_board = "Ultra96"
-
-# settings for PYNQ-Z1
-# test_fpga_part = "xc7z020clg400-1"
-# test_pynq_board = "Pynq-Z1"
+test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
+test_fpga_part = pynq_part_map[test_pynq_board]
 
 ip_stitch_model_dir = make_build_dir("test_fpgadataflow_ipstitch")
 
@@ -57,7 +54,7 @@ def create_one_fc_model():
         MW=m,
         MH=m,
         SIMD=m,
-        PE=m,
+        PE=m // 2,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
@@ -72,7 +69,8 @@ def create_one_fc_model():
         ["outp_tlast"],
         domain="finn",
         backend="fpgadataflow",
-        NumIters=1,
+        NumIters=2,
+        ElemWidth=odt.bitwidth(),
         StreamWidth=odt.bitwidth() * m,
     )
 
@@ -162,6 +160,7 @@ def create_two_fc_model():
         backend="fpgadataflow",
         NumIters=m,
         StreamWidth=2,
+        ElemWidth=odt.bitwidth(),
     )
 
     graph = helper.make_graph(
@@ -255,8 +254,35 @@ def test_fpgadataflow_ipstitch_pynq_synth():
 @pytest.mark.dependency(depends=["test_fpgadataflow_ipstitch_pynq_projgen"])
 def test_fpgadataflow_ipstitch_pynq_driver():
     model = ModelWrapper(ip_stitch_model_dir + "/test_fpgadataflow_pynq_projgen.onnx")
-    model = model.transform(MakePYNQDriver(test_pynq_board))
+    model = model.transform(MakePYNQDriver())
     driver_dir = model.get_metadata_prop("pynq_driver_dir")
     assert driver_dir is not None
     assert os.path.isdir(driver_dir)
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx")
+
+
+@pytest.mark.dependency(depends=["test_fpgadataflow_ipstitch_pynq_driver"])
+def test_fpgadataflow_ipstitch_pynq_deployment_folder():
+    model = ModelWrapper(
+        ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_driver.onnx"
+    )
+    ip = "172.21.165.113"
+    username = "xilinx"
+    password = "xilinx"
+    target_dir = "/home/xilinx/" + os.environ["FINN_INST_NAME"]
+    model = model.transform(DeployToPYNQ(ip, username, password, target_dir))
+    pynq_ip = model.get_metadata_prop("pynq_ip")
+    pynq_username = model.get_metadata_prop("pynq_username")
+    pynq_password = model.get_metadata_prop("pynq_password")
+    pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
+
+    assert pynq_ip == ip
+    assert pynq_username == username
+    assert pynq_password == password
+    assert pynq_target_dir == target_dir
+
+    deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
+    assert deployment_dir is not None
+    assert os.path.isdir(deployment_dir)
+
+    model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_pynq_deployment.onnx")