[Notebook] all tfc notebooks now working

9d7dd3fd · Yaman Umuroglu · dbc65ece · 9d7dd3fd · 9d7dd3fd
Commit 9d7dd3fd authored 4 years ago by Yaman Umuroglu
--- a/notebooks/end2end_example/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_example.ipynb
@@ -84,7 +84,9 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "/workspace/brevitas/brevitas_examples/bnn_pynq/models/TFC.py:80: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
+      "Downloading: \"https://github.com/Xilinx/brevitas/releases/download/bnn_pynq-r0/tfc_1w1a-ff8140dc.pth\" to /home/maltanar/.cache/torch/checkpoints/tfc_1w1a-ff8140dc.pth\n",
+      "100%|██████████| 249052/249052 [00:00<00:00, 759439.97it/s]\n",
+      "/workspace/brevitas/brevitas_examples/bnn_pynq/models/FC.py:83: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
      "  x = 2.0 * x - torch.tensor([1.0], device=x.device)\n"
     ]
    }
@@ -132,7 +134,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f98457e4048>"
+       "<IPython.lib.display.IFrame at 0x7f1b8e6db128>"
      ]
     },
     "execution_count": 3,
@@ -293,7 +295,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f97d7ff7cf8>"
+       "<IPython.lib.display.IFrame at 0x7f1add273cf8>"
      ]
     },
     "execution_count": 6,
@@ -402,7 +404,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f97d46a65f8>"
+       "<IPython.lib.display.IFrame at 0x7f1adc25df60>"
      ]
     },
     "execution_count": 8,
@@ -456,7 +458,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f97d46ad240>"
+       "<IPython.lib.display.IFrame at 0x7f1adc2548d0>"
      ]
     },
     "execution_count": 9,
@@ -532,7 +534,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f97d46a6668>"
+       "<IPython.lib.display.IFrame at 0x7f1adc254630>"
      ]
     },
     "execution_count": 10,
@@ -592,7 +594,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f9872793940>"
+       "<IPython.lib.display.IFrame at 0x7f1add27eba8>"
      ]
     },
     "execution_count": 11,
@@ -627,7 +629,7 @@
     "text": [
      "\n",
      "Stopping http://0.0.0.0:8081\n",
-      "Serving '/tmp/finn_dev_maltanar/dataflow_partition0_cuza0or8/df_model.onnx' at http://0.0.0.0:8081\n"
+      "Serving '/tmp/finn_dev_maltanar/dataflow_partition0_8y5bzo4x/df_model.onnx' at http://0.0.0.0:8081\n"
     ]
    },
    {
@@ -644,7 +646,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f987278b9e8>"
+       "<IPython.lib.display.IFrame at 0x7f1b7af8d240>"
      ]
     },
     "execution_count": 12,
@@ -835,7 +837,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f97d464ca20>"
+       "<IPython.lib.display.IFrame at 0x7f1adc266ba8>"
      ]
     },
     "execution_count": 16,
@@ -868,7 +870,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
@@ -887,7 +889,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -906,7 +908,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
@@ -926,7 +928,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -966,7 +968,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -989,22 +991,22 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[key: \"pynq_driver_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_driver_f266zjp6\"\n",
+       "value: \"/tmp/finn_dev_maltanar/pynq_driver_ilfzbags\"\n",
       ", key: \"vivado_pynq_proj\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk\"\n",
+       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk\"\n",
       ", key: \"bitfile\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk/resizer.bit\"\n",
+       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk/resizer.bit\"\n",
       ", key: \"hw_handoff\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk/resizer.hwh\"\n",
+       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk/resizer.hwh\"\n",
       ", key: \"vivado_synth_rpt\"\n",
-       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk/synth_report.xml\"\n",
+       "value: \"/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk/synth_report.xml\"\n",
       ", key: \"platform\"\n",
       "value: \"zynq-iodma\"\n",
       ", key: \"pynq_ip\"\n",
@@ -1018,15 +1020,15 @@
       ", key: \"pynq_target_dir\"\n",
       "value: \"/home/xilinx/finn_tfc_end2end_example\"\n",
       ", key: \"pynq_deployment_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_deployment_624bd_nc\"\n",
+       "value: \"/tmp/finn_dev_maltanar/pynq_deployment_eyiu4sxk\"\n",
       ", key: \"pynq_deploy_dir\"\n",
-       "value: \"/tmp/finn_dev_maltanar/pynq_deployment_624bd_nc\"\n",
+       "value: \"/tmp/finn_dev_maltanar/pynq_deployment_eyiu4sxk\"\n",
       ", key: \"exec_mode\"\n",
       "value: \"remote_pynq\"\n",
       "]"
      ]
     },
-     "execution_count": 23,
+     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1037,18 +1039,30 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "total 4212\r\n",
+      "/home/xilinx/finn_tfc_end2end_example/pynq_deployment_624bd_nc:\r\n",
+      "total 4228\r\n",
      "-rw-r--r-- 1 xilinx xilinx    9391 Sep  4 10:37 driver.py\r\n",
      "drwxr-xr-x 4 xilinx xilinx    4096 Sep  4 10:37 finn\r\n",
+      "-rw-r--r-- 1 xilinx xilinx    3264 Sep  4 10:38 input.npy\r\n",
+      "-rw-r--r-- 1 root   root       205 Sep  4 10:39 nw_metrics.txt\r\n",
+      "-rw-r--r-- 1 root   root       120 Sep  4 10:38 output.npy\r\n",
      "-rw-r--r-- 1 xilinx xilinx 4045671 Sep  4 10:37 resizer.bit\r\n",
-      "-rw-r--r-- 1 xilinx xilinx  246211 Sep  4 10:37 resizer.hwh\r\n"
+      "-rw-r--r-- 1 xilinx xilinx  246211 Sep  4 10:37 resizer.hwh\r\n",
+      "-rw-r--r-- 1 root   root        32 Sep  4 10:39 sds_trace_data.dat\r\n",
+      "\r\n",
+      "/home/xilinx/finn_tfc_end2end_example/pynq_deployment_eyiu4sxk:\r\n",
+      "total 4212\r\n",
+      "-rw-r--r-- 1 xilinx xilinx    8493 Sep  5 01:24 driver.py\r\n",
+      "drwxr-xr-x 4 xilinx xilinx    4096 Sep  5 01:24 finn\r\n",
+      "-rw-r--r-- 1 xilinx xilinx 4045671 Sep  5 01:24 resizer.bit\r\n",
+      "-rw-r--r-- 1 xilinx xilinx  246211 Sep  5 01:24 resizer.hwh\r\n"
     ]
    }
   ],
@@ -1065,16 +1079,16 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7f9857a748d0>"
+       "<matplotlib.image.AxesImage at 0x7f1b5b7cddd8>"
      ]
     },
-     "execution_count": 25,
+     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
@@ -1098,7 +1112,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1118,7 +1132,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1140,7 +1154,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
@@ -1149,7 +1163,7 @@
       "<BarContainer object of 10 artists>"
      ]
     },
-     "execution_count": 28,
+     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    },
@@ -1196,7 +1210,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
@@ -1204,10 +1218,10 @@
     "output_type": "stream",
     "text": [
      "Network metrics:\n",
-      "runtime[ms]: 7.485151290893555\n",
-      "throughput[images/s]: 1335978.3405000796\n",
-      "DRAM_in_bandwidth[Mb/s]: 130.9258773690078\n",
-      "DRAM_out_bandwidth[Mb/s]: 13.359783405000796\n",
+      "runtime[ms]: 7.472753524780273\n",
+      "throughput[images/s]: 1338194.8122387773\n",
+      "DRAM_in_bandwidth[Mb/s]: 131.14309159940018\n",
+      "DRAM_out_bandwidth[Mb/s]: 13.381948122387772\n",
      "fclk[mhz]: 100.0\n",
      "N: 10000\n"
     ]
@@ -1232,7 +1246,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {

 %% Cell type:markdown id: tags:

 # FINN - End-to-End Flow
 -----------------------------------------------------------------

 In this notebook, we will show how to take a simple, binarized, fully-connected network trained on the MNIST data set and take it all the way down to a customized bitfile running on a PYNQ board.

 This notebook is quite lengthy, and some of the cells (involving Vivado synthesis) may take up to an hour to finish running. To let you save and resume your progress, we will save the intermediate ONNX models that are generated in the various steps to disk, so that you can jump back directly to where you left off.


 %% Cell type:markdown id: tags:

 ## Overview

 The FINN compiler comes with many *transformations* that modify the ONNX representation of the network according to certain patterns. This notebook will demonstrate a *possible* sequence of such transformations to take a particular trained network all the way down to hardware, as shown in the figure below.

 %% Cell type:markdown id: tags:

 ![](finn-design-flow-example.svg)

 %% Cell type:markdown id: tags:

 The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vivado HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).
 There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)


 This Jupyter notebook is organized based on the sections described above. We will use the following helper functions, `showSrc` to show source code of FINN library calls and `showInNetron` to show the ONNX model at the current transformation step. The Netron displays are interactive, but they only work when running the notebook actively and not on GitHub (i.e. if you are viewing this on GitHub you'll only see blank squares).

 %% Cell type:code id: tags:

 ``` python
 from finn.util.visualization import showSrc, showInNetron
 from finn.util.basic import make_build_dir


 build_dir = "/workspace/finn"
 ```

 %% Cell type:markdown id: tags:

 ## Outline
 -------------
 1. [Brevitas export](#brev_exp)
 2. [Network preparation](#nw_prep)
 3. [Vivado HLS and IPI](#vivado)
 4. [PYNQ hardware generation and deployment](#hw_test)

 %% Cell type:markdown id: tags:

 ## 1. Brevitas export <a id='brev_exp'></a>
 FINN expects an ONNX model as input. This can be a model trained with [Brevitas](https://github.com/Xilinx/brevitas). Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several [example Brevitas networks](https://github.com/Xilinx/brevitas/tree/master/brevitas_examples/bnn_pynq). To show the FINN end-to-end flow, we'll use the TFC-w1a1 model as example network.

 First a few things have to be imported. Then the model can be loaded with the pretrained weights.

 %% Cell type:code id: tags:

 ``` python
 import onnx
 from finn.util.test import get_test_model_trained
 import brevitas.onnx as bo

 tfc = get_test_model_trained("TFC", 1, 1)
 bo.export_finn_onnx(tfc, (1, 1, 28, 28), build_dir+"/tfc_w1_a1.onnx")
 ```

 %% Output

-    /workspace/brevitas/brevitas_examples/bnn_pynq/models/TFC.py:80: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
+    Downloading: "https://github.com/Xilinx/brevitas/releases/download/bnn_pynq-r0/tfc_1w1a-ff8140dc.pth" to /home/maltanar/.cache/torch/checkpoints/tfc_1w1a-ff8140dc.pth
+    100%|██████████| 249052/249052 [00:00<00:00, 759439.97it/s]
+    /workspace/brevitas/brevitas_examples/bnn_pynq/models/FC.py:83: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.
      x = 2.0 * x - torch.tensor([1.0], device=x.device)

 %% Cell type:markdown id: tags:

 The model was now exported, loaded with the pretrained weights and saved under the name "lfc_w1_a1.onnx".
 To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties.

 %% Cell type:code id: tags:

 ``` python
 showInNetron(build_dir+"/tfc_w1_a1.onnx")
 ```

 %% Output

    Serving '/workspace/finn/tfc_w1_a1.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f98457e4048>
+    <IPython.lib.display.IFrame at 0x7f1b8e6db128>

 %% Cell type:markdown id: tags:

 Now that we have the model in .onnx format, we can work with it using FINN. For that FINN `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model.

 %% Cell type:code id: tags:

 ``` python
 from finn.core.modelwrapper import ModelWrapper
 model = ModelWrapper(build_dir+"/tfc_w1_a1.onnx")
 ```

 %% Cell type:markdown id: tags:

 Now the model is prepared and could be simulated using Python. How this works is described in the Jupyter notebook about verification and can be found [here](tfc_end2end_verification.ipynb#simpy).

 The model can now also be processed in different ways. The principle of FINN are analysis and transformation passes, which can be applied to the model. An analysis pass extracts specific information about the model and returns it to the user in the form of a dictionary. A transformation pass changes the model and returns the changed model back to the FINN flow.

 Since the goal in this notebook is to process the model to such an extent that a bitstream can be generated from it, the focus is on the transformations that are necessary for this. In the next section these are discussed in more detail.

 %% Cell type:markdown id: tags:

 ## 2. Network preparation <a id='nw_prep'></a>

 * [FINN-style Dataflow Architectures](#dataflow_arch)
 * [Tidy-up transformations](#basic_trafo)
 * [Streamlining](#streamline)
 * [Conversion to HLS layers](#hls_layers)
 * [Creating a Dataflow Partition](#dataflow_partition)
 * [Folding and Datawidth Converter, FIFO and TLastMarker Insertion](#folding)


 In this section, we will put the network through a series of transformations that puts it in a form that can be stitched together to form a FINN-style dataflow architecture, yielding a high-performance, high-efficiency FPGA accelerator.

 %% Cell type:markdown id: tags:

 ### FINN-style Dataflow Architectures <a id='dataflow_arch'></a>

 We start with a quick recap of FINN-style dataflow architectures. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, as illustrated in the figure below taken from the [FINN-R paper](https://arxiv.org/pdf/1809.04570.pdf):

 ![](finn-hw-arch.png)

 In practice, the compute arrays are instantiated by function calls to optimized Vivado HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process.

 %% Cell type:markdown id: tags:

 ### Tidy-up transformations <a id='basic_trafo'></a>
 This section deals with some basic transformations, which are applied to the model like a kind of "tidy-up" to make it easier to be processed. They do not appear in the diagram above, but they are applied in many steps in the FINN flow to postprocess the model after a transformation and/or to prepare it for the next transformation.

 %% Cell type:markdown id: tags:

 These transformations are:
 * GiveUniqueNodeNames
 * GiveReadableTensorNames
 * InferShapes
 * InferDataTypes
 * FoldConstants

 %% Cell type:markdown id: tags:

 In the first two transformations (`GiveUniqueNodeNames`, `GiveReadableTensorNames`) the nodes in the graph are first given unique (by enumeration) names, then the tensors are given human-readable names (based on the node names). The following two transformations (`InferShapes`, `InferDataTypes`) derive the shapes and data types of the tensors from the model properties and set them in the `ValueInfo` of the model. These transformations can almost always be applied without negative effects and do not affect the structure of the graph, ensuring that all the information needed is available.

 The last listed transformation is `FoldConstants`, which performs constant folding. It identifies a node with constant inputs and determines its output. The result is then set as constant-only inputs for the following node and the old node is removed. Although this transformation changes the structure of the model, it is a transformation that is usually always desired and can be applied to any model.

 %% Cell type:markdown id: tags:

 These transformations can be imported and applied as follows.

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
 from finn.transformation.infer_shapes import InferShapes
 from finn.transformation.infer_datatypes import InferDataTypes
 from finn.transformation.fold_constants import FoldConstants

 model = model.transform(InferShapes())
 model = model.transform(FoldConstants())
 model = model.transform(GiveUniqueNodeNames())
 model = model.transform(GiveReadableTensorNames())
 model = model.transform(InferDataTypes())

 model.save(build_dir+"/tfc_w1_a1_tidy.onnx")
 ```

 %% Cell type:markdown id: tags:

 The result of these transformations can be viewed with netron after the model has been saved again. By clicking on the individual nodes, it can now be seen, for example, that each node has been given a name. Also the whole upper area could be folded, so that now the first node is "Reshape".

 %% Cell type:code id: tags:

 ``` python
 showInNetron(build_dir+"/tfc_w1_a1_tidy.onnx")
 ```

 %% Output

    
    Stopping http://0.0.0.0:8081
    Serving '/workspace/finn/tfc_w1_a1_tidy.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f97d7ff7cf8>
+    <IPython.lib.display.IFrame at 0x7f1add273cf8>

 %% Cell type:markdown id: tags:

 ### Streamlining <a id='streamline'></a>
 Streamlining is a transformation containing several sub-transformations. The goal of streamlining is to eliminate floating point operations by moving them around, then collapsing them into one operation and in the last step transform them into multi-thresholding nodes. For more information on the theoretical background of this, see [this paper](https://arxiv.org/pdf/1709.04060).

 Let's have a look at which sub-transformations `Streamline` consists of:

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.streamline import Streamline
 showSrc(Streamline)
 ```

 %% Output

    class Streamline(Transformation):
        """Apply the streamlining transform, see arXiv:1709.04060."""
    
        def apply(self, model):
            streamline_transformations = [
                ConvertSubToAdd(),
                ConvertDivToMul(),
                BatchNormToAffine(),
                ConvertSignToThres(),
                AbsorbSignBiasIntoMultiThreshold(),
                MoveAddPastMul(),
                MoveScalarAddPastMatMul(),
                MoveAddPastConv(),
                MoveScalarMulPastMatMul(),
                MoveScalarMulPastConv(),
                MoveAddPastMul(),
                CollapseRepeatedAdd(),
                CollapseRepeatedMul(),
                AbsorbAddIntoMultiThreshold(),
                FactorOutMulSignMagnitude(),
                AbsorbMulIntoMultiThreshold(),
                Absorb1BitMulIntoMatMul(),
                Absorb1BitMulIntoConv(),
                RoundAndClipThresholds(),
            ]
            for trn in streamline_transformations:
                model = model.transform(trn)
                model = model.transform(RemoveIdentityOps())
                model = model.transform(GiveUniqueNodeNames())
                model = model.transform(GiveReadableTensorNames())
                model = model.transform(InferDataTypes())
            return (model, False)
    

 %% Cell type:markdown id: tags:

 As can be seen, several transformations are involved in the streamlining transformation. There are move and collapse transformations. In the last step the operations are transformed into multithresholds. The involved transformations can be viewed in detail [here](https://github.com/Xilinx/finn/tree/master/src/finn/transformation/streamline). After each transformation, three of the tidy-up transformations (`GiveUniqueNodeNames`, `GiveReadableTensorNames` and `InferDataTypes`) are applied to the model.

 After streamlining the network looks as follows:

 %% Cell type:code id: tags:

 ``` python
 model = ModelWrapper(build_dir+"/tfc_w1_a1_tidy.onnx")
 model = model.transform(Streamline())
 model.save(build_dir+"/tfc_w1_a1_streamlined.onnx")
 showInNetron(build_dir+"/tfc_w1_a1_streamlined.onnx")
 ```

 %% Output

    
    Stopping http://0.0.0.0:8081
    Serving '/workspace/finn/tfc_w1_a1_streamlined.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f97d46a65f8>
+    <IPython.lib.display.IFrame at 0x7f1adc25df60>

 %% Cell type:markdown id: tags:

 You can see that the network has become simplified considerably compared to the previous step -- a lot of nodes have disappeared between the `MatMul` layers, and the `Sign` nodes have been replaced with `MultiThreshold` nodes instead.

 **The current implementation of streamlining is highly network-specific and may not work for your network if its topology is very different than the example network here. We hope to rectify this in future releases.**

 Our example network is a quantized network with 1-bit bipolar (-1, +1 values) precision, and we want FINN to implement them as XNOR-popcount operations [as described in the original FINN paper](https://arxiv.org/pdf/1612.07119). For this reason, after streamlining, the resulting bipolar matrix multiplications are converted into xnorpopcount operations. This transformation produces operations that are again collapsed and converted into thresholds. This procedure is shown below.

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount
 import finn.transformation.streamline.absorb as absorb
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds

 model = model.transform(ConvertBipolarMatMulToXnorPopcount())
 model = model.transform(absorb.AbsorbAddIntoMultiThreshold())
 model = model.transform(absorb.AbsorbMulIntoMultiThreshold())
 model = model.transform(RoundAndClipThresholds())

 model.save(build_dir+"/tfc_w1a1_ready_for_hls_conversion.onnx")
 showInNetron(build_dir+"/tfc_w1a1_ready_for_hls_conversion.onnx")
 ```

 %% Output

    
    Stopping http://0.0.0.0:8081
    Serving '/workspace/finn/tfc_w1a1_ready_for_hls_conversion.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f97d46ad240>
+    <IPython.lib.display.IFrame at 0x7f1adc2548d0>

 %% Cell type:markdown id: tags:

 Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to HLS layers.

 %% Cell type:markdown id: tags:

 ### Conversion to HLS layers <a id='hls_layers'></a>
 Converts the nodes to HLS layers that correspond to the functions in [finn-hls library](https://finn-hlslib.readthedocs.io/en/latest/). In our case this transformation converts pairs of binary XnorPopcountMatMul layers to StreamingFCLayer_Batch layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.

 Below is the code for the transformation and the network is visualized using netron to create the new structure with `StreamingFCLayer_Batch` nodes, which will correspond to a function call from the [finn-hlslib](https://finn-hlslib.readthedocs.io/en/latest/library/fclayer.html#_CPPv4I_j_j_j_j000_i_i000E22StreamingFCLayer_BatchvRN3hls6streamI7ap_uintI9InStreamWEEERN3hls6streamI7ap_uintI10OutStreamWEEERK2TWRK2TAKjRK1R) library.

 %% Cell type:markdown id: tags:

 **Note:** The transformation `to_hls.InferBinaryStreamingFCLayer` gets the string "decoupled" as argument, this indicates the `mem_mode` for the weights. In FINN there are different options to set the way the weights are stored and accessed. For details please have a look on the [FINN readthedocs website](https://finn.readthedocs.io/) under Internals.

 %% Cell type:code id: tags:

 ``` python
 import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
 model = ModelWrapper(build_dir+"/tfc_w1a1_ready_for_hls_conversion.onnx")
 model = model.transform(to_hls.InferBinaryStreamingFCLayer("decoupled"))
 model.save(build_dir+"/tfc_w1_a1_hls_layers.onnx")
 showInNetron(build_dir+"/tfc_w1_a1_hls_layers.onnx")
 ```

 %% Output

    
    Stopping http://0.0.0.0:8081
    Serving '/workspace/finn/tfc_w1_a1_hls_layers.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f97d46a6668>
+    <IPython.lib.display.IFrame at 0x7f1adc254630>

 %% Cell type:markdown id: tags:

 Each StreamingFCLayer_Batch node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. We will shortly cover how these can be adjusted, but first we want to separate the HLS layers from the non-HLS layers in this network.

 %% Cell type:markdown id: tags:

 ### Creating a Dataflow Partition <a id='dataflow_partition'></a>

 In the graph above, you can see that there is a mixture of FINN HLS layers (StreamingFCLayer_Batch) with regular ONNX layers (Reshape, Mul, Add). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a "dataflow partition" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition:

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition

 model = ModelWrapper(build_dir+"/tfc_w1_a1_hls_layers.onnx")
 parent_model = model.transform(CreateDataflowPartition())
 parent_model.save(build_dir+"/tfc_w1_a1_dataflow_parent.onnx")
 showInNetron(build_dir+"/tfc_w1_a1_dataflow_parent.onnx")
 ```

 %% Output

    
    Stopping http://0.0.0.0:8081
    Serving '/workspace/finn/tfc_w1_a1_dataflow_parent.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f9872793940>
+    <IPython.lib.display.IFrame at 0x7f1add27eba8>

 %% Cell type:markdown id: tags:

 We can see that the StreamingFCLayer instances have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:

 %% Cell type:code id: tags:

 ``` python
 from finn.custom_op.registry import getCustomOp
 sdp_node = getCustomOp(parent_model.graph.node[2])
 dataflow_model_filename = sdp_node.get_nodeattr("model")
 showInNetron(dataflow_model_filename)
 ```

 %% Output

    
    Stopping http://0.0.0.0:8081
-    Serving '/tmp/finn_dev_maltanar/dataflow_partition0_cuza0or8/df_model.onnx' at http://0.0.0.0:8081
+    Serving '/tmp/finn_dev_maltanar/dataflow_partition0_8y5bzo4x/df_model.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f987278b9e8>
+    <IPython.lib.display.IFrame at 0x7f1b7af8d240>

 %% Cell type:markdown id: tags:

 We can see all the extracted `StreamingFCLayer` instances have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it.

 %% Cell type:code id: tags:

 ``` python
 model = ModelWrapper(dataflow_model_filename)
 ```

 %% Cell type:markdown id: tags:

 ### Folding: Adjusting the Parallelism <a id='folding'></a>

 *Folding* in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several *folding factors* for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original [FINN paper](https://arxiv.org/pdf/1612.07119). The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume.

 Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a StreamingFCLayer_Batch operation. This is where the Netron visualization helps us, in the above diagram we can see that the first four nodes are StreamingFCLayer_Batch. So as an example we extract the first node.

 %% Cell type:markdown id: tags:

 We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/master/src/finn/custom_op/fpgadataflow/__init__.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes.

 %% Cell type:code id: tags:

 ``` python
 fc0 = model.graph.node[0]
 fc0w = getCustomOp(fc0)

 print("CustomOp wrapper is of class " + fc0w.__class__.__name__)

 fc0w.get_nodeattr_types()
 ```

 %% Output

    CustomOp wrapper is of class StreamingFCLayer_Batch

    {'PE': ('i', True, 0),
     'SIMD': ('i', True, 0),
     'MW': ('i', True, 0),
     'MH': ('i', True, 0),
     'resType': ('s', True, ''),
     'ActVal': ('i', False, 0),
     'inputDataType': ('s', True, ''),
     'weightDataType': ('s', True, ''),
     'outputDataType': ('s', True, ''),
     'accDataType': ('s', False, 'INT32'),
     'binaryXnorMode': ('i', False, 0),
     'noActivation': ('i', False, 0),
     'numInputVectors': ('ints', False, [1]),
     'mem_mode': ('s', False, 'const'),
     'ram_style': ('s', False, 'auto'),
     'backend': ('s', True, 'fpgadataflow'),
     'code_gen_dir_cppsim': ('s', False, ''),
     'code_gen_dir_ipgen': ('s', False, ''),
     'executable_path': ('s', False, ''),
     'ipgen_path': ('s', False, ''),
     'ip_path': ('s', False, ''),
     'ip_vlnv': ('s', False, ''),
     'exec_mode': ('s', False, ''),
     'cycles_rtlsim': ('i', False, 0),
     'cycles_estimate': ('i', False, 0),
     'rtlsim_trace': ('s', False, ''),
     'res_estimate': ('s', False, ''),
     'res_hls': ('s', False, ''),
     'res_synth': ('s', False, ''),
     'rtlsim_so': ('s', False, ''),
     'partition_id': ('i', False, 0),
     'inFIFODepth': ('i', False, 2),
     'outFIFODepth': ('i', False, 2)}

 %% Cell type:markdown id: tags:

 We can see that the PE and SIMD are listed as node attributes, as well as the depths of the FIFOs that will be inserted between consecutive layers, and all can be adjusted using `set_nodeattr` subject to certain constraints.
 **In this notebook we are setting the folding factors and FIFO depths manually, but in a future version we will support determining the folding factors given an FPGA resource budget according to the analytical model from the [FINN-R paper](https://arxiv.org/pdf/1809.04570).**

 %% Cell type:code id: tags:

 ``` python
 fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
 # (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer
 config = [
    (16, 49, 16, 64, "block"),
    (8, 8, 64, 64, "auto"),
    (8, 8, 64, 64, "auto"),
    (10, 8, 64, 10, "distributed"),
 ]
 for fcl, (pe, simd, ififo, ofifo, ramstyle) in zip(fc_layers, config):
    fcl_inst = getCustomOp(fcl)
    fcl_inst.set_nodeattr("PE", pe)
    fcl_inst.set_nodeattr("SIMD", simd)
    fcl_inst.set_nodeattr("inFIFODepth", ififo)
    fcl_inst.set_nodeattr("outFIFODepth", ofifo)
    fcl_inst.set_nodeattr("ram_style", ramstyle)
 ```

 %% Cell type:markdown id: tags:

 We are setting PE and SIMD so that each layer has a total folding of 16.

 %% Cell type:markdown id: tags:

 Besides PE and SIMD three other node attributes are set. `ram_style` specifies how the weights are to be stored (BRAM, LUTRAM, and so on). It can be selected explicitly or with the option `auto` you can let Vivado decide.
 `inFIFODepth` and `outFIFODepth` specifies the FIFO depths that is needed by the node from the surrounding FIFOs. These attributes are used in the transformation 'InsertFIFO' to insert the appropriate FIFOs between the nodes, which will be automatically called as part of the hardware build process.

 In previous versions of FINN we had to call transformations to insert data width converters, FIFOs and `TLastMarker` manually at this step. This is no longer needed, as all this is taken care of by the `ZynqBuild` or `VitisBuild` transformations.

 %% Cell type:code id: tags:

 ``` python
 model.save(build_dir+"/tfc_w1_a1_set_folding_factors.onnx")
 showInNetron(build_dir+"/tfc_w1_a1_set_folding_factors.onnx")
 ```

 %% Output

    
    Stopping http://0.0.0.0:8081
    Serving '/workspace/finn/tfc_w1_a1_set_folding_factors.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f97d464ca20>
+    <IPython.lib.display.IFrame at 0x7f1adc266ba8>

 %% Cell type:markdown id: tags:

 This completes the network preparation and the network can be passed on to the next block *Vivado HLS and IPI*, which is described below.

 %% Cell type:markdown id: tags:

 ## 3. Hardware Build <a id='vivado'></a>

 We're finally ready to start generating hardware from our network. Depending on whether you want to target a Zynq or Alveo platform, FINN offers two transformations to build the accelerator, integrate into an appropriate shell and build a bitfile. These are `ZynqBuild` and `VitisBuild` for Zynq and Alveo, respectively. In this notebook we'll demonstrate the `ZynqBuild` as these boards are more common and it's much faster to complete bitfile generation for the smaller FPGAs found on them.

 As we will be dealing with FPGA synthesis tools in these tasks, we'll define two helper variables that describe the Xilinx FPGA part name and the PYNQ board name that we are targeting.

 %% Cell type:code id: tags:

 ``` python
 # print the names of the supported PYNQ boards
 from finn.util.basic import pynq_part_map
 print(pynq_part_map.keys())
 ```

 %% Output

    dict_keys(['Ultra96', 'Pynq-Z1', 'Pynq-Z2', 'ZCU102', 'ZCU104'])

 %% Cell type:code id: tags:

 ``` python
 # change this if you have a different PYNQ board, see list above
 pynq_board = "Pynq-Z1"
 fpga_part = pynq_part_map[pynq_board]
 target_clk_ns = 10
 ```

 %% Cell type:markdown id: tags:

 In previous versions of FINN, we had to manually go through several steps to generate HLS code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
 model = ModelWrapper(build_dir+"/tfc_w1_a1_set_folding_factors.onnx")
 model = model.transform(ZynqBuild(platform = pynq_board, period_ns = target_clk_ns))
 ```

 %% Output

    /workspace/finn/src/finn/transformation/infer_data_layouts.py:107: UserWarning: Assuming 2D input is NC
      warnings.warn("Assuming 2D input is NC")

 %% Cell type:code id: tags:

 ``` python
 model.save(build_dir + "/tfc_w1_a1_post_synthesis.onnx")
 ```

 %% Cell type:markdown id: tags:

 ### Examining the generated outputs <a id='gen_outputs'></a>

 TODO

 %% Cell type:markdown id: tags:

 ## 4.  PYNQ deployment <a id='hw_test'></a>

 * [Deployment and Remote Execution](#deploy)
 * [Throughput Test on PYNQ Board](#throughput)


 We are almost done preparing our hardware design. We'll now put it in a form suitable for use as a PYNQ overlay, synthesize and deploy it.

 %% Cell type:markdown id: tags:

 ### Deployment and Remote Execution <a id='deploy'></a>

 We'll now use the `DeployToPYNQ` transformation to create a deployment folder with the bitfile and driver file(s), and copy that to the PYNQ board. You can change the default IP address, username, password and target folder for the PYNQ below.

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 ip = "192.168.2.99"
 port = "22"
 username = "xilinx"
 password = "xilinx"
 target_dir = "/home/xilinx/finn_tfc_end2end_example"
 model = model.transform(DeployToPYNQ(ip, port, username, password, target_dir))
 model.save(build_dir + "/tfc_w1_a1_pynq_deploy.onnx")
 ```

 %% Cell type:markdown id: tags:

 Let's verify that the remote access credentials is saved in the model metadata, and that the deployment folder has been successfully copied to the board:

 %% Cell type:code id: tags:

 ``` python
 model.model.metadata_props
 ```

 %% Output

    [key: "pynq_driver_dir"
-    value: "/tmp/finn_dev_maltanar/pynq_driver_f266zjp6"
+    value: "/tmp/finn_dev_maltanar/pynq_driver_ilfzbags"
    , key: "vivado_pynq_proj"
-    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk"
+    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk"
    , key: "bitfile"
-    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk/resizer.bit"
+    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk/resizer.bit"
    , key: "hw_handoff"
-    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk/resizer.hwh"
+    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk/resizer.hwh"
    , key: "vivado_synth_rpt"
-    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj__qx46zlk/synth_report.xml"
+    value: "/tmp/finn_dev_maltanar/vivado_zynq_proj_erwcr5nk/synth_report.xml"
    , key: "platform"
    value: "zynq-iodma"
    , key: "pynq_ip"
    value: "192.168.2.99"
    , key: "pynq_port"
    value: "22"
    , key: "pynq_username"
    value: "xilinx"
    , key: "pynq_password"
    value: "xilinx"
    , key: "pynq_target_dir"
    value: "/home/xilinx/finn_tfc_end2end_example"
    , key: "pynq_deployment_dir"
-    value: "/tmp/finn_dev_maltanar/pynq_deployment_624bd_nc"
+    value: "/tmp/finn_dev_maltanar/pynq_deployment_eyiu4sxk"
    , key: "pynq_deploy_dir"
-    value: "/tmp/finn_dev_maltanar/pynq_deployment_624bd_nc"
+    value: "/tmp/finn_dev_maltanar/pynq_deployment_eyiu4sxk"
    , key: "exec_mode"
    value: "remote_pynq"
    ]

 %% Cell type:code id: tags:

 ``` python
 ! sshpass -p {password} ssh {username}@{ip} -p {port} 'ls -l {target_dir}/*'
 ```

 %% Output

-    total 4212
+    /home/xilinx/finn_tfc_end2end_example/pynq_deployment_624bd_nc:
+    total 4228
    -rw-r--r-- 1 xilinx xilinx    9391 Sep  4 10:37 driver.py
    drwxr-xr-x 4 xilinx xilinx    4096 Sep  4 10:37 finn
+    -rw-r--r-- 1 xilinx xilinx    3264 Sep  4 10:38 input.npy
+    -rw-r--r-- 1 root   root       205 Sep  4 10:39 nw_metrics.txt
+    -rw-r--r-- 1 root   root       120 Sep  4 10:38 output.npy
    -rw-r--r-- 1 xilinx xilinx 4045671 Sep  4 10:37 resizer.bit
    -rw-r--r-- 1 xilinx xilinx  246211 Sep  4 10:37 resizer.hwh
+    -rw-r--r-- 1 root   root        32 Sep  4 10:39 sds_trace_data.dat
+    
+    /home/xilinx/finn_tfc_end2end_example/pynq_deployment_eyiu4sxk:
+    total 4212
+    -rw-r--r-- 1 xilinx xilinx    8493 Sep  5 01:24 driver.py
+    drwxr-xr-x 4 xilinx xilinx    4096 Sep  5 01:24 finn
+    -rw-r--r-- 1 xilinx xilinx 4045671 Sep  5 01:24 resizer.bit
+    -rw-r--r-- 1 xilinx xilinx  246211 Sep  5 01:24 resizer.hwh

 %% Cell type:markdown id: tags:

 We only have two more steps to be able to remotely execute the deployed bitfile with some test data from the MNIST dataset. Let's load up some test data that comes bundled with FINN.

 %% Cell type:code id: tags:

 ``` python
 from pkgutil import get_data
 import onnx.numpy_helper as nph
 import matplotlib.pyplot as plt

 raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
 x = nph.to_array(onnx.load_tensor_from_string(raw_i))
 plt.imshow(x.reshape(28,28), cmap='gray')
 ```

 %% Output

-    <matplotlib.image.AxesImage at 0x7f9857a748d0>
+    <matplotlib.image.AxesImage at 0x7f1b5b7cddd8>

 %% Cell type:markdown id: tags:

 Recall that we partitioned our original network into a parent graph that contained the non-synthesizable nodes and a child graph that contained the bulk of the network, which we turned into a bitfile. We'll load up the parent graph, modify the `StreamingDataflowPartition` node so that it points to the deployed ONNX graph.

 %% Cell type:code id: tags:

 ``` python
 parent_model = ModelWrapper(build_dir+"/tfc_w1_a1_dataflow_parent.onnx")
 sdp_node = parent_model.graph.node[2]
 remote_exec_model = build_dir + "/tfc_w1_a1_pynq_deploy.onnx"
 getCustomOp(sdp_node).set_nodeattr("model", remote_exec_model)
 parent_model.save(build_dir+"/tfc_w1_a1_dataflow_parent_with_remote_bitfile_exec.onnx")
 ```

 %% Cell type:markdown id: tags:

 Finally, we can call `execute_onnx` on the parent graph, which will internally call remote execution with the bitfile once the `StreamingDataflowPartition` node is reached, grab the results, then continue executing the last portion of the network.

 %% Cell type:code id: tags:

 ``` python
 import numpy as np
 from finn.core.onnx_exec import execute_onnx
 iname = parent_model.graph.input[0].name
 oname = parent_model.graph.output[0].name
 ishape = parent_model.get_tensor_shape(iname)
 input_dict = {iname: x.reshape(ishape)}
 ret = execute_onnx(parent_model, input_dict, True)
 ```

 %% Cell type:markdown id: tags:

 We'll pass the output of the network through a softmax function to interpret it as probabilities, and plot the per-class probabilities as a bar chart.

 %% Cell type:code id: tags:

 ``` python
 def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

 logits = ret[oname].flatten()
 prob = softmax(logits)

 plt.bar(np.arange(10), prob)
 ```

 %% Output

    <BarContainer object of 10 artists>



 %% Cell type:markdown id: tags:

 We see that the network correctly predicts this as a digit 2 with high probability. This concludes our tutorial on how to take a simple fully-connected BNN all the way down to hardware with FINN, and execute it remotely on a PYNQ board.

 %% Cell type:markdown id: tags:

 ### Throughput Test on PYNQ Board <a id='throughput'></a>
 In addition to the functional verification, FINN also offers the possibility to measure the network performance directly on the PYNQ board. This can be done using the core function `throughput_test`. In the next section we import the function and execute it.
 First we extract the `remote_exec_model` again and pass it to the function. The function returns the metrics of the network as dictionary.

 %% Cell type:code id: tags:

 ``` python
 from finn.core.throughput_test import throughput_test_remote

 child_model = ModelWrapper(getCustomOp(sdp_node).get_nodeattr("model"))
 res = throughput_test_remote(child_model, 10000)
 print("Network metrics:")
 for key in res:
    print(str(key) + ": " + str(res[key]))
 ```

 %% Output

    Network metrics:
-    runtime[ms]: 7.485151290893555
-    throughput[images/s]: 1335978.3405000796
-    DRAM_in_bandwidth[Mb/s]: 130.9258773690078
-    DRAM_out_bandwidth[Mb/s]: 13.359783405000796
+    runtime[ms]: 7.472753524780273
+    throughput[images/s]: 1338194.8122387773
+    DRAM_in_bandwidth[Mb/s]: 131.14309159940018
+    DRAM_out_bandwidth[Mb/s]: 13.381948122387772
    fclk[mhz]: 100.0
    N: 10000

 %% Cell type:markdown id: tags:

 Together with the values for folding we can evaluate the performance of our accelerator. Each layer has a total folding factor of 64 and because the network is fully pipelined, it follows: `II = 64`. II is the initiation interval and indicates how many cycles are needed for one input to be processed.

 %% Cell type:code id: tags:

 ``` python
 II = 64
 # frequency in MHz
 f_MHz = 100
 # expected throughput in MFPS
 expected_throughput = f_MHz / II
 # measured throughput (FPS) from throughput test, converted to MFPS
 measured_throughput = res["throughput[images/s]"] * 0.000001
 # peformance
 print("We reach approximately " + str(round((measured_throughput / expected_throughput)*100)) + "% of the ideal performance.")
 ```

 %% Output

    We reach approximately 86% of the ideal performance.

 %% Cell type:markdown id: tags:

 The measured values were recorded with a batch size of 10000 and at a frequency of 100 MHz. We will be improving the efficiency of the generated accelerator examples in the coming FINN releases.

 %% Cell type:code id: tags:

 ``` python
 ```

--- a/notebooks/end2end_example/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/tfc_end2end_verification.ipynb
@@ -264,7 +264,7 @@
       "        "
      ],
      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7f776245ac18>"
+       "<IPython.lib.display.IFrame at 0x7f3cac09d978>"
      ]
     },
     "execution_count": 8,
@@ -409,7 +409,16 @@
   "outputs": [],
   "source": [
    "from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim\n",
-    "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_ipgen.onnx\")\n",
+    "from finn.transformation.fpgadataflow.prepare_ip import PrepareIP\n",
+    "from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP\n",
+    "\n",
+    "test_fpga_part = \"xc7z020clg400-1\"\n",
+    "target_clk_ns = 10\n",
+    "\n",
+    "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_set_folding_factors.onnx\")\n",
+    "child_model = child_model.transform(GiveUniqueNodeNames())\n",
+    "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n",
+    "child_model = child_model.transform(HLSSynthIP())\n",
    "child_model = child_model.transform(SetExecMode(\"rtlsim\"))\n",
    "child_model = child_model.transform(PrepareRTLSim())\n",
    "child_model.save(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")"
@@ -478,18 +487,44 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 18,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_3\n",
+      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n",
+      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_1\n",
+      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n",
+      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_2\n",
+      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n",
+      "/workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_0\n",
+      "  warnings.warn(\"Using pre-existing IP for %s\" % node.name)\n"
+     ]
+    }
+   ],
   "source": [
-    "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_ipstitch.onnx\")\n",
+    "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n",
+    "from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n",
+    "from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP\n",
+    "\n",
+    "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n",
+    "child_model = child_model.transform(InsertDWC())\n",
+    "child_model = child_model.transform(InsertFIFO())\n",
+    "child_model = child_model.transform(GiveUniqueNodeNames())\n",
+    "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n",
+    "child_model = child_model.transform(HLSSynthIP())\n",
+    "child_model = child_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))\n",
+    "child_model = child_model.transform(PrepareRTLSim())\n",
    "child_model.set_metadata_prop(\"exec_mode\",\"rtlsim\")\n",
    "child_model.save(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -502,7 +537,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
@@ -522,6 +557,13 @@
    "else:\n",
    "    print(\"The results are not the same!\")"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {

 %% Cell type:markdown id: tags:

 # FINN - Functional Verification of End-to-End Flow
 -----------------------------------------------------------------

 **Important: This notebook depends on the tfc_end2end_example notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**

 In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. Besides the methods in this notebook, there is another one that is covered in the Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb): remote execution. The remote execution allows functional verification directly on the PYNQ board, for details please have a look at the mentioned Jupyter notebook.

 %% Cell type:markdown id: tags:

 <img src="verification.png" alt="Drawing" style="width: 500px;"/>

 %% Cell type:markdown id: tags:

 We will use the following helper functions, `showSrc` to show source code of FINN library calls and `showInNetron` to show the ONNX model at the current transformation step. The Netron displays are interactive, but they only work when running the notebook actively and not on GitHub (i.e. if you are viewing this on GitHub you'll only see blank squares).

 %% Cell type:code id: tags:

 ``` python
 from finn.util.basic import make_build_dir
 from finn.util.visualization import showSrc, showInNetron

 build_dir = "/workspace/finn"
 ```

 %% Cell type:markdown id: tags:

 To verify the simulations, a "golden" output is calculated as a reference. This is calculated directly from the Brevitas model using PyTorch, by running some example data from the MNIST dataset through the trained model.

 %% Cell type:code id: tags:

 ``` python
 from pkgutil import get_data
 import onnx
 import onnx.numpy_helper as nph
 import torch
 from finn.util.test import get_test_model_trained

 fc = get_test_model_trained("TFC", 1, 1)
 raw_i = get_data("finn", "data/onnx/mnist-conv/test_data_set_0/input_0.pb")
 input_tensor = onnx.load_tensor_from_string(raw_i)
 input_brevitas = torch.from_numpy(nph.to_array(input_tensor)).float()
 output_golden = fc.forward(input_brevitas).detach().numpy()
 output_golden
 ```

 %% Output

    array([[-1.119972 , -1.7596636,  0.8423852, -1.0705007, -1.3218282,
            -1.5030646, -1.4598225, -1.2803943, -1.0334575, -1.7878995]],
          dtype=float32)

 %% Cell type:markdown id: tags:

 ## Simulation using Python <a id='simpy'></a>

 If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/master/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (backend $\neq$ "fpgadataflow") this model can be checked for functionality using Python.

 To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution functions are defined. The following is an example of the execution function of a XNOR popcount node.

 %% Cell type:code id: tags:

 ``` python
 from finn.custom_op.xnorpopcount import xnorpopcountmatmul
 showSrc(xnorpopcountmatmul)
 ```

 %% Output

    def xnorpopcountmatmul(inp0, inp1):
        """Simulates XNOR-popcount matrix multiplication as a regular bipolar
        matrix multiplication followed by some post processing."""
        # extract the operand shapes
        # (M, K0) = inp0.shape
        # (K1, N) = inp1.shape
        K0 = inp0.shape[-1]
        K1 = inp1.shape[0]
        # make sure shapes are compatible with matmul
        assert K0 == K1, "Matrix shapes are not compatible with matmul."
        K = K0
        # convert binary inputs to bipolar
        inp0_bipolar = 2.0 * inp0 - 1.0
        inp1_bipolar = 2.0 * inp1 - 1.0
        # call regular numpy matrix multiplication
        out = np.matmul(inp0_bipolar, inp1_bipolar)
        # XNOR-popcount does not produce the regular dot product result --
        # it returns the number of +1s after XNOR. let P be the number of +1s
        # and N be the number of -1s. XNOR-popcount returns P, whereas the
        # regular dot product result from numpy is P-N, so we need to apply
        # some correction.
        # out = P-N
        # K = P+N
        # out + K = 2P, so P = (out + K)/2
        return (out + K) * 0.5
    

 %% Cell type:markdown id: tags:

 The function contains a description of the behaviour in Python and can thus calculate the result of the node.

 This execution function and onnxruntime is used when `execute_onnx` from `onnx_exec` is applied to the model. The model is then simulated node by node and the result is stored in a context dictionary, which contains the values of each tensor at the end of the execution. To get the result, only the output tensor has to be extracted.

 The procedure is shown below. We take the model right before the nodes should be converted into HLS layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs.

 %% Cell type:code id: tags:

 ``` python
 import numpy as np
 from finn.core.modelwrapper import ModelWrapper
 input_dict = {"global_in": nph.to_array(input_tensor)}

 model_for_sim = ModelWrapper(build_dir+"/tfc_w1a1_ready_for_hls_conversion.onnx")
 ```

 %% Cell type:code id: tags:

 ``` python
 import finn.core.onnx_exec as oxe
 output_dict = oxe.execute_onnx(model_for_sim, input_dict)
 output_pysim = output_dict[list(output_dict.keys())[0]]



 if np.isclose(output_pysim, output_golden, atol=1e-3).all():
    print("Results are the same!")
 else:
    print("The results are not the same!")
 ```

 %% Output

    Results are the same!

 %% Cell type:markdown id: tags:

 The result is compared with the theoretical "golden" value for verification.

 %% Cell type:markdown id: tags:

 ## Simulation (cppsim) using C++

 When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in an .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model.

 %% Cell type:code id: tags:

 ``` python
 model_for_cppsim = ModelWrapper(build_dir+"/tfc_w1_a1_set_folding_factors.onnx")
 ```

 %% Cell type:markdown id: tags:

 To generate the code for this simulation and to generate the executable two transformations are used:
 * `PrepareCppSim` which generates the C++ code for the corresponding hls layer
 * `CompileCppSim` which compules the C++ code and stores the path to the executable

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
 from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
 from finn.transformation.general import GiveUniqueNodeNames

 model_for_cppsim = model_for_cppsim.transform(GiveUniqueNodeNames())
 model_for_cppsim = model_for_cppsim.transform(PrepareCppSim())
 model_for_cppsim = model_for_cppsim.transform(CompileCppSim())
 ```

 %% Cell type:markdown id: tags:

 When we take a look at the model using netron, we can see that the transformations introduced new attributes.

 %% Cell type:code id: tags:

 ``` python
 model_for_cppsim.save(build_dir+"/tfc_w1_a1_for_cppsim.onnx")
 showInNetron(build_dir+"/tfc_w1_a1_for_cppsim.onnx")
 ```

 %% Output

    Serving '/workspace/finn/tfc_w1_a1_for_cppsim.onnx' at http://0.0.0.0:8081

-    <IPython.lib.display.IFrame at 0x7f776245ac18>
+    <IPython.lib.display.IFrame at 0x7f3cac09d978>

 %% Cell type:markdown id: tags:

 The following node attributes have been added:
 * `code_gen_dir_cppsim` indicates the directory where the files for the simulation using C++ are stored
 * `executable_path` specifies the path to the executable

 We take now a closer look into the files that were generated:

 %% Cell type:code id: tags:

 ``` python
 from finn.custom_op.registry import getCustomOp

 fc0 = model_for_cppsim.graph.node[1]
 fc0w = getCustomOp(fc0)
 code_gen_dir = fc0w.get_nodeattr("code_gen_dir_cppsim")
 !ls {code_gen_dir}
 ```

 %% Output

    compile.sh			    memblock_0.dat  thresh.h
    execute_StreamingFCLayer_Batch.cpp  node_model	    weights.npy

 %% Cell type:markdown id: tags:

 Besides the .cpp file, the folder contains .h files with the weights and thresholds. The shell script contains the compile command and *node_model* is the executable generated by compilation. Comparing this with the `executable_path` node attribute, it can be seen that it specifies exactly the path to *node_model*.

 %% Cell type:markdown id: tags:

 To simulate the model the execution mode(exec_mode) must be set to "cppsim". This is done using the transformation SetExecMode.

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode

 model_for_cppsim = model_for_cppsim.transform(SetExecMode("cppsim"))
 model_for_cppsim.save(build_dir+"/tfc_w1_a1_for_cppsim.onnx")
 ```

 %% Cell type:markdown id: tags:

 Before the model can be executed using `execute_onnx`, we integrate the child model in the parent model. The function reads then the `exec_mode` and writes the input into the correct directory in a .npy file. To be able to read this in C++, there is an additional .hpp file ([npy2apintstream.hpp](https://github.com/Xilinx/finn/blob/master/src/finn/data/cpp/npy2apintstream.hpp)) in FINN, which uses cnpy to read .npy files and convert them into streams, or to read a stream and write it into an .npy. [cnpy](https://github.com/rogersce/cnpy) is a helper to read and write .npy and .npz formates in C++.

 The result is again compared to the "golden" output.

 %% Cell type:code id: tags:

 ``` python
 parent_model = ModelWrapper(build_dir+"/tfc_w1_a1_dataflow_parent.onnx")
 sdp_node = parent_model.graph.node[2]
 child_model = build_dir + "/tfc_w1_a1_for_cppsim.onnx"
 getCustomOp(sdp_node).set_nodeattr("model", child_model)
 output_dict = oxe.execute_onnx(parent_model, input_dict)
 output_cppsim = output_dict[list(output_dict.keys())[0]]

 if np.isclose(output_cppsim, output_golden, atol=1e-3).all():
    print("Results are the same!")
 else:
    print("The results are not the same!")
 ```

 %% Output

    Results are the same!

 %% Cell type:markdown id: tags:

 ## Emulation (rtlsim) using PyVerilator

 The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.

 We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS nodes could also be executed as whole.

 %% Cell type:markdown id: tags:

 Because at the point where we want to grab and verify the model, the model is already in split form (parent graph consisting of non-hls layers and child graph consisting only of hls layers) we first have to reference the child graph within the parent graph. This is done using the node attribute `model` for the `StreamingDataflowPartition` node.

 First the procedure is shown, if the child graph has ip blocks corresponding to the individual layers, then the procedure is shown, if the child graph already has a stitched IP.

 %% Cell type:markdown id: tags:

 ### Emulation of model node-by-node

 The child model is loaded and the `exec_mode` for each node is set. To prepare the node-by-node emulation the transformation `PrepareRTLSim` is applied to the child model. With this transformation the emulation files are created for each node and can be used directly when calling `execute_onnx()`. Each node has a new node attribute "rtlsim_so" after transformation, which contains the path to the corresponding emulation files. Then it is saved in a new .onnx file so that the changed model can be referenced in the parent model.

 %% Cell type:code id: tags:

 ``` python
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
-child_model = ModelWrapper(build_dir + "/tfc_w1_a1_ipgen.onnx")
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+
+test_fpga_part = "xc7z020clg400-1"
+target_clk_ns = 10
+
+child_model = ModelWrapper(build_dir + "/tfc_w1_a1_set_folding_factors.onnx")
+child_model = child_model.transform(GiveUniqueNodeNames())
+child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+child_model = child_model.transform(HLSSynthIP())
 child_model = child_model.transform(SetExecMode("rtlsim"))
 child_model = child_model.transform(PrepareRTLSim())
 child_model.save(build_dir + "/tfc_w1_a1_dataflow_child.onnx")
 ```

 %% Cell type:markdown id: tags:

 The next step is to load the parent model and set the node attribute `model` in the StreamingDataflowPartition node (`sdp_node`). Afterwards the `exec_mode` is set in the parent model in each node.

 %% Cell type:code id: tags:

 ``` python
 # parent model
 model_for_rtlsim = ModelWrapper(build_dir + "/tfc_w1_a1_dataflow_parent.onnx")
 # reference child model
 sdp_node = getCustomOp(model_for_rtlsim.graph.node[2])
 sdp_node.set_nodeattr("model", build_dir + "/tfc_w1_a1_dataflow_child.onnx")

 model_for_rtlsim = model_for_rtlsim.transform(SetExecMode("rtlsim"))
 ```

 %% Cell type:markdown id: tags:

 Because the necessary files for the emulation are already generated in Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb), in the next step the execution of the model can be done directly.

 %% Cell type:code id: tags:

 ``` python
 output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)
 output_rtlsim = output_dict[list(output_dict.keys())[0]]

 if np.isclose(output_rtlsim, output_golden, atol=1e-3).all():
    print("Results are the same!")
 else:
    print("The results are not the same!")
 ```

 %% Output

    Results are the same!

 %% Cell type:markdown id: tags:

 ### Emulation of stitched IP

 Here we use the same procedure. First the child model is loaded, but in contrast to the layer-by-layer emulation, the metadata property `exec_mode` is set to "rtlsim" for the whole child model. When the model is integrated and executed in the last step, the verilog files of the stitched IP of the child model are used.

 %% Cell type:code id: tags:

 ``` python
-child_model = ModelWrapper(build_dir + "/tfc_w1_a1_ipstitch.onnx")
+from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+
+child_model = ModelWrapper(build_dir + "/tfc_w1_a1_dataflow_child.onnx")
+child_model = child_model.transform(InsertDWC())
+child_model = child_model.transform(InsertFIFO())
+child_model = child_model.transform(GiveUniqueNodeNames())
+child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))
+child_model = child_model.transform(HLSSynthIP())
+child_model = child_model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns))
+child_model = child_model.transform(PrepareRTLSim())
 child_model.set_metadata_prop("exec_mode","rtlsim")
 child_model.save(build_dir + "/tfc_w1_a1_dataflow_child.onnx")
 ```

+%% Output
+
+    /workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_3
+      warnings.warn("Using pre-existing IP for %s" % node.name)
+    /workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_1
+      warnings.warn("Using pre-existing IP for %s" % node.name)
+    /workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_2
+      warnings.warn("Using pre-existing IP for %s" % node.name)
+    /workspace/finn/src/finn/transformation/fpgadataflow/hlssynth_ip.py:70: UserWarning: Using pre-existing IP for StreamingFCLayer_Batch_0
+      warnings.warn("Using pre-existing IP for %s" % node.name)
+
 %% Cell type:code id: tags:

 ``` python
 # parent model
 model_for_rtlsim = ModelWrapper(build_dir + "/tfc_w1_a1_dataflow_parent.onnx")
 # reference child model
 sdp_node = getCustomOp(model_for_rtlsim.graph.node[2])
 sdp_node.set_nodeattr("model", build_dir + "/tfc_w1_a1_dataflow_child.onnx")
 ```

 %% Cell type:code id: tags:

 ``` python
 output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)
 output_rtlsim = output_dict[list(output_dict.keys())[0]]

 if np.isclose(output_rtlsim, output_golden, atol=1e-3).all():
    print("Results are the same!")
 else:
    print("The results are not the same!")
 ```

 %% Output

    Results are the same!
+
+%% Cell type:code id: tags:
+
+``` python
+```