diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 1d6336e6fc821fd7ef5c2ae41ad215f453f09efe..fa1ba9f716aec389cc8e42576b7f70da0566c17c 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -54,7 +54,7 @@ RUN pip install sphinx==3.1.2
 RUN pip install sphinx_rtd_theme==0.5.0
 RUN pip install pytest-xdist==2.0.0
 RUN pip install pytest-parallel==0.1.0
-RUN pip install netron
+RUN pip install netron>=4.7.9
 RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
 
 # switch user
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 1ed8875e886ea78511f1992d95be4417b3af80df..a8e05114c312028d18a006d10d5b210b44afb9d3 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -12,7 +12,7 @@ gecho () {
 
 # checkout the correct dependency repo commits
 # the repos themselves are cloned in the Dockerfile
-FINN_BASE_COMMIT=1363981654009067790d5f2d0c3dd303b5fa05cb
+FINN_BASE_COMMIT=91fb6066927d965471e66e103fd5201ac217c755
 BREVITAS_COMMIT=aff49758ec445d77c75721c7de3091a2a1797ca8
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
 HLSLIB_COMMIT=2e49322d1bbc4969ca293843bda1f3f9c05456fc
diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
new file mode 100644
index 0000000000000000000000000000000000000000..093344e70331572a425a09d34c5a68d7313bc521
--- /dev/null
+++ b/docs/finn/faq.rst
@@ -0,0 +1,71 @@
+.. _faq:
+
+***********************
+Frequently Asked Questions
+***********************
+
+.. note:: **This page is under construction.**
+
+Can I install FINN out of the Docker container?
+===============================================
+
+We do not support out of the Docker implementations at the moment. This is due 
+to the high complexity of the FINN project dependencies.
+
+Since FINN uses ONNX, can I compile any model from the ONNX Model Zoo to an FPGA accelerator?
+=============================================================================================
+
+The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer 
+types and quantization annotations. Networks must be first quantized using Brevitas and exported
+to FINN-ONNX to be converted to FPGA accelerators.
+
+
+Can I deploy custom NNs with arbitrary precisions and layers using FINN? 
+=========================================================================
+
+Yes, though the effort required and quality of results will vary.
+Although we do support arbitrary 
+precision, the way we create the hardware isn't typically practical for more than 
+4 bits, or very large networks for a single FPGA. 
+In terms of layers, only a subset of quantized layers covered by the various FINN examples 
+are currently supported.
+It is possible to add support for new layers, though we don't have tutorials for this in place
+just yet.
+
+Does FINN only work with the example networks?
+==============================================
+
+FINN isn't restricted to the example networks; 
+rather, it's restricted to certain patterns (e.g. certain layer types and their combinations). 
+The current best practice for custom networks is to take a working network and gradually modify it. 
+
+What is the expected background for using FINN?
+===============================================
+
+Some general knowledge of Python, Docker, machine learning with neural networks and Jupyter notebooks
+is expected.
+Our goal is to make the tool in a shape and form so that no hardware/FPGA background 
+should be necessary, although having some knowledge would give better results.
+
+What operating systems are supported by FINN?
+=============================================
+
+FINN should work fine under any Linux-based OS capable of running Vivado/Vitis, as long
+as you install Docker (``docker-ce``) on your machine .
+
+
+I am getting DocNav and Model_Composer errors when launching the Docker image.
+==============================================================================
+
+We do not mount those particular directories into the Docker container because they are not
+used. The errors are Vivado related but you can safely ignore them.
+
+What board do you recommend to start working with FINN?
+=======================================================
+
+Our preferred target platforms are those supported by  `PYNQ <http://www.pynq.io/board.html>`_.
+For those boards we can offer end-to-end (DNN-to-bitstream) deployment,
+see the `finn-examples <https://github.com/Xilinx/finn-examples>`_ repository for some examples.
+However, FINN also supports Vivado IP Integrator designs. The IPs connect using AXI stream (FIFO) 
+in-and-out interfaces. This means that it can be integrated onto any Xilinx FPGA board,
+though you will have to do the system integration manually.
diff --git a/docs/finn/index.rst b/docs/finn/index.rst
index fa7ed30205da5b9c63c469ca600211e7865a9730..320cd88fe91af857c5a3948ef36a587ea305040f 100644
--- a/docs/finn/index.rst
+++ b/docs/finn/index.rst
@@ -48,5 +48,6 @@ More FINN Resources
    example_networks
    internals
    developers
+   faq
    source_code/finn
    genindex
diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst
index 4e3e8d24b2984b4473504030dc0f6a4001b0e0c8..4c260ecfb1b25448b4b8e1fe71d8c257cd7e31ff 100644
--- a/docs/finn/tutorials.rst
+++ b/docs/finn/tutorials.rst
@@ -37,8 +37,12 @@ The notebooks in this folder are more developer oriented. They should help you t
 
 * 0_custom_analysis_pass
 
-  * This notebook explains what an analysis pass is and how to write one for FINN.
+  * Explains what an analysis pass is and how to write one for FINN.
 
 * 1_custom_transformation_pass
 
-  * This notebook explains what a transformation pass is and how to write one for FINN.
+  * Explains what a transformation pass is and how to write one for FINN.
+
+* 2_custom_op
+
+  * Explains the basics of FINN custom ops and how to define a new one.
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
new file mode 100644
index 0000000000000000000000000000000000000000..7d7bc5c50b25e5b270fadc641bc2fa40d6dadddd
--- /dev/null
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -0,0 +1,923 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Introduction to custom ops in FINN\n",
+    "\n",
+    "Suppose that you want to introduce a new (custom) operation type into the FINN. Custom operations in FINN are useful for a variety of things ranging from code generation to functional verification. This is achieved by creating a new Python module for your custom operation that fulfills certain interface specifications.\n",
+    "\n",
+    "One thing to point out before we start is that **these custom operations are generic** and not really tied to e.g. Vivado HLS or few-bit quantization. As you will see in this notebook, it's possible to provide arbitrary Python/C/C++/... execution and code generation paths for custom nodes.\n",
+    "\n",
+    "## The CustomOp base class\n",
+    "\n",
+    "Subclasses of `CustomOp` provide a way of providing custom functionality for ONNX nodes in FINN.\n",
+    "This is the base class for every custom op node used in the framework, so you must create subclasses of `CustomOp` to provide execution, code generation and other functionalities in FINN.\n",
+    "\n",
+    "Let's start by looking at the `CustomOp` base class itself, which lives in the `finn-base` repository. You can view it [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/base.py). Note that the `finn` Docker container already has `finn-base` set up as a dependency.\n",
+    "\n",
+    "Some points of importance:\n",
+    "\n",
+    "1. `CustomOp` instances (in Python) are not meant to store any data, only provide functionality on top of data stored in ONNX. Each `CustomOp` instance has a member `self.onnx_node` which gives access to the ONNX `NodeProto` with attributes. There is also a custom attribute setter/getter system in `CustomOp` to make this process easier.\n",
+    "\n",
+    "2. `CustomOp` subclasses need to implement the methods below (those not starting with underscore).\n",
+    "\n",
+    "3. To be discoverable in the custom op register, `CustomOp` subclasses must set the `domain` field to the name of the Python module they appear in. For instance, to use the custom `Im2Col` op type from [here](https://github.com/Xilinx/finn-base/blob/dev/src/finn/custom_op/general/im2col.py), the ONNX node must use `domain=finn.custom_op.general` since its module is located at `finn/custom_op/general/im2col.py`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['__abstractmethods__',\n",
+       " '__class__',\n",
+       " '__delattr__',\n",
+       " '__dict__',\n",
+       " '__dir__',\n",
+       " '__doc__',\n",
+       " '__eq__',\n",
+       " '__format__',\n",
+       " '__ge__',\n",
+       " '__getattribute__',\n",
+       " '__gt__',\n",
+       " '__hash__',\n",
+       " '__init__',\n",
+       " '__init_subclass__',\n",
+       " '__le__',\n",
+       " '__lt__',\n",
+       " '__module__',\n",
+       " '__ne__',\n",
+       " '__new__',\n",
+       " '__reduce__',\n",
+       " '__reduce_ex__',\n",
+       " '__repr__',\n",
+       " '__setattr__',\n",
+       " '__sizeof__',\n",
+       " '__str__',\n",
+       " '__subclasshook__',\n",
+       " '__weakref__',\n",
+       " '_abc_cache',\n",
+       " '_abc_negative_cache',\n",
+       " '_abc_negative_cache_version',\n",
+       " '_abc_registry',\n",
+       " 'execute_node',\n",
+       " 'get_nodeattr',\n",
+       " 'get_nodeattr_allowed_values',\n",
+       " 'get_nodeattr_def',\n",
+       " 'get_nodeattr_types',\n",
+       " 'infer_node_datatype',\n",
+       " 'make_shape_compatible_op',\n",
+       " 'set_nodeattr',\n",
+       " 'verify_node']"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from finn.custom_op.base import CustomOp\n",
+    "dir(CustomOp)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A Simple CustomOp Example\n",
+    "\n",
+    "Let's make a simple CustomOp that raises its input to a given exponent (specified as attribute). For now it'll only work in Python, but later we'll add C++ execution capability too."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from onnx import helper\n",
+    "import numpy as np\n",
+    "\n",
+    "class MyPythonPowerOp(CustomOp):\n",
+    "    \n",
+    "    # here we use the CustomOp attribute system to make it easier\n",
+    "    # to set/get custom attributes on this node\n",
+    "    def get_nodeattr_types(self):\n",
+    "        return {\n",
+    "            # each entry is:\n",
+    "            # name of attribute : (dtype, required, default value)\n",
+    "            # dtype follows the ONNX attribute protobuf so\n",
+    "            # \"i\" is int, \"s\" is string, \"f\" is float,\n",
+    "            # \"ints\" is a list of integers...\n",
+    "            # also good practice to document what each attribute does here:\n",
+    "            \n",
+    "            # which integer power to raise the input to\n",
+    "            \"exponent\" : (\"i\", True, 0),\n",
+    "            # execution mode : currently only python\n",
+    "            \"exec_mode\" : (\"s\", True, \"python\"),\n",
+    "        }\n",
+    "    \n",
+    "    # return an ONNX node that has the same shape inference behavior\n",
+    "    # here we want in shape = out shape, so we use the ONNX ReLU\n",
+    "    # node to mimic its shape inference behavior\n",
+    "    # we have access to the entire ModelWrapper to help make this decision\n",
+    "    # (the parameter called model)\n",
+    "    def make_shape_compatible_op(self, model):\n",
+    "        node = self.onnx_node\n",
+    "        # make a Relu node connected to the same in-out tensors to get\n",
+    "        # shape inference\n",
+    "        # a general-purpose alternative is to use a Constant node that \n",
+    "        # produces the desired shape\n",
+    "        return helper.make_node(\"Relu\", [node.input[0]], [node.output[0]])\n",
+    "\n",
+    "    # used for FINN DataType inference: set the output tensors' datatypes\n",
+    "    # accordingly for this node\n",
+    "    # here we assume input datatype = output datatype\n",
+    "    # we have access to the entire ModelWrapper to help make this decision\n",
+    "    # (the parameter called model)\n",
+    "    def infer_node_datatype(self, model):\n",
+    "        node = self.onnx_node\n",
+    "        # data type stays the same\n",
+    "        dtype = model.get_tensor_datatype(node.input[0])\n",
+    "        model.set_tensor_datatype(node.output[0], dtype)\n",
+    "    \n",
+    "    # execute this node\n",
+    "    # context: used for both input and output, dictionary of named\n",
+    "    #          tensors\n",
+    "    # graph: the ONNX GraphProto (ModelWrapper.graph), generally \n",
+    "    #         not needed to execute a single node\n",
+    "    def execute_node(self, context, graph):\n",
+    "        exec_mode = self.get_nodeattr(\"exec_mode\")\n",
+    "        if exec_mode == \"python\":\n",
+    "            # get names of node input and output tensors\n",
+    "            i_name = self.onnx_node.input[0]\n",
+    "            o_name = self.onnx_node.output[0]\n",
+    "            # grab input tensor from context\n",
+    "            i_tensor = context[i_name]\n",
+    "            # get which power to raise to from attribute\n",
+    "            expnt = self.get_nodeattr(\"exponent\")\n",
+    "            # compute and put output into context\n",
+    "            o_tensor = np.power(i_tensor, expnt)\n",
+    "            context[o_name] = o_tensor\n",
+    "        else:\n",
+    "            raise Exception(\"Only python exec_mode is supported\")\n",
+    "        \n",
+    "    # can use to do a sanity check of all the node's properties\n",
+    "    # optional, not implemented here\n",
+    "    def verify_node(self):\n",
+    "        pass\n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To make sure our custom op is available, it needs to be registered. The best practice for this is to create a submodule under `finn.custom_op` which includes a `custom_op` dictionary that maps strings (op names) to classes (op implementations). Since we're in a Jupyter notebook we'll just hijack it at runtime like this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import finn.custom_op.general as general\n",
+    "general.custom_op[\"MyPythonPowerOp\"] = MyPythonPowerOp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see which custom ops are registered under this submodule by looking at the dictionary:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'DebugMarker': finn.custom_op.general.debugmarker.DebugMarker,\n",
+       " 'QuantAvgPool2d': finn.custom_op.general.quantavgpool2d.QuantAvgPool2d,\n",
+       " 'MaxPoolNHWC': finn.custom_op.general.maxpoolnhwc.MaxPoolNHWC,\n",
+       " 'StreamingDataflowPartition': finn.custom_op.general.streamingdataflowpartition.StreamingDataflowPartition,\n",
+       " 'MultiThreshold': finn.custom_op.general.multithreshold.MultiThreshold,\n",
+       " 'XnorPopcountMatMul': finn.custom_op.general.xnorpopcount.XnorPopcountMatMul,\n",
+       " 'Im2Col': finn.custom_op.general.im2col.Im2Col,\n",
+       " 'MyPythonPowerOp': __main__.MyPythonPowerOp}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "general.custom_op"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Let's Try Out our CustomOp\n",
+    "\n",
+    "We'll manually build a small ONNX graph containing our node in order to try out some of the functionality. This would normally go into the unit test for this CustomOp. \n",
+    "\n",
+    "The graph is built by first specifying the input/output tensor information (name, type, shape). Then,the custom node is generated; which is later used to generate the graph along the input/output tensor information. The model is built using the graph.  Finally, the model is wrapped around using the ModelWrapper function from FINN."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.core.modelwrapper import ModelWrapper\n",
+    "from onnx import TensorProto\n",
+    "\n",
+    "def make_graph(ishape, exp, op_type = \"MyPythonPowerOp\"):\n",
+    "    inp = helper.make_tensor_value_info(\n",
+    "        \"inp\", TensorProto.FLOAT, ishape\n",
+    "    )\n",
+    "    outp = helper.make_tensor_value_info(\n",
+    "        \"outp\", TensorProto.FLOAT, ishape\n",
+    "    )\n",
+    "\n",
+    "    custom_node = helper.make_node(\n",
+    "        # op type string in ONNX, what we used to register the custom op\n",
+    "        op_type,\n",
+    "        # name of input tensor\n",
+    "        [\"inp\"],\n",
+    "        # name of output tensor\n",
+    "        [\"outp\"],\n",
+    "        # specify domain s.t. FINN can find our op under this submodule\n",
+    "        domain=\"finn.custom_op.general\",\n",
+    "        # set up attributes\n",
+    "        exponent = int(exp),\n",
+    "        exec_mode = \"python\"\n",
+    "    )\n",
+    "\n",
+    "    graph = helper.make_graph(\n",
+    "        nodes=[custom_node], name=\"custom_graph\", inputs=[inp], outputs=[outp]\n",
+    "    )\n",
+    "    model = helper.make_model(graph, producer_name=\"custom-model\")\n",
+    "    return ModelWrapper(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now, we specify the input tensor shape and we generate the graph using the function we have just created. The input tensor shape and the exponent value are passed as parameters. These parameters are used to generate our model, graph and custom node using the `MyPythonPowerOp` operation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[input: \"inp\"\n",
+       "output: \"outp\"\n",
+       "op_type: \"MyPythonPowerOp\"\n",
+       "attribute {\n",
+       "  name: \"exec_mode\"\n",
+       "  s: \"python\"\n",
+       "  type: STRING\n",
+       "}\n",
+       "attribute {\n",
+       "  name: \"exponent\"\n",
+       "  i: 2\n",
+       "  type: INT\n",
+       "}\n",
+       "domain: \"finn.custom_op.general\"\n",
+       "]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# generate a small graph with our custom op\n",
+    "input_shape = (1, 2, 4)\n",
+    "ret_model = make_graph(input_shape, 2)\n",
+    "ret_model.model.graph.node"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We generate a random tensor based on the `input_shape` defined before. See the shape and values of the `random_input` below and the datatype. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[[-6.,  2., -3., -6.],\n",
+       "        [-6.,  0.,  1., -2.]]], dtype=float32)"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from finn.core.datatype import DataType\n",
+    "from finn.util.basic import gen_finn_dt_tensor\n",
+    "\n",
+    "# generate a random input of e.g signed 4-bit values\n",
+    "random_input = gen_finn_dt_tensor(DataType.INT4, input_shape)\n",
+    "random_input\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Just generate an Input Dictionary with the random values just generated. Then we execute the model using the model and random values just generated. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'outp': array([[[36.,  4.,  9., 36.],\n",
+       "         [36.,  0.,  1.,  4.]]], dtype=float32)}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from finn.core.onnx_exec import execute_onnx\n",
+    "\n",
+    "# run with FINN's execute_onnx\n",
+    "inp_dict = {\"inp\" : random_input}\n",
+    "ret = execute_onnx(ret_model, inp_dict)\n",
+    "ret"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Done! We have just executed the model that uses our custom operation. The result should be the input number to the power of 2."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## A CustomOp with C++ Generation\n",
+    "\n",
+    "We can write our CustomOps in C++ for instance and generate a model the same way we have done it previously. This can be done through python bindings that let us call C++ code from python. In fact, we will compile the C++ code and execute it from python. \n",
+    "\n",
+    "The following class is based on the `MyPythonPowerOp` class previously written. We are adding a new attribute `codegen_dir` into the `get_nodeattr_types` function that specifies the directory for the generated C++ code, building script and executable application.\n",
+    "\n",
+    "We define a new function that `my_custom_cpp_gen` that writes the C++ code into a file and builds it. Finally the `execute_node` function is modified to support the C++ execution of the CustomOp. The `c++` branch of the if-else statements first flattens the input tensor and writes it into the \"input.txt\" file. Then the C++ compiled application is executed using bash commands. The application reads the \".txt\" file, calculates the power value based on the exponent, and writes the result back into the \"output.txt\" file. Then the result of the ouput file is read and reshaped back into the original shape. Finally, the result is written into the `context` dictionary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from finn.util.basic import make_build_dir, CppBuilder\n",
+    "import subprocess\n",
+    "\n",
+    "# derive from our previous example\n",
+    "class MyMixedPowerOp(MyPythonPowerOp):\n",
+    "    \n",
+    "    # here we use the CustomOp attribute system to make it easier\n",
+    "    # to set/get custom attributes on this node\n",
+    "    def get_nodeattr_types(self):\n",
+    "        return {\n",
+    "            # each entry is:\n",
+    "            # name of attribute : (dtype, required, default value)\n",
+    "            # dtype follows the ONNX attribute protobuf so\n",
+    "            # \"i\" is int, \"s\" is string, \"f\" is float,\n",
+    "            # \"ints\" is a list of integers...\n",
+    "            # also good practice to document what each attribute does here:\n",
+    "            \n",
+    "            # which integer power to raise the input to\n",
+    "            \"exponent\" : (\"i\", True, 0),\n",
+    "            # execution mode : python or c++\n",
+    "            \"exec_mode\" : (\"s\", True, \"python\"),\n",
+    "            # code generation directory\n",
+    "            \"codegen_dir\" : (\"s\", False, \"\"),\n",
+    "        }\n",
+    "    \n",
+    "    def my_custom_cpp_gen(self):\n",
+    "        codegen_dir = make_build_dir(prefix=\"my_custom_op\")\n",
+    "        # set attribute for codegen dir\n",
+    "        self.set_nodeattr(\"codegen_dir\", codegen_dir)\n",
+    "        # generate some C++ code\n",
+    "        cpp_code = \"\"\"\n",
+    "#include <iostream>\n",
+    "#include <fstream>\n",
+    "using namespace std;\n",
+    "#define EXPONENT %d\n",
+    "\n",
+    "int main(int argc, char **argv) {\n",
+    "    ifstream infile(\"input.txt\");\n",
+    "    ofstream outfile(\"output.txt\");\n",
+    "    \n",
+    "    float elem;\n",
+    "    while (infile >> elem)\n",
+    "    {\n",
+    "        float res = 1.0;\n",
+    "        for(int i=0; i < EXPONENT; i++) {\n",
+    "            res *= elem;\n",
+    "        }\n",
+    "        outfile << res << \"\\\\n\";\n",
+    "    }\n",
+    "\n",
+    "    return 0;\n",
+    "}\n",
+    "        \"\"\" % (self.get_nodeattr(\"exponent\"))\n",
+    "        with open(codegen_dir+\"/top.cpp\", \"w\") as f:\n",
+    "            f.write(cpp_code)\n",
+    "        builder = CppBuilder()\n",
+    "        # to enable additional debug features please uncommand the next line\n",
+    "        builder.append_includes(\"--std=c++11\")\n",
+    "        builder.append_includes(\"-O3\")\n",
+    "        builder.append_sources(codegen_dir + \"/*.cpp\")\n",
+    "        builder.set_executable_path(codegen_dir + \"/node_model\")\n",
+    "        builder.build(codegen_dir)\n",
+    "    \n",
+    "    # execute this node\n",
+    "    # context: used for both input and output, dictionary of named\n",
+    "    #          tensors\n",
+    "    # graph: the ONNX GraphProto (ModelWrapper.graph), generally \n",
+    "    #         not needed to execute a single node\n",
+    "    def execute_node(self, context, graph):\n",
+    "        exec_mode = self.get_nodeattr(\"exec_mode\")\n",
+    "        # get names of node input and output tensors\n",
+    "        i_name = self.onnx_node.input[0]\n",
+    "        o_name = self.onnx_node.output[0]\n",
+    "        # grab input tensor from context\n",
+    "        i_tensor = context[i_name]\n",
+    "        # get which power to raise to from attribute\n",
+    "        expnt = self.get_nodeattr(\"exponent\")\n",
+    "        if exec_mode == \"python\":\n",
+    "            # compute and put output into context\n",
+    "            o_tensor = np.power(i_tensor, expnt)\n",
+    "            context[o_name] = o_tensor\n",
+    "        elif exec_mode == \"c++\":\n",
+    "            build_dir = self.get_nodeattr(\"codegen_dir\")\n",
+    "            # save input as txt, could preprocess, change layout etc..\n",
+    "            np.savetxt(build_dir+\"/input.txt\", i_tensor.flatten())\n",
+    "            bash_command = [\"./node_model\"]\n",
+    "            proc_run = subprocess.Popen(bash_command, cwd=build_dir, stdout=subprocess.PIPE)\n",
+    "            proc_run.communicate()\n",
+    "            o_tensor = np.loadtxt(build_dir+\"/output.txt\")\n",
+    "            o_tensor = o_tensor.reshape(i_tensor.shape)\n",
+    "            context[o_name] = o_tensor\n",
+    "        else:\n",
+    "            raise Exception(\"Only python and c++ exec_mode is supported\")\n",
+    "        \n",
+    "    # can use to do a sanity check of all the node's properties\n",
+    "    # optional, not implemented here\n",
+    "    def verify_node(self):\n",
+    "        pass\n",
+    "        \n",
+    "        "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We just register the new CustomOp the same way as we did before. Then, we create another graph using the same function `make_graph` as before. We can see the node containing the custom operation printed below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[input: \"inp\"\n",
+       "output: \"outp\"\n",
+       "op_type: \"MyMixedPowerOp\"\n",
+       "attribute {\n",
+       "  name: \"exec_mode\"\n",
+       "  s: \"python\"\n",
+       "  type: STRING\n",
+       "}\n",
+       "attribute {\n",
+       "  name: \"exponent\"\n",
+       "  i: 2\n",
+       "  type: INT\n",
+       "}\n",
+       "domain: \"finn.custom_op.general\"\n",
+       "]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# register our new op\n",
+    "general.custom_op[\"MyMixedPowerOp\"] = MyMixedPowerOp\n",
+    "\n",
+    "# make graph with new op\n",
+    "mixedop_graph = make_graph(input_shape, 2, op_type = \"MyMixedPowerOp\")\n",
+    "mixedop_graph.graph.node"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We just print all the functions inside the CustomOp, the default C++ code directory and the `exec_mode` attribute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Available functions: ['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_cache', '_abc_negative_cache', '_abc_negative_cache_version', '_abc_registry', 'execute_node', 'get_nodeattr', 'get_nodeattr_allowed_values', 'get_nodeattr_def', 'get_nodeattr_types', 'infer_node_datatype', 'make_shape_compatible_op', 'my_custom_cpp_gen', 'onnx_node', 'set_nodeattr', 'verify_node']\n",
+      "codegen_dir: \n",
+      "exec_mode: python\n"
+     ]
+    }
+   ],
+   "source": [
+    "from finn.custom_op.registry import getCustomOp\n",
+    "\n",
+    "# get FINN wrapper for this node, with all the functionality\n",
+    "op_inst = getCustomOp(mixedop_graph.model.graph.node[0])\n",
+    "print(\"Available functions: \" + str(dir(op_inst)))\n",
+    "# query some attributes\n",
+    "print(\"codegen_dir: \" + op_inst.get_nodeattr(\"codegen_dir\"))\n",
+    "print(\"exec_mode: \" + op_inst.get_nodeattr(\"exec_mode\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Implement a code generation transformation\n",
+    "\n",
+    "We define a local transformation function that transforms a specific model by accessing and modifying the attributes of the specified node. It will execute the `my_custom_cpp_gen` function from the node \"MyMixedPowerOp\" if the \"codegen_dir\" is not present."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#from finn.transformation.base import Transformation\n",
+    "# can derive from NodeLocalTransformation for faster (parallel) execution\n",
+    "from finn.transformation.base import NodeLocalTransformation\n",
+    "import os\n",
+    "\n",
+    "class MyNodeLocalCodeGen(NodeLocalTransformation):\n",
+    "    \n",
+    "    # will get called (possibly in parallel) for each node\n",
+    "    def applyNodeLocal(self, node):\n",
+    "        # keep track whether we changed anything\n",
+    "        modified_graph = False\n",
+    "        # check node type before we do anything\n",
+    "        if node.op_type == \"MyMixedPowerOp\":\n",
+    "            # get FINN wrapper for this node, with all the functions\n",
+    "            op_inst = getCustomOp(node)\n",
+    "            if not os.path.isdir(op_inst.get_nodeattr(\"codegen_dir\")):\n",
+    "                # call the codegen function we defined\n",
+    "                # this will modify the underlying node by setting attribute\n",
+    "                op_inst.my_custom_cpp_gen()\n",
+    "                # codegen function modifies attribute\n",
+    "                modified_graph = True\n",
+    "        # important: must return modified_graph = False at some point\n",
+    "        # otherwise transformation will run in infinite loop!\n",
+    "        return (node, modified_graph)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Apply the transformation into the model we had before. The returned model is the same input model after applying the specified transformation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mixedop_graph_new = mixedop_graph.transform(MyNodeLocalCodeGen())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Print the \"codegen_dir\" attribute from CustomOp node."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/tmp/finn_dev_jalezeta/my_custom_oppaxpincq\n"
+     ]
+    }
+   ],
+   "source": [
+    "new_op_inst = getCustomOp(mixedop_graph_new.graph.node[0])\n",
+    "codegen_dir = new_op_inst.get_nodeattr(\"codegen_dir\")\n",
+    "print(codegen_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the `codegen_dir` folder contains the compile script, compiled application and the C++ source file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "compile.sh  node_model\ttop.cpp\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! ls {codegen_dir}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's view the content of the C++ source file:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\r\n",
+      "#include <iostream>\r\n",
+      "#include <fstream>\r\n",
+      "using namespace std;\r\n",
+      "#define EXPONENT 2\r\n",
+      "\r\n",
+      "int main(int argc, char **argv) {\r\n",
+      "    ifstream infile(\"input.txt\");\r\n",
+      "    ofstream outfile(\"output.txt\");\r\n",
+      "    \r\n",
+      "    float elem;\r\n",
+      "    while (infile >> elem)\r\n",
+      "    {\r\n",
+      "        float res = 1.0;\r\n",
+      "        for(int i=0; i < EXPONENT; i++) {\r\n",
+      "            res *= elem;\r\n",
+      "        }\r\n",
+      "        outfile << res << \"\\n\";\r\n",
+      "    }\r\n",
+      "\r\n",
+      "    return 0;\r\n",
+      "}\r\n",
+      "        "
+     ]
+    }
+   ],
+   "source": [
+    "! cat {codegen_dir}/top.cpp"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Manually generate input and run C++ node model\n",
+    "\n",
+    "We will now manually generate the input data and write it into the `input.txt` file. Then, we manually execute the compiled application and finally see the result in the `output.txt` file. \n",
+    "\n",
+    "The purpose of this is mostly to show that there is no \"magic\" happening when FINN is executing our custom op; it's just launching a program. When debugging the execution of your custom op, it's a good idea to keep this in mind -- for instance, you can use `gdb` to debug the internals of the C++ node model here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! echo \"7.0 8.0 9.0\" > {codegen_dir}/input.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! cd {codegen_dir}; ./node_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "49\r\n",
+      "64\r\n",
+      "81\r\n"
+     ]
+    }
+   ],
+   "source": [
+    "! cat {codegen_dir}/output.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "! rm {codegen_dir}/*.txt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Use FINN execution flow\n",
+    "\n",
+    "We'll now trigger the custom node execution from inside FINN, via the custom ONNX execution capabilities which will automatically launch the appropriate handler when a custom node is encountered inside the ONNX graph, in this case launching the compiled C++ program. To do this, we will first generate a random tensor with a pre-specified tensor shape and print it. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[[-8.,  4.,  7.,  2.],\n",
+       "        [-5., -1.,  2.,  0.]]], dtype=float32)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# generate a random input of e.g signed 4-bit values\n",
+    "random_input = gen_finn_dt_tensor(DataType.INT4, input_shape)\n",
+    "random_input"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We set the CustomOp node attribute to execute in \"Python\" mode. Then, generate an input dictionay with the random input tensor and execute the transformed model using the `execute_onnx`. We print the output to see the results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'outp': array([[[64., 16., 49.,  4.],\n",
+       "         [25.,  1.,  4.,  0.]]], dtype=float32)}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# run with FINN's execute_onnx, custom node will use Python execution\n",
+    "new_op_inst.set_nodeattr(\"exec_mode\", \"python\")\n",
+    "inp_dict = {\"inp\" : random_input}\n",
+    "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
+    "ret"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We repeat the previous process in \"c++\" execution mode. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'outp': array([[[64., 16., 49.,  4.],\n",
+       "         [25.,  1.,  4.,  0.]]])}"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# run with FINN's execute_onnx, custom node will use c++ execution\n",
+    "new_op_inst.set_nodeattr(\"exec_mode\", \"c++\")\n",
+    "ret = execute_onnx(mixedop_graph_new, inp_dict)\n",
+    "ret"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import.ipynb
index 3c9cad615e168e19c7f5dfef45e7c7c60965d1e3..ad2b3db8ffcf3ae99e2a3ca13a2c002685e2df92 100644
--- a/notebooks/basics/1_brevitas_network_import.ipynb
+++ b/notebooks/basics/1_brevitas_network_import.ipynb
@@ -356,7 +356,7 @@
     "from pkgutil import get_data\n",
     "import onnx\n",
     "import onnx.numpy_helper as nph\n",
-    "raw_i = get_data(\"finn\", \"data/onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
     "input_tensor_npy = nph.to_array(input_tensor)\n",
     "input_tensor_pyt = torch.from_numpy(input_tensor_npy).float()\n",
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 795f7f22fef033381aed00375e6bd1bd45affce8..4130f35d7a371711fe1f6bf494358e3c93d8c136 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -701,16 +701,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 40,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7f25af026da0>"
+       "<matplotlib.image.AxesImage at 0x7f89a07e6eb8>"
       ]
      },
-     "execution_count": 40,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -732,7 +732,7 @@
     "import matplotlib.pyplot as plt\n",
     "import numpy as np\n",
     "\n",
-    "fn = pk.resource_filename(\"finn\", \"data/cifar10/cifar10-test-data-class3.npz\")\n",
+    "fn = pk.resource_filename(\"finn.qnn-data\", \"cifar10/cifar10-test-data-class3.npz\")\n",
     "x = np.load(fn)[\"arr_0\"]\n",
     "x = x.reshape(3, 32,32).transpose(1, 2, 0)\n",
     "plt.imshow(x)"
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index a067c6f6f8af1ef9e26384e1b2d92458c93b97fb..8cbff4fcea58d452b1e35c0dab647a8f922dc2c0 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -1468,16 +1468,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "<matplotlib.image.AxesImage at 0x7fe2dd62bf98>"
+       "<matplotlib.image.AxesImage at 0x7fcb96004cc0>"
       ]
      },
-     "execution_count": 53,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -1499,7 +1499,7 @@
     "import onnx.numpy_helper as nph\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
-    "raw_i = get_data(\"finn\", \"data/onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "x = nph.to_array(onnx.load_tensor_from_string(raw_i))\n",
     "plt.imshow(x.reshape(28,28), cmap='gray')"
    ]
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index 54738c3725c0141fddc3497dee024ca90db3f3ce..4a5d3dd07a2f6719b51e75d672790ed44883138f 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -71,7 +71,7 @@
     "from finn.util.test import get_test_model_trained\n",
     "\n",
     "fc = get_test_model_trained(\"TFC\", 1, 1)\n",
-    "raw_i = get_data(\"finn\", \"data/onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
+    "raw_i = get_data(\"finn.data\", \"onnx/mnist-conv/test_data_set_0/input_0.pb\")\n",
     "input_tensor = onnx.load_tensor_from_string(raw_i)\n",
     "input_brevitas = torch.from_numpy(nph.to_array(input_tensor)).float()\n",
     "output_golden = fc.forward(input_brevitas).detach().numpy()\n",
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 94305b861cbe0c5e6b641c9dccee7976c73c236f..a221b510ab8d22f4daca1c32e717a9b482246712 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -65,8 +65,17 @@ class InferConvInpGen(Transformation):
                     continue
                 i2c_inst = getCustomOp(n)
                 stride = i2c_inst.get_nodeattr("stride")
-                k = i2c_inst.get_nodeattr("kernel_size")
-                pad = i2c_inst.get_nodeattr("pad_amount")
+                k_attr = i2c_inst.get_nodeattr("kernel_size")
+                k_h = k_attr[0]
+                k_w = k_attr[1]
+                pad_attr = i2c_inst.get_nodeattr("pad_amount")
+                pad_h = pad_attr[0] + pad_attr[2]
+                pad_w = pad_attr[1] + pad_attr[3]
+                # temporary checks until non-square conv support is finalized
+                assert pad_h == pad_w, "Non-square images not yet supported."
+                assert k_h == k_w, "Non-square kernels not yet supported."
+                k = k_h
+                pad = pad_attr[0]
                 pad_val = i2c_inst.get_nodeattr("pad_value")
                 depthwise = i2c_inst.get_nodeattr("depthwise")
                 ifm_ch = i2c_in_shape[-1]
@@ -330,8 +339,8 @@ class InferPool_Batch(Transformation):
                     [im2col_out],
                     domain="finn.custom_op.general",
                     stride=stride,
-                    kernel_size=k,
-                    pad_amount=pad,
+                    kernel_size=[k, k],
+                    pad_amount=[pad, pad, pad, pad],
                     pad_value=pad_value,
                     depthwise=1,
                     input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
@@ -557,7 +566,7 @@ class InferQuantizedStreamingFCLayer(Transformation):
                     wmem = mw * mh // (pe * simd)
                     assert (
                         mw * mh == wmem * pe * simd
-                    ), """Requirement (MW * MH) divisiable by
+                    ), """Requirement (MW * MH) divisible by
                     (WMEM * PE * SIMD) is violated."""
                     # see if we have any following thresholds
                     consumer = model.find_consumer(mm_output)
@@ -574,20 +583,27 @@ class InferQuantizedStreamingFCLayer(Transformation):
                         thresholds neither 1 nor MH."""
                         odt = model.get_tensor_datatype(mt_output)
                         scale = getCustomOp(consumer).get_nodeattr("out_scale")
-                        bipolar_ok = odt == DataType.BIPOLAR and scale == 2.0
-                        assert (
-                            scale == 1.0 or bipolar_ok
-                        ), "out_scale must be equal to 1.0 for HLS conversion."
                         actval = getCustomOp(consumer).get_nodeattr("out_bias")
                         assert (
                             int(actval) == actval
                         ), "out_bias must be integer for HLS conversion."
                         actval = int(actval)
+                        odt_is_bipolar = odt == DataType.BIPOLAR
+                        bipolar_ok = (
+                            odt_is_bipolar and (scale == 2.0) and (actval == -1)
+                        )
+                        assert (
+                            scale == 1.0 or bipolar_ok
+                        ), "out_scale = 1.0 or bipolar output needed for conversion."
                         assert (not odt.signed()) or (
                             actval < 0
                         ), "Signed output requres actval < 0"
                         model.set_tensor_shape(mm_input, mm_in_shape)
                         model.set_tensor_shape(mt_output, mt_out_shape)
+                        if bipolar_ok:
+                            # remove bias for bipolar, since
+                            # binary->bipolar is achieved by reinterpretation
+                            actval = 0
                         # create and insert new StreamingFCLayer node
                         new_node = helper.make_node(
                             "StreamingFCLayer_Batch",
diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py
index 3f08a672d2b3c00bfb3764202a7c9fc84448f586..ebd7cbe0276d3e9b4128275b0a65b1a9a40d1f80 100644
--- a/src/finn/transformation/fpgadataflow/insert_iodma.py
+++ b/src/finn/transformation/fpgadataflow/insert_iodma.py
@@ -161,7 +161,7 @@ class InsertIODMA(Transformation):
                 padded_instream_width = first_node_inst.get_instream_width_padded()
                 padded_instream_bytes = padded_instream_width // 8
                 # determine the feasible interface width
-                transfer_bits = padded_instream_width * np.prod(out_folded_shape[:-1])
+                transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1])
                 intfwidth = math.gcd(transfer_bits, self.max_intfwidth)
                 assert (
                     intfwidth % 8 == 0
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 2d1c680338eec199908c305a42988403cb3645aa..73beb62f06a6b625a992bd2a7401a91ed09789f3 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -111,6 +111,9 @@ set_param board.repoPaths $paths_param
 if {$BOARD == "ZCU104"} {
     set_property board_part xilinx.com:zcu104:part0:1.1 [current_project]
     set ZYNQ_TYPE "zynq_us+"
+} elseif {$BOARD == "ZCU102"} {
+    set_property board_part xilinx.com:zcu102:part0:3.3 [current_project]
+    set ZYNQ_TYPE "zynq_us+"
 } elseif {$BOARD == "Ultra96"} {
     set_property board_part em.avnet.com:ultra96v1:part0:1.2 [current_project]
     set ZYNQ_TYPE "zynq_us+"
diff --git a/src/finn/transformation/streamline/__init__.py b/src/finn/transformation/streamline/__init__.py
index e78b798ff6f31ff705b733d47dcfe7bcdc6aa127..876f8892dbc9c42189ee8dc06ff5eb407f7a0946 100644
--- a/src/finn/transformation/streamline/__init__.py
+++ b/src/finn/transformation/streamline/__init__.py
@@ -59,6 +59,7 @@ from finn.transformation.streamline.reorder import (
     MoveScalarAddPastMatMul,
     MoveAddPastConv,
     MoveScalarMulPastConv,
+    MoveMulPastMaxPool,
 )
 
 from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds
@@ -76,6 +77,7 @@ class Streamline(Transformation):
             ConvertDivToMul(),
             BatchNormToAffine(),
             ConvertSignToThres(),
+            MoveMulPastMaxPool(),
             AbsorbSignBiasIntoMultiThreshold(),
             MoveAddPastMul(),
             MoveScalarAddPastMatMul(),
@@ -85,6 +87,7 @@ class Streamline(Transformation):
             MoveAddPastMul(),
             CollapseRepeatedAdd(),
             CollapseRepeatedMul(),
+            MoveMulPastMaxPool(),
             AbsorbAddIntoMultiThreshold(),
             FactorOutMulSignMagnitude(),
             AbsorbMulIntoMultiThreshold(),
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index 08a01171364c6f9c1ecc36b9f12f7447ad24e56c..b23f9f14909a5bd93ae24b34ef65304dafc7e0c1 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -425,12 +425,86 @@ class MoveMulPastDWConv(Transformation):
         return (model, graph_modified)
 
 
+class MoveMulPastMaxPool(Transformation):
+    """Move non-negative scalar or channelwise mul operations past max pool operations.
+    We want to have muls next to each other such that they can be collapsed into a
+    single mul."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            node_ind += 1
+            if (
+                n.op_type == "Mul"
+                and not model.is_fork_node(n)
+                and not model.is_join_node(n)
+            ):
+                consumer = model.find_consumer(n.output[0])
+                if (
+                    consumer is not None
+                    and consumer.op_type == "MaxPool"
+                    and not model.is_join_node(consumer)
+                ):
+                    mul_weight_name = n.input[1]
+                    A = model.get_initializer(mul_weight_name)
+                    if A is None:
+                        warnings.warn(
+                            """Mul weight tensor is not set. If it is a constant,
+                                please use set_initializer to set the tensor."""
+                        )
+                        continue
+                    maxpool_node = consumer
+                    mul_node = n
+                    start_name = mul_node.input[0]
+                    maxpool_in_name = maxpool_node.input[0]
+                    maxpool_in_shape = model.get_tensor_shape(maxpool_in_name)
+                    ifm_ch = maxpool_in_shape[1]
+                    maxpool_out_name = maxpool_node.output[0]
+                    maxpool_out_shape = model.get_tensor_shape(maxpool_out_name)
+
+                    # do not support non-2D MaxPool
+                    kernel_shape = list(
+                        get_by_name(maxpool_node.attribute, "kernel_shape").ints
+                    )
+                    if len(kernel_shape) != 2:
+                        continue
+
+                    # do not move negative multiplication factor(s)
+                    if (A < 0).any():
+                        continue
+
+                    if all(x == 1 for x in A.shape) or A.shape == (1, ifm_ch, 1, 1):
+                        # if the mul is scalar or channelwise,
+                        # we can simply swap the order of ops
+                        # rewire mul input to be maxpool input
+                        maxpool_node.input[0] = start_name
+                        model.set_tensor_shape(start_name, maxpool_in_shape)
+                        model.set_tensor_datatype(start_name, DataType.FLOAT32)
+                        # use old maxpool input tensor as maxpool output
+                        maxpool_node.output[0] = maxpool_in_name
+                        model.set_tensor_shape(maxpool_in_name, maxpool_out_shape)
+                        model.set_tensor_datatype(maxpool_in_name, DataType.FLOAT32)
+                        # use new maxpool output as new mul node input
+                        mul_node.input[0] = maxpool_in_name
+                        # use old maxpool output as new mul node output
+                        mul_node.output[0] = maxpool_out_name
+                        model.set_tensor_datatype(maxpool_out_name, DataType.FLOAT32)
+                        # move mul node past maxpool node
+                        graph.node.remove(mul_node)
+                        graph.node.insert(node_ind, mul_node)
+                        graph_modified = True
+        model = model.transform(InferShapes())
+        return (model, graph_modified)
+
+
 class MoveLinearPastEltwiseAdd(Transformation):
     """Move linear operations (mul, add) past elementwise add operations where possible.
-       Specifically,matches and transforms the following patterns:
-       (x*C) + (y*C) -> (x + y) * C
-       (x+A) + (y+B) -> (x + y) + (A + B)
-       where x and y are dynamic inputs, A, B, C are constant tensors (in general).
+    Specifically,matches and transforms the following patterns:
+    (x*C) + (y*C) -> (x + y) * C
+    (x+A) + (y+B) -> (x + y) + (A + B)
+    where x and y are dynamic inputs, A, B, C are constant tensors (in general).
     """
 
     def move_node(self, graph, n, prod0, prod1, node_ind):
@@ -504,12 +578,12 @@ class MoveLinearPastEltwiseAdd(Transformation):
 
 class MoveScalarLinearPastInvariants(Transformation):
     """Move scalar linear operations (mul, add) past functions which are invariant
-       to them. Specifically, matches and transforms the following patterns:
-       f(x*C) -> f(x) * C
-       f(x+C) -> f(x) + C
-       where x is a dynamic input, C is a constant tensor.
-       Known f which obey this property are: Reshape, Flatten, Transpose,
-       GlobalAveragePool
+    to them. Specifically, matches and transforms the following patterns:
+    f(x*C) -> f(x) * C
+    f(x+C) -> f(x) + C
+    where x is a dynamic input, C is a constant tensor.
+    Known f which obey this property are: Reshape, Flatten, Transpose,
+    GlobalAveragePool
     """
 
     def apply(self, model):
@@ -604,7 +678,7 @@ class MakeMaxPoolNHWC(Transformation):
 
 class MoveOpPastFork(Transformation):
     """Move node operations past graph forks. Used when a node before a fork
-     can be merged with nodes in the branches
+    can be merged with nodes in the branches
     """
 
     def __init__(self, op_name_list):
diff --git a/src/finn/util/visualization.py b/src/finn/util/visualization.py
index 9f0e9e8d773dd3cb4521d44db486a3f916b86ff7..3eb7e55e307c380ecc6712ff4d0c74577a9e7a43 100644
--- a/src/finn/util/visualization.py
+++ b/src/finn/util/visualization.py
@@ -34,7 +34,6 @@ from IPython.display import IFrame
 def showSrc(what):
     print("".join(inspect.getsourcelines(what)[0]))
 
-
 def showInNetron(model_filename):
-    netron.start(model_filename, port=8081, host="0.0.0.0")
-    return IFrame(src="http://0.0.0.0:8081/", width="100%", height=400)
+    netron.start(model_filename, address=("0.0.0.0", 8081))
+    return IFrame(src="http://0.0.0.0:8081/", width="100%", height=400)
\ No newline at end of file
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
index 9d350a9342e3de56cbbb5b3fc4abec69bfc254dc..d88576583eaacb7579b02bc00e4e0f9b77b16f7e 100644
--- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -77,7 +77,10 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, exec_mode):
         out_chn = 20
         conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size]
 
-    out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad)
+    total_pad = 2 * pad
+    out_feature_dim = compute_conv_output_dim(
+        in_feature_dim, kernel_size, stride, total_pad
+    )
 
     input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
     output_shape = [1, out_chn, out_feature_dim, out_feature_dim]
diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py
index 7c608fc3863ab72d1097f49b793af73664b2be48..c406d78158c52226fea881c48bc178139653fea5 100644
--- a/tests/fpgadataflow/test_depthwise_convolution.py
+++ b/tests/fpgadataflow/test_depthwise_convolution.py
@@ -57,7 +57,8 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
 
     # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold)
     ofm_ch = ifm_ch
-    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad=padding)
+    total_pad = 2 * padding
+    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, total_pad=total_pad)
 
     if act is None:
         odt = DataType.INT32
@@ -96,9 +97,9 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding):
         domain="finn.custom_op.general",
         inputs=["inp"],
         outputs=["im2col_out"],
-        kernel_size=k,
+        kernel_size=[k, k],
         stride=stride,
-        pad_amount=padding,
+        pad_amount=[padding, padding, padding, padding],
         input_shape="(1, {}, {}, {})".format(ifm_dim, ifm_dim, ifm_ch),
         depthwise=1,
     )
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 0e2e60534bcc871592128fdbbd5ca52b3cc0fe4f..4e0e8c7c35a8fc8a30e0ba4c27a7c0d637e24d1f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -63,9 +63,9 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i
         domain="finn.custom_op.general",
         backend="fpgadataflow",
         stride=stride,
-        kernel_size=k,
+        kernel_size=[k, k],
         input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)),
-        pad_amount=0,
+        pad_amount=[0, 0, 0, 0],
         pad_value=0,
     )
     graph = helper.make_graph(
diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
index 5e96d15867b087fbb5f4f1b467aea34cb33e3ff4..ce0cbcd0405f8a09efabbadd5555de1bd6b89e43 100644
--- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py
+++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py
@@ -32,8 +32,8 @@ def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw):
         ofm_ch = ifm_ch + 2
         groups = 1
         W_shape = [ofm_ch, ifm_ch, k, k]
-
-    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad_amt)
+    total_pad = 2 * pad_amt
+    ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, total_pad)
 
     # set up onnx model
     inp = helper.make_tensor_value_info(
diff --git a/tests/transformation/streamline/test_move_mul_past_maxpool.py b/tests/transformation/streamline/test_move_mul_past_maxpool.py
new file mode 100755
index 0000000000000000000000000000000000000000..f612841020e373a3c6458ee3e9a6eb14fcea7eb5
--- /dev/null
+++ b/tests/transformation/streamline/test_move_mul_past_maxpool.py
@@ -0,0 +1,91 @@
+import numpy as np
+import pytest
+
+from onnx import helper, TensorProto
+from finn.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+import finn.core.onnx_exec as oxe
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.infer_shapes import InferShapes
+from finn.util.basic import gen_finn_dt_tensor
+from finn.transformation.streamline.reorder import MoveMulPastMaxPool
+
+
+# input dimension
+@pytest.mark.parametrize("ifm_dim", [4, 7])
+# input channels
+@pytest.mark.parametrize("ifm_ch", [1, 3])
+# kernel size
+@pytest.mark.parametrize("k", [2, 3])
+# stride
+@pytest.mark.parametrize("stride", [1, 2])
+# padding
+@pytest.mark.parametrize("pad", [0, 1])
+# channelwise or scalar mul
+@pytest.mark.parametrize("cw", [0, 1])
+# negative mul
+@pytest.mark.parametrize("negative", [0, 1])
+def test_move_mul_past_maxpool(ifm_dim, ifm_ch, k, stride, pad, cw, negative):
+    if cw == 1:
+        mul_shape = [1, ifm_ch, 1, 1]
+    else:
+        mul_shape = [1, 1, 1, 1]
+
+    ofm_ch = ifm_ch
+    ofm_dim = compute_pool_output_dim(ifm_dim, k, stride, pad)
+
+    # set up onnx model
+    inp = helper.make_tensor_value_info(
+        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
+    )
+    mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, mul_shape)
+    outp = helper.make_tensor_value_info(
+        "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]
+    )
+
+    Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"])
+
+    Maxpool_node = helper.make_node(
+        "MaxPool",
+        ["mul_out"],
+        ["outp"],
+        kernel_shape=[k, k],
+        pads=[pad, pad, pad, pad],
+        strides=[stride, stride],
+    )
+
+    graph = helper.make_graph(
+        nodes=[Mul_node, Maxpool_node],
+        name="mulpastmaxpool_graph",
+        inputs=[inp],
+        outputs=[outp],
+        value_info=[mul],
+    )
+
+    model = helper.make_model(graph, producer_name="mulpastmaxpool-model")
+    model = ModelWrapper(model)
+    inp_values = gen_finn_dt_tensor(DataType.INT2, [1, ifm_ch, ifm_dim, ifm_dim])
+    mul_values = np.random.random_sample(mul_shape).astype(np.float32)
+    if negative == 1:
+        mul_values = mul_values * (-1)
+    model.set_initializer("mul", mul_values)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataTypes())
+    idict = {"inp": inp_values}
+    odict = oxe.execute_onnx(model, idict, True)
+    out_before = odict["outp"]
+
+    # perform transformation
+    model_transformed = model.transform(MoveMulPastMaxPool())
+    odict = oxe.execute_onnx(model_transformed, idict, True)
+    out_after = odict["outp"]
+
+    assert (out_before == out_after).all()
+
+    if negative == 1:
+        assert model.graph.node[0].op_type == model_transformed.graph.node[0].op_type
+        assert model.graph.node[1].op_type == model_transformed.graph.node[1].op_type
+    else:
+        assert model.graph.node[0].op_type == model_transformed.graph.node[1].op_type
+        assert model.graph.node[1].op_type == model_transformed.graph.node[0].op_type