diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 143514b36ba31cb2b292f3a1961187709798efbf..f8f12a0269fb124bed7efa93b1826a66cfca5982 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -32,7 +32,7 @@ default_language_version:
     python: python3
 
 repos:
-- repo: git://github.com/pre-commit/pre-commit-hooks
+- repo: https://github.com/pre-commit/pre-commit-hooks
   rev: v3.2.0
   hooks:
   - id: trailing-whitespace
@@ -50,12 +50,12 @@ repos:
   - id: mixed-line-ending
     args: ['--fix=no']
 
-- repo: git://github.com/PyCQA/isort
+- repo: https://github.com/PyCQA/isort
   rev: 5.5.3
   hooks:
   - id: isort
 
-- repo: git://github.com/psf/black
+- repo: https://github.com/psf/black
   rev: stable
   hooks:
   - id: black
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index 4d03e2fbb5c4cce7dbda6a757aea8dce3e15e569..2404faafbabe437fc9f36bf77722e8b3c641553f 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -86,13 +86,13 @@ RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg
 
 # git-based Python repo dependencies
 # these are installed in editable mode for easier co-development
-ARG FINN_BASE_COMMIT="e8facdd719b55839cca46da2cc4f4a4a372afb41"
+ARG FINN_BASE_COMMIT="7cd7e00ba6709a85073ba22beeb5827e684fe085"
 ARG QONNX_COMMIT="9f9eff95227cc57aadc6eafcbd44b7acda89f067"
 ARG FINN_EXP_COMMIT="af6102769226b82b639f243dc36f065340991513"
 ARG BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
 ARG PYVERILATOR_COMMIT="0c3eb9343500fc1352a02c020a736c8c2db47e8e"
 ARG CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-ARG HLSLIB_COMMIT="966d17d3fddd801927b2167627d23a9a15ed1461"
+ARG HLSLIB_COMMIT="bcca5d2b69c88e9ad7a86581ec062a9756966367"
 ARG OMX_COMMIT="1dfc4aa2f2895632742cd5751520c6b472feb74e"
 ARG AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 2c9f4a99ed3edd05a8e8d32db2fe6bcdad204716..69ac1f7717f281a43b7d6215eee91e8d3e1d9478 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -103,27 +103,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "--2021-10-12 15:49:17--  https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1\n",
-      "Resolving zenodo.org (zenodo.org)... 137.138.76.77\n",
-      "Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.\n",
-      "HTTP request sent, awaiting response... 200 OK\n",
-      "Length: 13391907 (13M) [application/octet-stream]\n",
-      "Saving to: ‘unsw_nb15_binarized.npz’\n",
-      "\n",
-      "unsw_nb15_binarized 100%[===================>]  12.77M  3.56MB/s    in 3.7s    \n",
-      "\n",
-      "2021-10-12 15:49:22 (3.44 MB/s) - ‘unsw_nb15_binarized.npz’ saved [13391907/13391907]\n",
-      "\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "! wget -O unsw_nb15_binarized.npz https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1"
    ]
@@ -137,18 +119,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Samples in each set: train = 175341, test = 82332\n",
-      "Shape of one input sample: torch.Size([593])\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "from torch.utils.data import TensorDataset\n",
@@ -220,6 +193,33 @@
     "        break"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define a PyTorch Device <a id='define_pytorch_device'></a> \n",
+    "\n",
+    "GPUs can significantly speed-up training of deep neural networks. We check for availability of a GPU and if so define it as target device."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Target device: cuda\n"
+     ]
+    }
+   ],
+   "source": [
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "print(\"Target device: \" + str(device))"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -236,7 +236,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -258,7 +258,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -282,7 +282,9 @@
     "      nn.Dropout(0.5),\n",
     "      QuantReLU(bit_width=act_bit_width),\n",
     "      QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width)\n",
-    ")\n"
+    ")\n",
+    "\n",
+    "model.to(device)"
    ]
   },
   {
@@ -302,7 +304,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -313,6 +315,7 @@
     "    \n",
     "    for i, data in enumerate(train_loader, 0):        \n",
     "        inputs, target = data\n",
+    "        inputs, target = inputs.to(device), target.to(device)\n",
     "        optimizer.zero_grad()   \n",
     "                \n",
     "        # forward pass\n",
@@ -324,14 +327,14 @@
     "        optimizer.step()\n",
     "        \n",
     "        # keep track of loss value\n",
-    "        losses.append(loss.data.numpy()) \n",
+    "        losses.append(loss.data.cpu().numpy()) \n",
     "           \n",
     "    return losses"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -347,12 +350,13 @@
     "    with torch.no_grad():\n",
     "        for data in test_loader:\n",
     "            inputs, target = data\n",
+    "            inputs, target = inputs.to(device), target.to(device)\n",
     "            output_orig = model(inputs.float())\n",
     "            # run the output through sigmoid\n",
     "            output = torch.sigmoid(output_orig)  \n",
     "            # compare against a threshold of 0.5 to generate 0/1\n",
-    "            pred = (output.detach().numpy() > 0.5) * 1\n",
-    "            target = target.float()\n",
+    "            pred = (output.detach().cpu().numpy() > 0.5) * 1\n",
+    "            target = target.cpu().float()\n",
     "            y_true.extend(target.tolist()) \n",
     "            y_pred.extend(pred.reshape(-1).tolist())\n",
     "        \n",
@@ -384,7 +388,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -402,18 +406,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
     "# loss criterion and optimizer\n",
-    "criterion = nn.BCEWithLogitsLoss()\n",
+    "criterion = nn.BCEWithLogitsLoss().to(device)\n",
     "optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.999))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "metadata": {
     "scrolled": true
    },
@@ -422,7 +426,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Training loss = 0.132918 test accuracy = 0.798341: 100%|██████████| 10/10 [00:44<00:00,  4.45s/it]\n"
+      "Training loss = 0.131165 test accuracy = 0.809102: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [02:24<00:00, 14.43s/it]\n"
      ]
     }
    ],
@@ -450,14 +454,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "metadata": {
     "scrolled": true
    },
    "outputs": [
     {
      "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/d3fzzAAAACXBIWXMAAAsTAAALEwEAmpwYAAAofElEQVR4nO3de3Rd5X3m8e+jo5slSzq2JRMsHWMbTIi5SLSGXEsTSlpIO8C0uUCbhLRpmXRKmpY2DWlmpR2mWSuFTpJ2SjowJSSZkFJCksaTQsiNQNKEBBOMb9yM8N1g+Spbsu6/+eNsiSMhyTq2js+R9HzW0tLe776c3z4herz3u/e7FRGYmZlNVVmxCzAzs5nFwWFmZnlxcJiZWV4cHGZmlhcHh5mZ5cXBYWZmeXFwmJ0ASQ9Ium66182zhjdL2jnd+zU7nvJiF2B2qkg6mjNbA/QCg8n8f4mIu6e6r4i4ohDrms0EDg6bMyJi/vC0pK3A70fEd8euJ6k8IgZOZW1mM4kvVdmcN3zJR9JHJL0I3CVpgaRvSuqQdDCZbsnZ5geSfj+Zfp+kH0n6u2TdFyRdcYLrLpf0iKQjkr4r6TZJX5ricbwm+axDkjZJujJn2dskbU72u0vSnyftjcmxHZJ0QNIPJfnvgk3K/4GYZb0KWAicAVxP9v8bdyXzS4FjwD9Osv1rgWeARuAW4E5JOoF1vwz8DFgE/DXwnqkUL6kC+H/At4HFwAeBuyW9OlnlTrKX4+qA84DvJ+1/BuwEmoDTgL8EPA6RTcrBYZY1BPxVRPRGxLGI2B8RX42I7og4AnwC+OVJtt8WEf8nIgaBLwCnk/1DPOV1JS0FLgI+HhF9EfEjYM0U638dMB/4ZLLt94FvAtcmy/uBVZLqI+JgRPw8p/104IyI6I+IH4YHsLPjcHCYZXVERM/wjKQaSbdL2iapE3gESEtKTbD9i8MTEdGdTM7Pc90lwIGcNoAdU6x/CbAjIoZy2rYBzcn0bwFvA7ZJeljS65P2W4EtwLcltUu6aYqfZ3OYg8Msa+y/sv8MeDXw2oioBy5J2ie6/DQd9gALJdXktGWmuO1uIDOmf2IpsAsgIh6LiKvIXsb6N+DepP1IRPxZRKwArgRulPQrJ3cYNts5OMzGV0e2X+OQpIXAXxX6AyNiG7AW+GtJlclZwX+a4uY/BbqBv5BUIenNybb3JPv6HUkNEdEPdJK9NIek35B0VtLHcpjs7clD436CWcLBYTa+zwDzgH3Ao8C3TtHn/g7wemA/8DfAv5J93mRSEdFHNiiuIFvzZ4H3RsTTySrvAbYml90+kHwOwErgu8BR4CfAZyPioWk7GpuV5H4ws9Il6V+BpyOi4Gc8ZlPlMw6zEiLpIklnSiqTdDlwFdk+CbOS4SfHzUrLq4CvkX2OYyfwhxHxRHFLMhvNl6rMzCwvvlRlZmZ5mROXqhobG2PZsmXFLsPMbEZ5/PHH90VE09j2OREcy5YtY+3atcUuw8xsRpG0bbx2X6oyM7O8ODjMzCwvDg4zM8uLg8PMzPLi4DAzs7w4OMzMLC8ODjMzy4uDYxLfWLeLLz067m3MZmZzloNjEt/a+CJ3PNJe7DLMzEqKg2MSrZk02w90c6Crr9ilmJmVDAfHJFpb0gA8ueNQUeswMyslDo5JXNDSQJlgnYPDzGyEg2MStVXlrFxcx5M7DxW7FDOzkuHgOI62TJondxzCL7wyM8tycBxHaybNwe5+th/oLnYpZmYloaDBIelySc9I2iLppnGWf0DSBknrJP1I0qqcZR9NtntG0q9NdZ/TrTXTALifw8xsWMGCQ1IKuA24AlgFXJsbDIkvR8T5EdEG3AJ8Ktl2FXANcC5wOfBZSakp7nNavfq0OqoryhwcZmaJQp5xXAxsiYj2iOgD7gGuyl0hIjpzZmuB4Y6Eq4B7IqI3Il4AtiT7O+4+p1t5qozzmxt8S66ZWaKQwdEM7MiZ35m0jSLpjyQ9T/aM44+Ps+2U9pns93pJayWt7ejoOOGDgOzzHBt3d9I/OHRS+zEzmw2K3jkeEbdFxJnAR4D/No37vSMiVkfE6qamV7xrPS9tS9P0DQzx9J4j01SdmdnMVcjg2AVkcuZbkraJ3ANcfZxt893ntBh+gnydn+cwMytocDwGrJS0XFIl2c7uNbkrSFqZM/vrwHPJ9BrgGklVkpYDK4GfTWWfhdCyYB6LaitZt/1QoT/KzKzklRdqxxExIOkG4EEgBXwuIjZJuhlYGxFrgBskXQb0AweB65JtN0m6F9gMDAB/FBGDAOPts1DHMExS9kFAn3GYmRUuOAAi4n7g/jFtH8+Z/tAk234C+MRU9nkqtGbSfP+ZvXT29FNfXXGqP97MrGQUvXN8pmjLpImADTsPF7sUM7OicnBM0QUtfoLczAwcHFOWrqlkeWOtHwQ0sznPwZGHtkyadR4p18zmOAdHHlpbGth7pJcXO3uKXYqZWdE4OPLQmkkDfpWsmc1tDo48rFpST0VKPOHgMLM5zMGRh6ryFKtOr/cZh5nNaQ6OPLVm0mzYeZjBIXeQm9nc5ODIU1smTVffIFv2Hi12KWZmReHgyJM7yM1srnNw5Gn5olrqq8s9xLqZzVkOjjyVlYnWTNpDrJvZnOXgOAGtLWmeeekIx/oGi12Kmdkp5+A4AW2ZNINDwcbdHinXzOYeB8cJuCCTHSnXHeRmNhc5OE7A4rpqmtPzPMS6mc1JDo4TNDxSrpnZXOPgOEGtmQZ2HjzGvqO9xS7FzOyUKmhwSLpc0jOStki6aZzlN0raLGm9pO9JOiNpf4ukdTk/PZKuTpZ9XtILOcvaCnkME2ltSQOw3s9zmNkcU7DgkJQCbgOuAFYB10paNWa1J4DVEXEBcB9wC0BEPBQRbRHRBlwKdAPfztnuw8PLI2JdoY5hMue3NFAm/DyHmc05hTzjuBjYEhHtEdEH3ANclbtCEhDdyeyjQMs4+3k78EDOeiWhprKcs0+rY91O35JrZnNLIYOjGdiRM78zaZvI+4EHxmm/BviXMW2fSC5vfVpS1Xg7k3S9pLWS1nZ0dORT95S1ZdI86VfJmtkcUxKd45LeDawGbh3TfjpwPvBgTvNHgXOAi4CFwEfG22dE3BERqyNidVNTU0HqbsukOXysn637S+pkyMysoAoZHLuATM58S9I2iqTLgI8BV0bE2FuU3gl8PSL6hxsiYk9k9QJ3kb0kVhQeKdfM5qJCBsdjwEpJyyVVkr3ktCZ3BUkXAreTDY294+zjWsZcpkrOQpAk4Gpg4/SXPjVnn1ZHTWXKz3OY2ZxSXqgdR8SApBvIXmZKAZ+LiE2SbgbWRsQaspem5gNfyeYA2yPiSgBJy8iesTw8Ztd3S2oCBKwDPlCoYzieVJk4r7nBwWFmc0rBggMgIu4H7h/T9vGc6csm2XYr43SmR8Sl01jiSWvLpPn8f2ylb2CIyvKS6DIyMyso/6U7SW2ZNH2DQzy1p7PYpZiZnRIOjpM00kHuJ8jNbI5wcJykJQ3VNM6vcj+Hmc0ZDo6TJMkj5ZrZnOLgmAZtmQbaO7o4fKz/+Cubmc1wDo5pMNzPscHjVpnZHODgmAYXJEOsr9txsLiFmJmdAg6OadAwr4IVTbWs2+EzDjOb/Rwc02S4g9wj5ZrZbOfgmCZtmTT7jvay+3BPsUsxMysoB8c0GX6VrEfKNbPZzsExTV5zej2VqTIHh5nNeg6OaVJZXsaqJfU84eAws1nOwTGN2jJpNuw8zMDgULFLMTMrGAfHNGrLpDnWP8hze48WuxQzs4JxcEwjv0rWzOYCB8c0WraohoZ5FR5i3cxmNQfHNJJEaybNE9sPFbsUM7OCcXBMs7aWBp596QjdfQPFLsXMrCAKGhySLpf0jKQtkm4aZ/mNkjZLWi/pe5LOyFk2KGld8rMmp325pJ8m+/xXSZWFPIZ8tS1NMxSwcZdfJWtms1PBgkNSCrgNuAJYBVwradWY1Z4AVkfEBcB9wC05y45FRFvyc2VO+98Cn46Is4CDwPsLdQwnwiPlmtlsV8gzjouBLRHRHhF9wD3AVbkrRMRDEdGdzD4KtEy2Q0kCLiUbMgBfAK6ezqJPVuP8KloWzONJj5RrZrNUIYOjGdiRM78zaZvI+4EHcuarJa2V9Kikq5O2RcChiBjuQJhwn5KuT7Zf29HRcUIHcKL8Klkzm81KonNc0ruB1cCtOc1nRMRq4LeBz0g6M599RsQdEbE6IlY3NTVNY7XH15ZJs+vQMTqO9J7SzzUzOxUKGRy7gEzOfEvSNoqky4CPAVdGxMhf2ojYlfxuB34AXAjsB9KSyifbZ7H5QUAzm80KGRyPASuTu6AqgWuANbkrSLoQuJ1saOzNaV8gqSqZbgTeCGyO7FuSHgLenqx6HfCNAh7DCTlvSQOpMvlBQDOblQoWHEk/xA3Ag8BTwL0RsUnSzZKG75K6FZgPfGXMbbevAdZKepJsUHwyIjYnyz4C3ChpC9k+jzsLdQwnal5lilefVud+DjOblcqPv8qJi4j7gfvHtH08Z/qyCbb7MXD+BMvayd6xVdJaM2n+ff1uhoaCsjIVuxwzs2lTEp3js9GFmTSdPQNs3d9V7FLMzKaVg6NAhjvIfbnKzGYbB0eBnLV4PrWVKd9ZZWazjoOjQFJl4vyWBp9xmNms4+AooNZMms17OukdGCx2KWZm08bBUUAXZtL0DwZP7TlS7FLMzKaNg6OARjrIt3ukXDObPRwcBfSq+moW11Xx5E6PlGtms4eDo4Ak0ZZJ+84qM5tVHBwF1ppJ076vi8Pd/cUuxcxsWjg4CqxteKRcD3hoZrOEg6PAzm9pQPIQ62Y2ezg4Cqy+uoIzm+b7QUAzmzUcHKdAa0uaJ3ceIvs6ETOzmc3BcQq0LU2z72gfuw4dK3YpZmYnzcFxCrS1pAGPlGtms4OD4xQ45/Q6KsvL3EFuZrOCg+MUqEiVcd6Sep9xmNms4OA4RVozaTbsOszA4FCxSzEzOylTCg5JtZLKkumzJV0pqWIK210u6RlJWyTdNM7yGyVtlrRe0vcknZG0t0n6iaRNybJ35WzzeUkvSFqX/LRN+WiLqC2Tpqd/iGdfOlrsUszMTspUzzgeAaolNQPfBt4DfH6yDSSlgNuAK4BVwLWSVo1Z7QlgdURcANwH3JK0dwPvjYhzgcuBz0hK52z34YhoS37WTfEYiqrNr5I1s1liqsGhiOgGfhP4bES8Azj3ONtcDGyJiPaI6APuAa7KXSEiHkr2C/Ao0JK0PxsRzyXTu4G9QNMUay1JSxfWsKCmwh3kZjbjTTk4JL0e+B3g35O21HG2aQZ25MzvTNom8n7ggXE++GKgEng+p/kTySWsT0uqmqDg6yWtlbS2o6PjOKUWniRaM2mPWWVmM95Ug+NPgI8CX4+ITZJWAA9NVxGS3g2sBm4d03468H+B342I4V7ljwLnABcBC4GPjLfPiLgjIlZHxOqmptI4WWltSfPsS0fo6h0odilmZidsSsEREQ9HxJUR8bdJJ/m+iPjj42y2C8jkzLckbaNIugz4GHBlRPTmtNeTPbv5WEQ8mlPLnsjqBe4ie0lsRmjLpBkK2LDLL3Yys5lrqndVfVlSvaRaYCOwWdKHj7PZY8BKScslVQLXAGvG7PdC4HayobE3p70S+DrwxYi4b8w2pye/BVyd1DMjDL9K1v0cZjaTTfVS1aqI6CT7h/oBYDnZO6smFBEDwA3Ag8BTwL3JZa6bJV2ZrHYrMB/4SnJr7XCwvBO4BHjfOLfd3i1pA7ABaAT+ZorHUHQLaytZurDGd1aZ2YxWPsX1KpLnNq4G/jEi+iUdd6jXiLgfuH9M28dzpi+bYLsvAV+aYNmlU6y5JLVl0qzdeqDYZZiZnbCpnnHcDmwFaoFHkgf1OgtV1GzWmkmz+3APezt7il2KmdkJmWrn+D9ERHNEvC3pmN4GvKXAtc1KbZkGwA8CmtnMNdXO8QZJnxp+LkLS/yR79mF5OndJA+Vl8vMcZjZjTfVS1eeAI2Q7rd9J9jLVXYUqajarrkhxzul1PuMwsxlrqp3jZ0bEb+XM/3dJ6wpQz5zQ2pJmzbrdDA0FZWUqdjlmZnmZ6hnHMUlvGp6R9EbA70E9QW2ZNEd6B2jf11XsUszM8jbVM44PAF+U1JDMHwSuK0xJs1/uSLlnLZ5f3GLMzPI01buqnoyIVuAC4IKIuBCY0c9TFNOKpvnMryr3E+RmNiPl9QbAiOhMniAHuLEA9cwJqTJxQUuD76wysxnpZF4d617dk9CaSfPUnk56+geLXYqZWV5OJjiOO+SITay1JU3/YLB5jx/AN7OZZdLOcUlHGD8gBMwrSEVzxIVL00B2pNxfWLqguMWYmeVh0uCIiLpTVchcc1p9Na+qr/aDgGY245zMpSo7SW2ZtO+sMrMZx8FRRK2ZNFv3d3Oou6/YpZiZTZmDo4haPVKumc1ADo4iuqAljQRP7vA7yM1s5nBwFNH8qnJWLp7vBwHNbEZxcBRZa0uadTsOEeHHYsxsZihocEi6XNIzkrZIummc5TdK2ixpvaTvJa+kHV52naTnkp/rctp/UdKGZJ//IGlGP8HetjTNga4+dh70YMNmNjMULDgkpYDbgCuAVcC1klaNWe0JYHVEXADcB9ySbLsQ+CvgtcDFwF9JGn5K7p+APwBWJj+XF+oYToXWljQAT7iD3MxmiEKecVwMbImI9ojoA+4BrspdISIeiojuZPZRoCWZ/jXgOxFxICIOAt8BLpd0OlAfEY9G9trOF4GrC3gMBffqV9VRVV7m5znMbMYoZHA0Azty5ncmbRN5P/DAcbZtTqaPu09J1w+/I72joyPP0k+dilQZ5zc3ODjMbMYoic5xSe8GVgO3Ttc+I+KOiFgdEaubmpqma7cF0ZpJs2HXYfoHh4pdipnZcRUyOHYBmZz5lqRtFEmXAR8DroyI3uNsu4uXL2dNuM+Zpi2TpndgiGdePFLsUszMjquQwfEYsFLSckmVwDXAmtwVJF0I3E42NPbmLHoQ+FVJC5JO8V8FHoyIPUCnpNcld1O9F/hGAY/hlBh+layf5zCzmaBgwRERA8ANZEPgKeDeiNgk6WZJVyar3QrMB74iaZ2kNcm2B4D/QTZ8HgNuTtoA/ivwz8AW4Hle7heZsVoWzGNhbSXrth8qdilmZsc16bDqJysi7gfuH9P28ZzpyybZ9nPA58ZpXwucN41lFp2k7Ei5PuMwsxmgJDrHLfs8x3N7j3K0d6DYpZiZTcrBUSJaMw1EwHqfdZhZiXNwlIiRDnKPlGtmJc7BUSLSNZUsW1TjBwHNrOQ5OEpIaybtlzqZWclzcJSQtkyaFzt7ePFwT7FLMTObkIOjhLQm/Rw+6zCzUubgKCGrTq+nIiU/z2FmJc3BUUKqK1K85vR6d5CbWUlzcJSY1pY063ceZnDIr5I1s9Lk4CgxbZk0R3sHaO84WuxSzMzG5eAoMe4gN7NS5+AoMSsaa6mrLndwmFnJcnCUmLIy0drikXLNrHQ5OEpQa6aBp/ccoad/sNilmJm9goOjBLW2pBkYCjbt9oCHZlZ6HBwlqG2kg9zBYWalx8FRghbXV7OkodoPAppZSXJwlCiPlGtmpaqgwSHpcknPSNoi6aZxll8i6eeSBiS9Paf9LZLW5fz0SLo6WfZ5SS/kLGsr5DEUS1smzfYD3Rzo6it2KWZmoxQsOCSlgNuAK4BVwLWSVo1ZbTvwPuDLuY0R8VBEtEVEG3Ap0A18O2eVDw8vj4h1hTmC4modeSPgoaLWYWY2ViHPOC4GtkREe0T0AfcAV+WuEBFbI2I9MDTJft4OPBAR3YUrtfSc39xAmfwEuZmVnkIGRzOwI2d+Z9KWr2uAfxnT9glJ6yV9WlLViRZYymqryjn7tDo/CGhmJaekO8clnQ6cDzyY0/xR4BzgImAh8JEJtr1e0lpJazs6OgpeayG0tqR5cschIjxSrpmVjkIGxy4gkzPfkrTl453A1yOif7ghIvZEVi9wF9lLYq8QEXdExOqIWN3U1JTnx5aGtqVpDnb3s/3AnLpKZ2YlrpDB8RiwUtJySZVkLzmtyXMf1zLmMlVyFoIkAVcDG0++1NLU2pIG3M9hZqWlYMEREQPADWQvMz0F3BsRmyTdLOlKAEkXSdoJvAO4XdKm4e0lLSN7xvLwmF3fLWkDsAFoBP6mUMdQbGefNp95FSkHh5mVlPJC7jwi7gfuH9P28Zzpx8hewhpv262M05keEZdOb5WlqzxVxvnNDb4l18xKSkl3jlt2pNyNuzvpH5zsjmUzs1PHwVHiWjNp+gaGeHrPkWKXYmYGODhK3shIuX6ew8xKhIOjxDWn59Gcnsct33qaf/jecxzp6T/+RmZmBeTgKHGS+MLvXcwbzlzEp77zLJfc8hC3P/w8x/r8dkAzKw7NhaeSV69eHWvXri12GSdt/c5DfOo7z/KDZzponF/FDW85k2tfu5Sq8lSxSzOzWUjS4xGx+hXtDo6ZZ+3WA/zdt5/h0fYDLGmo5oZLV/KO1S1UpHwCaWbTx8Exi4Jj2I+37OPWbz/DE9sPsXRhDX9y2UquamsmVaZil2Zms8BEweF/os5gbzirka/94Ru4630XUVddzo33Psmvfvphvrl+N0NDs/8fBGZWHA6OGU4SbzlnMd/84Jv43+/+BVJl4oYvP8Hb/uGHfGfzSx5Z18ymnYNjlpDE5eedzgMfuoS/v6aNnv5B/uCLa7n6tv/gkWc7HCBmNm0cHLNMqkxc1dbMd2/8ZW75rQvYd7SP937uZ7zr9kf5afv+YpdnZrOAO8dnud6BQe59bAf/6/tb2Hukl19a2ciNbz2bC5cuKHZpZlbifFfVHA2OYT39g3zp0W189gfPc6Crj8tes5g/fevZnLukodilmVmJcnDM8eAY1tU7wOd/vJXbH36ezp4Bfv380/nTt67krMV1xS7NzEqMg8PBMcrhY/3c+cN27vzRCxzrH+TqtmY+dNlKzlhUW+zSzKxEODgcHOM60NXH7Q8/zxd+spX+weAdv9jCB39lJc3pecUuzcyKzMHh4JjU3s4ePvuD5/nyT7cDcO3FGf7oLWexuL66yJWZWbE4OBwcU7Lr0DH+8fvP8ZW1O0mVievesIwP/PKZLKytLHZpZnaKOTgcHHnZtr+Lv//uc3x93S5qKlL83puW8/u/tIKGeRXFLs3MTpGiBIeky4G/B1LAP0fEJ8csvwT4DHABcE1E3JezbBDYkMxuj4grk/blwD3AIuBx4D0R0TdZHQ6OE/fcS0f4zHef49837KGuupxLVjZxbnM95zc3cN6SBhb4TMRs1jrlwSEpBTwLvBXYCTwGXBsRm3PWWQbUA38OrBkTHEcjYv44+70X+FpE3CPpfwNPRsQ/TVaLg+Pkbdp9mP/zSDuPbz/IjgPHRtqb0/M4LwmSc5MwaaqrKmKlZjZdJgqO8gJ+5sXAlohoTwq4B7gKGAmOiNiaLBuayg4lCbgU+O2k6QvAXwOTBoedvHOXNPCZay4E4FB3H5t2d7Jh12E27jrMpt2dPLjppZF1X1VfzXnN9Zy7pCF7ZtLcwGn1VWT/5zOzma6QwdEM7MiZ3wm8No/tqyWtBQaAT0bEv5G9PHUoIgZy9tk83saSrgeuB1i6dGl+lduk0jWVvPGsRt54VuNIW2dPP5t3d7IxCZONuzv53tN7GT6hbZxfxXnN9Zy3JBsk5zXX05ye5zAxm4EKGRwn64yI2CVpBfB9SRuAw1PdOCLuAO6A7KWqAtVoifrqCl63YhGvW7FopK2rd4Cn9mTDZMOuTjbtPswPn9vHYPKukAU1FZzX3JBzZlLP0oU1DhOzElfI4NgFZHLmW5K2KYmIXcnvdkk/AC4EvgqkJZUnZx157dNOrdqqclYvW8jqZQtH2nr6B7NhsruTjTsPs3H3Ye78UTv9g9kwqasuT85K6pMzkwaWL6qlzG81NCsZhQyOx4CVyV1Qu4BreLlvYlKSFgDdEdErqRF4I3BLRISkh4C3k72z6jrgGwWp3gqiuiLFhUsXjBqdt3dgkGdfPMrG3YfZsOswm3Yd5gs/2UbfQLbrq7YyxblLGkbu5jp3SQPLGmuoKk8V6zDM5rRC3477NrK326aAz0XEJyTdDKyNiDWSLgK+DiwAeoAXI+JcSW8AbgeGyL4z5DMRcWeyzxVkQ2Mh8ATw7ojonawO31U18/QPDvHcS9kwGe432bynk57+bJiUCTILa1jeWMuKxvmsaKplRWMtK5rmuyPebJr4AUAHx4w3MDhE+74uNu0+zAsdXTy/r4v2ji5e2Hd0JFAAaipT2UBpms/yxlrObMqGy/KmWuZXlXK3nllpKcbtuGbTqjxVxtmn1XH2aaOHgB8aCl7s7BkJkec7umjf18W6HQf55vrd5P7baHFdFSuaalneOD8bKMl0ZsE8ylN+IabZVDg4bMYrKxNL0vNYkp7Hm1Y2jlrW0z/I9gPdtHdkA+WFfV20dxzlgY17ONTdP7JeRUosXVgzEijDZywrmmpZVFvpS19mORwcNqtVV6TGPUsBONjVR/u+o7QnZyjtHUd5YV8XjzzbQd/gy5e+6qrLWdE0nzMba0ddAlveWMu8SnfQ29zj4LA5a0FtJb9Yu5BfPGPhqPbBoWDXwWMjofLCvi7a9x3lJ+37+doTo+/+flV9NWcsqmHZolrOaEx+L6rhjEXuT7HZy/9lm42RKhNLF9WwdFENb3716GXdfQPJ5a5soGzb3822/V187+m97Ds6+ua+xvlVLEtCZFmyv2WLalm2qJaGGo8ybDOXg8MsDzWV5dlnSpY0vGLZ0d4Btu3PhsnW/V1s29fNtgNd/Pj5fXz15z2j1k3XVIwEytjfC92nYiXOwWE2TeZXTRwqw530W/flBMv+bh7fdpD/9+RuhnLu/KqrKueMxrGBkr0EtrjOz6hY8Tk4zE6ByTrpewcG2XnwGNv2d7F1X3c2YPZ3sXl3Jw9ufJGBnFSZV5FK+lCG+1NqWTS/kqGhYGAoGBz5PUT/4Oj5gaFgcPDl9fqHhkbNj1ovmR8YHMpZFgyM7DPb3p8zX1NZzuK6KhbXV3NafRWnJb8X11WzuL6KRbVVpDx0zKzg4DArsqryFGc2zefMple8foaBwSF2H+pJzlC62Jr0qTzf0cVDT4+++ysfqTJRnvykykR5qmykLfd3xSvas/NVFeU5+8i2He0dYPfhHtbtOMT+rle+Wy1VJprmV2XDpL6axXU54VJfzWlJwCysqfTYZCXOwWFWwspTZSMd9dA0atlg8uDjwa4+ylPDf9zLcsLg5T/qYwOh0Je7+gaG6Djay0udPezt7GXvkR5e6uzhpc5s244D3azdeoCDOc/SjBxzmUaduSyuywmXnLYFNRW+bFckDg6zGSpVJprT82hOzyt2Ka9QWV42pdp6+gfpOJINlr1JqLx05OXAeWFfF4+2H+DwsVcGTGWqjKa6qpHLYsNhU1ddTlV5GVXlqezvipenqytSLy+rKBuZriwv82W0PDg4zKxoqitSZBbWkFlYM+l6Pf2D2WAZFTAvTz+39yg/2rKPIz0Dk+5nMhUpvRw25WVUDYdMRU7bmMAZG0zD4VRZXkZFaswZYHJJcOzZ38hZYWr89orUmPXKVPRLeQ4OMyt51RWpnEt2EzvWN0hX3wC9A0P09g9mf4+dHhiktz873TPSnvzuz5ketd0gR3sH2H+0b2R5T866w68AOFUkXhk8EwTUnddddNzvLV8ODjObNeZVpooyDMzQUNA3ODqYcu9G6x97d9pQzvzgOO2D49/l9vKdba9sf+VnZNerqpj+wTsdHGZmJ6msTFSXpaiuSAGzf1QAjyNtZmZ5cXCYmVleHBxmZpYXB4eZmeWloMEh6XJJz0jaIummcZZfIunnkgYkvT2nvU3STyRtkrRe0rtyln1e0guS1iU/bYU8BjMzG61gd1VJSgG3AW8FdgKPSVoTEZtzVtsOvA/48zGbdwPvjYjnJC0BHpf0YEQcSpZ/OCLuK1TtZmY2sULejnsxsCUi2gEk3QNcBYwER0RsTZaNenomIp7Nmd4taS/ZgXoOFbBeMzObgkJeqmoGduTM70za8iLpYqASeD6n+RPJJaxPS6qaYLvrJa2VtLajoyPfjzUzswmU9AOAkk4H/i9wXUQMn5V8FHiRbJjcAXwEuHnsthFxR7IcSR2Stp1gGY3AvhPcdjby9/Eyfxej+fsYbTZ8H2eM11jI4NgFZHLmW5K2KZFUD/w78LGIeHS4PSL2JJO9ku7ilf0jrxARTcdbZ5I61kbE6hPdfrbx9/Eyfxej+fsYbTZ/H4W8VPUYsFLSckmVwDXAmqlsmKz/deCLYzvBk7MQlB2I/2pg43QWbWZmkytYcETEAHAD8CDwFHBvRGySdLOkKwEkXSRpJ/AO4HZJm5LN3wlcArxvnNtu75a0AdhA9lTwbwp1DGZm9kqKiOOvNYdJuj7pLzH8feTydzGav4/RZvP34eAwM7O8eMgRMzPLi4PDzMzy4uCYxPHG2porJGUkPSRpczJ+2IeKXVMpkJSS9ISkbxa7lmKTlJZ0n6SnJT0l6fXFrqlYJP1p8v+TjZL+RVJ1sWuabg6OCeSMtXUFsAq4VtKq4lZVNAPAn0XEKuB1wB/N4e8i14fI3jFo8PfAtyLiHKCVOfq9SGoG/hhYHRHnASmyjyLMKg6OiY2MtRURfcDwWFtzTkTsiYifJ9NHyP5RyHv4mNlEUgvw68A/F7uWYpPUQPb2+TsBIqIvZ0DSuagcmCepHKgBdhe5nmnn4JjYtIy1NdtIWgZcCPy0yKUU22eAvwCGjrPeXLAc6ADuSi7d/bOk2mIXVQwRsQv4O7Ijf+8BDkfEt4tb1fRzcNiUSZoPfBX4k4joLHY9xSLpN4C9EfF4sWspEeXALwD/FBEXAl3AnOwTlLSA7JWJ5cASoFbSu4tb1fRzcEzspMbamm0kVZANjbsj4mvFrqfI3ghcKWkr2UuYl0r6UnFLKqqdwM6IGD4LvY9skMxFlwEvRERHRPQDXwPeUOSapp2DY2InPNbWbJOMC3Yn8FREfKrY9RRbRHw0IloiYhnZ/y6+HxGz7l+VUxURLwI7JL06afoVct67M8dsB14nqSb5/82vMAtvFCjpYdWLKSIGJA2PtZUCPhcRm46z2Wz1RuA9wAZJ65K2v4yI+4tXkpWYD5IdR64SaAd+t8j1FEVE/FTSfcDPyd6N+ATJ6x1mEw85YmZmefGlKjMzy4uDw8zM8uLgMDOzvDg4zMwsLw4OMzPLi4PD7DgkHU1+L5P029O8778cM//j6dy/WSE4OMymbhmQV3AkA91NZlRwRMSse8rYZh8Hh9nUfRL4JUnrkncupCTdKukxSesl/RcASW+W9ENJa0ieoJb0b5IeT97TcH3S9kmyo6iuk3R30jZ8dqNk3xslbZD0rpx9/yDn3Rd3J08oI+mTyTtT1kv6u1P+7dic4SfHzabuJuDPI+I3AJIAOBwRF0mqAv5D0vBIqL8AnBcRLyTzvxcRByTNAx6T9NWIuEnSDRHRNs5n/SbQRvbdFo3JNo8kyy4EziU7XPd/AG+U9BTwn4FzIiIkpaf30M1e5jMOsxP3q8B7k2FYfgosAlYmy36WExoAfyzpSeBRsoNnrmRybwL+JSIGI+Il4GHgopx974yIIWAd2Utoh4Ee4E5Jvwl0n+SxmU3IwWF24gR8MCLakp/lOe9e6BpZSXoz2VFTXx8RrWTHLzqZ14n25kwPAuURMUD25WP3Ab8BfOsk9m82KQeH2dQdAepy5h8E/jAZch5JZ0/wAqMG4GBEdEs6h+zrd4f1D28/xg+BdyX9KE1k37D3s4kKS96V0pAMPPmnZC9xmRWE+zjMpm49MJhccvo82fdsLwN+nnRQdwBXj7Pdt4APJP0Qz5C9XDXsDmC9pJ9HxO/ktH8deD3wJBDAX0TEi0nwjKcO+IakarJnQjee0BGaTYFHxzUzs7z4UpWZmeXFwWFmZnlxcJiZWV4cHGZmlhcHh5mZ5cXBYWZmeXFwmJlZXv4/QAgzW/yBXxUAAAAASUVORK5CYII=\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 432x288 with 1 Axes>"
       ]
@@ -478,12 +482,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "image/png": "\n",
+      "image/png": "\n",
       "text/plain": [
        "<Figure size 432x288 with 1 Axes>"
       ]
@@ -501,16 +505,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.798340863819657"
+       "0.8091021716950881"
       ]
      },
-     "execution_count": 15,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -521,7 +525,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -540,7 +544,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -549,7 +553,7 @@
        "<All keys matched successfully>"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -557,6 +561,10 @@
    "source": [
     "import torch\n",
     "\n",
+    "# Make sure the model is on CPU before loading a pretrained state_dict\n",
+    "model = model.cpu()\n",
+    "\n",
+    "# Load pretrained weights\n",
     "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
     "\n",
     "model.load_state_dict(trained_state_dict, strict=False)"
@@ -564,7 +572,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {
     "scrolled": true
    },
@@ -575,12 +583,16 @@
        "0.9188772287810328"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 19,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
+    "# Move the model back to it's target device\n",
+    "model.to(device)\n",
+    "\n",
+    "# Test for accuracy\n",
     "test(model, test_quantized_loader)"
    ]
   },
@@ -600,6 +612,16 @@
     "Sometimes, it's desirable to make some changes to our trained network prior to export (this is known in general as \"network surgery\"). This depends on the model and is not generally necessary, but in this case we want to make a couple of changes to get better results with FINN."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Move the model to CPU before surgery\n",
+    "model = model.cpu()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -609,7 +631,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
@@ -618,7 +640,7 @@
        "(64, 593)"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -634,7 +656,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [
     {
@@ -643,7 +665,7 @@
        "(64, 600)"
       ]
      },
-     "execution_count": 20,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -658,7 +680,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
@@ -667,7 +689,7 @@
        "torch.Size([64, 600])"
       ]
      },
-     "execution_count": 21,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -690,11 +712,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from brevitas.core.quant import QuantType\n",
     "from brevitas.nn import QuantIdentity\n",
     "\n",
     "\n",
@@ -702,23 +723,27 @@
     "    def __init__(self, my_pretrained_model):\n",
     "        super(CybSecMLPForExport, self).__init__()\n",
     "        self.pretrained = my_pretrained_model\n",
-    "        self.qnt_output = QuantIdentity(quant_type=QuantType.BINARY, bit_width=1, min_val=-1.0, max_val=1.0)\n",
+    "        self.qnt_output = QuantIdentity(\n",
+    "            quant_type='binary', \n",
+    "            scaling_impl_type='const',\n",
+    "            bit_width=1, min_val=-1.0, max_val=1.0)\n",
     "    \n",
     "    def forward(self, x):\n",
     "        # assume x contains bipolar {-1,1} elems\n",
     "        # shift from {-1,1} -> {0,1} since that is the\n",
     "        # input range for the trained network\n",
-    "        x = (x + torch.tensor([1.0])) / 2.0  \n",
+    "        x = (x + torch.tensor([1.0]).to(x.device)) / 2.0  \n",
     "        out_original = self.pretrained(x)\n",
     "        out_final = self.qnt_output(out_original)   # output as {-1,1}     \n",
     "        return out_final\n",
     "\n",
-    "model_for_export = CybSecMLPForExport(modified_model)"
+    "model_for_export = CybSecMLPForExport(modified_model)\n",
+    "model_for_export.to(device)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 25,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -731,16 +756,17 @@
     "    with torch.no_grad():\n",
     "        for data in test_loader:\n",
     "            inputs, target = data\n",
+    "            inputs, target = inputs.to(device), target.to(device)\n",
     "            # pad inputs to 600 elements\n",
-    "            input_padded = np.pad(inputs, [(0,0), (0,7)])\n",
+    "            input_padded = torch.nn.functional.pad(inputs, (0,7,0,0))\n",
     "            # convert inputs to {-1,+1}\n",
-    "            input_scaled = 2*input_padded - 1\n",
+    "            input_scaled = 2 * input_padded - 1\n",
     "            # run the model\n",
-    "            output = model(torch.from_numpy(input_scaled).float())\n",
-    "            y_pred.extend(list(output.flatten()))\n",
+    "            output = model(input_scaled.float())\n",
+    "            y_pred.extend(list(output.flatten().cpu().numpy()))\n",
     "            # make targets bipolar {-1,+1}\n",
-    "            expected = 2*target.float() - 1\n",
-    "            expected = expected.detach().numpy()\n",
+    "            expected = 2 * target.float() - 1\n",
+    "            expected = expected.cpu().numpy()\n",
     "            y_true.extend(list(expected.flatten()))\n",
     "        \n",
     "    return accuracy_score(y_true, y_pred)"
@@ -748,7 +774,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 26,
    "metadata": {},
    "outputs": [
     {
@@ -757,7 +783,7 @@
        "0.9188772287810328"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -780,7 +806,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 27,
    "metadata": {
     "scrolled": true
    },
@@ -791,16 +817,6 @@
      "text": [
       "Model saved to cybsec-mlp-ready.onnx\n"
      ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "<ipython-input-22-78c27bb59095>:15: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  x = (x + torch.tensor([1.0])) / 2.0\n",
-      "/workspace/brevitas/src/brevitas/quant_tensor/__init__.py:74: TracerWarning: torch.tensor results are registered as constants in the trace. You can safely ignore this warning if you use this function to create tensors out of constant variables that would be the same every time you call this function. In any other case, this might cause the trace to be incorrect.\n",
-      "  training = torch.tensor(training, dtype=torch.bool)\n"
-     ]
     }
    ],
    "source": [
@@ -809,6 +825,7 @@
     "\n",
     "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n",
     "input_shape = (1, 600)\n",
+    "\n",
     "# create a QuantTensor instance to mark input as bipolar during export\n",
     "input_a = np.random.randint(0, 1, size=input_shape).astype(np.float32)\n",
     "input_a = 2 * input_a - 1\n",
@@ -818,6 +835,10 @@
     "    input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True\n",
     ")\n",
     "\n",
+    "#Move to CPU before export\n",
+    "model_for_export.cpu()\n",
+    "\n",
+    "# Export to ONNX\n",
     "bo.export_finn_onnx(\n",
     "    model_for_export, export_path=ready_model_filename, input_t=input_qt\n",
     ")\n",
@@ -843,38 +864,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Serving 'cybsec-mlp-ready.onnx' at http://0.0.0.0:8081\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "\n",
-       "        <iframe\n",
-       "            width=\"100%\"\n",
-       "            height=\"400\"\n",
-       "            src=\"http://localhost:8081/\"\n",
-       "            frameborder=\"0\"\n",
-       "            allowfullscreen\n",
-       "        ></iframe>\n",
-       "        "
-      ],
-      "text/plain": [
-       "<IPython.lib.display.IFrame at 0x7fb36398c3a0>"
-      ]
-     },
-     "execution_count": 26,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "from finn.util.visualization import showInNetron\n",
     "\n",
@@ -888,18 +880,11 @@
     "## That's it! <a id=\"thats_it\" ></a>\n",
     "You created, trained and tested a quantized MLP that is ready to be loaded into FINN, congratulations! You can now proceed to the next notebook."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -913,7 +898,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.7.0"
   }
  },
  "nbformat": 4,
diff --git a/requirements.txt b/requirements.txt
index da0ec0b63092f0618bb7c9982b95fa90e8f91118..87386cfbbd03393c8b5936f5510b86c8cf25557f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,7 @@ numpy==1.18.0
 onnx==1.7.0
 onnxoptimizer
 onnxruntime==1.4.0
-pre-commit==2.6.0
+pre-commit==2.9.2
 pyscaffold==3.2.1
 scipy==1.5.2
 setupext-janitor>=1.1.2
diff --git a/setup.cfg b/setup.cfg
index 96618e0ffcb8dcb217185c67948a71a132a7b45a..c1dff9bd9b44fc7ca7a02ad0891fd75f10009530 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -106,7 +106,6 @@ console_scripts =
 [test]
 # py.test options when running `python setup.py test`
 # addopts = --verbose
-extras = True
 
 [tool:pytest]
 # Options for py.test:
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index 807fd706860d7e4667107ddd2ed46ea2b123c3ec..cb7ad10761852fd0cc2f10a64fc16bd73a08e55e 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -34,7 +34,7 @@ from enum import Enum
 from typing import Any, List, Optional
 
 from finn.transformation.fpgadataflow.vitis_build import VitisOptStrategy
-from finn.util.basic import alveo_part_map, pynq_part_map
+from finn.util.basic import alveo_default_platform, alveo_part_map, pynq_part_map
 
 
 class ShellFlowType(str, Enum):
@@ -257,6 +257,8 @@ class DataflowBuildConfig:
     #: Which Vitis platform will be used.
     #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
     #: e.g. "xilinx_u250_xdma_201830_2"
+    #: If not specified but "board" is specified, will use the FINN
+    #: default (if any) for that Alveo board
     vitis_platform: Optional[str] = None
 
     #: Path to JSON config file assigning each layer to an SLR.
@@ -340,7 +342,7 @@ class DataflowBuildConfig:
         if self.target_fps is None:
             return None
         else:
-            n_clock_cycles_per_sec = 10 ** 9 / self.synth_clk_period_ns
+            n_clock_cycles_per_sec = 10**9 / self.synth_clk_period_ns
             n_cycles_per_frame = n_clock_cycles_per_sec / self.target_fps
             return int(n_cycles_per_frame)
 
@@ -356,6 +358,17 @@ class DataflowBuildConfig:
         }
         return name_to_strategy[self.vitis_opt_strategy]
 
+    def _resolve_vitis_platform(self):
+        if self.vitis_platform is not None:
+            return self.vitis_platform
+        elif (self.vitis_platform is None) and (self.board is not None):
+            return alveo_default_platform[self.board]
+        else:
+            raise Exception(
+                "Could not resolve Vitis platform:"
+                " need either board or vitis_platform specified"
+            )
+
     def _resolve_verification_steps(self):
         if self.verify_steps is None:
             return []
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index c977f15e7090f5cae633a013f5eb9e6b3dd34dd2..7748626f0794a283262ea5283a550416b7489f26 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -397,7 +397,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
         model = model.transform(AnnotateCycles())
         estimate_network_performance = model.analysis(dataflow_performance)
         # add some more metrics to estimated performance
-        n_clock_cycles_per_sec = (10 ** 9) / cfg.synth_clk_period_ns
+        n_clock_cycles_per_sec = (10**9) / cfg.synth_clk_period_ns
         est_fps = n_clock_cycles_per_sec / estimate_network_performance["max_cycles"]
         estimate_network_performance["estimated_throughput_fps"] = est_fps
         est_latency_ns = (
@@ -451,7 +451,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
             InsertAndSetFIFODepths(
                 cfg._resolve_fpga_part(),
                 cfg._resolve_hls_clk_period(),
-                vivado_ram_style=cfg.large_fifo_mem_style.value,
+                vivado_ram_style=cfg.large_fifo_mem_style,
             )
         )
     else:
@@ -599,7 +599,7 @@ def step_out_of_context_synthesis(model: ModelWrapper, cfg: DataflowBuildConfig)
 
         estimate_network_performance = model.analysis(dataflow_performance)
         # add some more metrics to estimated performance
-        n_clock_cycles_per_sec = float(ooc_res_dict["fmax_mhz"]) * (10 ** 6)
+        n_clock_cycles_per_sec = float(ooc_res_dict["fmax_mhz"]) * (10**6)
         est_fps = n_clock_cycles_per_sec / estimate_network_performance["max_cycles"]
         ooc_res_dict["estimated_throughput_fps"] = est_fps
         with open(report_dir + "/ooc_synth_and_timing.json", "w") as f:
@@ -644,7 +644,7 @@ def step_synthesize_bitfile(model: ModelWrapper, cfg: DataflowBuildConfig):
                 VitisBuild(
                     cfg._resolve_fpga_part(),
                     cfg.synth_clk_period_ns,
-                    cfg.vitis_platform,
+                    cfg._resolve_vitis_platform(),
                     strategy=cfg._resolve_vitis_opt_strategy(),
                     enable_debug=cfg.enable_hw_debug,
                     floorplan_file=cfg.vitis_floorplan_file,
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 417a505898fb1aba751e4b44db336b8cf313cb6a..a5c88307ca6057444e223153e0e7f5b4eba76a91 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -28,6 +28,7 @@
 
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
+from finn.custom_op.fpgadataflow.concat import StreamingConcat
 from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
     ConvolutionInputGenerator,
 )
@@ -83,3 +84,4 @@ custom_op["IODMA"] = IODMA
 custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
 custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch
 custom_op["Lookup"] = Lookup
+custom_op["StreamingConcat"] = StreamingConcat
diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d61d3abc2b0411e107271586fba7a2c29b5fce5
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+
+from finn.core.datatype import DataType
+from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.util.basic import roundup_to_integer_multiple
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingConcat(HLSCustomOp):
+    """Streaming concatenation node with dynamically generated HLS.
+    Only supports concatenating along the last axis."""
+
+    def __init__(self, onnx_node):
+        super().__init__(onnx_node)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            # number of elements from each stream to concat
+            "ElemsPerStream": ("ints", True, []),
+            # FINN DataTypes for inputs; output datatype inferred from input
+            "inputDataType": ("s", True, ""),
+            # number of input vectors for non-concat axes, examples:
+            # [1] is a single vector (like a FC layer with batch=1)
+            # [4] is four vectors (like a FC layer with batch=4)
+            # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+            "numInputVectors": ("ints", False, [1]),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_n_inputs(self):
+        return len(self.get_nodeattr("ElemsPerStream"))
+
+    def get_total_elems(self):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        return int(np.sum(elems_per_stream))
+
+    def get_normal_input_shape(self, ind=0):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        elems = elems_per_stream[ind]
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        ishape = tuple(vecs + [elems])
+        return ishape
+
+    def get_folded_input_shape(self, ind=0):
+        return self.get_normal_input_shape(ind)
+
+    def get_normal_output_shape(self):
+        total_elems = self.get_total_elems()
+        vecs = list(self.get_nodeattr("numInputVectors"))
+        return tuple(vecs + [total_elems])
+
+    def get_folded_output_shape(self):
+        return self.get_normal_output_shape()
+
+    def make_shape_compatible_op(self, model):
+        # check all input shapes
+        for i, inp in enumerate(self.onnx_node.input):
+            exp_ishape = self.get_normal_input_shape(i)
+            ishape = tuple(model.get_tensor_shape(inp))
+            assert ishape == exp_ishape, "Unexpected shape for " + inp
+        oshape = self.get_normal_output_shape()
+        return super().make_const_shape_op(oshape)
+
+    def infer_node_datatype(self, model):
+        # check all input datatypes
+        for i, inp in enumerate(self.onnx_node.input):
+            idt = model.get_tensor_datatype(inp)
+            assert idt == self.get_input_datatype()
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+    def verify_node(self):
+        pass
+
+    def get_input_datatype(self, ind=0):
+        # input dt identical for all inputs
+        return DataType[self.get_nodeattr("inputDataType")]
+
+    def get_output_datatype(self):
+        return self.get_input_datatype()
+
+    def get_instream_width(self, ind=0):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        elems = elems_per_stream[ind]
+        ibits = self.get_input_datatype().bitwidth()
+        return elems * ibits
+
+    def get_outstream_width(self):
+        obits = self.get_output_datatype().bitwidth()
+        total_elems = self.get_total_elems()
+        out_width = total_elems * obits
+        return out_width
+
+    def get_number_output_values(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def get_exp_cycles(self):
+        return np.prod(self.get_folded_output_shape()[:-1])
+
+    def generate_params(self, model, path):
+        elems_per_stream = self.get_nodeattr("ElemsPerStream")
+        inp_streams = []
+        commands = []
+        idt = self.get_input_datatype()
+        total_elems = self.get_total_elems()
+        total_bw = idt.bitwidth() * total_elems
+        for (i, elems) in enumerate(elems_per_stream):
+            bw = idt.bitwidth() * elems
+            inp_stream = "hls::stream<ap_uint<%d> > &in%d" % (bw, i)
+            inp_streams.append(inp_stream)
+            cmd = "in%d.read()" % i
+            commands.append(cmd)
+        out_stream = "hls::stream<ap_uint<%d> > &out" % (total_bw)
+        inp_streams.append(out_stream)
+
+        impl_hls_code = []
+        impl_hls_code.append("void StreamingConcat(")
+        impl_hls_code.append(",".join(inp_streams))
+        impl_hls_code.append(", unsigned int numReps) {")
+        impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {")
+        impl_hls_code.append("#pragma HLS PIPELINE II=1")
+        impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw)
+        # FIXME: the order of streams for concatenation works out differently
+        # for cppsim vs rtlsim, addressed via reversing the order of commands
+        # for now
+        impl_hls_code.append("#ifdef __SYNTHESIS__")
+        impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");")
+        impl_hls_code.append("#else")
+        impl_hls_code.append("out_elem = (" + ",".join(commands) + ");")
+        impl_hls_code.append("#endif")
+        impl_hls_code.append("out.write(out_elem);")
+        impl_hls_code.append("}")
+        impl_hls_code.append("}")
+        impl_hls_code = "\n".join(impl_hls_code)
+
+        impl_filename = "{}/concat_impl.hpp".format(path)
+        f_impl = open(impl_filename, "w")
+        f_impl.write(impl_hls_code)
+        f_impl.close()
+
+    def execute_node(self, context, graph):
+        mode = self.get_nodeattr("exec_mode")
+        node = self.onnx_node
+        n_inps = len(self.onnx_node.input)
+        ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)]
+        folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)]
+        exp_oshape = self.get_normal_output_shape()
+        folded_oshape = self.get_folded_output_shape()
+        export_idt = self.get_input_datatype()
+
+        if mode == "cppsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        elif mode == "rtlsim":
+            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        for i in range(n_inps):
+            inp = context[node.input[i]]
+            assert str(inp.dtype) == "float32", "Input datatype is not float32"
+            assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i]
+            # reshape input into folded form
+            inp = inp.reshape(folded_ishapes[i])
+            # make copy before saving array
+            reshaped_input = inp.copy()
+            np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input)
+
+        if mode == "cppsim":
+            # execute the precompiled model
+            super().exec_precompiled_singlenode_model()
+            # load output npy file
+            super().npy_to_dynamic_output(context)
+            assert (
+                context[node.output[0]].shape == folded_oshape
+            ), "cppsim did not produce expected folded output shape"
+            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+        elif mode == "rtlsim":
+            sim = self.get_rtlsim()
+            io_dict = {"inputs": {}, "outputs": {"out": []}}
+            for i in range(n_inps):
+                nbits = self.get_instream_width(i)
+                rtlsim_inp = npy_to_rtlsim_input(
+                    "%s/input_%d.npy" % (code_gen_dir, i),
+                    export_idt,
+                    nbits,
+                    reverse_inner=True,
+                )
+                io_dict["inputs"]["in%d" % i] = rtlsim_inp
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+
+            self.rtlsim_multi_io(sim, io_dict)
+            rtlsim_output = io_dict["outputs"]["out"]
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output,
+                out_npy_path,
+                odt,
+                out_shape,
+                packed_bits,
+                target_bits,
+                reverse_inner=True,
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
+        else:
+            raise Exception(
+                """Invalid value for attribute exec_mode! Is currently set to: {}
+            has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+                    mode
+                )
+            )
+
+        assert (
+            context[node.output[0]].shape == exp_oshape
+        ), """Output shape doesn't match expected shape."""
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"']
+
+    def defines(self, var):
+        num_reps = self.get_nodeattr("numInputVectors")
+        num_reps = np.prod(num_reps)
+        self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps]
+
+    def read_npy_data(self):
+        n_inputs = self.get_n_inputs()
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        idt = self.get_input_datatype()
+        idt_bw = idt.bitwidth()
+        elem_hls_type = idt.get_hls_datatype_str()
+        elem_bits = idt_bw
+        for i in range(n_inputs):
+            packed_bits = self.get_instream_width(i)
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            npy_in = "%s/input_%d.npy" % (code_gen_dir, i)
+            self.code_gen_dict["$READNPYDATA$"].append(
+                'npy2apintstream<%s, %s, %d, %s>("%s", in%d);'
+                % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in, i)
+            )
+
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        n_inputs = self.get_n_inputs()
+        for i in range(n_inputs):
+            packed_bits = self.get_instream_width(i)
+            packed_hls_type = "ap_uint<%d>" % packed_bits
+            stream_name = "in%d" % i
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<%s> %s ("%s");'
+                % (packed_hls_type, stream_name, stream_name)
+            )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
+        )
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = []
+        n_inputs = self.get_n_inputs()
+        in_stream_names = ["in%d" % x for x in range(n_inputs)]
+        in_stream_names = ",".join(in_stream_names)
+        comp_call = "StreamingConcat(%s, out, NumReps);" % (in_stream_names)
+        self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
+
+    def dataoutstrm(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        dtype = self.get_output_datatype()
+        elem_bits = dtype.bitwidth()
+        packed_bits = self.get_outstream_width()
+        packed_hls_type = "ap_uint<%d>" % packed_bits
+        elem_hls_type = dtype.get_hls_datatype_str()
+        npy_type = "float"
+        npy_out = "%s/output.npy" % code_gen_dir
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
+            % (
+                packed_hls_type,
+                elem_hls_type,
+                elem_bits,
+                npy_type,
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+
+    def save_as_npy(self):
+        self.code_gen_dict["$SAVEASCNPY$"] = []
+
+    def blackboxfunction(self):
+        n_inputs = self.get_n_inputs()
+        in_streams = []
+        for i in range(n_inputs):
+            iwidth = self.get_instream_width(i)
+            in_streams.append("hls::stream<ap_uint<%d>> &in%d" % (iwidth, i))
+        in_streams = ",".join(in_streams)
+        total_width = self.get_input_datatype().bitwidth() * self.get_total_elems()
+        out_stream = "hls::stream<ap_uint<%d>> &out" % (total_width)
+        blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream)
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
+
+    def pragmas(self):
+        n_inputs = self.get_n_inputs()
+        pragmas = []
+        for i in range(n_inputs):
+            pragmas.append("#pragma HLS INTERFACE axis port=in%d" % i)
+        self.code_gen_dict["$PRAGMAS$"] = pragmas
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        self.code_gen_dict["$PRAGMAS$"].append(
+            "#pragma HLS INTERFACE ap_ctrl_none port=return"
+        )
+
+    def get_instream_width_padded(self, ind=0):
+        in_width = self.get_instream_width(ind)
+        return roundup_to_integer_multiple(in_width, 8)
+
+    def get_verilog_top_module_intf_names(self):
+        intf_names = super().get_verilog_top_module_intf_names()
+        n_inputs = self.get_n_inputs()
+        intf_names["s_axis"] = []
+        for i in range(n_inputs):
+            intf_names["s_axis"].append(
+                ("in%d_V_V" % i, self.get_instream_width_padded(i))
+            )
+        return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
index e43d73b1cd3ec7902fc743bfdf4d2fcad1c01dfe..6347c9e9e6923cff6c1d02d272030fbdb100604a 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
@@ -217,12 +217,13 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         dilation = self.get_nodeattr("Dilation")
         stride_h, stride_w = stride
         dilation_h, dilation_w = dilation
+        ram_style = self.get_nodeattr("ram_style")
 
         if self.get_nodeattr("SIMD") == self.get_nodeattr("IFMChannels"):
             if self.get_nodeattr("depthwise") == 0:
                 if stride_h == 1 and stride_w == 1:
                     if dilation_h == 1 and dilation_w == 1:
-                        return True
+                        return ram_style in ["auto", "distributed"]
 
         return False
 
@@ -265,6 +266,8 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         k = np.prod(self.get_nodeattr("ConvKernelDim"))
         stride = np.prod(self.get_nodeattr("Stride"))
         ram_style = self.get_nodeattr("ram_style")
+        if self.use_parallel_window_output():
+            return 0
         if ram_style == "block" or ram_style == "auto":
             ram_depth = ifm_dim * ifm_ch / simd
             if ram_depth <= 512:
@@ -297,7 +300,11 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
         k = np.prod(self.get_nodeattr("ConvKernelDim"))
         stride = np.prod(self.get_nodeattr("Stride"))
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "distributed":
+        if self.use_parallel_window_output():
+            ram_luts = math.ceil(
+                (simd * self.get_input_datatype().bitwidth() * (k + 1)) / 64
+            )
+        elif ram_style == "distributed":
             ram_luts = int(
                 (k + stride)
                 * (
@@ -312,20 +319,26 @@ class ConvolutionInputGenerator1D(HLSCustomOp):
 
     def uram_estimation(self):
         # NOTE: not tested for correctness
-        simd = self.get_nodeattr("SIMD")
-        ifm_ch = self.get_nodeattr("IFMChannels")
-        ifm_dim = np.prod(self.get_nodeattr("IFMDim"))
-        k = np.prod(self.get_nodeattr("ConvKernelDim"))
-        stride = np.prod(self.get_nodeattr("Stride"))
+        (
+            ifm_ch,
+            ifm_dim,
+            ofm_dim,
+            k,
+            stride,
+            dilation,
+        ) = self.get_1d_conv_attrs_normalized()
+        ifm_dim_y, ifm_dim_x = ifm_dim
+        k_y, k_x = k
+        stride_y, stride_x = stride
         ram_style = self.get_nodeattr("ram_style")
-        if ram_style == "ultra":
-            return int(
-                (k + stride)
-                * (
-                    math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
-                    * math.ceil(ifm_dim * ifm_ch / simd / 4096)
-                )
-            )
+        simd = self.get_nodeattr("SIMD")
+        if self.use_parallel_window_output():
+            return 0
+        elif ram_style == "ultra":
+            block_mul = 2
+            width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
+            depth_mul = math.ceil(stride_x * ifm_dim_x * (ifm_ch // simd) / 4096)
+            return block_mul * width_mul * depth_mul
         else:
             return 0
 
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
index 3b0fa55b0065e6ceeb8ad2eb7282a413adf443d7..51c8e4aea34b22a6b509eea65e9ebacaa640f234 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
@@ -29,7 +29,6 @@
 import numpy as np
 import os
 import warnings
-from onnx import TensorProto, helper
 
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
@@ -46,6 +45,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
         my_attrs = {
             "NumChannels": ("i", True, 0),
             "PE": ("i", True, 0),
+            # how many duplicated output streams to create
+            "NumOutputStreams": ("i", True, 0),
             # FINN DataTypes for input
             "inputDataType": ("s", True, ""),
             # number of input vectors, examples:
@@ -57,6 +58,9 @@ class DuplicateStreams_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def get_num_output_streams(self):
+        return self.get_nodeattr("NumOutputStreams")
+
     def get_normal_input_shape(self):
         ch = self.get_nodeattr("NumChannels")
         vecs = list(self.get_nodeattr("numInputVectors"))
@@ -82,26 +86,13 @@ class DuplicateStreams_Batch(HLSCustomOp):
         exp_ishape = self.get_normal_input_shape()
         ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
         assert ishape == exp_ishape, "Unexpected input shape."
+        num_out = self.get_num_output_streams()
+        assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs"
 
         oshape = self.get_normal_output_shape()
-        values = np.zeros(oshape).astype(np.float32)
-        split_input = np.concatenate((values, values), axis=0)
-
-        split_in = helper.make_tensor_value_info(
-            model.make_new_valueinfo_name(), TensorProto.FLOAT, oshape
-        )
-
-        model.graph.value_info.append(split_in)  # requires clean up
-        model.set_initializer(split_in.name, split_input)
-
-        shape_comp_node = helper.make_node(
-            "Split",
-            inputs=[split_in.name],
-            outputs=[self.onnx_node.output[0], self.onnx_node.output[1]],
-            axis=0,
-        )
-
-        return shape_comp_node
+        ret = super().make_const_shape_op(oshape)
+        ret.output[:] = self.onnx_node.output
+        return ret
 
     def infer_node_datatype(self, model):
         node = self.onnx_node
@@ -115,8 +106,8 @@ class DuplicateStreams_Batch(HLSCustomOp):
             warnings.warn(warn_str)
         self.set_nodeattr("inputDataType", idt.name)
         odt = self.get_output_datatype()
-        model.set_tensor_datatype(self.onnx_node.output[0], odt)
-        model.set_tensor_datatype(self.onnx_node.output[1], odt)
+        for my_out in self.onnx_node.output:
+            model.set_tensor_datatype(my_out, odt)
 
     def verify_node(self):
         info_messages = []
@@ -133,6 +124,7 @@ class DuplicateStreams_Batch(HLSCustomOp):
             self.get_nodeattr("executable_path")
             self.get_nodeattr("NumChannels")
             self.get_nodeattr("PE")
+            self.get_nodeattr("NumOutputStreams")
             self.get_nodeattr("inputDataType")
             info_messages.append("All necessary attributes exist")
         except Exception:
@@ -165,12 +157,46 @@ class DuplicateStreams_Batch(HLSCustomOp):
         return out_width
 
     def get_number_output_values(self):
-        return 2 * np.prod(self.get_folded_output_shape()[1:-1])
+        return self.get_num_output_streams() * np.prod(
+            self.get_folded_output_shape()[1:-1]
+        )
 
     def get_exp_cycles(self):
         # Channels/PE * batch size * fmdim * fmdim
         return np.prod(self.get_folded_output_shape()[:-1])
 
+    def generate_params(self, model, path):
+        n_outputs = self.get_num_output_streams()
+        inp_streams = []
+        commands = []
+        o_stream_w = self.get_outstream_width()
+        i_stream_w = self.get_instream_width()
+        in_stream = "hls::stream<ap_uint<%d> > &in0" % (i_stream_w)
+        inp_streams.append(in_stream)
+        commands.append("ap_uint<%d> e = in0.read();" % i_stream_w)
+        iters = self.get_number_output_values() // self.get_num_output_streams()
+        for i in range(n_outputs):
+            out_stream = "hls::stream<ap_uint<%d> > &out%d" % (o_stream_w, i)
+            inp_streams.append(out_stream)
+            cmd = "out%d.write(e);" % i
+            commands.append(cmd)
+
+        impl_hls_code = []
+        impl_hls_code.append("void DuplicateStreamsCustom(")
+        impl_hls_code.append(",".join(inp_streams))
+        impl_hls_code.append(") {")
+        impl_hls_code.append("for(unsigned int i = 0; i < %d; i++) {" % iters)
+        impl_hls_code.append("#pragma HLS PIPELINE II=1")
+        impl_hls_code.append("\n".join(commands))
+        impl_hls_code.append("}")
+        impl_hls_code.append("}")
+        impl_hls_code = "\n".join(impl_hls_code)
+
+        impl_filename = "{}/duplicate_impl.hpp".format(path)
+        f_impl = open(impl_filename, "w")
+        f_impl.write(impl_hls_code)
+        f_impl.close()
+
     def execute_node(self, context, graph):
         mode = self.get_nodeattr("exec_mode")
         node = self.onnx_node
@@ -178,6 +204,7 @@ class DuplicateStreams_Batch(HLSCustomOp):
         exp_oshape = self.get_normal_output_shape()
         folded_ishape = self.get_folded_input_shape()
         folded_oshape = self.get_folded_output_shape()
+        n_outputs = self.get_num_output_streams()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -205,17 +232,15 @@ class DuplicateStreams_Batch(HLSCustomOp):
             # execute the precompiled model
             super().exec_precompiled_singlenode_model()
             # load output npy file
-            super().npy_to_dynamic_outputs(context, ["output0.npy", "output1.npy"])
-            assert (
-                context[node.output[0]].shape == folded_oshape
-            ), "cppsim \
-            did not produce expected ofolded utput shape"
-            assert (
-                context[node.output[1]].shape == folded_oshape
-            ), "cppsim \
-            did not produce expected ofolded utput shape"
-            context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
-            context[node.output[1]] = context[node.output[1]].reshape(*exp_oshape)
+            super().npy_to_dynamic_outputs(
+                context, ["output%d.npy" % i for i in range(n_outputs)]
+            )
+            for i in range(n_outputs):
+                assert (
+                    context[node.output[i]].shape == folded_oshape
+                ), "cppsim \
+                did not produce expected ofolded utput shape"
+                context[node.output[i]] = context[node.output[i]].reshape(*exp_oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
             nbits = self.get_instream_width()
@@ -226,41 +251,30 @@ class DuplicateStreams_Batch(HLSCustomOp):
             super().toggle_clk(sim)
             rtlsim_dict = {
                 "inputs": {"in0": rtlsim_inp},
-                "outputs": {"out0": [], "out1": []},
+                "outputs": {},
             }
+            for i in range(n_outputs):
+                rtlsim_dict["outputs"]["out%d" % i] = []
             self.rtlsim_multi_io(sim, rtlsim_dict)
             odt = self.get_output_datatype()
             target_bits = odt.bitwidth()
             packed_bits = self.get_outstream_width()
             out_shape = self.get_folded_output_shape()
+            for i in range(n_outputs):
+                out_npy_path = "%s/output%d.npy" % (code_gen_dir, i)
+                rtlsim_output_to_npy(
+                    rtlsim_dict["outputs"]["out%d" % i],
+                    out_npy_path,
+                    odt,
+                    out_shape,
+                    packed_bits,
+                    target_bits,
+                )
+                # load and reshape output 0
+                output = np.load(out_npy_path)
+                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+                context[node.output[i]] = output
 
-            out_npy_path = "{}/output0.npy".format(code_gen_dir)
-            rtlsim_output_to_npy(
-                rtlsim_dict["outputs"]["out0"],
-                out_npy_path,
-                odt,
-                out_shape,
-                packed_bits,
-                target_bits,
-            )
-            # load and reshape output 0
-            output = np.load(out_npy_path)
-            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-            context[node.output[0]] = output
-
-            out_npy_path = "{}/output1.npy".format(code_gen_dir)
-            rtlsim_output_to_npy(
-                rtlsim_dict["outputs"]["out1"],
-                out_npy_path,
-                odt,
-                out_shape,
-                packed_bits,
-                target_bits,
-            )
-            # load and reshape output 1
-            output = np.load(out_npy_path)
-            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-            context[node.output[1]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
@@ -277,7 +291,7 @@ class DuplicateStreams_Batch(HLSCustomOp):
         ), """Output1 shape doesn't match expected shape."""
 
     def global_includes(self):
-        self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+        self.code_gen_dict["$GLOBALS$"] = ['#include "duplicate_impl.hpp"']
 
     def defines(self, var):
         self.code_gen_dict["$DEFINES$"] = []
@@ -298,24 +312,23 @@ class DuplicateStreams_Batch(HLSCustomOp):
         )
 
     def strm_decl(self):
+        n_outputs = self.get_num_output_streams()
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
             'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
         )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out0 ("out0");'.format(self.get_outstream_width())
-        )
-        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out1 ("out1");'.format(self.get_outstream_width())
-        )
+        for i in range(n_outputs):
+            out_name = "out%d" % i
+            self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+                'hls::stream<ap_uint<%d>> %s ("%s");'
+                % (self.get_outstream_width(), out_name, out_name)
+            )
 
     def docompute(self):
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """DuplicateStreams_Batch<{}, {}> (in0, out0, out1, 1);""".format(
-                self.get_outstream_width(),
-                self.get_number_output_values() // 2,
-            )
-        ]
+        n_outputs = self.get_num_output_streams()
+        ostreams = ["out%d" % x for x in range(n_outputs)]
+        dc = "DuplicateStreamsCustom(in0, %s);" % (",".join(ostreams))
+        self.code_gen_dict["$DOCOMPUTE$"] = [dc]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -325,62 +338,67 @@ class DuplicateStreams_Batch(HLSCustomOp):
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
-        npy_out = "%s/output0.npy" % code_gen_dir
-        npy_out1 = "%s/output1.npy" % code_gen_dir
+        n_outputs = self.get_num_output_streams()
         oshape = self.get_folded_output_shape()
         oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
-        self.code_gen_dict["$DATAOUTSTREAM$"] = [
-            'apintstream2npy<%s, %s, %d, %s>(out0, %s, "%s");'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                oshape_cpp_str,
-                npy_out,
+        outstrm_code = []
+
+        for i in range(n_outputs):
+            out_name = "out%d" % i
+            npy_out = "%s/output%d.npy" % (code_gen_dir, i)
+            outstrm_code.append(
+                'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");'
+                % (
+                    packed_hls_type,
+                    elem_hls_type,
+                    elem_bits,
+                    npy_type,
+                    out_name,
+                    oshape_cpp_str,
+                    npy_out,
+                )
             )
-        ]
 
-        self.code_gen_dict["$DATAOUTSTREAM$"] += [
-            'apintstream2npy<%s, %s, %d, %s>(out1, %s, "%s");'
-            % (
-                packed_hls_type,
-                elem_hls_type,
-                elem_bits,
-                npy_type,
-                oshape_cpp_str,
-                npy_out1,
-            )
-        ]
+        self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code
 
     def save_as_npy(self):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
+        n_outputs = self.get_num_output_streams()
+        inp_streams = []
+        o_stream_w = self.get_outstream_width()
+        i_stream_w = self.get_instream_width()
+        in_stream = "hls::stream<ap_uint<%d> > &in0" % (i_stream_w)
+        inp_streams.append(in_stream)
+        for i in range(n_outputs):
+            out_stream = "hls::stream<ap_uint<%d> > &out%d" % (o_stream_w, i)
+            inp_streams.append(out_stream)
+
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            """void {}(hls::stream<ap_uint<{}>> &in0,
-                hls::stream<ap_uint<{}>> &out0,
-                hls::stream<ap_uint<{}>> &out1)""".format(
+            """void {}({})""".format(
                 self.onnx_node.name,
-                self.get_instream_width(),
-                self.get_outstream_width(),
-                self.get_outstream_width(),
+                ",".join(inp_streams),
             )
         ]
 
     def pragmas(self):
+        n_outputs = self.get_num_output_streams()
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out0")
-        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out1")
+        for i in range(n_outputs):
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS INTERFACE axis port=out%d" % i
+            )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
-        intf_names["m_axis"] = [
-            ("out0_V_V", self.get_outstream_width_padded()),
-            ("out1_V_V", self.get_outstream_width_padded()),
-        ]
+        n_outputs = self.get_num_output_streams()
+        intf_names["m_axis"] = []
+        for i in range(n_outputs):
+            intf_names["m_axis"].append(
+                ("out%d_V_V" % i, self.get_outstream_width_padded())
+            )
         return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/pool_batch.py
index ba8a446f2cf7541c0bd2e1dff731afe2397942ef..708a3a149abe268d122d339a5c25648630a01ff6 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/pool_batch.py
@@ -38,7 +38,7 @@ class Pool_Batch(HLSCustomOp):
     """Class that corresponds to finn-hlslib Pool_batch function.
     Requires ConvolutionInputGenerator(depthwise == 1) to format its input
 
-    Input shape (BatchSize,OutImgDim,OutImgDim,KernelSize^2*Channels)
+    Input shape (BatchSize,OutImgDim,OutImgDim,TotalKernelSize*Channels)
     Output shape (BatchSize,OutImgDim,OutImgDim,Channels)
 
     Notes:
@@ -56,13 +56,13 @@ class Pool_Batch(HLSCustomOp):
         my_attrs = {
             "Channels": ("i", True, 0),
             "PE": ("i", True, 1),
-            "KernelSize": ("i", True, 0),
+            "KernelSize": ("ints", True, []),
             # Function:
             #  - MaxPool
             #  - QuantAvgPool
             # TODO add support for AvgPool and AccPool
             "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}),
-            "OutImgDim": ("i", True, 0),
+            "OutImgDims": ("ints", True, []),
             # FINN DataTypes for inputs/outputs
             "InputDataType": ("s", True, ""),
             "OutputDataType": ("s", True, ""),
@@ -100,10 +100,11 @@ class Pool_Batch(HLSCustomOp):
 
     def get_normal_input_shape(self):
         ifm_ch = self.get_nodeattr("Channels")
-        odim = self.get_nodeattr("OutImgDim")
+        odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
         k = self.get_nodeattr("KernelSize")
-        ishape = (batch_size, odim, odim, k * k * ifm_ch)
+        k_prod = int(np.prod(k))
+        ishape = (batch_size, *odims, k_prod * ifm_ch)
         return ishape
 
     def get_folded_input_shape(self):
@@ -117,9 +118,9 @@ class Pool_Batch(HLSCustomOp):
 
     def get_normal_output_shape(self):
         ofm_ch = self.get_nodeattr("Channels")
-        odim = self.get_nodeattr("OutImgDim")
+        odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
-        oshape = (batch_size, odim, odim, ofm_ch)
+        oshape = (batch_size, *odims, ofm_ch)
         return oshape
 
     def get_folded_output_shape(self):
@@ -140,9 +141,10 @@ class Pool_Batch(HLSCustomOp):
         ifm_ch = self.get_nodeattr("Channels")
         pe = self.get_nodeattr("PE")
         k = self.get_nodeattr("KernelSize")
-        odim = self.get_nodeattr("OutImgDim")
+        k_prod = int(np.prod(k))
+        odims = self.get_nodeattr("OutImgDims")
         batch_size = self.get_nodeattr("BatchSize")
-        exp_cycles = ((ifm_ch * k * k) / pe) * odim * odim * batch_size
+        exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size
         return int(exp_cycles)
 
     def get_instream_width(self):
@@ -211,10 +213,12 @@ class Pool_Batch(HLSCustomOp):
         self.code_gen_dict["$DEFINES$"] += ["#define PE {}".format(pe)]
 
         k = self.get_nodeattr("KernelSize")
-        self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k)]
+        k_prod = int(np.prod(k))
+        self.code_gen_dict["$DEFINES$"] += ["#define KernelSize {}".format(k_prod)]
 
-        odim = self.get_nodeattr("OutImgDim")
-        self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+        odims = self.get_nodeattr("OutImgDims")
+        total_odim = np.prod(odims)
+        self.code_gen_dict["$DEFINES$"] += ["#define OFMDimTotal {}".format(total_odim)]
 
         numReps = self.get_nodeattr("BatchSize")
         self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(numReps)]
@@ -275,7 +279,7 @@ class Pool_Batch(HLSCustomOp):
 
         self.code_gen_dict["$DOCOMPUTE$"] += [
             """Pool_batch<Channels, PE, KernelSize,Slice<{} >, Slice< {} > >
-        (in0,out, pool_fxn, OFMDim*OFMDim*numReps);""".format(
+        (in0,out, pool_fxn, OFMDimTotal*numReps);""".format(
                 i_hls_dt, o_hls_dt
             )
         ]
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 68cd1ff9ea680e157f59353d0c9d05afc3d9d6d7..896e7c2925e340455f98344d1275d9368f701ed9 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -358,8 +358,8 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         if noact == 0:
             odt = self.get_output_datatype()
             B = odt.bitwidth()
-            thr_luts = (2 ** B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
-            comp_luts = (2 ** B - 1) * acc_bits
+            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2**B - 1) * acc_bits
 
         return int(
             c0
diff --git a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
index e0f789a8883aad83ed8c8b37a16392308bc720cc..f50c5d1ef61d1677d2c2e394c43ebd6354a5331e 100644
--- a/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
+++ b/src/finn/custom_op/fpgadataflow/vector_vector_activate_batch.py
@@ -694,8 +694,8 @@ class Vector_Vector_Activate_Batch(HLSCustomOp):
         if noact == 0:
             odt = self.get_output_datatype()
             B = odt.bitwidth()
-            thr_luts = (2 ** B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
-            comp_luts = (2 ** B - 1) * acc_bits
+            thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+            comp_luts = (2**B - 1) * acc_bits
 
         return int(c0 + c1 * (P * (mult_luts + acc_luts + thr_luts + comp_luts)) + c2)
 
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 113ccb93b839d6a3bd67e3bf8f23e477e86822c6..b2f50b1a23f85bf782c553057148173b6f94dde4 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -385,62 +385,57 @@ class InferPool_Batch(Transformation):
         graph = model.graph
         node_ind = 0
         graph_modified = False
-        for n in graph.node:
+        for node in graph.node:
             node_ind += 1
-            if n.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]:
-                # extract pool parameters
+            if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]:
+                node_input = node.input[0]
+                ishape = model.get_tensor_shape(node_input)
+                node_output = node.output[0]
+                idt = model.get_tensor_datatype(node_input)
+                oshape = model.get_tensor_shape(node_output)
+                # only support 4D input tensors (1D convs need extra dummy dim)
+                if len(ishape) != 4:
+                    continue
 
-                if n.op_type == "MaxPool":
-                    k = get_by_name(n.attribute, "kernel_shape").ints[-1]
-                    stride = get_by_name(n.attribute, "strides").ints[-1]
-                    # assumed datalayout
+                # extract pool parameters
+                if node.op_type == "MaxPool":
+                    kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints)
+                    sh, sw = list(get_by_name(node.attribute, "strides").ints)
                     dlayout = "NCHW"
-                elif n.op_type == "QuantAvgPool2d":
-                    inst = getCustomOp(n)
-                    k = inst.get_nodeattr("kernel")
-                    stride = inst.get_nodeattr("stride")
+                elif node.op_type == "QuantAvgPool2d":
+                    inst = getCustomOp(node)
+                    # QuantAvgPool2d has a single scalar attribute
+                    # for kernel size and stride (implicit square)
+                    kh = kw = inst.get_nodeattr("kernel")
+                    sh = sw = inst.get_nodeattr("stride")
                     dlayout = inst.get_nodeattr("data_layout")
-                elif n.op_type == "MaxPoolNHWC":
-                    inst = getCustomOp(n)
-                    k_shape = inst.get_nodeattr("kernel_shape")
-                    strides = inst.get_nodeattr("strides")
-                    assert k_shape[0] == k_shape[1]
-                    assert strides[0] == strides[1]
-                    k = k_shape[0]
-                    stride = strides[0]
+                elif node.op_type == "MaxPoolNHWC":
+                    inst = getCustomOp(node)
+                    kh, kw = inst.get_nodeattr("kernel_shape")
+                    sh, sw = inst.get_nodeattr("strides")
                     dlayout = "NHWC"
                 try:
-                    pad = get_by_name(n.attribute, "pads").ints[-1]
+                    pad = list(get_by_name(node.attribute, "pads").ints)
                 except AttributeError:
-                    pad = 0
-
-                node_input = n.input[0]
-                node_output = n.output[0]
-                idt = model.get_tensor_datatype(node_input)
+                    pad = [0, 0, 0, 0]
 
                 if not idt.is_integer():
                     continue
 
-                if k < stride:
+                if (kh < sh) or (kw < sw):
+                    # TODO check/implement swg support
                     continue
-                elif k == stride:
-                    warnings.warn(
-                        n.name
-                        + """: Inferring Pool_Batch node for k == stride.
-                        This case can be optimized.
-                        For example, for MaxPool run InferStreamingMaxPool before
-                        InferPool_Batch """
-                    )
 
                 odt = model.get_tensor_datatype(node_output)
 
                 if dlayout == "NCHW":
-                    ifm_ch = model.get_tensor_shape(n.input[0])[1]
+                    _, ifm_ch, ifm_h, ifm_w = ishape
+                    _, ofm_ch, ofm_h, ofm_w = oshape
+                elif dlayout == "NHWC":
+                    _, ifm_h, ifm_w, ifm_ch = ishape
+                    _, ofm_h, ofm_w, ofm_ch = oshape
                 else:
-                    ifm_ch = model.get_tensor_shape(n.input[0])[-1]
-                ofm_ch = ifm_ch
-                ifm_dim = model.get_tensor_shape(n.input[0])[-2]
-                ofm_dim = model.get_tensor_shape(n.output[0])[-2]
+                    raise Exception("Unknown dlayout: " + str(dlayout))
 
                 # if data layout NCHW, we need transpose nodes surrounding
                 # the hls layer
@@ -449,7 +444,7 @@ class InferPool_Batch(Transformation):
                     inp_trans_out = helper.make_tensor_value_info(
                         model.make_new_valueinfo_name(),
                         TensorProto.FLOAT,
-                        (1, ifm_dim, ifm_dim, ifm_ch),  # NHWC
+                        (1, ifm_h, ifm_w, ifm_ch),  # NHWC
                     )
                     graph.value_info.append(inp_trans_out)
                     inp_trans_out = inp_trans_out.name
@@ -458,7 +453,7 @@ class InferPool_Batch(Transformation):
                     pool_output = helper.make_tensor_value_info(
                         model.make_new_valueinfo_name(),
                         TensorProto.FLOAT,
-                        (1, ofm_dim, ofm_dim, ofm_ch),
+                        (1, ofm_h, ofm_w, ofm_ch),
                     )
                     graph.value_info.append(pool_output)
                     pool_output = pool_output.name
@@ -467,7 +462,7 @@ class InferPool_Batch(Transformation):
                 im2col_out = helper.make_tensor_value_info(
                     model.make_new_valueinfo_name(),
                     TensorProto.FLOAT,
-                    (1, ofm_dim, ofm_dim, ifm_ch * k * k),
+                    (1, ofm_h, ofm_w, ifm_ch * kh * kw),
                 )
                 graph.value_info.append(im2col_out)
                 im2col_out = im2col_out.name
@@ -485,24 +480,28 @@ class InferPool_Batch(Transformation):
                     pool_output = node_output
 
                 accum_bits = 0
-                pool_size_param = k
+                pool_size_param = 0  # will be overridden if neededs
                 pad_value = 0
-                if n.op_type in ["MaxPool", "MaxPoolNHWC"]:
+                if node.op_type in ["MaxPool", "MaxPoolNHWC"]:
                     pool_fxn = "MaxPool"
                     odt = idt
                     pad_value = idt.min()
-                elif n.op_type == "QuantAvgPool2d":
+                elif node.op_type == "QuantAvgPool2d":
                     assert odt.is_integer(), """Output data type for QuantAvgPool2d
                     needs to be integer"""
-                    assert pad == 0, "Padding is not supported for QuantAvgPool2d"
-                    inst = getCustomOp(n)
+                    assert all(
+                        x == 0 for x in pad
+                    ), "Padding is not supported for QuantAvgPool2d"
+                    inst = getCustomOp(node)
                     pool_fxn = "QuantAvgPool"
                     pool_size_param = inst.get_shifts()
                     accum_bits = inst.get_accum_size()
 
                 else:
                     raise Exception(
-                        "pad_value and pool_fxn not configured for {}".format(n.op_type)
+                        "pad_value and pool_fxn not configured for {}".format(
+                            node.op_type
+                        )
                     )
 
                 # format input tensor
@@ -511,13 +510,13 @@ class InferPool_Batch(Transformation):
                     [im2col_in],
                     [im2col_out],
                     domain="finn.custom_op.general",
-                    stride=[stride, stride],
-                    kernel_size=[k, k],
-                    pad_amount=[pad, pad, pad, pad],
+                    stride=[sh, sw],
+                    kernel_size=[kh, kw],
+                    pad_amount=pad,
                     pad_value=pad_value,
                     depthwise=1,
-                    input_shape="(1,{},{},{})".format(ifm_dim, ifm_dim, ifm_ch),
-                    name="Im2Col_" + n.name,
+                    input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch),
+                    name="Im2Col_" + node.name,
                 )
 
                 # Warning PE has to be equal to ifm_ch until Im2Col is replaced by
@@ -534,13 +533,13 @@ class InferPool_Batch(Transformation):
                     OutputDataType=odt.name,
                     Channels=ifm_ch,
                     PE=ifm_ch,
-                    KernelSize=k,
+                    KernelSize=[kh, kw],
                     Function=pool_fxn,
-                    OutImgDim=ofm_dim,
+                    OutImgDims=[ofm_h, ofm_w],
                     AccumBits=accum_bits,
                     Size=pool_size_param,
                     BatchSize=1,
-                    name="Pool_Batch_" + n.name,
+                    name="Pool_Batch_" + node.name,
                 )
 
                 if dlayout == "NCHW":
@@ -559,7 +558,7 @@ class InferPool_Batch(Transformation):
                     graph.node.insert(node_ind, im2col_node)
                     graph.node.insert(node_ind + 1, pool_node)
                 # remove old node
-                graph.node.remove(n)
+                graph.node.remove(node)
                 graph_modified = True
 
         if graph_modified:
@@ -1180,8 +1179,9 @@ class InferDuplicateStreamsLayer(Transformation):
         for node in graph.node:
             node_ind += 1
             successors = model.find_consumers(node.output[0])
-            if successors is not None and len(successors) == 2:
+            if successors is not None and len(successors) >= 2:
                 output_tensor = node.output[0]
+                n_outputs = len(successors)
 
                 dt = model.get_tensor_datatype(output_tensor)
 
@@ -1192,7 +1192,7 @@ class InferDuplicateStreamsLayer(Transformation):
                 # create clone tensors
                 out_shape = model.get_tensor_shape(output_tensor)
                 out_tensor_clones = []
-                for i in range(2):
+                for i in range(n_outputs):
                     clone = helper.make_tensor_value_info(
                         model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape
                     )
@@ -1215,6 +1215,7 @@ class InferDuplicateStreamsLayer(Transformation):
                     PE=pe,
                     inputDataType=dt.name,
                     numInputVectors=vecs,
+                    NumOutputStreams=n_outputs,
                     name="DuplicateStreams_Batch_" + node.name,
                 )
 
@@ -1593,3 +1594,60 @@ class InferLookupLayer(Transformation):
             model = model.transform(InferShapes())
             model = model.transform(InferDataTypes())
         return (model, graph_modified)
+
+
+class InferConcatLayer(Transformation):
+    """Convert suitable Concat nodes (operating on last/-1 axis)
+    into StreamingConcat HLS layers."""
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for node in graph.node:
+            node_ind += 1
+            if node.op_type == "Concat":
+                ishape = model.get_tensor_shape(node.input[0])
+                axis = get_by_name(node.attribute, "axis")
+                if (axis is None) or (ishape is None):
+                    continue
+                axis = axis.i
+                last_axis = len(ishape) - 1
+                # skip conversion if not using last axis
+                if (axis != -1) and (axis != last_axis):
+                    continue
+                # check datatype coherence
+                dt0 = model.get_tensor_datatype(node.input[0])
+                if dt0 is None:
+                    continue
+                dt_coherent = all(
+                    [model.get_tensor_datatype(x) == dt0 for x in node.input]
+                )
+                if not dt_coherent:
+                    continue
+                # skip conversion if inputs are not integers
+                if not dt0.is_integer():
+                    continue
+                # ready for conversion
+                elems_per_stream = [model.get_tensor_shape(x)[-1] for x in node.input]
+                inp_vec = list(model.get_tensor_shape(node.input[0])[:-1])
+                new_node = helper.make_node(
+                    "StreamingConcat",
+                    node.input,
+                    node.output,
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    name="Concat_" + node.name,
+                    ElemsPerStream=elems_per_stream,
+                    inputDataType=dt0.name,
+                    numInputVectors=inp_vec,
+                )
+                graph.node.insert(node_ind, new_node)
+                # remove old node
+                graph.node.remove(node)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 327c7867fe30485f6df51d5e98dcbbaceea04cd8..ecaf4f4d194e57f20a6af186dfaccdad5ab2a686 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -223,8 +223,8 @@ class CreateStitchedIP(Transformation):
                 behavior. It is strongly recommended to insert FIFOs prior to
                 calling CreateStitchedIP."""
             )
-        # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
+            # ensure that all nodes are fpgadataflow, and that IPs are generated
             assert is_fpgadataflow_node(
                 node
             ), "All nodes must be FINN fpgadataflow nodes."
@@ -236,9 +236,7 @@ class CreateStitchedIP(Transformation):
             self.connect_clk_rst(node)
             self.connect_axi(node)
             for i in range(len(node.input)):
-                if is_external_input(model, node, i):
-                    self.connect_s_axis_external(node, idx=i)
-                else:
+                if not is_external_input(model, node, i):
                     producer = model.find_producer(node.input[i])
                     if producer is None:
                         continue
@@ -254,8 +252,25 @@ class CreateStitchedIP(Transformation):
                         "[get_bd_intf_pins %s/%s]"
                         % (producer.name, src_intf_name, node.name, dst_intf_name)
                     )
+
+        # process external inputs and outputs in top-level graph input order
+        for input in model.graph.input:
+            inp_name = input.name
+            inp_cons = model.find_consumers(inp_name)
+            assert inp_cons is not None, "No consumer for input " + inp_name
+            assert len(inp_cons) == 1, "Multiple consumers for input " + inp_name
+            node = inp_cons[0]
+            node_inst = getCustomOp(node)
+            for i in range(len(node.input)):
+                if node.input[i] == inp_name:
+                    self.connect_s_axis_external(node, idx=i)
+        for output in model.graph.output:
+            out_name = output.name
+            node = model.find_producer(out_name)
+            assert node is not None, "No producer for output " + out_name
+            node_inst = getCustomOp(node)
             for i in range(len(node.output)):
-                if is_external_output(model, node, i):
+                if node.output[i] == out_name:
                     self.connect_m_axis_external(node, idx=i)
 
         # create a temporary folder for the project
@@ -316,7 +331,10 @@ class CreateStitchedIP(Transformation):
             tcl.append("write_verilog -force -mode synth_stub %s.v" % block_name)
             tcl.append("write_checkpoint %s.dcp" % block_name)
             tcl.append("write_xdc %s.xdc" % block_name)
-            tcl.append("report_utilization -file %s_partition_util.rpt" % block_name)
+            tcl.append(
+                "report_utilization -hierarchical -hierarchical_depth 5 "
+                "-file %s_partition_util.rpt" % block_name
+            )
         # export block design itself as an IP core
         block_vendor = "xilinx_finn"
         block_library = "finn"
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 58efe65eb5f9d96d74cdf40672703fabe76afb0d..4a0d0a89c4a6bb5809887ffcfffb2068ccebaa48 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -1,4 +1,3 @@
-import warnings
 from onnx import TensorProto
 from onnx import helper as oh
 
@@ -48,23 +47,23 @@ class InsertDWC(Transformation):
                     consumers = model.find_consumers(output_name)
                     if consumers is None:
                         continue
-                    if len(consumers) > 1:
-                        warnings.warn(
-                            n.name
-                            + ": HLS node with fan-out higher than 1 cannot be stitched"
-                        )
-
+                    assert len(consumers) == 1, (
+                        n.name
+                        + ": HLS node with fan-out higher than 1 cannot be stitched"
+                    )
                     consumer = consumers[0]
                     if _suitable_node(consumer) is True:
                         n0 = getCustomOp(n)
                         n1 = getCustomOp(consumer)
                         n0_out_shape = n0.get_folded_output_shape()
-
-                        # If FC and external mem, it could be connected to input 1
+                        # in some special cases, we need to get folded shapes of
+                        # non-default inputs for the consumer
+                        # - if FC and external mem, it could be connected to input 1
+                        # - if concat, could be connected to any input
                         if (
                             consumer.op_type == "StreamingFCLayer_Batch"
                             and n1.get_nodeattr("mem_mode") == "external"
-                        ):
+                        ) or (consumer.op_type == "StreamingConcat"):
                             # get input idx
                             in_idx = None
                             for idx, n_input in enumerate(consumer.input):
@@ -73,6 +72,7 @@ class InsertDWC(Transformation):
                             assert in_idx is not None, "Malformed model"
                             n1_in_shape = n1.get_folded_input_shape(in_idx)
                         else:
+                            # use default folded input shape
                             n1_in_shape = n1.get_folded_input_shape()
 
                         if n0_out_shape[-1] != n1_in_shape[-1]:
diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py
index c8bb716922823876f5f16ffe62f17c425d49aa74..b5ae2da47a19af5b6bbf44a2a65cbef4c3bbc4dd 100644
--- a/src/finn/transformation/fpgadataflow/insert_fifo.py
+++ b/src/finn/transformation/fpgadataflow/insert_fifo.py
@@ -57,21 +57,21 @@ class InsertFIFO(Transformation):
         graph = model.graph
         node_ind = -1
         graph_modified = False
-        for n in graph.node:
+        for first_node in graph.node:
             node_ind += 1
-            if _suitable_node(n):
-                for n_output in n.output:
+            if _suitable_node(first_node):
+                for n_output in first_node.output:
                     consumers = model.find_consumers(n_output)
                     if consumers is None:
                         continue
                     if len(consumers) > 1:
                         warnings.warn(
-                            n.name
+                            first_node.name
                             + ": HLS node with fan-out higher than 1 cannot be stitched"
                         )
                     consumer = consumers[0]
                     if _suitable_node(consumer) is True:
-                        n0 = getCustomOp(n)
+                        n0 = getCustomOp(first_node)
                         # determine fifo node attributes
                         fld_shape = n0.get_folded_output_shape()
                         dtype = n0.get_output_datatype()
@@ -137,47 +137,54 @@ class InsertFIFO(Transformation):
                             graph_modified = True
 
         if graph_modified is False:
-            # insert FIFO as first node, except when first node is DMA
-            if (
-                graph.node[0].op_type != "StreamingFIFO"
-                and graph.node[0].op_type != "IODMA"
-            ):
-                n = graph.node[0]
-                n_input = n.input[0]
-                n0 = getCustomOp(n)
-                # determine fifo node attributes
-                fld_shape = n0.get_folded_input_shape()
-                dtype = n0.get_input_datatype()
-                fifo_depth = n0.get_nodeattr("inFIFODepth")
-
-                if fifo_depth <= 2:
-                    warnings.warn("Overriding input FIFO depth to 32")
-                    fifo_depth = 32
-
-                # create fifo node
-                fifo_output_tensor = oh.make_tensor_value_info(
-                    model.make_new_valueinfo_name(),
-                    TensorProto.FLOAT,
-                    n0.get_normal_input_shape(),
-                )
-                graph.value_info.append(fifo_output_tensor)
-                model.set_tensor_datatype(fifo_output_tensor.name, dtype)
-
-                fifo_node = oh.make_node(
-                    "StreamingFIFO",
-                    [n_input],
-                    [fifo_output_tensor.name],
-                    domain="finn.custom_op.fpgadataflow",
-                    backend="fpgadataflow",
-                    depth=fifo_depth,
-                    folded_shape=fld_shape,
-                    dataType=str(dtype.name),
-                )
-                # insert fifo
-                graph.node.insert(0, fifo_node)
-
-                # set fifo output tensor as new input tensor of second node
-                n.input[0] = fifo_output_tensor.name
+            graph_in_names = [x.name for x in model.graph.input]
+            for graph_in_name in graph_in_names:
+                first_node = model.find_consumer(graph_in_name)
+                # insert FIFO as first node, except when first node is DMA
+                if (
+                    first_node.op_type != "StreamingFIFO"
+                    and first_node.op_type != "IODMA"
+                ):
+                    inp_ind = list(first_node.input).index(graph_in_name)
+                    n_input = first_node.input[inp_ind]
+                    n0 = getCustomOp(first_node)
+                    # determine fifo node attributes
+                    if inp_ind == 0:
+                        fld_shape = n0.get_folded_input_shape()
+                        dtype = n0.get_input_datatype()
+                    else:
+                        fld_shape = n0.get_folded_input_shape(inp_ind)
+                        dtype = n0.get_input_datatype(inp_ind)
+                    fifo_depth = n0.get_nodeattr("inFIFODepth")
+
+                    if fifo_depth <= 2:
+                        warnings.warn("Overriding input FIFO depth to 32")
+                        fifo_depth = 32
+
+                    # create fifo node
+                    fifo_output_tensor = oh.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        n0.get_normal_input_shape(),
+                    )
+                    graph.value_info.append(fifo_output_tensor)
+                    model.set_tensor_datatype(fifo_output_tensor.name, dtype)
+
+                    fifo_node = oh.make_node(
+                        "StreamingFIFO",
+                        [n_input],
+                        [fifo_output_tensor.name],
+                        domain="finn.custom_op.fpgadataflow",
+                        backend="fpgadataflow",
+                        depth=fifo_depth,
+                        folded_shape=fld_shape,
+                        dataType=str(dtype.name),
+                    )
+                    # insert fifo
+                    graph.node.insert(0, fifo_node)
+
+                    # set fifo output tensor as new input tensor of second node
+                    first_node.input[inp_ind] = fifo_output_tensor.name
 
             # insert FIFO as last node, except when last node is DMA
             graph_out_names = [x.name for x in model.graph.output]
diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
index 39eb049565475b462ea0df9d88b46e3598e6cdd9..ce7cf7bc589fae7fb6c8785b51cf45514f49c5a0 100644
--- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py
+++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py
@@ -222,7 +222,7 @@ class InsertAndSetFIFODepths(Transformation):
         fpgapart,
         clk_ns=10.0,
         max_qsrl_depth=256,
-        max_depth=2 ** 14,
+        max_depth=2**14,
         swg_exception=True,
         vivado_ram_style="auto",
     ):
diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py
index 64d7a080724820d58a026bafbe74a4d7567b2179..617c9f431bc8ecd08e429fd7c10b56d65d374e39 100644
--- a/src/finn/transformation/fpgadataflow/set_folding.py
+++ b/src/finn/transformation/fpgadataflow/set_folding.py
@@ -104,7 +104,12 @@ class SetFolding(Transformation):
         ]
         # these ops use SIMD parallelism, up to a max value of NumChannels
         # ConvolutionInputGenerator has a special case when depthwise=1
-        simd_ops = ["DownSampler", "FMPadding_Batch", "ConvolutionInputGenerator"]
+        simd_ops = [
+            "DownSampler",
+            "FMPadding_Batch",
+            "ConvolutionInputGenerator",
+            "ConvolutionInputGenerator1D",
+        ]
         # these ops are preceded by depthwise SWG and have special behavior,
         # as explained in the SetFolding docstring
         depthwise_op_exceptions = ["Vector_Vector_Activate_Batch", "Pool_Batch"]
@@ -150,7 +155,7 @@ class SetFolding(Transformation):
                 # also set the folding of the upsteam DW SWU
                 # which must be identical to this node
                 swu_node = model.find_producer(node.input[0])
-                if swu_node.op_type == "ConvolutionInputGenerator":
+                if swu_node.op_type.startswith("ConvolutionInputGenerator"):
                     swu_node_inst = getCustomOp(swu_node)
                     pe = node_inst.get_nodeattr("PE")
                     swu_node_inst.set_nodeattr("SIMD", pe)
@@ -166,7 +171,10 @@ class SetFolding(Transformation):
                             "Expected SWU on DW op input, found " + swu_node.op_type
                         )
             elif op_type in simd_ops:
-                if op_type == "ConvolutionInputGenerator":
+                if op_type in [
+                    "ConvolutionInputGenerator",
+                    "ConvolutionInputGenerator1D",
+                ]:
                     depthwise = node_inst.get_nodeattr("depthwise")
                     if depthwise == 0:
                         max_simd = node_inst.get_nodeattr("IFMChannels")
diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py
index a2865321418343efbfdae12c111ba4334ecfee28..365632cd5a02eae6e19e670e0b676c521e460507 100644
--- a/src/finn/transformation/fpgadataflow/vitis_build.py
+++ b/src/finn/transformation/fpgadataflow/vitis_build.py
@@ -49,7 +49,6 @@ from finn.transformation.general import (
     GiveUniqueNodeNames,
     RemoveUnusedTensors,
 )
-from finn.transformation.infer_data_layouts import InferDataLayouts
 from finn.util.basic import make_build_dir
 
 from . import templates
@@ -392,8 +391,6 @@ class VitisBuild(Transformation):
 
     def apply(self, model):
         _check_vitis_envvars()
-        # first infer layouts
-        model = model.transform(InferDataLayouts())
         # prepare at global level, then break up into kernels
         prep_transforms = [InsertIODMA(512), InsertDWC()]
         for trn in prep_transforms:
diff --git a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
index faad31fa06e76b245f25b6f0aa583fec5c0da29a..c234bd38d9679f72b6df73e81df57fba3e8d4554 100644
--- a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
+++ b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py
@@ -230,7 +230,7 @@ class AvgPoolAndTruncToQuantAvgPool(Transformation):
                         # 7c2603a95e90e4de2575020e575c24eab6a15889/src/finn/custom_op/
                         # general/quantavgpool2d.py#L94
                         ibits = math.floor(
-                            math.log(2 ** trunc_in_bits / (k_s * k_s), 2)
+                            math.log(2**trunc_in_bits / (k_s * k_s), 2)
                         )
                         # Get sign
                         signed = _get_signed_from_upstream(model, t_node)
diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
index 3336b1eee7fa9d54092cd56b9ba0edaf9d0884b1..c8bde7fea8ae8195001a7eccfd48baa4c48997ae 100644
--- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py
+++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py
@@ -333,7 +333,7 @@ class QuantReluHandler(QuantActBaseHandler):
         # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/
         # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/
         # onnx/finn/handler/act.py#L21
-        num_distinct_values = 2 ** bit_width
+        num_distinct_values = 2**bit_width
         num_thresholds = int(num_distinct_values - 1)
         flat_scale = quant_scale.flatten().astype(np.float32)
         num_scale_channels = flat_scale.shape[0]
@@ -468,9 +468,9 @@ class QuantIdentityHandler(QuantActBaseHandler):
             return thresholds
         else:
             if narrow:
-                num_distinct_values = 2 ** bit_width - 1
+                num_distinct_values = 2**bit_width - 1
             else:
-                num_distinct_values = 2 ** bit_width
+                num_distinct_values = 2**bit_width
 
             num_thresholds = int(num_distinct_values - 1)
             flat_scale = quant_scale.flatten()
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 1fddc7c1c26a0ba04d5849809ccf59b0a926a509..1d7d5e3e9a939b62a938c6f23e347e3d15a64663 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -765,7 +765,7 @@ class TestEnd2End:
         ret = dict()
         # try a range of batch sizes, some may fail due to insufficient DMA
         # buffers
-        bsize_range_in = [8 ** i for i in range(5)]
+        bsize_range_in = [8**i for i in range(5)]
         bsize_range = []
         for bsize in bsize_range_in:
             res = throughput_test_remote(model, bsize)
diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
index 3efafc040df07a7d56638bf5ce0b1ce01887343c..0dd9991b2ff07a35c923afeda854352213f8ca09 100644
--- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
+++ b/tests/fpgadataflow/test_convert_to_hls_pool_batch.py
@@ -48,22 +48,31 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.util.basic import gen_finn_dt_tensor
 
 
-def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt):
+def make_single_maxpool_modelwrapper(
+    k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False
+):
     odt = idt
-    inp = helper.make_tensor_value_info(
-        "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]
-    )
-    outp = helper.make_tensor_value_info(
-        "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]
-    )
-
+    if use_1d:
+        ishape = [1, ifm_ch, 1, ifm_dim]
+        oshape = [1, ifm_ch, 1, ofm_dim]
+        kshape = [1, k]
+        pads = [0, pad, 0, pad]
+        strides = [1, stride]
+    else:
+        ishape = [1, ifm_ch, ifm_dim, ifm_dim]
+        oshape = [1, ifm_ch, ofm_dim, ofm_dim]
+        kshape = [k, k]
+        pads = [pad, pad, pad, pad]
+        strides = [stride, stride]
+    inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape)
+    outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, oshape)
     mp_node = helper.make_node(
         "MaxPool",
         ["inp"],
         ["outp"],
-        kernel_shape=[k, k],
-        pads=[pad, pad, pad, pad],
-        strides=[stride, stride],
+        kernel_shape=kshape,
+        pads=pads,
+        strides=strides,
     )
     graph = helper.make_graph(
         nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]
@@ -128,7 +137,7 @@ def prepare_inputs(input_tensor):
 # number of out channel computed in parallel
 @pytest.mark.parametrize("pe", [1, 2, 4])
 # pool type
-@pytest.mark.parametrize("op_type", ["QuantAvgPool2d", "MaxPool"])
+@pytest.mark.parametrize("op_type", ["QuantAvgPool2d", "MaxPool", "MaxPool1D"])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
@@ -147,7 +156,14 @@ def test_convert_to_hls_pool_batch(
     np.random.seed(0)
     ofm_dim = int(((ifm_dim + 2 * pad - k) / stride) + 1)
 
-    x = gen_finn_dt_tensor(idt, (1, ifm_ch, ifm_dim, ifm_dim))
+    ishape = (1, ifm_ch, ifm_dim, ifm_dim)
+    use_1d = False
+    if op_type == "MaxPool1D":
+        use_1d = True
+        ishape = (1, ifm_ch, 1, ifm_dim)
+        op_type = "MaxPool"
+
+    x = gen_finn_dt_tensor(idt, ishape)
     # prepare input data
     input_dict = prepare_inputs(x)
     if op_type == "MaxPool":
@@ -159,7 +175,7 @@ def test_convert_to_hls_pool_batch(
             pytest.skip("Skipping Maxpool with idt != odt")
 
         model = make_single_maxpool_modelwrapper(
-            k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt
+            k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d
         )
     elif op_type == "QuantAvgPool2d":
         if pad != 0:
@@ -178,16 +194,40 @@ def test_convert_to_hls_pool_batch(
     new_model = model.transform(to_hls.InferPool_Batch())
     new_model = new_model.transform(GiveUniqueNodeNames())
 
-    if ifm_ch != pe:
-        new_model = new_model.transform(to_hls.InferConvInpGen())
-        # Folding
-        for n in new_model.graph.node:
-            if n.op_type == "ConvolutionInputGenerator":
-                inst = getCustomOp(n)
-                inst.set_nodeattr("SIMD", pe)
-            elif n.op_type == "Pool_Batch":
-                inst = getCustomOp(n)
-                inst.set_nodeattr("PE", pe)
+    new_model = new_model.transform(to_hls.InferConvInpGen())
+    # Folding
+    for n in new_model.graph.node:
+        if n.op_type.startswith("ConvolutionInputGenerator"):
+            inst = getCustomOp(n)
+            inst.set_nodeattr("SIMD", pe)
+        elif n.op_type == "Pool_Batch":
+            inst = getCustomOp(n)
+            inst.set_nodeattr("PE", pe)
+
+    if stride <= k:
+        if pad == 0:
+            assert len(new_model.graph.node) == 4
+            assert new_model.graph.node[0].op_type == "Transpose"
+            assert new_model.graph.node[1].op_type.startswith(
+                "ConvolutionInputGenerator"
+            )
+            assert new_model.graph.node[2].op_type == "Pool_Batch"
+            assert new_model.graph.node[3].op_type == "Transpose"
+        else:
+            assert len(new_model.graph.node) == 5
+            assert new_model.graph.node[0].op_type == "Transpose"
+            assert new_model.graph.node[1].op_type == "FMPadding_Batch"
+            assert new_model.graph.node[2].op_type.startswith(
+                "ConvolutionInputGenerator"
+            )
+            assert new_model.graph.node[3].op_type == "Pool_Batch"
+            assert new_model.graph.node[4].op_type == "Transpose"
+    else:
+        # not currently converted to HLS, node stays as-is
+        assert len(new_model.graph.node) == 1
+        assert new_model.graph.node[0].op_type in ["MaxPool", "QuantAvgPool2d"]
+        # no need to exec
+        return
 
     if exec_mode == "cppsim":
         new_model = new_model.transform(SetExecMode("cppsim"))
@@ -205,13 +245,6 @@ def test_convert_to_hls_pool_batch(
     # execute new_model
     y_produced = oxe.execute_onnx(new_model, input_dict)["outp"]
     assert (y_produced == y_expected).all()
-    if stride <= k:
-        if pad == 0 or ifm_ch == pe:
-            assert len(new_model.graph.node) == 4
-        else:
-            assert len(new_model.graph.node) == 5
-    else:
-        assert len(new_model.graph.node) == 1
 
     if exec_mode == "rtlsim":
         node = new_model.get_nodes_by_op_type("Pool_Batch")[0]
diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a7d78610132ff71ff92ee6a69ad7e089604463b
--- /dev/null
+++ b/tests/fpgadataflow/test_fpgadataflow_concat.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2021, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import pytest
+
+import numpy as np
+import onnx
+import torch
+from io import BytesIO
+from torch import nn
+
+from finn.core.datatype import DataType
+from finn.core.modelwrapper import ModelWrapper
+from finn.core.onnx_exec import execute_onnx
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.convert_to_hls_layers import InferConcatLayer
+from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
+from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
+from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.general import GiveUniqueNodeNames
+from finn.util.basic import gen_finn_dt_tensor
+
+
+def make_concat_model(i_shapes, idt):
+    class ConcatModel(nn.Module):
+        def forward(self, *args):
+            return torch.cat(args, -1)
+
+    torch_model = ConcatModel()
+    torch_model.eval()
+    input_t = []
+    for i_shape in i_shapes:
+        input_t.append(torch.zeros(i_shape, dtype=torch.float32))
+    input_t = tuple(input_t)
+    model_bytes = BytesIO()
+    torch.onnx.export(torch_model, input_t, model_bytes, opset_version=11)
+    model = onnx.ModelProto.FromString(model_bytes.getvalue())
+    model = ModelWrapper(model)
+    for inp in model.graph.input:
+        model.set_tensor_datatype(inp.name, idt)
+    return model
+
+
+@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
+@pytest.mark.parametrize("idt", [DataType["INT4"]])
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_concat(exec_mode, idt):
+    i_shapes = [(1, 2, 4), (1, 2, 6), (1, 2, 1)]
+    i_data = [gen_finn_dt_tensor(idt, x) for x in i_shapes]
+    model = make_concat_model(i_shapes, idt)
+    assert len(i_shapes) == len(model.graph.input)
+    assert len(model.graph.output) == 1
+    exp_oshape = list(i_shapes[0][:-1]) + [sum(x[-1] for x in i_shapes)]
+    oname = model.graph.output[0].name
+    assert model.get_tensor_shape(oname) == exp_oshape
+    exp_out = np.concatenate(i_data, axis=-1)
+    inp_dict = {}
+    for i in range(len(i_shapes)):
+        inp_dict[model.graph.input[i].name] = i_data[i]
+    ret = execute_onnx(model, inp_dict)
+    assert (ret[oname] == exp_out).all()
+    # call transformation to convert to HLS and verify conversion
+    model = model.transform(InferConcatLayer())
+    assert model.graph.node[0].op_type == "StreamingConcat"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    if exec_mode == "cppsim":
+        model = model.transform(PrepareCppSim())
+        model = model.transform(CompileCppSim())
+        model = model.transform(SetExecMode("cppsim"))
+    elif exec_mode == "rtlsim":
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(PrepareIP("xc7z020clg400-1", 10))
+        model = model.transform(HLSSynthIP())
+        model = model.transform(SetExecMode("rtlsim"))
+        model = model.transform(PrepareRTLSim())
+    ret_sim = execute_onnx(model, inp_dict)
+    assert (exp_out == ret_sim[oname]).all()
+
+
+@pytest.mark.vivado
+@pytest.mark.slow
+def test_fpgadataflow_concat_stitchedip():
+    idt = DataType["INT4"]
+    fpga_part = "xc7z020clg400-1"
+    clk_ns = 10
+    i_shapes = [(1, 2, 4), (1, 2, 6), (1, 2, 1)]
+    i_data = [gen_finn_dt_tensor(idt, x) for x in i_shapes]
+    model = make_concat_model(i_shapes, idt)
+    assert len(i_shapes) == len(model.graph.input)
+    assert len(model.graph.output) == 1
+    exp_oshape = list(i_shapes[0][:-1]) + [sum(x[-1] for x in i_shapes)]
+    oname = model.graph.output[0].name
+    assert model.get_tensor_shape(oname) == exp_oshape
+    exp_out = np.concatenate(i_data, axis=-1)
+    inp_dict = {}
+    for i in range(len(i_shapes)):
+        inp_dict[model.graph.input[i].name] = i_data[i]
+    ret = execute_onnx(model, inp_dict)
+    assert (ret[oname] == exp_out).all()
+    # call transformation to convert to HLS and verify conversion
+    model = model.transform(InferConcatLayer())
+    assert model.graph.node[0].op_type == "StreamingConcat"
+    assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow"
+    model = model.transform(InsertFIFO(create_shallow_fifos=True))
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(PrepareIP(fpga_part, clk_ns))
+    model = model.transform(HLSSynthIP())
+    model = model.transform(
+        CreateStitchedIP(
+            fpga_part,
+            clk_ns,
+            vitis=False,
+        )
+    )
+    model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", "trace.vcd")
+    model.save("dbg.onnx")
+    ret_sim = execute_onnx(model, inp_dict)
+    assert (exp_out == ret_sim[oname]).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
index 73bf1165afa9418be0c89f77797de538275fd220..1faf647df225853cf026a49adbfc6bb9d8f1b670 100644
--- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
+++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py
@@ -48,25 +48,32 @@ from finn.transformation.infer_shapes import InferShapes
 from finn.util.basic import gen_finn_dt_tensor
 
 
-def make_dupstreams_modelwrapper(ch, pe, idim, idt):
+def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl):
     shape = [1, idim, idim, ch]
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape)
-    outp0 = helper.make_tensor_value_info("outp0", TensorProto.FLOAT, shape)
-    outp1 = helper.make_tensor_value_info("outp1", TensorProto.FLOAT, shape)
+    out_names = []
+    out_vi = []
+    for i in range(n_dupl):
+        outp_name = "outp%d" % i
+        out_names.append(outp_name)
+        out_vi.append(
+            helper.make_tensor_value_info(outp_name, TensorProto.FLOAT, shape)
+        )
 
     dupstrm_node = helper.make_node(
         "DuplicateStreams_Batch",
         ["inp"],
-        ["outp0", "outp1"],
+        out_names,
         domain="finn.custom_op.fpgadataflow",
         backend="fpgadataflow",
         NumChannels=ch,
+        NumOutputStreams=n_dupl,
         PE=pe,
         inputDataType=idt.name,
         numInputVectors=[1, idim, idim],
     )
     graph = helper.make_graph(
-        nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=[outp0, outp1]
+        nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi
     )
 
     model = helper.make_model(graph, producer_name="addstreams-model")
@@ -92,10 +99,12 @@ def prepare_inputs(input_tensor, idt):
 @pytest.mark.parametrize("fold", [-1, 2, 1])
 # image dimension
 @pytest.mark.parametrize("imdim", [7])
+# amount of duplication
+@pytest.mark.parametrize("n_dupl", [2, 3])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 @pytest.mark.vivado
-def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
+def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode):
     if fold == -1:
         pe = 1
     else:
@@ -105,7 +114,7 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
     # generate input data
     x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch))
 
-    model = make_dupstreams_modelwrapper(ch, pe, imdim, idt)
+    model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl)
 
     if exec_mode == "cppsim":
         model = model.transform(PrepareCppSim())
@@ -123,12 +132,11 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, exec_mode):
     # prepare input data and execute
     input_dict = prepare_inputs(x, idt)
     output_dict = oxe.execute_onnx(model, input_dict)
-    y0 = output_dict["outp0"]
-    y1 = output_dict["outp1"]
-    expected_y = x
 
-    assert (y0 == expected_y).all(), exec_mode + " failed"
-    assert (y1 == expected_y).all(), exec_mode + " failed"
+    expected_y = x
+    for i in range(n_dupl):
+        y = output_dict["outp%d" % i]
+        assert (y == expected_y).all(), exec_mode + " failed"
 
     if exec_mode == "rtlsim":
         node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0]
diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py
index 6f39994bf27594a063a1e66c5bba7867eaabef6e..9eb3a7f4514e610d79bb83cc62a7561a33ced543 100644
--- a/tests/fpgadataflow/test_fpgadataflow_vvau.py
+++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py
@@ -62,8 +62,8 @@ def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels):
 def _calculate_dot_prod_range(dt_a, dt_b, len):
     """Returns the (min,max) values a dot product between two (un)signed vectors of
     types dt_a and dt_b of len elements can take."""
-    min_prod = 2 ** 30
-    max_prod = -(2 ** 30)
+    min_prod = 2**30
+    max_prod = -(2**30)
     for a_val in [dt_a.min(), dt_a.max()]:
         for b_val in [dt_b.min(), dt_b.max()]:
             prod = a_val * b_val * len
diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py
index 66fd5b43a1b8b8c8986bf9c9b9d0e9efd7a744a6..492f208671f4622d189c48ece874740a68b69072 100644
--- a/tests/fpgadataflow/test_set_folding.py
+++ b/tests/fpgadataflow/test_set_folding.py
@@ -109,7 +109,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes):
 
 
 # desired frames per second
-@pytest.mark.parametrize("target_fps", [30, 10 ** 5, 10 ** 7])
+@pytest.mark.parametrize("target_fps", [30, 10**5, 10**7])
 # target chip or board
 @pytest.mark.parametrize("platform", ["Pynq-Z1", "Ultra96", "U200"])
 def test_set_folding(target_fps, platform):
@@ -126,7 +126,7 @@ def test_set_folding(target_fps, platform):
     dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename)
 
     clk_ns = 5
-    target_cycles_per_frame = int((10 ** 9 / clk_ns) / target_fps)
+    target_cycles_per_frame = int((10**9 / clk_ns) / target_fps)
     dataflow_model = dataflow_model.transform(SetFolding(target_cycles_per_frame))
 
     exp_cycles_dict = dataflow_model.analysis(exp_cycles_per_layer)