diff --git a/.gitignore b/.gitignore
index 91879240b36709b5c827ec951366cc55ad515cce..0411de3941d790fd1668fe2328b248cd3c09be08 100644
--- a/.gitignore
+++ b/.gitignore
@@ -83,3 +83,6 @@ MANIFEST
 /finn-hlslib/
 /pyverilator/
 /PYNQ-HelloWorld/
+
+# Jenkins cfg dir
+/docker/jenkins_home
diff --git a/AUTHORS.rst b/AUTHORS.rst
index a87cf170b065879f0c5e01e4726bc57608e5c4f2..e231e61d38991e11e2e43a7c9a3a78c50c878244 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -2,6 +2,7 @@
 Contributors
 ============
 
-* Yaman Umuroglu
-* Jakoba Petri-Koenig
-* Andrea Rigoni
+* Yaman Umuroglu (@maltanar)
+* Jakoba Petri-Koenig (@auphelia)
+* Andrea Rigoni (@AndreaRigoni)
+* Hendrik Borras (@HenniOVP)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..f12dafa857b8a99493d7266ad029bec3f725d9ec
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,34 @@
+We welcome contributions to FINN.
+
+Please follow the steps below and be sure that your contribution complies with our guidelines.
+
+1. Share your proposal via <a href="https://github.com/Xilinx/finn/issues" target="_blank">Github issues</a>. If you are looking for some issues to get started with, we have a list of <a href="https://github.com/Xilinx/finn/labels/good%20first%20issue">good first issues</a> in the issue tracker. Feel free to ask questions on the <a href="https://gitter.im/xilinx-finn/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge">FINN gitter channel as well</a>.
+
+	We welcome submissions to:
+
+	1. The FINN flow like additional custom ONNX nodes, transformation and analysis passes.
+	2. Contributions to the documentation and Jupyter notebooks
+
+	To ensure clean separation of toolflow and examples, we do not keep example networks in this repo. If you want to add example networks, we ask you to make them into a separate repo and use FINN as a dependency -- we'll be happy to add it to the list of <a href="https://xilinx.github.io/finn/community">FINN community projects</a>.
+
+2. Submitting your pull request:
+
+	1. Fork this repository to your own GitHub account using the *fork* button above.
+
+	2. Clone the fork to your local computer using *git clone*. Checkout the branch you want to work on.
+
+	3. Please install <a href="https://pre-commit.com/" target="_blank">pre-commit</a> to ensure your code is formatted to our style guidelines. The hooks we use for pre-commit can be found in <a href="https://github.com/Xilinx/finn/blob/master/.pre-commit-config.yaml" target="_blank">this file</a>
+
+	4. Modify the Python source code, Jupyter notebooks and Sphinx documentation etc. as needed.
+
+	5. Use *git add*, *git commit*, *git push* to add changes to your fork.
+
+	6. If you are introducing new functionality, add at least one unit test under the `test/` folder and make sure it passes before you submit the pull request.
+
+	7. Submit a pull request by clicking the *pull request* button on your GitHub repo:
+		1. The <a href="https://github.com/Xilinx/finn" target="_blank">master branch</a> should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
+        2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the <a href="https://github.com/Xilinx/finn/tree/dev" target="_blank">development branch</a>.
+
+3. We will review your contribution and, if any additional fixes or modifications are
+necessary, may provide feedback to guide you. When accepted, your pull request will
+be merged to the repository. If you have more questions please contact us via the <a href="https://gitter.im/xilinx-finn/community" target="_blank">FINN gitter channel</a>.
diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
new file mode 100644
index 0000000000000000000000000000000000000000..dd0c28da759d31544a68f2a969783174c628c28b
--- /dev/null
+++ b/docker/Dockerfile.finn_ci
@@ -0,0 +1,93 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
+MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
+ARG PYTHON_VERSION=3.6
+ARG BUILD_PATH
+ARG FINN_CI_BRANCH
+
+WORKDIR /workspace
+
+RUN apt-get update
+RUN apt-get -y upgrade
+RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
+RUN apt install verilator
+RUN apt-get -y install sshpass
+RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
+
+# cloning dependency repos
+# Brevitas
+RUN git clone --branch feature/finn_onnx_export https://github.com/Xilinx/brevitas.git /workspace/brevitas
+RUN git -C /workspace/brevitas checkout 215cf44c76d562339fca368c8c3afee3110033e8
+
+# Brevitas examples
+RUN git clone --branch feature/rework_scaling_clipping https://github.com/maltanar/brevitas_cnv_lfc.git /workspace/brevitas_cnv_lfc
+RUN git -C /workspace/brevitas_cnv_lfc checkout 2059f96bd576bf71f32c757e7f92617a70190c90
+
+# CNPY
+RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
+RUN git -C /workspace/cnpy checkout 4e8810b1a8637695171ed346ce68f6984e585ef4
+
+# FINN hlslib
+RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib
+RUN git -C /workspace/finn-hlslib checkout b139bf051ac8f8e0a3625509247f714127cf3317
+
+# PyVerilator
+RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
+RUN git -C /workspace/pyverilator checkout 307fc5c82db748620836307a2002fdc9fe170226
+
+# PYNQ-HelloWorld
+RUN git clone --branch feature/synth_rpt https://github.com/maltanar/PYNQ-HelloWorld.git /workspace/PYNQ-HelloWorld
+RUN git -C /workspace/PYNQ-HelloWorld checkout db7e418767ce2a8e08fe732ddb3aa56ee79b7560
+
+# FINN
+RUN git clone --branch $FINN_CI_BRANCH https://github.com/Xilinx/finn /workspace/finn
+
+RUN pip install -r /workspace/finn/requirements.txt
+RUN apt update; apt install nano
+RUN pip install pytest-dependency
+
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
+ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
+ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
+ENV VIVADO_IP_CACHE "$BUILD_PATH/vivado_ip_cache"
+
+# colorful terminal output
+RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /root/.bashrc
+RUN mkdir -p $BUILD_PATH
+RUN mkdir -p $VIVADO_IP_CACHE
+
+WORKDIR /workspace/finn
+
+COPY finn_entrypoint.sh /usr/local/bin/
+RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
+ENTRYPOINT ["finn_entrypoint.sh"]
+CMD ["bash"]
diff --git a/Dockerfile b/docker/Dockerfile.finn_dev
similarity index 93%
rename from Dockerfile
rename to docker/Dockerfile.finn_dev
index eb0e746df429b6617432b23a9c77ec0b91732372..e28492bd31f3a2115ac566ed06a0125d348208f4 100644
--- a/Dockerfile
+++ b/docker/Dockerfile.finn_dev
@@ -29,27 +29,52 @@
 FROM pytorch/pytorch:1.1.0-cuda10.0-cudnn7.5-devel
 MAINTAINER Yaman Umuroglu <yamanu@xilinx.com>
 ARG PYTHON_VERSION=3.6
+ARG GID
+ARG GNAME
+ARG UNAME
+ARG UID
+ARG PASSWD
+ARG JUPYTER_PORT
+ARG NETRON_PORT
+
+EXPOSE $JUPYTER_PORT
+EXPOSE $NETRON_PORT
 
 WORKDIR /workspace
 
+RUN apt-get update
+RUN apt-get -y upgrade
+RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
+RUN apt-get install verilator
+RUN apt-get install nano
+RUN apt-get -y install sshpass
+RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
+
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 RUN rm requirements.txt
-RUN apt update; apt install nano
 RUN pip install jupyter
 RUN pip install netron
 RUN pip install matplotlib
 RUN pip install pytest-dependency
-RUN apt-get update
-RUN apt-get -y upgrade
-RUN apt-get install -y build-essential libglib2.0-0 libsm6 libxext6 libxrender-dev
-RUN apt install verilator
-RUN apt-get -y install sshpass
-RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
 RUN pip install sphinx
 RUN pip install sphinx_rtd_theme
 
-# cloning dependency repos
+# copy entrypoint script
+COPY docker/finn_entrypoint.sh /usr/local/bin/
+RUN chmod 755 /usr/local/bin/finn_entrypoint.sh
+
+# switch user
+RUN groupadd -g $GID $GNAME
+RUN useradd -M -u $UID $UNAME -g $GNAME
+RUN usermod -aG sudo $UNAME
+RUN echo "$UNAME:$PASSWD" | chpasswd
+RUN echo "root:$PASSWD" | chpasswd
+RUN ln -s /workspace /home/$UNAME
+RUN chown -R $UNAME:$GNAME /home/$UNAME
+USER $UNAME
+
+# cloning dependency repos (as user)
 # Brevitas
 RUN git clone --branch feature/finn_onnx_export https://github.com/Xilinx/brevitas.git /workspace/brevitas
 RUN git -C /workspace/brevitas checkout 215cf44c76d562339fca368c8c3afee3110033e8
@@ -77,33 +102,14 @@ RUN git -C /workspace/PYNQ-HelloWorld checkout db7e418767ce2a8e08fe732ddb3aa56ee
 # Note that we expect the cloned finn directory on the host to be
 # mounted on /workspace/finn -- see run-docker.sh for an example
 # of how to do this.
-# This branch assumes the same for brevitas and brevitas_cnv_lfc for easier
-# co-development.
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/finn/src"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
 
-ARG GID
-ARG GNAME
-ARG UNAME
-ARG UID
-ARG PASSWD
-ARG JUPYTER_PORT
-ARG NETRON_PORT
-
-RUN groupadd -g $GID $GNAME
-RUN useradd -M -u $UID $UNAME -g $GNAME
-RUN usermod -aG sudo $UNAME
-RUN echo "$UNAME:$PASSWD" | chpasswd
-RUN echo "root:$PASSWD" | chpasswd
-RUN ln -s /workspace /home/$UNAME
-RUN chown -R $UNAME:$GNAME /home/$UNAME
-USER $UNAME
-
-RUN echo "source \$VIVADO_PATH/settings64.sh" >> /home/$UNAME/.bashrc
-RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /home/$UNAME/.bashrc
-EXPOSE $JUPYTER_PORT
-EXPOSE $NETRON_PORT
 WORKDIR /home/$UNAME/finn
+RUN echo "PS1='\[\033[1;36m\]\u\[\033[1;31m\]@\[\033[1;32m\]\h:\[\033[1;35m\]\w\[\033[1;31m\]\$\[\033[0m\] '" >>  /home/$UNAME/.bashrc
+
+ENTRYPOINT ["finn_entrypoint.sh"]
+CMD ["bash"]
diff --git a/docker/Dockerfile.jenkins b/docker/Dockerfile.jenkins
new file mode 100644
index 0000000000000000000000000000000000000000..e1939b642e1493ee97daf6472009649d3634632f
--- /dev/null
+++ b/docker/Dockerfile.jenkins
@@ -0,0 +1,11 @@
+FROM jenkins/jenkins:lts
+# if we want to install via apt
+USER root
+RUN apt-get update
+RUN apt-get install -y gnupg-agent curl ca-certificates apt-transport-https software-properties-common
+RUN curl -fsSL https://download.docker.com/linux/debian/gpg | apt-key add -
+RUN add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
+RUN apt-get update
+RUN apt-get install -y docker-ce-cli
+# drop back to the regular jenkins user - good practice
+USER jenkins
diff --git a/docker/Jenkinsfile b/docker/Jenkinsfile
new file mode 100644
index 0000000000000000000000000000000000000000..bd8800ea58ef359f4b8308e555c3d6deda476443
--- /dev/null
+++ b/docker/Jenkinsfile
@@ -0,0 +1,54 @@
+pipeline {
+    agent any
+    parameters {
+        string(name: 'FINN_CI_BRANCH', defaultValue: '', description: 'FINN branch to build')
+        string(name: 'VIVADO_PATH', defaultValue: '', description: 'Path to Vivado installation')
+        string(name: 'PYNQ_BOARD', defaultValue: 'Pynq-Z1', description: 'PYNQ board type')
+        string(name: 'PYNQ_IP', defaultValue: '', description: 'PYNQ board IP address')
+        string(name: 'PYNQ_USERNAME', defaultValue: 'xilinx', description: 'PYNQ board username')
+        string(name: 'PYNQ_PASSWORD', defaultValue: 'xilinx', description: 'PYNQ board password')
+        string(name: 'PYNQ_TARGET_DIR', defaultValue: '/home/xilinx/finn', description: 'PYNQ board target deployment directory')
+        string(name: 'NUM_DEFAULT_WORKERS', defaultValue: '1', description: 'Number of cores for parallel transformations')
+    }
+    environment {
+        DOCKER_TAG='finn_ci:$BUILD_ID'
+        DOCKER_INST_NAME='finn_ci_$BUILD_ID'
+        BUILD_PATH='/tmp/finn_ci_$BUILD_ID'
+        DOCKER_CMD="python setup.py test"
+    }
+    stages {
+        stage("Clone") {
+            steps {
+                git branch: "${params.FINN_CI_BRANCH}", url: 'https://github.com/Xilinx/finn.git'
+            }
+        }
+      stage('Build') {
+            steps {
+                sh """
+                docker build -t $DOCKER_TAG -f docker/Dockerfile.finn_ci \
+                --build-arg BUILD_PATH=$BUILD_PATH \
+                --build-arg FINN_CI_BRANCH=${params.FINN_CI_BRANCH} \
+                docker/
+                """
+            }
+        }
+        stage('Test') {
+            steps {
+                sh """
+                docker run --name $DOCKER_INST_NAME \
+                --hostname $DOCKER_INST_NAME \
+                -v ${params.VIVADO_PATH}:${params.VIVADO_PATH}:ro \
+                -e NUM_DEFAULT_WORKERS=${params.NUM_DEFAULT_WORKERS} \
+                -e FINN_INST_NAME=$DOCKER_INST_NAME \
+                -e VIVADO_PATH=${params.VIVADO_PATH} \
+                -e PYNQ_BOARD=${params.PYNQ_BOARD} \
+                -e PYNQ_IP=${params.PYNQ_IP} \
+                -e PYNQ_USERNAME=${params.PYNQ_USERNAME} \
+                -e PYNQ_PASSWORD=${params.PYNQ_PASSWORD} \
+                -e PYNQ_TARGET_DIR=${params.PYNQ_TARGET_DIR} \
+                $DOCKER_TAG bash -c "$DOCKER_CMD"
+                """
+            }
+        }
+    }
+}
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
new file mode 100644
index 0000000000000000000000000000000000000000..930218e26eff0b7be541529f452efc2a038160c5
--- /dev/null
+++ b/docker/finn_entrypoint.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export XILINX_VIVADO=$VIVADO_PATH
+export SHELL=/bin/bash
+export FINN_ROOT=/workspace/finn
+
+# source Vivado env.vars
+source $VIVADO_PATH/settings64.sh
+
+exec "$@"
diff --git a/docker/launch-jenkins.sh b/docker/launch-jenkins.sh
new file mode 100755
index 0000000000000000000000000000000000000000..64dc1ec73f68e621cdd737595983b6b9a217f6fe
--- /dev/null
+++ b/docker/launch-jenkins.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# defaults, can be overriden by environment variables
+# user to run Jenkins as -- see NOTE below regarding Docker access permissions
+: ${JENKINS_USER=jenkins}
+# port for Jenkins on host machine
+: ${JENKINS_PORT=8080}
+# make Jenkins config persistent by mounting into this folder
+: ${JENKINS_HOME=$(pwd)/jenkins_home}
+
+mkdir -p $JENKINS_HOME
+
+# build a Jenkins Docker image that also has the Docker CLI installed
+docker build -t finn_jenkins -f Dockerfile.jenkins .
+
+# launch Docker container mounted to local Docker socket
+# NOTE: we allow customizing the user (e.g. as root) to work around permission
+# issues, may not al
+docker run -u $JENKINS_USER -p $JENKINS_PORT:8080 -v /var/run/docker.sock:/var/run/docker.sock -v $JENKINS_HOME:/var/jenkins_home finn_jenkins
diff --git a/run-docker.sh b/run-docker.sh
index 62ca70c2cbc80f28b12f0b0ff8f9139db0108271..018bd9aa8c39666a1b9c0ef7f426587f265769f7 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -44,10 +44,10 @@ DOCKER_PASSWD="finn"
 # generate a random number per-run to allow multiple
 # containers from the same user
 DOCKER_RND=$(shuf -i0-32768 -n1)
-DOCKER_TAG="finn_${DOCKER_UNAME}"
+DOCKER_TAG="finn_dev_${DOCKER_UNAME}"
 # uncomment to run multiple instances with different names
 # DOCKER_INST_NAME="finn_${DOCKER_UNAME}_${DOCKER_RND}"
-DOCKER_INST_NAME="finn_${DOCKER_UNAME}"
+DOCKER_INST_NAME="finn_dev_${DOCKER_UNAME}"
 # ensure Docker tag and inst. name are all lowercase
 DOCKER_TAG=$(echo "$DOCKER_TAG" | tr '[:upper:]' '[:lower:]')
 DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
@@ -59,6 +59,7 @@ DOCKER_INST_NAME=$(echo "$DOCKER_INST_NAME" | tr '[:upper:]' '[:lower:]')
 : ${PYNQ_PASSWORD="xilinx"}
 : ${PYNQ_BOARD="Pynq-Z1"}
 : ${PYNQ_TARGET_DIR="/home/xilinx/$DOCKER_INST_NAME"}
+: ${NUM_DEFAULT_WORKERS=1}
 
 # Absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
@@ -83,17 +84,17 @@ echo "Using default PYNQ board $PYNQ_BOARD"
 
 if [ "$1" = "test" ]; then
         echo "Running test suite"
-        DOCKER_CMD="source ~/.bashrc; python setup.py test"
+        DOCKER_CMD="python setup.py test"
 elif [ "$1" = "notebook" ]; then
         echo "Running Jupyter notebook server"
-        DOCKER_CMD="source ~/.bashrc; jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
+        DOCKER_CMD="jupyter notebook --ip=0.0.0.0 --port $JUPYTER_PORT notebooks"
 else
         echo "Running container only"
         DOCKER_CMD="bash"
 fi
 
 # Build the FINN Docker image
-docker build --tag=$DOCKER_TAG \
+docker build -f docker/Dockerfile.finn_dev --tag=$DOCKER_TAG \
              --build-arg GID=$DOCKER_GID \
              --build-arg GNAME=$DOCKER_GNAME \
              --build-arg UNAME=$DOCKER_UNAME \
@@ -119,6 +120,7 @@ docker run -t --rm --name $DOCKER_INST_NAME -it \
 -e PYNQ_USERNAME=$PYNQ_USERNAME \
 -e PYNQ_PASSWORD=$PYNQ_PASSWORD \
 -e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR \
+-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS \
 -p $JUPYTER_PORT:$JUPYTER_PORT \
 -p $NETRON_PORT:$NETRON_PORT \
-$DOCKER_TAG bash -c "$DOCKER_CMD"
+$DOCKER_TAG $DOCKER_CMD
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 0841fedebcd473a488b2e62db4dc763f283789e1..af84a75e299d666c059df54211be42b691f5ccf2 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -35,11 +35,18 @@ from finn.util.fpgadataflow import (
     pyverilate_stitched_ip,
 )
 
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
 
 def rtlsim_exec(model, execution_context):
     """Use PyVerilator to execute given model with stitched IP. The execution
     context contains the input values."""
 
+    if PyVerilator is None:
+        raise ImportError("Installation of PyVerilator is required.")
     # ensure stitched ip project already exists
     assert os.path.isfile(
         model.get_metadata_prop("wrapper_filename")
@@ -74,7 +81,12 @@ def rtlsim_exec(model, execution_context):
     packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
     num_out_values = last_node.get_number_output_values()
     # prepare pyverilator model
-    sim = pyverilate_stitched_ip(model)
+    rtlsim_so = model.get_metadata_prop("rtlsim_so")
+    if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):
+        sim = pyverilate_stitched_ip(model)
+        model.set_metadata_prop("rtlsim_so", sim.lib._name)
+    else:
+        sim = PyVerilator(rtlsim_so)
     _reset_rtlsim(sim)
     _toggle_clk(sim)
     ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file)
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 4fc69985f7cdf09298f79055e159f63b2eabaf97..5de97fd976f8d57c7c389c04ad33e02340f13e56 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -38,6 +38,11 @@ from finn.util.fpgadataflow import (
 )
 from . import templates
 
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
 
 class HLSCustomOp(CustomOp):
     """HLSCustomOp class all custom ops that correspond to a finn-hlslib
@@ -76,8 +81,64 @@ class HLSCustomOp(CustomOp):
             "res_estimate": ("s", False, ""),
             "res_hls": ("s", False, ""),
             "res_synth": ("s", False, ""),
+            "rtlsim_so": ("s", False, ""),
         }
 
+    def get_verilog_top_module_name(self):
+        "Return the Verilog top module name for this node."
+
+        node = self.onnx_node
+        prefixed_top_name = "%s_%s" % (node.name, node.name)
+        return prefixed_top_name
+
+    def get_verilog_top_filename(self):
+        "Return the Verilog top module filename for this node."
+
+        verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
+            self.get_nodeattr("code_gen_dir_ipgen"),
+            self.onnx_node.name,
+            self.get_verilog_top_module_name(),
+        )
+        return verilog_file
+
+    def prepare_rtlsim(self):
+        """Creates a Verilator emulation library for the RTL code generated
+        for this node, sets the rtlsim_so attribute to its path and returns
+        a PyVerilator wrapper around it."""
+
+        if PyVerilator is None:
+            raise ImportError("Installation of PyVerilator is required.")
+        # ensure that code is generated
+        code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+        assert (
+            code_gen_dir != ""
+        ), """Node attribute "code_gen_dir_ipgen" is
+        not set. Please run HLSSynth_IPGen first."""
+        verilog_file = self.get_verilog_top_filename()
+        assert os.path.isfile(verilog_file), "Cannot find top-level Verilog file."
+        # build the Verilator emu library
+        sim = PyVerilator.build(
+            verilog_file,
+            verilog_path=[
+                "{}/project_{}/sol1/impl/verilog/".format(
+                    code_gen_dir, self.onnx_node.name
+                )
+            ],
+        )
+        # save generated lib filename in attribute
+        self.set_nodeattr("rtlsim_so", sim.lib._name)
+        return sim
+
+    def get_rtlsim(self):
+        """Return a PyVerilator wrapper for the Verilator emulation library
+        for this node."""
+
+        rtlsim_so = self.get_nodeattr("rtlsim_so")
+        assert os.path.isfile(rtlsim_so), "Cannot find rtlsim library."
+        # create PyVerilator wrapper
+        sim = PyVerilator(rtlsim_so)
+        return sim
+
     def node_res_estimation(self):
         """Returns summarized resource estimation of BRAMs and LUTs
         of the node as a dictionary."""
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 2ef5d350fb972e448b9a3745eb8c98197ab87d94..e05b2dcea7e17231617f9d3880b778d1978b4ead 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -30,11 +30,6 @@ import os
 
 import numpy as np
 
-try:
-    from pyverilator import PyVerilator
-except ModuleNotFoundError:
-    PyVerilator = None
-
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
@@ -208,52 +203,26 @@ class ConvolutionInputGenerator(HLSCustomOp):
             did not produce expected ofolded utput shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
-            if PyVerilator is None:
-                raise ImportError("Installation of PyVerilator is required.")
-
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                rtlsim_inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                rtlsim_output = self.rtlsim(sim, rtlsim_inp)
-                odt = export_idt
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    rtlsim_output,
-                    out_npy_path,
-                    odt,
-                    out_shape,
-                    packed_bits,
-                    target_bits,
-                )
-                # load and reshape output
-                output = np.load(out_npy_path)
-                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-                context[node.output[0]] = output
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index 5e4c99aa41216b05f66da8341870269c620c6c40..6a4070528ee50d97e62881d00b57355d2a2baf2d 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -29,10 +29,6 @@
 import os
 import numpy as np
 
-try:
-    from pyverilator import PyVerilator
-except ModuleNotFoundError:
-    PyVerilator = None
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.core.datatype import DataType
 from onnx import TensorProto, helper
@@ -351,52 +347,26 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             context[node.output[0]] = output
 
         elif mode == "rtlsim":
-            if PyVerilator is None:
-                raise ImportError("Installation of PyVerilator is required.")
-
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                rtlsim_inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                rtlsim_output = self.rtlsim(sim, rtlsim_inp)
-                odt = export_idt
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    rtlsim_output,
-                    out_npy_path,
-                    odt,
-                    out_shape,
-                    packed_bits,
-                    target_bits,
-                )
-                # load and reshape output
-                output = np.load(out_npy_path)
-                output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
-                context[node.output[0]] = output
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 7408b119cb6f694ba5dcd056e25a1ec49764f5df..ab7ad37033c625d3f5bb47adef197ebd469438bf 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -33,10 +33,6 @@ from shutil import copy
 
 import numpy as np
 
-try:
-    from pyverilator import PyVerilator
-except ModuleNotFoundError:
-    PyVerilator = None
 from onnx import TensorProto, helper
 from finn.core.datatype import DataType
 from finn.custom_op.fpgadataflow import HLSCustomOp
@@ -100,6 +96,23 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def get_verilog_top_module_name(self):
+        "Return the Verilog top module name for this node."
+
+        node = self.onnx_node
+        # set top name depending on mem_mode
+        mem_mode = self.get_nodeattr("mem_mode")
+        if mem_mode == "const":
+            prefixed_top_name = "%s_%s" % (node.name, node.name)
+        elif mem_mode == "decoupled":
+            prefixed_top_name = "%s_memstream" % (node.name)
+        else:
+            raise Exception(
+                """Please set mem_mode to "const" or "decoupled", currently no other
+                parameter value is supported!"""
+            )
+        return prefixed_top_name
+
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         mw = self.get_nodeattr("MW")
@@ -646,61 +659,28 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             oshape = self.get_normal_output_shape()
             context[node.output[0]] = context[node.output[0]].reshape(*oshape)
         elif mode == "rtlsim":
-            if PyVerilator is None:
-                raise ImportError("Installation of PyVerilator is required.")
-
-            # set top name depending on mem_mode
-            mem_mode = self.get_nodeattr("mem_mode")
-            if mem_mode == "const":
-                prefixed_top_name = "%s_%s" % (node.name, node.name)
-            elif mem_mode == "decoupled":
-                prefixed_top_name = "%s_memstream" % (node.name)
-            else:
-                raise Exception(
-                    """Please set mem_mode to "const" or "decoupled", currently no other
-                    parameter value is supported!"""
-                )
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+            )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            output = self.rtlsim(sim, inp)
+            odt = self.get_output_datatype()
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                output, out_npy_path, odt, out_shape, packed_bits, target_bits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                output = self.rtlsim(sim, inp)
-                odt = self.get_output_datatype()
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    output, out_npy_path, odt, out_shape, packed_bits, target_bits
-                )
-
-                # load and reshape output
-                output = np.load(out_npy_path)
-                oshape = self.get_normal_output_shape()
-                output = np.asarray([output], dtype=np.float32).reshape(*oshape)
-                context[node.output[0]] = output
-
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
 
+            # load and reshape output
+            output = np.load(out_npy_path)
+            oshape = self.get_normal_output_shape()
+            output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index a7c2d5166b6af41327abcfeaa5cb5ae25fd23856..f370d417aa0ac1ce5d62af878575332941e2c1d0 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -29,10 +29,6 @@
 import os
 import numpy as np
 
-try:
-    from pyverilator import PyVerilator
-except ModuleNotFoundError:
-    PyVerilator = None
 from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.custom_op.im2col import compute_conv_output_dim
 from finn.core.datatype import DataType
@@ -302,52 +298,26 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             did not produce expected ofolded utput shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
-            if PyVerilator is None:
-                raise ImportError("Installation of PyVerilator is required.")
-
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-            # check if needed file exists
-            verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format(
-                code_gen_dir, node.name, prefixed_top_name
+            sim = self.get_rtlsim()
+            nbits = self.get_instream_width()
+            rtlsim_inp = npy_to_rtlsim_input(
+                "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
             )
-            if os.path.isfile(verilog_file):
-                nbits = self.get_instream_width()
-                rtlsim_inp = npy_to_rtlsim_input(
-                    "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
-                )
-                sim = PyVerilator.build(
-                    verilog_file,
-                    verilog_path=[
-                        "{}/project_{}/sol1/impl/verilog/".format(
-                            code_gen_dir, node.name
-                        )
-                    ],
-                )
-                super().reset_rtlsim(sim)
-                super().toggle_clk(sim)
-                rtlsim_output = self.rtlsim(sim, rtlsim_inp)
-                odt = export_idt
-                target_bits = odt.bitwidth()
-                packed_bits = self.get_outstream_width()
-                out_npy_path = "{}/output.npy".format(code_gen_dir)
-                out_shape = self.get_folded_output_shape()
-                rtlsim_output_to_npy(
-                    rtlsim_output,
-                    out_npy_path,
-                    odt,
-                    out_shape,
-                    packed_bits,
-                    target_bits,
-                )
-                # load and reshape output
-                output = np.load(out_npy_path)
-                output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
-                context[node.output[0]] = output
-            else:
-                raise Exception(
-                    """Found no verilog files for this node,
-                    did you run the codegen_ipgen transformation?"""
-                )
+            super().reset_rtlsim(sim)
+            super().toggle_clk(sim)
+            rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+            odt = export_idt
+            target_bits = odt.bitwidth()
+            packed_bits = self.get_outstream_width()
+            out_npy_path = "{}/output.npy".format(code_gen_dir)
+            out_shape = self.get_folded_output_shape()
+            rtlsim_output_to_npy(
+                rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+            )
+            # load and reshape output
+            output = np.load(out_npy_path)
+            output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+            context[node.output[0]] = output
         else:
             raise Exception(
                 """Invalid value for attribute exec_mode! Is currently set to: {}
diff --git a/src/finn/transformation/__init__.py b/src/finn/transformation/__init__.py
index a4e0bcf330a8ad1797eb76e61ba63511eb903dcf..e9f5fe15f6bdefe1e739394495f67a972ccff669 100644
--- a/src/finn/transformation/__init__.py
+++ b/src/finn/transformation/__init__.py
@@ -48,6 +48,8 @@ Guide to writing FINN transformations
 """
 
 from abc import ABC, abstractmethod
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
 
 
 class Transformation(ABC):
@@ -60,3 +62,54 @@ class Transformation(ABC):
     @abstractmethod
     def apply(self, model):
         pass
+
+
+class NodeLocalTransformation(Transformation):
+    """
+    Parent class for transformations, which can be executed locally to one node
+    by accessing and modifying the attributes of only that node.
+    This class can then automatically parallelize the transformation.
+    Transformations sublcassing NodeLocalTransformation must implement the
+    abstract method applyNodeLocal().
+
+    To control the degree of parallelization, specify the num_workers argument
+    in the constructor, using one of the following values:
+    * None: use NUM_DEFAULT_WORKERS environment variable
+    * 0: use all available CPU cores
+    * (any other int>0): set number of parallel workers
+    """
+
+    def __init__(self, num_workers=None):
+        super().__init__()
+        if num_workers is None:
+            self._num_workers = get_num_default_workers()
+        else:
+            self._num_workers = num_workers
+        assert self._num_workers >= 0, "Number of workers must be nonnegative."
+        if self._num_workers == 0:
+            self._num_workers = mp.cpu_count()
+
+    @abstractmethod
+    def applyNodeLocal(self, node):
+        pass
+
+    def apply(self, model):
+        # Remove old nodes from the current model
+        old_nodes = []
+        for i in range(len(model.graph.node)):
+            old_nodes.append(model.graph.node.pop())
+
+        # Execute transformation in parallel
+        with mp.Pool(self._num_workers) as p:
+            new_nodes_and_bool = p.map(self.applyNodeLocal, old_nodes, chunksize=1)
+
+        # extract nodes and check if the transformation needs to run again
+        # Note: .pop() had initially reversed the node order
+        run_again = False
+        for node, run in reversed(new_nodes_and_bool):
+            # Reattach new nodes to old model
+            model.graph.node.append(node)
+            if run is True:
+                run_again = True
+
+        return (model, run_again)
diff --git a/src/finn/transformation/fpgadataflow/compile.py b/src/finn/transformation/fpgadataflow/compile.py
index e577c3af6d2b92d8a2c63e89e3b1bca21d3d7c0a..a76ab683209bbb1219517075ff29a75540dc7bfc 100644
--- a/src/finn/transformation/fpgadataflow/compile.py
+++ b/src/finn/transformation/fpgadataflow/compile.py
@@ -28,28 +28,30 @@
 
 import finn.custom_op.registry as registry
 import finn.util.basic as util
-from finn.transformation import Transformation
+from finn.transformation import NodeLocalTransformation
 
 
-class Compile(Transformation):
+class Compile(NodeLocalTransformation):
     """For every node: compile C++ code in node attribute "code_gen_dir_npysim"
     and save path to executables in node attribute "executable_path".
     All nodes in the graph must have the fpgadataflow backend attribute.
 
     To use these executables, exec_mode must be set to "npysim" (using transformation
     SetExecMode) and the model has to be executed using execute_onnx() from
-    finn.core.onnx_exec"""
+    finn.core.onnx_exec
 
-    def __init__(self):
-        super().__init__()
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
 
-    def apply(self, model):
-        for node in model.graph.node:
-            op_type = node.op_type
-            if node.domain == "finn":
-                backend_attribute = util.get_by_name(node.attribute, "backend")
-                if backend_attribute is None:
-                    continue
+    def __init__(self, num_workers=None):
+        super().__init__(num_workers=num_workers)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if node.domain == "finn":
+            backend_attribute = util.get_by_name(node.attribute, "backend")
+            if backend_attribute is not None:
                 backend_value = backend_attribute.s.decode("UTF-8")
                 if backend_value == "fpgadataflow":
                     try:
@@ -74,4 +76,4 @@ class Compile(Transformation):
                         raise Exception(
                             "Custom op_type %s is currently not supported." % op_type
                         )
-        return (model, False)
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py b/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py
index 9fb7f8652d1fa5e624776a81ff6946d67882aa2a..2a40b3c2302a432937d45e807515e795f02e0365 100644
--- a/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py
+++ b/src/finn/transformation/fpgadataflow/hlssynth_ipgen.py
@@ -28,50 +28,54 @@
 
 import finn.custom_op.registry as registry
 import finn.util.basic as util
-from finn.transformation import Transformation
+from finn.transformation import NodeLocalTransformation
 
 
-class HLSSynth_IPGen(Transformation):
+class HLSSynth_IPGen(NodeLocalTransformation):
     """For each node: generate IP block from code in folder
     that is referenced in node attribute "code_gen_dir_ipgen"
     and save path of generated project in node attribute "ipgen_path".
     All nodes in the graph must have the fpgadataflow backend attribute.
 
     This transformation calls Vivado HLS for synthesis, so it will run for
-    some time (several minutes)"""
+    some time (several minutes)
 
-    def __init__(self):
-        super().__init__()
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
 
-    def apply(self, model):
-        for node in model.graph.node:
-            op_type = node.op_type
-            if node.domain == "finn":
-                backend_attribute = util.get_by_name(node.attribute, "backend")
-                if backend_attribute is None:
-                    continue
-                backend_value = backend_attribute.s.decode("UTF-8")
-                if backend_value == "fpgadataflow":
-                    try:
-                        # lookup op_type in registry of CustomOps
-                        inst = registry.custom_op[op_type](node)
-                        # ensure that code is generated
-                        assert (
-                            inst.get_nodeattr("code_gen_dir_ipgen") != ""
-                        ), """Node
-                        attribute "code_gen_dir_ipgen" is empty. Please run
-                        transformation CodeGen_ipgen first."""
-                        # call the compilation function for this node
-                        inst.ipgen_singlenode_code()
-                        # ensure that executable path is now set
-                        assert (
-                            inst.get_nodeattr("ipgen_path") != ""
-                        ), """Transformation
-                        HLSSynth_IPGen was not successful. Node attribute "ipgen_path"
-                        is empty."""
-                    except KeyError:
-                        # exception if op_type is not supported
-                        raise Exception(
-                            "Custom op_type %s is currently not supported." % op_type
-                        )
-        return (model, False)
+    def __init__(self, num_workers=None):
+        super().__init__(num_workers=num_workers)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if node.domain == "finn":
+            backend_attribute = util.get_by_name(node.attribute, "backend")
+            if backend_attribute is None:
+                return (node, False)
+            backend_value = backend_attribute.s.decode("UTF-8")
+            if backend_value == "fpgadataflow":
+                try:
+                    # lookup op_type in registry of CustomOps
+                    inst = registry.custom_op[op_type](node)
+                    # ensure that code is generated
+                    assert (
+                        inst.get_nodeattr("code_gen_dir_ipgen") != ""
+                    ), """Node
+                    attribute "code_gen_dir_ipgen" is empty. Please run
+                    transformation CodeGen_ipgen first."""
+                    # call the compilation function for this node
+                    inst.ipgen_singlenode_code()
+                    # ensure that executable path is now set
+                    assert (
+                        inst.get_nodeattr("ipgen_path") != ""
+                    ), """Transformation
+                    HLSSynth_IPGen was not successful. Node attribute "ipgen_path"
+                    is empty."""
+                except KeyError:
+                    # exception if op_type is not supported
+                    raise Exception(
+                        "Custom op_type %s is currently not supported." % op_type
+                    )
+
+        return (node, False)
diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
new file mode 100644
index 0000000000000000000000000000000000000000..4474831381425268d2a59e7de835bba31c55a733
--- /dev/null
+++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import finn.custom_op.registry as registry
+import finn.util.basic as util
+from finn.transformation import NodeLocalTransformation
+
+try:
+    from pyverilator import PyVerilator
+except ModuleNotFoundError:
+    PyVerilator = None
+
+
+class PrepareRTLSim(NodeLocalTransformation):
+    """For a graph with generated RTL sources (after HLSSynth_IPGen), create a
+    Verilator emulation library for each node to prepare for rtlsim
+    execution and set the rtlsim_so property to the path to the generated
+    emulation library.
+
+    To use these libraries, exec_mode must be set to "rtlsim" (using
+    SetExecMode) and the model has to be executed using execute_onnx() from
+    finn.core.onnx_exec
+
+    * num_workers (int or None) number of parallel workers, see documentation in
+      NodeLocalTransformation for more details.
+    """
+
+    def __init__(self, num_workers=None):
+        super().__init__(num_workers=num_workers)
+
+    def applyNodeLocal(self, node):
+        op_type = node.op_type
+        if node.domain == "finn":
+            backend_attribute = util.get_by_name(node.attribute, "backend")
+            if backend_attribute is not None:
+                backend_value = backend_attribute.s.decode("UTF-8")
+                if backend_value == "fpgadataflow":
+                    try:
+                        # lookup op_type in registry of CustomOps
+                        inst = registry.custom_op[op_type](node)
+                        inst.prepare_rtlsim()
+                        # ensure that executable path is now set
+                        assert (
+                            inst.get_nodeattr("rtlsim_so") != ""
+                        ), "Failed to prepare RTLSim, no rtlsim_so attribute found."
+                    except KeyError:
+                        # exception if op_type is not supported
+                        raise Exception(
+                            "Custom op_type %s is currently not supported." % op_type
+                        )
+        return (node, False)
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index f99a453d05d7cb3c824784e80103b6021f072a79..4eb0e6cb874f80620e3cb25017abcc29368b261b 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -44,6 +44,17 @@ pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1"
 pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e"
 
 
+def get_num_default_workers():
+    """Return the number of workers for parallel transformations. Controllable
+    via the NUM_DEFAULT_WORKERS environment variable. If the env.var. is
+    undefined, the default value of 1 is returned.
+    """
+
+    try:
+        return int(os.environ["NUM_DEFAULT_WORKERS"])
+    except KeyError:
+        return 1
+
 
 def get_finn_root():
     "Return the root directory that FINN is cloned into."
diff --git a/tests/end2end/test_end2end_cnv_w1a1.py b/tests/end2end/test_end2end_cnv_w1a1.py
index 75dc6c84f28fb03197e36bed3588670b5d37d2db..53f34d4d772a458eed3d417cdeb8a962338b099c 100644
--- a/tests/end2end/test_end2end_cnv_w1a1.py
+++ b/tests/end2end/test_end2end_cnv_w1a1.py
@@ -71,7 +71,7 @@ from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
-
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -199,6 +199,7 @@ def test_end2end_cnv_w1a1_verify_dataflow_part():
     res_npysim = ret_npysim[out_name]
     # node-by-node rtlsim
     model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     for fcl in fc_layers:
         getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py
index 03d6f92f1c148ce444f08fd65a867ad9390a18fd..020d49216f987ab17219817fdfa45f54d6a3154f 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -71,6 +71,7 @@ from finn.transformation.streamline.round_thresholds import RoundAndClipThreshol
 from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -182,6 +183,7 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     res_npysim = ret_npysim[out_name]
     # node-by-node rtlsim
     model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     for fcl in fc_layers:
         getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
diff --git a/tests/end2end/test_end2end_tfc_w1a2.py b/tests/end2end/test_end2end_tfc_w1a2.py
index 7fef331b99a78b43f8e808c8cdf978a5c8233f92..b55d985e07ac40fc875c49ba201c9552fd62c411 100644
--- a/tests/end2end/test_end2end_tfc_w1a2.py
+++ b/tests/end2end/test_end2end_tfc_w1a2.py
@@ -67,6 +67,7 @@ from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -173,6 +174,7 @@ def test_end2end_tfc_w1a2_verify_dataflow_part():
     res_npysim = ret_npysim[out_name]
     # node-by-node rtlsim
     model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     for fcl in fc_layers:
         getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
diff --git a/tests/end2end/test_end2end_tfc_w2a2.py b/tests/end2end/test_end2end_tfc_w2a2.py
index c78be7b66fe2c2f84e6f9a1a520c3e22e769c82f..92b8b18bc0253a07eec988c2bace9a9178682147 100644
--- a/tests/end2end/test_end2end_tfc_w2a2.py
+++ b/tests/end2end/test_end2end_tfc_w2a2.py
@@ -67,6 +67,7 @@ from finn.transformation.streamline import Streamline
 from finn.util.basic import pynq_part_map
 from finn.util.test import get_test_model_trained
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -173,6 +174,7 @@ def test_end2end_tfc_w2a2_verify_dataflow_part():
     res_npysim = ret_npysim[out_name]
     # node-by-node rtlsim
     model = model.transform(SetExecMode("rtlsim"))
+    model = model.transform(PrepareRTLSim())
     fc_layers = model.get_nodes_by_op_type("StreamingFCLayer_Batch")
     for fcl in fc_layers:
         getCustomOp(fcl).set_nodeattr("rtlsim_trace", "default")
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 92d98f57c62aeb93fc17091c37214a62e78ebb8f..2ec47915b01c92c7b7c11d0cf160543fb71dd27d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -38,6 +38,7 @@ from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
 from finn.transformation.fpgadataflow.compile import Compile
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
@@ -152,6 +153,7 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
         model = model.transform(HLSSynth_IPGen())
+        model = model.transform(PrepareRTLSim())
     else:
         raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py
index 651df836ab4dfd320e4c67ff3dd49f31ec13c110..1465881830b4fec61d1b1aa6e8465a41766fd9de 100644
--- a/tests/fpgadataflow/test_fpgadataflow_dwc.py
+++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py
@@ -7,6 +7,7 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 import finn.core.onnx_exec as oxe
@@ -66,6 +67,7 @@ def test_fpgadataflow_dwc_rtlsim(Shape, INWidth, OUTWidth, finn_dtype):
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(PrepareRTLSim())
     y = oxe.execute_onnx(model, input_dict)["outp"]
 
     assert (
diff --git a/tests/fpgadataflow/test_fpgadataflow_fclayer.py b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
index 330d9a1aedcd6607ae0c150b0ea9ef439afcc1df..e73c450ae80f6a4fc3672224cc394a318ce8938a 100644
--- a/tests/fpgadataflow/test_fpgadataflow_fclayer.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fclayer.py
@@ -44,7 +44,11 @@ from finn.transformation.fpgadataflow.compile import Compile
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.general import GiveUniqueNodeNames
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.util.basic import calculate_signed_dot_prod_range, gen_finn_dt_tensor
+from finn.transformation.fpgadataflow.replace_verilog_relpaths import (
+    ReplaceVerilogRelPaths,
+)
 
 
 def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None):
@@ -295,6 +299,8 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh):
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
     model = model.transform(HLSSynth_IPGen())
+    model = model.transform(ReplaceVerilogRelPaths())
+    model = model.transform(PrepareRTLSim())
     y_produced = oxe.execute_onnx(model, input_dict)["outp"]
     assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed"
 
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index a9acebb3114059f988aa0f21cad70e617d3d6f77..a7a731aaa5593a9fd680061d2b8ad3fc47e9f490 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -38,6 +38,7 @@ from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
 from finn.transformation.fpgadataflow.compile import Compile
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.transformation.general import GiveUniqueNodeNames
 from finn.util.basic import gen_finn_dt_tensor
 
@@ -144,6 +145,7 @@ def test_fpgadataflow_streamingmaxpool(idt, k, ifm_dim, ifm_ch, exec_mode):
         model = model.transform(GiveUniqueNodeNames())
         model = model.transform(CodeGen_ipgen("xc7z020clg400-1", 5))
         model = model.transform(HLSSynth_IPGen())
+        model = model.transform(PrepareRTLSim())
     else:
         raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow")