diff --git a/Dockerfile b/Dockerfile
index 7780d3fd4e630af7a6395b84858211fb93c2b834..734a8fd3f4b6493246f1c7e522da2934d09e849b 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,6 @@ ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
-ENV PYNQ_BOARD "Pynq-Z1"
 
 ARG GID
 ARG GNAME
diff --git a/run-docker.sh b/run-docker.sh
index 77441ed955c8a055ac57a7328f2998f8855c20e9..aadc4c78717d85bde3bdf3dcedd48824f5ba483b 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -17,8 +17,11 @@ DOCKER_TAG="finn_${DOCKER_UNAME}"
 # uncomment to run multiple instances with different names
 # DOCKER_INST_NAME="finn_${DOCKER_UNAME}_${DOCKER_RND}"
 DOCKER_INST_NAME="finn_${DOCKER_UNAME}"
+# the settings below will be taken from environment variables if available,
+# otherwise the defaults below will be used
 : ${JUPYTER_PORT=8888}
 : ${NETRON_PORT=8081}
+: ${PYNQ_BOARD="Pynq-Z1"}
 
 # Absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
@@ -40,7 +43,7 @@ PYVERILATOR_LOCAL=$SCRIPTPATH/pyverilator
 PYNQSHELL_LOCAL=$SCRIPTPATH/PYNQ-HelloWorld
 BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
-: ${VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache}
+VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 
 # clone dependency repos
 git clone --branch feature/finn_onnx_export $BREVITAS_REPO $BREVITAS_LOCAL ||  git -C "$BREVITAS_LOCAL" pull
@@ -67,6 +70,7 @@ echo "Mounting $VIVADO_PATH into $VIVADO_PATH"
 echo "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 echo "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 echo "Vivado IP cache dir is at $VIVADO_IP_CACHE"
+echo "Using default PYNQ board $PYNQ_BOARD"
 
 if [ "$1" = "test" ]; then
         echo "Running test suite"
@@ -107,6 +111,7 @@ docker run -t --rm --name $DOCKER_INST_NAME -it \
 -e FINN_INST_NAME=$DOCKER_INST_NAME \
 -e FINN_ROOT="/workspace/finn" \
 -e VIVADO_IP_CACHE="$VIVADO_IP_CACHE" \
+-e PYNQ_BOARD=$PYNQ_BOARD \
 -p $JUPYTER_PORT:$JUPYTER_PORT \
 -p $NETRON_PORT:$NETRON_PORT \
 $DOCKER_TAG bash -c "$DOCKER_CMD"
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index be9b51e6a7b1b3e255cd2ee8baf10937b95f8665..6ed4df512725eebd14fd0df80040d82b802e48ad 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -41,6 +41,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "binaryXnorMode": ("i", False, 0),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 0),
+            "outFIFODepth": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -99,6 +102,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             info_messages.append('Attribute backend should be set to "fpgadataflow"')
 
         # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
         try:
             self.get_nodeattr("code_gen_dir_npysim")
             self.get_nodeattr("executable_path")
@@ -161,6 +165,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         Y. Umuroglu, M. Leeser and K. Vissers
         - 12. Sep 2018
         """
+        # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
@@ -178,6 +183,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         Y. Umuroglu, M. Leeser and K. Vissers
         - 12. Sep 2018
         """
+        # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
@@ -642,6 +648,17 @@ class StreamingFCLayer_Batch(HLSCustomOp):
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        in_fifo_depth = self.get_nodeattr("inFIFODepth")
+        out_fifo_depth = self.get_nodeattr("outFIFODepth")
+        # insert depth pragmas only if specified
+        if in_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+            )
+        if out_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
+            )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index 40b272efb9d6b00126312a9934f28c2a899bd942..11b6dc63065fb376f95f44dd92319439321c78f8 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -11,6 +11,7 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
 from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
@@ -26,7 +27,7 @@ from finn.util.basic import (
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
 
-ip_stitch_model_dir = make_build_dir("test_fpgadataflow_ipstitch")
+ip_stitch_model_dir = make_build_dir("test_fpgadataflow_ipstitch_")
 
 
 def create_one_fc_model():
@@ -39,10 +40,11 @@ def create_one_fc_model():
     no_act = 1
     binary_xnor_mode = 0
     actval = 0
+    simd = 2
+    pe = 2
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, m])
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
-    outp_tlast = helper.make_tensor_value_info("outp_tlast", TensorProto.FLOAT, [1, m])
 
     fc0 = helper.make_node(
         "StreamingFCLayer_Batch",
@@ -53,8 +55,8 @@ def create_one_fc_model():
         resType="ap_resource_lut()",
         MW=m,
         MH=m,
-        SIMD=m,
-        PE=m // 2,
+        SIMD=simd,
+        PE=pe,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
@@ -63,23 +65,8 @@ def create_one_fc_model():
         noActivation=no_act,
     )
 
-    tlastmarker = helper.make_node(
-        "TLastMarker",
-        ["outp"],
-        ["outp_tlast"],
-        domain="finn",
-        backend="fpgadataflow",
-        NumIters=2,
-        ElemWidth=odt.bitwidth(),
-        StreamWidth=odt.bitwidth() * m,
-    )
-
     graph = helper.make_graph(
-        nodes=[fc0, tlastmarker],
-        name="fclayer_graph",
-        inputs=[inp],
-        outputs=[outp_tlast],
-        value_info=[outp],
+        nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp],
     )
 
     model = helper.make_model(graph, producer_name="fclayer-model")
@@ -87,7 +74,6 @@ def create_one_fc_model():
 
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", odt)
-    model.set_tensor_datatype("outp_tlast", odt)
     model.set_tensor_datatype("w0", wdt)
 
     # generate weights
@@ -108,11 +94,12 @@ def create_two_fc_model():
     actval = odt.min()
     no_act = 0
     binary_xnor_mode = 0
+    pe = 2
+    simd = 2
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, m])
     mid = helper.make_tensor_value_info("mid", TensorProto.FLOAT, [1, m])
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
-    outp_tlast = helper.make_tensor_value_info("outp_tlast", TensorProto.FLOAT, [1, m])
 
     fc0 = helper.make_node(
         "StreamingFCLayer_Batch",
@@ -123,8 +110,8 @@ def create_two_fc_model():
         resType="ap_resource_lut()",
         MW=m,
         MH=m,
-        SIMD=1,
-        PE=1,
+        SIMD=simd,
+        PE=pe,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
@@ -142,8 +129,8 @@ def create_two_fc_model():
         resType="ap_resource_lut()",
         MW=m,
         MH=m,
-        SIMD=1,
-        PE=1,
+        SIMD=simd,
+        PE=pe,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
@@ -152,23 +139,12 @@ def create_two_fc_model():
         noActivation=no_act,
     )
 
-    tlastmarker = helper.make_node(
-        "TLastMarker",
-        ["outp"],
-        ["outp_tlast"],
-        domain="finn",
-        backend="fpgadataflow",
-        NumIters=m,
-        StreamWidth=2,
-        ElemWidth=odt.bitwidth(),
-    )
-
     graph = helper.make_graph(
-        nodes=[fc0, fc1, tlastmarker],
+        nodes=[fc0, fc1],
         name="fclayer_graph",
         inputs=[inp],
-        outputs=[outp_tlast],
-        value_info=[mid, outp],
+        outputs=[outp],
+        value_info=[mid],
     )
 
     model = helper.make_model(graph, producer_name="fclayer-model")
@@ -177,7 +153,6 @@ def create_two_fc_model():
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("mid", idt)
     model.set_tensor_datatype("outp", odt)
-    model.set_tensor_datatype("outp_tlast", odt)
     model.set_tensor_datatype("w0", wdt)
     model.set_tensor_datatype("w1", wdt)
 
@@ -206,12 +181,13 @@ def create_two_fc_model():
 @pytest.mark.dependency()
 def test_fpgadataflow_ipstitch_gen_model():
     model = create_one_fc_model()
+    model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen(test_fpga_part, 5))
     model = model.transform(HLSSynth_IPGen())
     assert model.graph.node[0].op_type == "StreamingFCLayer_Batch"
     # assert model.graph.node[1].op_type == "StreamingFCLayer_Batch"
-    assert model.graph.node[1].op_type == "TLastMarker"
+    assert model.graph.node[-1].op_type == "TLastMarker"
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx")
 
 
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index 32b1c60fc714794e39fe1ade2d0252895bb33025..a7a096d6a69de743a318c0a514a2f24da5d7a29f 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -7,6 +7,7 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.cleanup import CleanUp
 from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
 from finn.transformation.fpgadataflow.compile import Compile
+from finn.transformation.fpgadataflow.set_sim_mode import SetSimMode
 
 
 def test_layer_streaming_maxpool_batch():
@@ -112,6 +113,7 @@ def test_layer_streaming_maxpool_batch():
     ).reshape(2, 2, 4, 4)
     print(input_tensor)
 
+    model = model.transform(SetSimMode("npysim"))
     model = model.transform(CodeGen_npysim())
     model = model.transform(Compile())