diff --git a/Dockerfile b/Dockerfile
index 8c97a3ad9089bcc858134a51ac189e4105a98ed9..161ceb5ace3025e56b335064a8d1653c9ba26aba 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -27,7 +27,6 @@ ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas_cnv_lfc/training_scripts"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/brevitas"
 ENV PYTHONPATH "${PYTHONPATH}:/workspace/pyverilator"
 ENV PYNQSHELL_PATH "/workspace/PYNQ-HelloWorld/boards"
-ENV PYNQ_BOARD "Pynq-Z1"
 
 ARG GID
 ARG GNAME
diff --git a/run-docker.sh b/run-docker.sh
index 77441ed955c8a055ac57a7328f2998f8855c20e9..aadc4c78717d85bde3bdf3dcedd48824f5ba483b 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -17,8 +17,11 @@ DOCKER_TAG="finn_${DOCKER_UNAME}"
 # uncomment to run multiple instances with different names
 # DOCKER_INST_NAME="finn_${DOCKER_UNAME}_${DOCKER_RND}"
 DOCKER_INST_NAME="finn_${DOCKER_UNAME}"
+# the settings below will be taken from environment variables if available,
+# otherwise the defaults below will be used
 : ${JUPYTER_PORT=8888}
 : ${NETRON_PORT=8081}
+: ${PYNQ_BOARD="Pynq-Z1"}
 
 # Absolute path to this script, e.g. /home/user/bin/foo.sh
 SCRIPT=$(readlink -f "$0")
@@ -40,7 +43,7 @@ PYVERILATOR_LOCAL=$SCRIPTPATH/pyverilator
 PYNQSHELL_LOCAL=$SCRIPTPATH/PYNQ-HelloWorld
 BUILD_LOCAL=/tmp/$DOCKER_INST_NAME
 VIVADO_HLS_LOCAL=$VIVADO_PATH
-: ${VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache}
+VIVADO_IP_CACHE=$BUILD_LOCAL/vivado_ip_cache
 
 # clone dependency repos
 git clone --branch feature/finn_onnx_export $BREVITAS_REPO $BREVITAS_LOCAL ||  git -C "$BREVITAS_LOCAL" pull
@@ -67,6 +70,7 @@ echo "Mounting $VIVADO_PATH into $VIVADO_PATH"
 echo "Port-forwarding for Jupyter $JUPYTER_PORT:$JUPYTER_PORT"
 echo "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 echo "Vivado IP cache dir is at $VIVADO_IP_CACHE"
+echo "Using default PYNQ board $PYNQ_BOARD"
 
 if [ "$1" = "test" ]; then
         echo "Running test suite"
@@ -107,6 +111,7 @@ docker run -t --rm --name $DOCKER_INST_NAME -it \
 -e FINN_INST_NAME=$DOCKER_INST_NAME \
 -e FINN_ROOT="/workspace/finn" \
 -e VIVADO_IP_CACHE="$VIVADO_IP_CACHE" \
+-e PYNQ_BOARD=$PYNQ_BOARD \
 -p $JUPYTER_PORT:$JUPYTER_PORT \
 -p $NETRON_PORT:$NETRON_PORT \
 $DOCKER_TAG bash -c "$DOCKER_CMD"
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index 8a273367023829fed1581261ac35dcea4b9d1738..fe710f0be1bd0d8783e13352812eeef5ebd2c332 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -41,6 +41,9 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             "binaryXnorMode": ("i", False, 0),
             # no-activation mode (produce accumulators)
             "noActivation": ("i", False, 0),
+            # input and output FIFO depths
+            "inFIFODepth": ("i", False, 0),
+            "outFIFODepth": ("i", False, 0),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -99,6 +102,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             info_messages.append('Attribute backend should be set to "fpgadataflow"')
 
         # verify that all necessary attributes exist
+        # TODO collect automatically from get_nodeattr_types
         try:
             self.get_nodeattr("code_gen_dir_npysim")
             self.get_nodeattr("executable_path")
@@ -161,6 +165,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         Y. Umuroglu, M. Leeser and K. Vissers
         - 12. Sep 2018
         """
+        # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
@@ -178,6 +183,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         Y. Umuroglu, M. Leeser and K. Vissers
         - 12. Sep 2018
         """
+        # TODO add in/out FIFO contributions
         P = self.get_nodeattr("PE")
         Q = self.get_nodeattr("SIMD")
         wdt = self.get_weight_datatype()
@@ -642,6 +648,17 @@ class StreamingFCLayer_Batch(HLSCustomOp):
     def pragmas(self):
         self.code_gen_dict["$PRAGMAS$"] = ["#pragma HLS INTERFACE axis port=in0"]
         self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE axis port=out")
+        in_fifo_depth = self.get_nodeattr("inFIFODepth")
+        out_fifo_depth = self.get_nodeattr("outFIFODepth")
+        # insert depth pragmas only if specified
+        if in_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=in0" % in_fifo_depth
+            )
+        if out_fifo_depth != 0:
+            self.code_gen_dict["$PRAGMAS$"].append(
+                "#pragma HLS stream depth=%d variable=out" % out_fifo_depth
+            )
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index a613d0622ee95e7f1ca848142e2930cf6d3c91bd..ba45e01bf3ecee457e9788e5dbea4cd1c3ee0007 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -103,7 +103,9 @@ dma.sendchannel.wait()
 dma.recvchannel.wait()
 
 # unpack the packed output buffer from accelerator
-obuf_folded = packed_bytearray_to_finnpy(obuf_packed, odt, oshape_folded)
+obuf_folded = packed_bytearray_to_finnpy(
+    obuf_packed, odt, oshape_folded, reverse_endian=True
+)
 # convert to normal reshape and save
 obuf_normal = obuf_folded.reshape(oshape_normal)
 np.save("output.npy", obuf_normal)
diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py
index 58c62219287940eb6533d2513e66d2c9c33cfb01..2bdb992948a93cd75cc4b15f1ec0a8e8b4a3b372 100644
--- a/src/finn/util/data_packing.py
+++ b/src/finn/util/data_packing.py
@@ -152,7 +152,7 @@ def unpack_innermost_dim_from_hex_string(
         # interpret values as bipolar
         if dtype == DataType.BIPOLAR:
             ar_list = [2 * x - 1 for x in ar_list]
-        # interpret values as signed values 
+        # interpret values as signed values
         elif dtype.name.startswith("INT"):
             mask = 2 ** (dtype.bitwidth() - 1)
             ar_list = [-(x & mask) + (x & ~mask) for x in ar_list]
@@ -277,7 +277,13 @@ def finnpy_to_packed_bytearray(ndarray, dtype):
         return np.apply_along_axis(fn, packed_hexstring.ndim - 1, packed_hexstring)
 
 
-def packed_bytearray_to_finnpy(packed_bytearray, dtype, output_shape=None, reverse_inner=False):
+def packed_bytearray_to_finnpy(
+    packed_bytearray,
+    dtype,
+    output_shape=None,
+    reverse_inner=False,
+    reverse_endian=False,
+):
     """Given a packed numpy uint8 ndarray, unpack it into a FINN array of
     given DataType. output_shape can be specified to remove padding from the
     packed dimension, or set to None to be inferred from the input."""
@@ -296,10 +302,20 @@ def packed_bytearray_to_finnpy(packed_bytearray, dtype, output_shape=None, rever
         assert packed_bits % target_bits == 0
         n_target_elems = packed_bits // target_bits
         output_shape = packed_bytearray.shape[:-1] + (n_target_elems,)
+    if reverse_endian and target_bits > 8:
+        # revse the endianness of each element
+        orig_shape = packed_bytearray.shape
+        assert target_bits % 8 == 0
+        target_bytes = target_bits // 8
+        new_shape = orig_shape[:-1] + (-1, target_bytes)
+        packed_bytearray = np.flip(packed_bytearray.reshape(new_shape), axis=-1)
+        packed_bytearray = packed_bytearray.reshape(orig_shape)
     # convert innermost dim of byte array to hex strings
     packed_hexstring = np.apply_along_axis(
         npbytearray2hexstring, packed_dim, packed_bytearray
     )
-    ret = unpack_innermost_dim_from_hex_string(packed_hexstring, dtype, output_shape, reverse_inner)
+    ret = unpack_innermost_dim_from_hex_string(
+        packed_hexstring, dtype, output_shape, reverse_inner
+    )
 
     return ret
diff --git a/tests/fpgadataflow/test_data_packing.py b/tests/fpgadataflow/test_data_packing.py
index 3616219ef0e1046e7ef1a6daf3c1bfb6528a21cc..2c175953e6e3d7e3fc3f89cf1249575a10ea1fc0 100644
--- a/tests/fpgadataflow/test_data_packing.py
+++ b/tests/fpgadataflow/test_data_packing.py
@@ -1,5 +1,5 @@
-import shutil
 import os
+import shutil
 import subprocess
 
 import numpy as np
@@ -61,7 +61,9 @@ def make_npy2apintstream_testcase(ndarray, dtype):
     cmd_compile = """
 g++ -o test_npy2apintstream test.cpp /workspace/cnpy/cnpy.cpp \
 -I/workspace/cnpy/ -I{}/include -I/workspace/finn/src/finn/data/cpp \
---std=c++11 -lz""".format(os.environ["VIVADO_PATH"])
+--std=c++11 -lz""".format(
+        os.environ["VIVADO_PATH"]
+    )
     with open(test_dir + "/compile.sh", "w") as f:
         f.write(cmd_compile)
     compile = subprocess.Popen(
@@ -191,3 +193,13 @@ def test_packed_bytearray_to_finnpy():
     eE = np.asarray(eE, dtype=np.float32)
     shapeE = eE.shape
     assert (packed_bytearray_to_finnpy(E, DataType.INT32, shapeE) == eE).all()
+    F = np.asarray(
+        [[252, 255, 255, 255, 0, 0, 0, 0, 252, 255, 255, 255, 252, 255, 255, 255]],
+        dtype=np.uint8,
+    )
+    eF = [[-4, 0, -4, -4]]
+    eF = np.asarray(eE, dtype=np.float32)
+    shapeF = eF.shape
+    assert (
+        packed_bytearray_to_finnpy(F, DataType.INT32, shapeF, reverse_endian=True) == eF
+    ).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
index f27d06b312981b810d4e1d7f9b6dcbe79ea56ccc..15b74b2b7e7a9bb32d5cb963cfddb6a02c4f5e3f 100644
--- a/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
+++ b/tests/fpgadataflow/test_fpgadataflow_ip_stitch.py
@@ -11,6 +11,7 @@ from finn.custom_op.registry import getCustomOp
 from finn.transformation.fpgadataflow.codegen_ipgen import CodeGen_ipgen
 from finn.transformation.fpgadataflow.codegen_ipstitch import CodeGen_ipstitch
 from finn.transformation.fpgadataflow.hlssynth_ipgen import HLSSynth_IPGen
+from finn.transformation.fpgadataflow.insert_tlastmarker import InsertTLastMarker
 from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ
 from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
 from finn.transformation.fpgadataflow.make_pynq_proj import MakePYNQProject
@@ -27,7 +28,7 @@ from finn.util.basic import (
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
 test_fpga_part = pynq_part_map[test_pynq_board]
 
-ip_stitch_model_dir = make_build_dir("test_fpgadataflow_ipstitch")
+ip_stitch_model_dir = make_build_dir("test_fpgadataflow_ipstitch_")
 
 
 def create_one_fc_model():
@@ -40,10 +41,11 @@ def create_one_fc_model():
     no_act = 1
     binary_xnor_mode = 0
     actval = 0
+    simd = 2
+    pe = 2
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, m])
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
-    outp_tlast = helper.make_tensor_value_info("outp_tlast", TensorProto.FLOAT, [1, m])
 
     fc0 = helper.make_node(
         "StreamingFCLayer_Batch",
@@ -54,8 +56,8 @@ def create_one_fc_model():
         resType="ap_resource_lut()",
         MW=m,
         MH=m,
-        SIMD=m,
-        PE=m // 2,
+        SIMD=simd,
+        PE=pe,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
@@ -64,23 +66,8 @@ def create_one_fc_model():
         noActivation=no_act,
     )
 
-    tlastmarker = helper.make_node(
-        "TLastMarker",
-        ["outp"],
-        ["outp_tlast"],
-        domain="finn",
-        backend="fpgadataflow",
-        NumIters=2,
-        ElemWidth=odt.bitwidth(),
-        StreamWidth=odt.bitwidth() * m,
-    )
-
     graph = helper.make_graph(
-        nodes=[fc0, tlastmarker],
-        name="fclayer_graph",
-        inputs=[inp],
-        outputs=[outp_tlast],
-        value_info=[outp],
+        nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp],
     )
 
     model = helper.make_model(graph, producer_name="fclayer-model")
@@ -88,7 +75,6 @@ def create_one_fc_model():
 
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("outp", odt)
-    model.set_tensor_datatype("outp_tlast", odt)
     model.set_tensor_datatype("w0", wdt)
 
     # generate weights
@@ -110,11 +96,12 @@ def create_two_fc_model():
     actval = odt.min()
     no_act = 0
     binary_xnor_mode = 0
+    pe = 2
+    simd = 2
 
     inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, m])
     mid = helper.make_tensor_value_info("mid", TensorProto.FLOAT, [1, m])
     outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m])
-    outp_tlast = helper.make_tensor_value_info("outp_tlast", TensorProto.FLOAT, [1, m])
 
     fc0 = helper.make_node(
         "StreamingFCLayer_Batch",
@@ -125,8 +112,8 @@ def create_two_fc_model():
         resType="ap_resource_lut()",
         MW=m,
         MH=m,
-        SIMD=1,
-        PE=1,
+        SIMD=simd,
+        PE=pe,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
@@ -144,8 +131,8 @@ def create_two_fc_model():
         resType="ap_resource_lut()",
         MW=m,
         MH=m,
-        SIMD=1,
-        PE=1,
+        SIMD=simd,
+        PE=pe,
         inputDataType=idt.name,
         weightDataType=wdt.name,
         outputDataType=odt.name,
@@ -154,23 +141,12 @@ def create_two_fc_model():
         noActivation=no_act,
     )
 
-    tlastmarker = helper.make_node(
-        "TLastMarker",
-        ["outp"],
-        ["outp_tlast"],
-        domain="finn",
-        backend="fpgadataflow",
-        NumIters=m,
-        StreamWidth=2,
-        ElemWidth=odt.bitwidth(),
-    )
-
     graph = helper.make_graph(
-        nodes=[fc0, fc1, tlastmarker],
+        nodes=[fc0, fc1],
         name="fclayer_graph",
         inputs=[inp],
-        outputs=[outp_tlast],
-        value_info=[mid, outp],
+        outputs=[outp],
+        value_info=[mid],
     )
 
     model = helper.make_model(graph, producer_name="fclayer-model")
@@ -179,7 +155,6 @@ def create_two_fc_model():
     model.set_tensor_datatype("inp", idt)
     model.set_tensor_datatype("mid", idt)
     model.set_tensor_datatype("outp", odt)
-    model.set_tensor_datatype("outp_tlast", odt)
     model.set_tensor_datatype("w0", wdt)
     model.set_tensor_datatype("w1", wdt)
 
@@ -213,14 +188,13 @@ def test_fpgadataflow_ipstitch_gen_model():
         assert sdp_node.__class__.__name__ == "StreamingDataflowPartition"
         assert os.path.isfile(sdp_node.get_nodeattr("model"))
         model = ModelWrapper(sdp_node.get_nodeattr("model"))
-
+    model = model.transform(InsertTLastMarker())
     model = model.transform(GiveUniqueNodeNames())
     model = model.transform(CodeGen_ipgen(test_fpga_part, 5))
     model = model.transform(HLSSynth_IPGen())
     assert model.graph.node[0].op_type == "StreamingFCLayer_Batch"
     # assert model.graph.node[1].op_type == "StreamingFCLayer_Batch"
-    assert model.graph.node[1].op_type == "TLastMarker"
-
+    assert model.graph.node[-1].op_type == "TLastMarker"
     model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model.onnx")
 
 
diff --git a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
index 32b1c60fc714794e39fe1ade2d0252895bb33025..a7a096d6a69de743a318c0a514a2f24da5d7a29f 100644
--- a/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
+++ b/tests/fpgadataflow/test_layer_streaming_maxpool_batch.py
@@ -7,6 +7,7 @@ from finn.core.modelwrapper import ModelWrapper
 from finn.transformation.fpgadataflow.cleanup import CleanUp
 from finn.transformation.fpgadataflow.codegen_npysim import CodeGen_npysim
 from finn.transformation.fpgadataflow.compile import Compile
+from finn.transformation.fpgadataflow.set_sim_mode import SetSimMode
 
 
 def test_layer_streaming_maxpool_batch():
@@ -112,6 +113,7 @@ def test_layer_streaming_maxpool_batch():
     ).reshape(2, 2, 4, 4)
     print(input_tensor)
 
+    model = model.transform(SetSimMode("npysim"))
     model = model.transform(CodeGen_npysim())
     model = model.transform(Compile())