diff --git a/docker/Dockerfile.finn_ci b/docker/Dockerfile.finn_ci
index 0d610ec66a5f433d156f4e8da976767ce6458aef..2668927602ebb8de5fdc3d7c25b20a0c8c4a2e55 100644
--- a/docker/Dockerfile.finn_ci
+++ b/docker/Dockerfile.finn_ci
@@ -47,7 +47,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
 RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
 # FINN hlslib
-RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib
+RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
 # PyVerilator
 RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 # PYNQ-HelloWorld
diff --git a/docker/Dockerfile.finn_dev b/docker/Dockerfile.finn_dev
index 1c2cb19d14137b866b55417522fdebb8e0d7ad90..1200c7d5d15bbd62e15f19f84e70d5fe0b8aca28 100644
--- a/docker/Dockerfile.finn_dev
+++ b/docker/Dockerfile.finn_dev
@@ -76,7 +76,7 @@ RUN git clone https://github.com/Xilinx/brevitas.git /workspace/brevitas
 # CNPY
 RUN git clone https://github.com/rogersce/cnpy.git /workspace/cnpy
 # FINN hlslib
-RUN git clone https://github.com/maltanar/finn-hlslib.git /workspace/finn-hlslib
+RUN git clone https://github.com/Xilinx/finn-hlslib.git /workspace/finn-hlslib
 # PyVerilator
 RUN git clone https://github.com/maltanar/pyverilator /workspace/pyverilator
 # PYNQ-HelloWorld
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index 9cc239319fe94f482a4a6564399943b4dfe6ff53..7e13e117859365531f459928b7c664edb3fbf4ce 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -15,8 +15,8 @@ gecho () {
 # the repos themselves are cloned in the Dockerfile
 BREVITAS_COMMIT=989cdfdba4700fdd900ba0b25a820591d561c21a
 CNPY_COMMIT=4e8810b1a8637695171ed346ce68f6984e585ef4
-HLSLIB_COMMIT=6b88db826bb023937506913a23d964775a7606af
-PYVERILATOR_COMMIT=1d89cb0d4e0c97469cc6352c611f876ec13edfa6
+HLSLIB_COMMIT=13e9b0772a27a3a1efc40c878d8e78ed09efb716
+PYVERILATOR_COMMIT=c97a5ba41bbc7c419d6f25c74cdf3bdc3393174f
 PYNQSHELL_COMMIT=0c82a61b0ec1a07fa275a14146233824ded7a13d
 
 
diff --git a/requirements.txt b/requirements.txt
index 2427f9490a3dd5a7ffe0e0a8cf2ad19af0934cdf..b15d86ed89f7b0e76b772ce42aba6481937310b0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,6 @@ pre-commit
 pyverilator
 scipy
 sphinx
+toposort
+vcdvcd
 wget
diff --git a/run-docker.sh b/run-docker.sh
index 186efc322a8f437be0371b5a142a9dd524d1abf3..e07556716db335421f57a390f1e6a17168ac058b 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -96,7 +96,7 @@ gecho "Port-forwarding for Netron $NETRON_PORT:$NETRON_PORT"
 gecho "Vivado IP cache dir is at $VIVADO_IP_CACHE"
 gecho "Using default PYNQ board $PYNQ_BOARD"
 
-DOCKER_INTERACTIVE = ""
+DOCKER_INTERACTIVE=""
 
 if [ "$1" = "test" ]; then
         gecho "Running test suite (all tests)"
diff --git a/src/finn/core/modelwrapper.py b/src/finn/core/modelwrapper.py
index ed32426abcc8ea71428a7f746a99454e8e4a2c17..646add188c5d475cf37ccd33cf24d29d61754ae1 100644
--- a/src/finn/core/modelwrapper.py
+++ b/src/finn/core/modelwrapper.py
@@ -259,11 +259,10 @@ class ModelWrapper:
 
     def find_producer(self, tensor_name):
         """Finds and returns the node that produces the tensor with given name."""
-        ret = None
         for x in self._model_proto.graph.node:
             if tensor_name in x.output:
-                ret = x
-        return ret
+                return x
+        return None
 
     def find_upstream(self, tensor_name, finder_fxn):
         """Follow the producer chain upstream, calling finder_fxn on each upstream
@@ -510,3 +509,41 @@ class ModelWrapper:
             qa.tensor_name = tensor_name
             qa.quant_parameter_tensor_names.append(dt)
             qnt_annotations.append(qa)
+
+    def get_tensor_sparsity(self, tensor_name):
+        """Returns the sparsity of a given tensor as dictionary."""
+        graph = self._model_proto.graph
+        qnt_annotations = graph.quantization_annotation
+        ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name")
+        if ret is not None:
+            ret = util.get_by_name(
+                ret.quant_parameter_tensor_names, "tensor_sparsity", "key"
+            )
+            if ret is not None:
+                return eval(ret.value)
+        return None
+
+    def set_tensor_sparsity(self, tensor_name, sparsity_dict):
+        """Sets the sparsity annotation of a tensor with given name."""
+        graph = self._model_proto.graph
+        qnt_annotations = graph.quantization_annotation
+        ret = util.get_by_name(qnt_annotations, tensor_name, "tensor_name")
+        if ret is not None:
+            ret_ts = util.get_by_name(
+                ret.quant_parameter_tensor_names, "tensor_sparsity", "key"
+            )
+            if ret_ts is not None:
+                ret_ts.value = str(sparsity_dict)
+            else:
+                ts = onnx.StringStringEntryProto()
+                ts.key = "tensor_sparsity"
+                ts.value = str(sparsity_dict)
+                ret.quant_parameter_tensor_names.append(ts)
+        else:
+            qa = onnx.TensorAnnotation()
+            dt = onnx.StringStringEntryProto()
+            dt.key = "tensor_sparsity"
+            dt.value = str(sparsity_dict)
+            qa.tensor_name = tensor_name
+            qa.quant_parameter_tensor_names.append(dt)
+            qnt_annotations.append(qa)
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index 44787e1d26049e6075e2222316b45ab3898acbc7..c2f68a35076418e0cf2edb578bdb8d548772fc78 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -103,7 +103,7 @@ def execute_node(node, context, graph):
                         """Output shapes disagree after node execution:
                         found %s vs expected %s"""
                         % (
-                            str(output_list[list_ind].shape.shape),
+                            str(output_list[list_ind].shape),
                             str(context[outp].shape),
                         )
                     )
diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index 91bd3d1198f997eaf96ef3883b2c25e32c5da050..1e1bee3aa7435d5cab6cbf5ea23dd37dcdfa4380 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -99,9 +99,7 @@ def rtlsim_exec(model, execution_context):
         sim = pyverilate_stitched_ip(model)
         model.set_metadata_prop("rtlsim_so", sim.lib._name)
     else:
-        sim = PyVerilator(rtlsim_so)
-    _reset_rtlsim(sim)
-    _toggle_clk(sim)
+        sim = PyVerilator(rtlsim_so, auto_eval=False)
     ret = _run_rtlsim(sim, packed_input, num_out_values, trace_file)
     packed_output = ret[0]
     model.set_metadata_prop("sim_cycles", str(ret[1]))
@@ -117,18 +115,22 @@ def _reset_rtlsim(sim):
     """Sets reset input in pyverilator to zero, toggles the clock and set it
     back to one"""
     sim.io.ap_rst_n_0 = 0
-    sim.io.ap_clk_0 = 1
-    sim.io.ap_clk_0 = 0
+    _toggle_clk(sim)
+    _toggle_clk(sim)
     sim.io.ap_rst_n_0 = 1
+    _toggle_clk(sim)
+    _toggle_clk(sim)
 
 
 def _toggle_clk(sim):
     """Toggles the clock input in pyverilator once."""
-    sim.io.ap_clk_0 = 1
     sim.io.ap_clk_0 = 0
+    sim.eval()
+    sim.io.ap_clk_0 = 1
+    sim.eval()
 
 
-def _run_rtlsim(sim, inp, num_out_values, trace_file=None):
+def _run_rtlsim(sim, inp, num_out_values, trace_file=None, reset=True):
     """Runs the pyverilator simulation by passing the input values to the simulation,
     toggle the clock and observing the execution time. Argument num_out_values contains
     the number of expected output values, so the simulation is closed after all
@@ -153,6 +155,8 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None):
 
     if trace_file is not None:
         sim.start_vcd_trace(trace_file)
+    if reset:
+        _reset_rtlsim(sim)
 
     while not (output_observed):
         sim.io.in0_V_V_0_tvalid = 1 if len(inputs) > 0 else 0
@@ -161,8 +165,7 @@ def _run_rtlsim(sim, inp, num_out_values, trace_file=None):
             inputs = inputs[1:]
         if sim.io.out_r_0_tvalid == 1 and sim.io.out_r_0_tready == 1:
             outputs = outputs + [sim.io.out_r_0_tdata]
-        sim.io.ap_clk_0 = 1
-        sim.io.ap_clk_0 = 0
+        _toggle_clk(sim)
 
         observation_count = observation_count + 1
         no_change_count = no_change_count + 1
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 17a55e519ed0440f68e295aecaab179e6adf632f..a688898f4a43b33fd3f07cda12144b84829e451f 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -219,6 +219,7 @@ class HLSCustomOp(CustomOp):
         self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
         self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
 
+
         template = self.ipgentcl_template
 
         for key in self.code_gen_dict:
@@ -234,7 +235,7 @@ class HLSCustomOp(CustomOp):
     def ipgen_extra_directives(self):
         "Return a list of extra tcl directives for HLS synthesis."
         return []
-
+        
     def ipgen_singlenode_code(self):
         """Builds the bash script for ip generation using the IPGenBuilder from
         finn.util.fpgadataflow."""
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index e4d106068d4d128c66b2ce5f3d6c925dfe414b90..3e40ad70208909551365c51324153859ccc79ceb 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -41,10 +41,19 @@ from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 # output 0 is the output tensor, shape NHWC:
 #     = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels)
 
+# note: the actual data layout produced by the hlslib kernels is different
+# for depthwise and non-depthwise ops.
+# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD)
+# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD)
+# see test_fpgadataflow_slidingwindow.py for an example of how to transform
+# between the two layouts
+
 
 class ConvolutionInputGenerator(HLSCustomOp):
-    """Class that corresponds to finn-hlslib ConvolutionInputGenerator
-    (sliding window) function."""
+    """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator
+    (sliding window) function variants. Depending on the combination of
+    attributes (e.g. depthwise or not, whether k % stride is 0) a different
+    variant will be picked for the actual HLS implementation."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
@@ -60,6 +69,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
             # FINN DataTypes for inputs, weights, outputs
             "inputDataType": ("s", True, ""),
             "outputDataType": ("s", True, ""),
+            "depthwise": ("i", False, 0),
             # FPGA resource type for ConvolutionInputGenerator input buffer
             # auto -- let Vivado HLS decide
             # block -- use BRAM
@@ -106,7 +116,6 @@ class ConvolutionInputGenerator(HLSCustomOp):
         pad = 0
         ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, pad)
         assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
-        assert k % stride == 0, "stride must divide kernel size k"
         wf = int((k * k * ifm_ch) // simd)
         folded_oshape = (1, ofm_dim, ofm_dim, wf, simd)
         return folded_oshape
@@ -305,12 +314,35 @@ class ConvolutionInputGenerator(HLSCustomOp):
 
     def docompute(self):
         node = self.onnx_node
-        self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
-                OFMDim1, SIMD1, Stride1> (in0, out, numReps);""".format(
-                node.op_type
-            )
-        ]
+        ram_style = self.get_nodeattr("ram_style")
+        map_to_hls_ram_style = {
+            "auto": "ap_resource_dflt()",
+            "block": "ap_resource_bram()",
+            "distributed": "ap_resource_lutram()",
+            "ultra": "ap_resource_uram()",
+        }
+        hls_ram_style = map_to_hls_ram_style[ram_style]
+        hls_call = node.op_type
+        # check if non optimized ConvolutionInputGenerator is needed
+        k = self.get_nodeattr("ConvKernelDim")
+        stride = self.get_nodeattr("Stride")
+        if k % stride != 0:
+            hls_call += "_kernel_stride"
+
+        if self.get_nodeattr("depthwise") == 1:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}_dws<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
+                    OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
+        else:
+            self.code_gen_dict["$DOCOMPUTE$"] = [
+                """{}<ConvKernelDim1, IFMChannels1, Input_precision1, IFMDim1,
+                    OFMDim1, SIMD1, Stride1> (in0, out, numReps, {});""".format(
+                    hls_call, hls_ram_style
+                )
+            ]
 
     def dataoutstrm(self):
         code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -356,17 +388,3 @@ class ConvolutionInputGenerator(HLSCustomOp):
         self.code_gen_dict["$PRAGMAS$"].append(
             "#pragma HLS INTERFACE ap_ctrl_none port=return"
         )
-
-    def ipgen_extra_directives(self):
-        # add directive to control input buffer memory resources
-        ram_style = self.get_nodeattr("ram_style")
-        map_to_hls_ram_style = {
-            "auto": "RAM_2P",
-            "block": "RAM_2P_BRAM",
-            "distributed": "RAM_2P_LUTRAM",
-            "ultra": "RAM_2P_URAM",
-        }
-        hls_ram_style = map_to_hls_ram_style[ram_style]
-        directive = "set_directive_resource -core %s " % hls_ram_style
-        directive += "ConvolutionInputGenerator inputBuf"
-        return [directive]
diff --git a/src/finn/custom_op/fpgadataflow/sameresize_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding.py
similarity index 86%
rename from src/finn/custom_op/fpgadataflow/sameresize_batch.py
rename to src/finn/custom_op/fpgadataflow/fmpadding.py
index c459cac1e9c17336200a1fc85aad2af5e14e2c61..fa321dfa65d14b67fa218fb6a49f602ddab8d57e 100644
--- a/src/finn/custom_op/fpgadataflow/sameresize_batch.py
+++ b/src/finn/custom_op/fpgadataflow/fmpadding.py
@@ -6,27 +6,40 @@ from finn.custom_op.fpgadataflow import HLSCustomOp
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
-class SameResize_Batch(HLSCustomOp):
-    """Class that corresponds to finn-hlslib SameResize function.
-    Implements 'same' padding on a given input image."""
+class FMPadding_Batch(HLSCustomOp):
+    """Corresponds to finn-hlslib FMPadding_Batch function.
+    Pads input image by given amount."""
 
     def __init__(self, onnx_node):
         super().__init__(onnx_node)
 
     def get_nodeattr_types(self):
         my_attrs = {
+            # spatial size of input images
             "ImgDim": ("i", True, 0),
-            "KernelDim": ("i", True, 0),
-            "Stride": ("i", True, 0),
+            # total padding (per dimension) to apply
+            "Padding": ("i", True, 2),
+            # number of channels in input image
             "NumChannels": ("i", True, 0),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
-            # distribution of added values to achieve "same" padding
-            "PaddingStyle": ("i", True, 2),
+            # controls distribution of padded pixels
+            # in case of uneven padding -- see FMPadding fxn
+            # in hlslib
+            "PaddingStyle": ("i", False, 2),
+            # shape describing input vecs per execution
+            "numInputVectors": ("i", False, 1),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
+    def get_padded_odim(self):
+        "Return the padded spatial size of the output."
+
+        idim = self.get_nodeattr("ImgDim")
+        pad = self.get_nodeattr("Padding")
+        return idim + pad
+
     def get_normal_input_shape(self):
         idim = self.get_nodeattr("ImgDim")
         num_ch = self.get_nodeattr("NumChannels")
@@ -35,14 +48,8 @@ class SameResize_Batch(HLSCustomOp):
         return ishape
 
     def get_normal_output_shape(self):
-        idim = self.get_nodeattr("ImgDim")
+        odim = self.get_padded_odim()
         num_ch = self.get_nodeattr("NumChannels")
-        kdim = self.get_nodeattr("KernelDim")
-        stride = self.get_nodeattr("Stride")
-        assert idim % stride == 0, "Stride must divide input dimension."
-        # number of "same" windows over the input data
-        same_windows = idim // stride
-        odim = kdim + stride * (same_windows - 1)
 
         oshape = (1, odim, odim, num_ch)
         return oshape
@@ -87,7 +94,7 @@ class SameResize_Batch(HLSCustomOp):
         # data type stays the same
         dtype = model.get_tensor_datatype(node.input[0])
         exp_idtype = self.get_input_datatype()
-        assert dtype == exp_idtype, "Unexpected datatype for SameResize_Batch"
+        assert dtype == exp_idtype, "Unexpected datatype for FMPadding_Batch"
         model.set_tensor_datatype(node.output[0], dtype)
 
     def verify_node(self):
@@ -96,9 +103,9 @@ class SameResize_Batch(HLSCustomOp):
     def get_input_datatype(self):
         """Returns FINN DataType of input."""
         ret = DataType[self.get_nodeattr("inputDataType")]
-        # the hlslib op always pads with zeroes, so ensure that the DataType
-        # is able to represent zeroes
-        assert ret.allowed(0), "SameResize_Batch DataType must support zero"
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert ret.allowed(0), "FMPadding_Batch DataType must support zero"
         return ret
 
     def get_output_datatype(self):
@@ -125,18 +132,16 @@ class SameResize_Batch(HLSCustomOp):
         self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
 
     def defines(self, var):
-        numReps = 1
-        assert self.get_nodeattr("PaddingStyle") == 2, "Only PaddingStyle=2 supported"
         self.code_gen_dict["$DEFINES$"] = [
-            """#define ImgDim1 {}\n #define KernelDim1 {}\n
-            #define Stride1 {}\n #define NumChannels1 {}\n
-            #define PaddingStyle1 {}\n #define numReps {}""".format(
+            """#define ImgDim1 {}\n#define OutputDim1 {}\n
+            #define Padding1 {}\n#define NumChannels1 {}\n
+            #define PaddingStyle1 {}\n#define numReps {}\n""".format(
                 self.get_nodeattr("ImgDim"),
-                self.get_nodeattr("KernelDim"),
-                self.get_nodeattr("Stride"),
+                self.get_padded_odim(),
+                self.get_nodeattr("Padding"),
                 self.get_nodeattr("NumChannels"),
                 self.get_nodeattr("PaddingStyle"),
-                numReps,
+                self.get_nodeattr("numInputVectors"),
             )
         ]
 
@@ -171,8 +176,8 @@ class SameResize_Batch(HLSCustomOp):
         in_t = self.get_input_datatype().get_hls_datatype_str()
         node = self.onnx_node
         self.code_gen_dict["$DOCOMPUTE$"] = [
-            """{}<ImgDim1, KernelDim1, Stride1, NumChannels1,
-                {}, PaddingStyle1> (in0, out, numReps);""".format(
+            """{}<ImgDim1, OutputDim1, Padding1, NumChannels1,
+            {}, PaddingStyle1> (in0, out, numReps);""".format(
                 node.op_type, in_t
             )
         ]
@@ -261,8 +266,7 @@ class SameResize_Batch(HLSCustomOp):
             super().npy_to_dynamic_output(context)
             assert (
                 context[node.output[0]].shape == folded_oshape
-            ), "cppsim \
-            did not produce expected ofolded utput shape"
+            ), "cppsim did not produce expected folded output shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
             sim = self.get_rtlsim()
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index f650442401b49f1ad0a602b6b2ad3e50fbb5e5c2..9b73ba1e100aa83fd19aa8799195c99891fca3fd 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -513,40 +513,44 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         elif mem_mode == "decoupled":
             """Saves weights in corresponding file format for cppsim or rtlsim"""
             # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD)
-            # and save as unflipped weight tensor to be able to differentiate between
-            # flipped an unflipped weight tensor (has to be flipped for cppsim)
-
             weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3))
 
-            # flip PE dimension and reverse SIMD flip for saving weights in .npy
-            weight_tensor_flipped = np.flip(weight_tensor_unflipped, axis=-2)
-            weight_tensor_flipped = np.flip(weight_tensor_flipped, axis=-1)
+            # reverse SIMD flip for saving weights in .npy
+            weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1)
+            # PE flip for saving weights in .dat
+            weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2)
 
-            # reshape weight tensor (flipped and unflipped) to desired shape
+            # reshape weight tensor (simd_flipped and pe_flipped) to desired shape
             pe = self.get_nodeattr("PE")
             simd = self.get_nodeattr("SIMD")
-            # unflipped
-            weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd)
-            weight_tensor_unflipped = weight_tensor_unflipped.copy()
+            # simd_flipped
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy()
             # flipped
-            weight_tensor_flipped = weight_tensor_flipped.reshape(1, -1, pe * simd)
-            weight_tensor_flipped = weight_tensor_flipped.copy()
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(
+                1, -1, pe * simd
+            )
+            weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy()
 
             """Saves weights into .npy file"""
-            np.save(os.path.join(code_gen_dir, "weights.npy"), weight_tensor_flipped)
+            np.save(
+                os.path.join(code_gen_dir, "weights.npy"), weight_tensor_simd_flipped
+            )
 
             """Saves weights into .dat file"""
             # convert weight values into hexstring
             weight_width = self.get_weightstream_width()
             # pad to nearest 4 bits to get hex strings
             weight_width_padded = roundup_to_integer_multiple(weight_width, 4)
-            weight_tensor_unflipped = pack_innermost_dim_as_hex_string(
-                weight_tensor_unflipped, export_wdt, weight_width_padded, prefix=""
+            weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string(
+                weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix=""
             )
-            weight_stream_len = np.prod(weight_tensor_unflipped.shape)
+            weight_stream_len = np.prod(weight_tensor_pe_flipped.shape)
             factor = math.ceil(weight_stream_len / 1024)
             # add zeroes to pad out file to 1024 entries
-            weight_stream = weight_tensor_unflipped.flatten()
+            weight_stream = weight_tensor_pe_flipped.flatten()
             pad_amt = (factor * 1024) - weight_stream_len
             weight_stream = np.pad(
                 weight_stream, (0, pad_amt), mode="constant", constant_values="0"
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 66190333ce8d71dafba99aaeae4fb2c973d67410..1f734b548f923341687843c538d1887fcc069bee 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -110,6 +110,8 @@ class StreamingFIFO(HLSCustomOp):
         ]
         # make instream width a multiple of 8 for axi interface
         in_width = self.get_instream_width_padded()
+        count_width = int(self.get_nodeattr("depth") - 1).bit_length()
+        self.code_gen_dict["$COUNT_RANGE$"] = ["[{}:0]".format(count_width - 1)]
         self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
         self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)]
         self.code_gen_dict["$WIDTH$"] = [str(in_width)]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 5f526aa2aa1917144c7a048c9d9314aa9288a2d8..1a8216f64bf71b7fb9f1f8becf4732970b5bf451 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -408,6 +408,7 @@ strm_fifo_wrapper = """
 module $TOPNAME$(
 ap_clk,
 ap_rst_n,
+count,
 in0_V_V_TDATA,
 in0_V_V_TVALID,
 in0_V_V_TREADY,
@@ -418,6 +419,7 @@ out_V_V_TREADY
 
 input   ap_clk;
 input   ap_rst_n;
+output $COUNT_RANGE$ count;
 input  $IN_RANGE$ in0_V_V_TDATA;
 input   in0_V_V_TVALID;
 output   in0_V_V_TREADY;
@@ -433,6 +435,7 @@ $LAYER_NAME$
 (
  .clock(ap_clk),
  .reset(!ap_rst_n),
+ .count(count),
  .i_d(in0_V_V_TDATA),
  .i_v(in0_V_V_TVALID),
  .i_r(in0_V_V_TREADY),
diff --git a/src/finn/custom_op/im2col.py b/src/finn/custom_op/im2col.py
index 16446c15d46ee7996162f864708f7fde6cfedaf3..82a6b140f7af1be4e5c0f429d077b99c7865383e 100644
--- a/src/finn/custom_op/im2col.py
+++ b/src/finn/custom_op/im2col.py
@@ -21,8 +21,6 @@ def get_im2col_indices_nchw(
     """Returns im2col indices."""
     # First figure out what the size of the output should be
     N, C, H, W = x_shape
-    assert (H + 2 * padding - field_height) % stride_y == 0
-    assert (W + 2 * padding - field_width) % stride_x == 0
     out_height = compute_conv_output_dim(H, field_height, stride_y, padding)
     out_width = compute_conv_output_dim(W, field_width, stride_x, padding)
 
@@ -70,6 +68,9 @@ def im2col_indices_nchw(
 # * ifm is the number of input channels
 # * k is the convolutional kernel size
 
+# note: for the innermost (dot product) dimension of k*k*ifm, we
+# assume an internal ordering (k, k, ifm)
+
 
 class Im2Col(CustomOp):
     def get_nodeattr_types(self):
diff --git a/src/finn/custom_op/registry.py b/src/finn/custom_op/registry.py
index 238829e03353d79fab7c51e7d1b9dca6e2a96a11..614a3d7ffd70d0b102bad2b76177a2d3b32765c7 100644
--- a/src/finn/custom_op/registry.py
+++ b/src/finn/custom_op/registry.py
@@ -44,7 +44,7 @@ from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
     StreamingDataWidthConverter_Batch,
 )
 from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.sameresize_batch import SameResize_Batch
+from finn.custom_op.fpgadataflow.fmpadding import FMPadding_Batch
 from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
 from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
 from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
@@ -65,7 +65,7 @@ custom_op["MaxPoolNHWC"] = MaxPoolNHWC
 custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
 custom_op["StreamingFIFO"] = StreamingFIFO
 custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
-custom_op["SameResize_Batch"] = SameResize_Batch
+custom_op["FMPadding_Batch"] = FMPadding_Batch
 custom_op["Thresholding_Batch"] = Thresholding_Batch
 custom_op["AddStreams_Batch"] = AddStreams_Batch
 custom_op["LabelSelect_Batch"] = LabelSelect_Batch
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
index 3ff86cab48d365c10e69bc2c764e8083c6a36880..d421a5f3ef8ca980b399087de1482b2ae913da1b 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py
@@ -26,7 +26,7 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from onnx import helper
+from onnx import helper, TensorProto
 
 from finn.core.datatype import DataType
 from finn.transformation import Transformation
@@ -59,27 +59,61 @@ class InferConvInpGen(Transformation):
                 ifm_ch = i2c_in_shape[-1]
                 ifm_dim = i2c_in_shape[1]
                 ofm_dim = i2c_out_shape[1]
-                # if padding enabled, ensure pad_val supported by DataType
+
+                # default params for ConvolutionInputGenerator
+                ConvInpGen_node_idx = node_ind
+                ConvInpGen_input = i2c_input
+                ConvInpGen_idim = ifm_dim
+
                 if pad > 0:
+                    # if padding enabled, ensure pad_val supported by DataType
                     assert dt.allowed(pad_val), "Im2Col DataType must support pad_val"
+
+                    odim_padding = ifm_dim + 2 * pad
+
+                    padding_out = helper.make_tensor_value_info(
+                        model.make_new_valueinfo_name(),
+                        TensorProto.FLOAT,
+                        (1, odim_padding, odim_padding, ifm_ch),
+                    )
+                    graph.value_info.append(padding_out)
+                    padding_out = padding_out.name
+                    model.set_tensor_datatype(padding_out, dt)
+
+                    ConvInpGen_node_idx += 1
+                    ConvInpGen_input = padding_out
+                    ConvInpGen_idim = odim_padding
+
+                    padding_node = helper.make_node(
+                        "FMPadding_Batch",
+                        [i2c_input],
+                        [padding_out],
+                        domain="finn",
+                        backend="fpgadataflow",
+                        ImgDim=ifm_dim,
+                        Padding=2 * pad,
+                        NumChannels=ifm_ch,
+                        inputDataType=dt.name,
+                    )
+                    graph.node.insert(node_ind, padding_node)
+
                 # create equivalent ConvolutionInputGenerator node
-                # TODO support padding
-                new_node = helper.make_node(
+                ConvInpGen_node = helper.make_node(
                     "ConvolutionInputGenerator",
-                    [i2c_input],
+                    [ConvInpGen_input],
                     [i2c_output],
                     domain="finn",
                     backend="fpgadataflow",
                     ConvKernelDim=k,
                     IFMChannels=ifm_ch,
-                    IFMDim=ifm_dim,
+                    IFMDim=ConvInpGen_idim,
                     OFMDim=ofm_dim,
                     SIMD=ifm_ch,
                     Stride=stride,
                     inputDataType=dt.name,
                     outputDataType=dt.name,
                 )
-                graph.node.insert(node_ind, new_node)
+                graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node)
                 # remove old nodes
                 graph.node.remove(n)
                 graph_modified = True
diff --git a/src/finn/transformation/general.py b/src/finn/transformation/general.py
index f51ffbcfd9f62e06bf4942409fbb163e92ff6370..488391740fc25f1f7caa657adc9ed55bdc2f9722 100644
--- a/src/finn/transformation/general.py
+++ b/src/finn/transformation/general.py
@@ -28,6 +28,7 @@
 
 import finn.util.basic as util
 from finn.transformation import Transformation
+from toposort import toposort_flatten
 
 
 class GiveUniqueNodeNames(Transformation):
@@ -104,11 +105,13 @@ class GiveUniqueParameterTensors(Transformation):
                     # first occurance
                     seen_parameters += [node_input]
                     continue
-                    
+
                 new_param_name = model.make_new_valueinfo_name()
 
                 model.set_initializer(new_param_name, input_init)
-                model.set_tensor_datatype(new_param_name, model.get_tensor_datatype(node_input))
+                model.set_tensor_datatype(
+                    new_param_name, model.get_tensor_datatype(node_input)
+                )
 
                 # point node input to new tensor
                 n.input[input_idx] = new_param_name
@@ -116,6 +119,56 @@ class GiveUniqueParameterTensors(Transformation):
         return (model, graph_modified)
 
 
+class SortGraph(Transformation):
+    """ Returns the model with its node list sorted topologically.
+    Any ONNX graph to be executed must have a topologically sorted node list, as dictated
+    by the ONNX standard.
+    """
+    
+    # Notes on SortGraph performance:
+    #         benchmark in  tests/transformation/test_sort_graph.py
+    # 
+    #         The algorithm doesn't move initializers so its performance should only depend on
+    #         the number of nodes
+    # 
+    #         Relative order of magnitudes for time per step:
+    #             - Gather graph structure:       base
+    #             - Sort nodes:                   0.1 of base
+    #             - Remove and insert in order :  0.001 of base
+    # 
+    #     Notes:
+    #         Remove nodes and insert them in order:
+    #           Probably this is faster than copying initializers and more robust in general
+
+    def apply(self, model):
+        # Gather graph structure
+        graph_dependencies = {}
+        node_list = [
+            n for n in model.graph.node
+        ]  # I also need the list to remove the nodes
+        for node_idx, n in enumerate(node_list):
+            node_pred = model.find_direct_predecessors(n)
+            if node_pred is None:
+                # Will also eliminate nodes that are floating around for some reason
+                continue
+
+            node_dependencies = [node_list.index(pred) for pred in node_pred]
+            graph_dependencies[node_idx] = set(node_dependencies)
+
+        # Sort nodes
+        sorted_node_indexes = toposort_flatten(graph_dependencies)
+
+        # Remove nodes and insert them in order
+        # Can't remove nodes before if I want to use model.find_direct_predecessors()
+        for n in node_list:
+            model.graph.node.remove(n)
+
+        for new_idx, sorted_idx in enumerate(sorted_node_indexes):
+            model.graph.node.insert(new_idx, node_list[sorted_idx])
+
+        return model, False
+
+
 class ConvertSubToAdd(Transformation):
     """Convert subtract-a-constant nodes to add-a-constant nodes."""
 
diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py
index 3880bb9591e27af5fe9d063dba2485d304e4db54..d3bfb73fe239d7194fab3760555663895a209e84 100644
--- a/src/finn/util/basic.py
+++ b/src/finn/util/basic.py
@@ -56,6 +56,12 @@ def get_rtlsim_trace_depth():
     via the RTLSIM_TRACE_DEPTH environment variable. If the env.var. is
     undefined, the default value of 1 is returned. A trace depth of 1
     will only show top-level signals and yield smaller .vcd files.
+
+    The following depth values are of interest for whole-network stitched IP
+    rtlsim:
+    - level 1 shows top-level input/output streams
+    - level 2 shows per-layer input/output streams
+    - level 3 shows per full-layer I/O including FIFO count signals
     """
 
     try:
diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py
index d1669444e55cb0fddb2690e51849c4603d47d32c..3fe747a84985b2702ffb1e5855d9071362efebda 100644
--- a/src/finn/util/fpgadataflow.py
+++ b/src/finn/util/fpgadataflow.py
@@ -104,6 +104,7 @@ def pyverilate_stitched_ip(model):
         build_dir=build_dir,
         trace_depth=get_rtlsim_trace_depth(),
         top_module_name=top_module_name,
+        auto_eval=False,
     )
     return sim
 
diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9e244422065314ceb790dc6719b57688ff76828
--- /dev/null
+++ b/src/finn/util/vcd.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2020, Xilinx
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from vcdvcd import VCDVCD
+from finn.util.basic import get_num_default_workers
+import multiprocessing as mp
+
+# string patterns to search for to find particular interfaces
+# streaming interfaces
+vname = "TVALID"
+rname = "TREADY"
+# FIFO count signals
+fifo_mod_name = "StreamingFIFO"
+fifo_cname = "count"
+
+
+def list_stream_if(vcd_file):
+    "Return a list of stream  interface names from given vcd trace."
+
+    sig_names = VCDVCD(vcd_file, print_dumps=False, only_sigs=True).get_signals()
+    stream_if_names = []
+    for cand_name in filter(lambda x: x.endswith(vname), sig_names):
+        base_name = cand_name.replace(vname, "")
+        if base_name + rname in sig_names:
+            stream_if_names.append(base_name)
+    return stream_if_names
+
+
+def list_fifo_count_signals(vcd_file):
+    "Return a list of FIFO count signal names from given vcd trace."
+
+    sig_names = VCDVCD(vcd_file, print_dumps=False, only_sigs=True).get_signals()
+    fifo_cnt_names = []
+    for cand_name in filter(lambda x: fifo_cname in x, sig_names):
+        if fifo_mod_name in cand_name:
+            fifo_cnt_names.append(cand_name)
+    return fifo_cnt_names
+
+
+def get_fifo_count_max(vcd_file, fifo_count_signal):
+    "Return the maximum value of the given FIFO count signal in vcd trace."
+
+    d = VCDVCD(vcd_file, signals=[fifo_count_signal], store_tvs=True).get_data()
+    assert len(d) != 0, "FIFO count signal not found"
+    events = list(d.values())[0]["tv"]
+    max = 0
+    for (time, val) in events:
+        current = int(val, base=2)
+        if current > max:
+            max = current
+    return max
+
+
+def _get_fifo_max(x):
+    return (x[0], get_fifo_count_max(x[1], x[0]))
+
+
+def get_all_fifo_count_max(vcd_file, fifo_count_signals=None):
+    """Return a list of max FIFO counts. If fifo_count_signals is None,
+    all FIFO count signals will be returned, otherwise treated as a list of
+    signal names to return the stats for."""
+    if fifo_count_signals is None:
+        fifo_count_signals = list_fifo_count_signals(vcd_file)
+
+    with mp.Pool(get_num_default_workers()) as p:
+        fifo_count_signals = map(lambda x: (x, vcd_file), fifo_count_signals)
+        all_stats = p.map(_get_fifo_max, fifo_count_signals)
+
+    return all_stats
+
+
+def get_stream_if_stats(vcd_file, if_base_name):
+    """Return statistics for given streaming interface in vcd trace in the
+    following dict format:
+
+    <stream_state>: (<num_samples>, <fraction_of_time>),
+
+    where <stream_state> is the combination of (V)alid/(R)eady values,
+    <num_samples> is the approximate number of rising clock edges spent in <state>
+    , and <fraction_of_time> is the fraction of <num_samples> to total
+    amount of time recorded by the trace.
+
+    Example:
+    {"{'V': 0, 'R': 0}": (5, 0.0006060606060606061),
+     "{'V': 1, 'R': 0}": (0, 0.0),
+     "{'V': 0, 'R': 1}": (7605, 0.9218181818181819),
+     "{'V': 1, 'R': 1}": (640, 0.07757575757575758)}
+
+    Here we can see the stream was transmitting values 7.7% of the time,
+    and 9.2% of the time there was no incoming data (valid 0, ready 1)
+    """
+    if_valid = if_base_name + vname
+    if_ready = if_base_name + rname
+    v = VCDVCD(vcd_file, signals=[if_valid], store_tvs=True)
+    endtime = v.get_endtime()
+    v = v.get_data()
+    assert len(v) != 0, "Streaming interface not found"
+    v = list(v.values())[0]["tv"]
+    v = list(map(lambda x: ("V", x[0], x[1]), v))
+    v.append(("V", endtime, "0"))
+    r = VCDVCD(vcd_file, signals=[if_ready], store_tvs=True).get_data()
+    assert len(r) != 0, "Streaming interface not found"
+    r = list(r.values())[0]["tv"]
+    r = list(map(lambda x: ("R", x[0], x[1]), r))
+    r.append(("R", endtime, "0"))
+    events = sorted(v + r, key=lambda x: x[1])
+    ret = {
+        "{'V': 0, 'R': 0}": 0,
+        "{'V': 1, 'R': 0}": 0,
+        "{'V': 0, 'R': 1}": 0,
+        "{'V': 1, 'R': 1}": 0,
+    }
+    status = {"V": 0, "R": 0}
+    last_time = 0
+    total_rising_clock_edges = 0
+    for (sig, time, val) in events:
+        # pyverilator generates 5 time units per sample
+        time = time / 5
+        # pyverilator generates 4 samples per clock period
+        n_rising_clock_edges = int((time - last_time) / 4)
+        # note that the calculation of n_rising_clock_edges is approximate
+        # doing this exactly would require a cycle-by-cycle walkthrough of the
+        # trace, which can take very long
+        ret[str(status)] += n_rising_clock_edges
+        total_rising_clock_edges += n_rising_clock_edges
+        status[sig] = int(val)
+        last_time = time
+
+    for state in ret:
+        v = ret[state]
+        ret[state] = (v, v / total_rising_clock_edges)
+
+    return ret
+
+
+def _get_stats(x):
+    return (x[0], get_stream_if_stats(x[1], x[0]))
+
+
+def get_all_stream_if_stats(vcd_file, stream_ifs=None, sort_by="{'V': 1, 'R': 0}"):
+    """Return a list of streaming interface stats, sorted by the percentage
+    for the given sort_by key. If stream_ifs is None, all streamin interface
+    stats will be returned, otherwise treated as a list of interface names to
+    return the stats for."""
+
+    if stream_ifs is None:
+        stream_ifs = list_stream_if(vcd_file)
+
+    with mp.Pool(get_num_default_workers()) as p:
+        stream_ifs = map(lambda x: (x, vcd_file), stream_ifs)
+        all_stats = p.map(_get_stats, stream_ifs)
+
+    def sort_key(x):
+        stat = x[1]
+        (samples, percent) = stat[sort_by]
+        return percent
+
+    ret = sorted(all_stats, key=sort_key)
+    return ret
diff --git a/tests/core/test_modelwrapper.py b/tests/core/test_modelwrapper.py
index 4bd9385536bc6721c66726169dfa4c69e5f06772..5fa9b23bad5c5b67f65530c55f862f889c07b1ac 100644
--- a/tests/core/test_modelwrapper.py
+++ b/tests/core/test_modelwrapper.py
@@ -73,6 +73,11 @@ def test_modelwrapper():
     inp_layout = DataLayout.NCHW
     model.set_tensor_layout(inp_name, inp_layout)
     assert model.get_tensor_layout(inp_name) == inp_layout
+    inp_sparsity = model.get_tensor_sparsity(inp_name)
+    assert inp_sparsity is None
+    inp_sparsity = {"dw": {"kernel_shape": 3}}
+    model.set_tensor_sparsity(inp_name, inp_sparsity)
+    assert model.get_tensor_sparsity(inp_name) == inp_sparsity
     os.remove(export_onnx_path)
 
 
diff --git a/tests/end2end/test_end2end_tfc_w1a1.py b/tests/end2end/test_end2end_tfc_w1a1.py
index 354e8d88931f758a57231ebcf3564046cc0f3ab9..f9bd408ebac8a011eb0c461d7f8e48b5cc76be86 100644
--- a/tests/end2end/test_end2end_tfc_w1a1.py
+++ b/tests/end2end/test_end2end_tfc_w1a1.py
@@ -73,6 +73,7 @@ from finn.util.test import get_test_model_trained, load_test_checkpoint_or_skip
 from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources
 from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
 from finn.core.throughput_test import throughput_test_rtlsim
+import finn.util.vcd as vcd
 
 build_dir = "/tmp/" + os.environ["FINN_INST_NAME"]
 test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1")
@@ -198,6 +199,8 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     res_rtlsim_nodebynode = ret_rtlsim_nodebynode[out_name]
     # whole-network (ip-stitched) rtlsim
     model.set_metadata_prop("exec_mode", "rtlsim")
+    model.set_metadata_prop("rtlsim_trace", build_dir + "/tfc_w1a1.vcd")
+    os.environ["RTLSIM_TRACE_DEPTH"] = "3"
     model.save(build_dir + "/end2end_tfc_w1a1_ipstitch_whole_rtlsim.onnx")
     ret_rtlsim_whole = execute_onnx(model, inp_dict, True)
     res_rtlsim_whole = ret_rtlsim_whole[out_name]
@@ -205,6 +208,24 @@ def test_end2end_tfc_w1a1_verify_dataflow_part():
     assert np.isclose(res_cppsim, res_rtlsim_whole).all()
 
 
+def test_end2end_tfc_w1a1_verify_fifo_fullness():
+    vcdf = build_dir + "/tfc_w1a1.vcd"
+    if not os.path.isfile(vcdf):
+        pytest.skip("Cannot find %s, skipping" % vcdf)
+    stream_ifs = vcd.list_stream_if(vcdf)
+    fifos = vcd.list_fifo_count_signals(vcdf)
+    assert len(stream_ifs) == 37
+    assert len(fifos) == 6
+    fifo_max = vcd.get_all_fifo_count_max(vcdf)
+    assert fifo_max[0][0] == "TOP.v.finn_design_i.StreamingFIFO_0.count[3:0]"
+    assert fifo_max[0][1] == 3
+    stream_stat = vcd.get_all_stream_if_stats(vcdf)
+    assert (
+        stream_stat[0][0]
+        == "TOP.v.finn_design_i.StreamingDataWidthConverter_Batch_0_out_V_V_"
+    )
+
+
 @pytest.mark.vivado
 def test_end2end_tfc_w1a1_throughput_test_rtlsim():
     model = load_test_checkpoint_or_skip(
diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee65326ec57fb7fa7fa0490a8980dbabb8efc13c
--- /dev/null
+++ b/tests/fpgadataflow/test_convert_to_hls_conv_layer.py
@@ -0,0 +1,106 @@
+from onnx import TensorProto, helper
+import numpy as np
+import pytest
+
+from finn.core.datatype import DataType
+from finn.transformation.infer_shapes import InferShapes
+from finn.transformation.infer_datatypes import InferDataTypes
+from finn.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames
+from finn.transformation.infer_data_layouts import InferDataLayouts
+from finn.transformation.lower_convs_to_matmul import LowerConvsToMatMul
+
+import finn.core.onnx_exec as oxe
+from finn.core.modelwrapper import ModelWrapper
+from finn.util.basic import gen_finn_dt_tensor
+import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+
+from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
+from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim
+from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
+
+
+@pytest.mark.parametrize("padding", [True, False])
+@pytest.mark.parametrize("kernel_size", [3, 5])
+@pytest.mark.slow
+@pytest.mark.vivado
+def test_convert_to_hls_conv_layer(padding, kernel_size):
+
+    assert (
+        kernel_size % 2 != 0
+    ), """test_convert_to_hls_conv_layer test only
+    supports odd kernel_size"""
+
+    np.random.seed(0)
+    padding = True
+    idt = DataType.UINT4
+
+    in_feature_dim = 7
+    in_chn = 3
+
+    stages = 1  # just one convolution
+
+    out_feature_dim = (
+        in_feature_dim if padding else in_feature_dim - (kernel_size // 2 * 2) * stages
+    )
+
+    input_shape = [1, in_chn, in_feature_dim, in_feature_dim]
+    output_shape = [1, in_chn, out_feature_dim, out_feature_dim]
+
+    conv_param_shape = [in_chn, in_chn, kernel_size, kernel_size]
+
+    conv_config = {}
+    conv_config["dilations"] = [1, 1]
+    conv_config["group"] = 1
+    conv_config["kernel_shape"] = [kernel_size, kernel_size]
+    if padding:
+        pad = kernel_size // 2
+        conv_config["pads"] = [pad, pad, pad, pad]
+    else:
+        conv_config["pads"] = [0, 0, 0, 0]
+    conv_config["strides"] = [1, 1]
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape)
+    value_info = [
+        helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)
+    ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="conv_test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)
+            ],
+        )
+    )
+
+    model = ModelWrapper(modelproto)
+    model.set_tensor_datatype("top_in", idt)
+    model.set_tensor_datatype("top_out", idt)
+    model.set_tensor_datatype("p1", DataType.UINT4)
+
+    model = model.transform(InferShapes())
+    model.set_initializer(
+        "p1", np.round(np.random.rand(*conv_param_shape).astype(np.float32) * 16)
+    )
+
+    model.set_tensor_datatype(model.graph.input[0].name, idt)
+    model = model.transform(InferShapes())
+    model = model.transform(InferDataLayouts())
+    model = model.transform(GiveUniqueNodeNames())
+    model = model.transform(GiveReadableTensorNames())
+    model = model.transform(InferDataTypes())
+
+    new_model = model.transform(LowerConvsToMatMul())
+    new_model = new_model.transform(to_hls.InferConvInpGen())
+
+    new_model = new_model.transform(PrepareCppSim())
+    new_model = new_model.transform(CompileCppSim())
+    new_model = new_model.transform(SetExecMode("cppsim"))
+
+    x = gen_finn_dt_tensor(idt, input_shape)
+    inp_dict = {model.graph.input[0].name: x}
+    assert oxe.compare_execution(model, new_model, inp_dict)
diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
index 5051bf34dc690daf8b6186859d3717cc8e217eee..b5fc85caf274edc9e7afc52df962862fa8a99ba3 100644
--- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
+++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py
@@ -78,7 +78,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, simd, stride, i
 
 
 def make_single_slidingwindow_modelwrapper(
-    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt
+    k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw=0
 ):
     odt = idt
     inp = helper.make_tensor_value_info(
@@ -102,6 +102,7 @@ def make_single_slidingwindow_modelwrapper(
         Stride=stride,
         inputDataType=idt.name,
         outputDataType=odt.name,
+        depthwise=dw,
     )
     graph = helper.make_graph(
         nodes=[SlidingWindow_node],
@@ -126,25 +127,29 @@ def prepare_inputs(input_tensor):
 # input datatype
 @pytest.mark.parametrize("idt", [DataType.BIPOLAR, DataType.INT2])
 # kernel size
-@pytest.mark.parametrize("k", [2, 4])
+@pytest.mark.parametrize("k", [2, 3])
 # input dimension
-@pytest.mark.parametrize("ifm_dim", [4, 6, 8])
+@pytest.mark.parametrize("ifm_dim", [6, 8])
 # input channels
-@pytest.mark.parametrize("ifm_ch", [2, 4])  # , 2, 3, 4])
+@pytest.mark.parametrize("ifm_ch", [2, 4])
 # Stride
 @pytest.mark.parametrize("stride", [1, 2])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # input channel parallelism ("SIMD")
 @pytest.mark.parametrize("simd", [1, 2])
+# depthwise
+@pytest.mark.parametrize("dw", [0, 1])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd):
+def test_fpgadataflow_slidingwindow(
+    idt, k, ifm_dim, ifm_ch, stride, exec_mode, simd, dw
+):
     ofm_dim = int(((ifm_dim - k) / stride) + 1)
 
     x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch))
     model = make_single_slidingwindow_modelwrapper(
-        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt
+        k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt, dw
     )
 
     if exec_mode == "cppsim":
@@ -168,6 +173,12 @@ def test_fpgadataflow_slidingwindow(idt, k, ifm_dim, ifm_ch, stride, exec_mode,
         k, ifm_ch, ifm_dim, ofm_dim, simd, stride, idt
     )
     y_expected = oxe.execute_onnx(golden, input_dict)["outp"]
-    # if idt == DataType.BIPOLAR:
-    #     y_expected = 2 * y_expected - 1
-    assert (y_produced == y_expected).all()
+    if dw == 0:
+        assert (y_produced == y_expected).all()
+    else:
+        y_expected = y_expected.reshape(
+            1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd
+        )
+        y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5)
+        y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k)
+        assert (y_produced == y_expected).all()
diff --git a/tests/fpgadataflow/test_fpgadataflow_sameresize.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
similarity index 75%
rename from tests/fpgadataflow/test_fpgadataflow_sameresize.py
rename to tests/fpgadataflow/test_fpgadataflow_fmpadding.py
index ea6130c3891443595b038460233ebb85799ac461..9d6390b2673e5d2c0e72748183ac04ed222d078e 100644
--- a/tests/fpgadataflow/test_fpgadataflow_sameresize.py
+++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py
@@ -23,9 +23,11 @@ test_fpga_part = pynq_part_map[test_pynq_board]
 target_clk_ns = 10
 
 
-def make_single_sameresize_modelwrapper(
-    idim, odim, kdim, stride, num_ch, idt, pad_style
-):
+def make_single_fmpadding_modelwrapper(idim, padding, num_ch, idt, pad_style):
+    assert pad_style == 2, "only pad_style == 2 supported in hlslib"
+    assert padding > 0, "Output dim should be greater than input dim"
+    odim = idim + padding
+
     inp = helper.make_tensor_value_info(
         "inp", TensorProto.FLOAT, [1, idim, idim, num_ch]
     )
@@ -33,25 +35,25 @@ def make_single_sameresize_modelwrapper(
         "outp", TensorProto.FLOAT, [1, odim, odim, num_ch]
     )
 
-    SameResize_node = helper.make_node(
-        "SameResize_Batch",
+    FMPadding = helper.make_node(
+        "FMPadding_Batch",
         ["inp"],
         ["outp"],
         domain="finn",
         backend="fpgadataflow",
         ImgDim=idim,
-        KernelDim=kdim,
-        Stride=stride,
+        Padding=padding,
         NumChannels=num_ch,
         inputDataType=str(idt.name),
         PaddingStyle=pad_style,
+        numInputVectors=1,
     )
 
     graph = helper.make_graph(
-        nodes=[SameResize_node], name="sameresize_graph", inputs=[inp], outputs=[outp]
+        nodes=[FMPadding], name="fmpadding_graph", inputs=[inp], outputs=[outp]
     )
 
-    model = helper.make_model(graph, producer_name="sameresize-model")
+    model = helper.make_model(graph, producer_name="fmpadding-model")
     model = ModelWrapper(model)
 
     model.set_tensor_datatype("inp", idt)
@@ -60,34 +62,28 @@ def make_single_sameresize_modelwrapper(
     return model
 
 
-# image dimension
+# input image dimension
 @pytest.mark.parametrize("idim", [8, 16])
-# kernel dimension
-@pytest.mark.parametrize("kdim", [2, 3])
-# stride
-@pytest.mark.parametrize("stride", [1, 2])
+# number of rows and number of cols to add
+@pytest.mark.parametrize("pad", [2, 3])
 # number of channels
 @pytest.mark.parametrize("num_ch", [1, 2])
+# PaddingStyle: selects behavior when (odim-idim)%2 != 0
+@pytest.mark.parametrize("pad_style", [2])
 # FINN input datatype
 @pytest.mark.parametrize("idt", [DataType.INT2, DataType.INT4])
 # execution mode
 @pytest.mark.parametrize("mode", ["cppsim", "rtlsim"])
 @pytest.mark.slow
 @pytest.mark.vivado
-def test_fpgadataflow_sameresize(idim, kdim, stride, num_ch, idt, mode):
-    pad_style = 2
-    assert idim % stride == 0, "Stride must divide input dimension."
-    # number of "same" windows over the input data
-    same_windows = idim // stride
-    odim = kdim + stride * (same_windows - 1)
+def test_fpgadataflow_fmpadding(idim, pad, num_ch, pad_style, idt, mode):
 
     # generate input data
     x = gen_finn_dt_tensor(idt, [1, idim, idim, num_ch])
     input_dict = {"inp": x}
+    odim = idim + pad
 
-    model = make_single_sameresize_modelwrapper(
-        idim, odim, kdim, stride, num_ch, idt, pad_style
-    )
+    model = make_single_fmpadding_modelwrapper(idim, pad, num_ch, idt, pad_style)
     model = model.transform(InferShapes())
     model = model.transform(SetExecMode(mode))
     model = model.transform(GiveUniqueNodeNames())
@@ -103,8 +99,7 @@ def test_fpgadataflow_sameresize(idim, kdim, stride, num_ch, idt, mode):
     assert y_produced.shape == expected_oshape
 
     # calculate reference
-    # calculate correct padding according to parameters
-    pad = odim - idim
+    # calculate correct pad according to parameters
     if pad_style == 2:
         if pad % 2 == 0:
             pad_up = pad // 2
@@ -115,6 +110,7 @@ def test_fpgadataflow_sameresize(idim, kdim, stride, num_ch, idt, mode):
     else:
         pad_up = pad // 2
         pad_left = pad // 2
+
     pad_down = pad - pad_up
     pad_right = pad - pad_left
 
diff --git a/tests/transformation/test_sort_graph.py b/tests/transformation/test_sort_graph.py
new file mode 100644
index 0000000000000000000000000000000000000000..05842504c13b144bb34e8084fb12b5086fa84115
--- /dev/null
+++ b/tests/transformation/test_sort_graph.py
@@ -0,0 +1,150 @@
+from onnx import TensorProto, helper
+import numpy as np
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.general import SortGraph
+from finn.transformation.infer_shapes import InferShapes
+import pytest
+import finn.analysis.topology as ta
+
+
+def make_randomly_sorted_linear_model(num_of_nodes, seed=None):
+    if seed is not None:
+        np.random.seed(seed)
+
+    ch = 2
+    ifmdim = 16
+    input_shape = (1, ch, ifmdim, ifmdim)
+
+    top_in = helper.make_tensor_value_info("t0", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info(
+        "t" + str(num_of_nodes), TensorProto.FLOAT, input_shape
+    )
+
+    value_info = []
+    nodes = []
+    for i in range(num_of_nodes):
+        nodes += [
+            helper.make_node("Add", ["t" + str(i), "p" + str(i)], ["t" + str(i + 1)])
+        ]
+        value_info += [
+            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
+        ]
+
+    nodes = np.random.permutation(nodes)
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=nodes,
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    for i in range(num_of_nodes):
+        model.set_initializer(
+            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
+        )
+
+    return model
+
+
+@pytest.mark.parametrize("num_of_nodes", [64])
+def test_sort_linear_graph(num_of_nodes):
+    model = make_randomly_sorted_linear_model(num_of_nodes, seed=0)
+    new_model = model.transform(SortGraph())
+
+    # Test
+    ret = new_model.analysis(ta.nodes_topologically_sorted)
+    assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted."
+
+
+def test_sort_nonlinear_graph():
+    ch = 2
+    ifmdim = 16
+    input_shape = (1, ch, ifmdim, ifmdim)
+
+    top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape)
+    top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, input_shape)
+
+    num_of_params = 8
+    value_info = []
+    for i in range(num_of_params):
+        value_info += [
+            helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)
+        ]
+
+    modelproto = helper.make_model(
+        helper.make_graph(
+            name="test",
+            inputs=[top_in],
+            outputs=[top_out],
+            value_info=value_info,
+            nodes=[
+                # Not sorted nodes
+                helper.make_node("Mul", ["fork1", "p2"], ["t3"]),
+                helper.make_node("Add", ["t4", "p3"], ["t5"]),
+                helper.make_node("Add", ["t2", "t3"], ["t4"]),
+                helper.make_node("Add", ["t6", "t7"], ["t8"]),
+                helper.make_node("Add", ["fork3", "fork3"], ["top_out"]),
+                helper.make_node("Mul", ["t5", "p4"], ["fork2"]),
+                helper.make_node("Add", ["top_in", "p0"], ["fork1"]),
+                helper.make_node("Mul", ["fork1", "p1"], ["t2"]),
+                helper.make_node("Add", ["fork2", "p5"], ["t6"]),
+                helper.make_node("Add", ["fork2", "p6"], ["t7"]),
+                helper.make_node("Mul", ["t8", "p7"], ["fork3"]),
+            ],
+        )
+    )
+    model = ModelWrapper(modelproto)
+    model = model.transform(InferShapes())
+
+    np.random.seed(0)
+    for i in range(num_of_params):
+        model.set_initializer(
+            "p" + str(i), np.random.rand(*input_shape).astype(np.float32)
+        )
+
+    new_model = model.transform(SortGraph())
+
+    # Test
+    ret = new_model.analysis(ta.nodes_topologically_sorted)
+    assert ret["nodes_topologically_sorted"], "Nodes are not topologically sorted."
+
+
+if __name__ == "__main__":
+    import time
+
+    sizes = [10, 50, 100, 500, 1000]
+    times = []
+    reps = 10
+
+    print("SortGraph performance test:")
+    print("Test sizes", sizes)
+    print("Repetitions per size:", reps)
+    for sz in sizes:
+        acc_time = 0
+        print(" Testing size ", sz)
+        for i in range(reps):
+            # it should take the same time even with the sorted one
+            # but better new model each time as it is a more general approach
+            model = make_randomly_sorted_linear_model(sz)  # new model as seed is None
+            bef = time.time()
+            new_model = model.transform(SortGraph(), make_deepcopy=False)
+            acc_time += time.time() - bef
+
+        times += [acc_time / reps]
+
+    # print csv
+    print("\nnum_of_nodes,  seconds")
+    for sz, tm in zip(sizes, times):
+        print("{:12d}, {:6.4e}".format(sz, tm))
+
+    # plot
+    # import matplotlib.pyplot as plt
+    # plt.plot(sizes,times,"--o")
+    # plt.grid(True)