diff --git a/docs/finn/source_code/finn.analysis.fpgadataflow.rst b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
index e31723ca3802e1ba523131ddff8078662c06d54b..9c1b0dd99a324929f5c8dd182f7208c27baa5ba8 100644
--- a/docs/finn/source_code/finn.analysis.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.analysis.fpgadataflow.rst
@@ -1,4 +1,3 @@
-***********************
 Analysis - fpgadataflow 
 ***********************
 
diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst
index dbfeea2a9658957b2525016c387b7bc6aeb77608..9ce1b99990ed2facc430eaf318bb0480f7fba5e0 100644
--- a/docs/finn/source_code/finn.analysis.rst
+++ b/docs/finn/source_code/finn.analysis.rst
@@ -31,4 +31,3 @@ finn.analysis.verify\_custom\_nodes
    :undoc-members:
    :show-inheritance:
 
-
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index 78b461ee69efb6cac59eb4e9c1dbd5abc521191d..f9d59fc0f68a37761f4c7fa8b25f5ee016092a0a 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -54,4 +54,3 @@ finn.core.rtlsim\_exec
    :members:
    :undoc-members:
    :show-inheritance:
-
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index f8c6274e40126a12f072b9c46c9e5748747f8121..7e9ebcbec1ebb958faa5b2cf1cdb87c02b17011b 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -13,7 +13,6 @@ Base Class
    :undoc-members:
    :show-inheritance:
 
-
 finn.custom\_op.fpgadataflow.convolutioninputgenerator
 ------------------------------------------------------
 
@@ -22,6 +21,14 @@ finn.custom\_op.fpgadataflow.convolutioninputgenerator
    :undoc-members:
    :show-inheritance:
 
+finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
+---------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.fpgadataflow.streamingfclayer\_batch
 ----------------------------------------------------
 
diff --git a/docs/finn/source_code/finn.custom_op.rst b/docs/finn/source_code/finn.custom_op.rst
index e0a2c77213be8bd5eca4dce67d48c6b5950be9ba..cb436b1c1303020deeed1ca2f6dc2eb2f8678287 100644
--- a/docs/finn/source_code/finn.custom_op.rst
+++ b/docs/finn/source_code/finn.custom_op.rst
@@ -21,6 +21,23 @@ Base Class
    :undoc-members:
    :show-inheritance:
 
+
+finn.custom\_op.im2col
+----------------------
+
+.. automodule:: finn.custom_op.im2col
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+finn.custom\_op.maxpoolnhwc
+---------------------------
+
+.. automodule:: finn.custom_op.maxpoolnhwc
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.custom\_op.multithreshold
 ------------------------------
 
@@ -53,3 +70,4 @@ finn.custom\_op.xnorpopcount
    :undoc-members:
    :show-inheritance:
 
+
diff --git a/docs/finn/source_code/finn.rst b/docs/finn/source_code/finn.rst
index 1197c50a035b77ec24f51f9e95f6208db162db8e..6f6b591e1e6a4b7b3900b834c8f5ff2765644f52 100644
--- a/docs/finn/source_code/finn.rst
+++ b/docs/finn/source_code/finn.rst
@@ -14,3 +14,4 @@ Modules
    finn.custom_op
    finn.transformation
    finn.util
+
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index ba850d8fd3834ca86769925ddcf9cc958cdb4980..e80ddbdd05595ab3ca1e6a81da95f96e92f5452a 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -69,6 +69,14 @@ finn.transformation.fpgadataflow.hlssynth\_ipgen
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.fpgadataflow.insert\_dwc
+--------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.insert_dwc
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.fpgadataflow.insert\_tlastmarker
 ----------------------------------------------------
 
@@ -132,3 +140,4 @@ finn.transformation.fpgadataflow.templates
    :members:
    :undoc-members:
    :show-inheritance:
+
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index c944cb478303719bfd3392567b2b07f65c40edd0..480c8030245ae5bc0add11ff629095b2d46ff225 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -32,6 +32,14 @@ finn.transformation.bipolar\_to\_xnor
    :undoc-members:
    :show-inheritance:
 
+finn.transformation.double\_to\_single\_float
+---------------------------------------------
+
+.. automodule:: finn.transformation.double_to_single_float
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
 finn.transformation.fold\_constants
 -----------------------------------
 
@@ -63,3 +71,12 @@ finn.transformation.infer\_shapes
    :members:
    :undoc-members:
    :show-inheritance:
+
+finn.transformation.lower\_convs\_to\_matmul
+--------------------------------------------
+
+.. automodule:: finn.transformation.lower_convs_to_matmul
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 5a86918195cd41dfc83e7faeb7923c5703f4fe42..7bcfde705f35a0deb9bb82d2d24831f718802070 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -44,3 +44,4 @@ finn.util.test
    :members:
    :undoc-members:
    :show-inheritance:
+
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 79ed896fab73920e87e6f78537237a78f82f1462..e2a2f90b85a790a6d4fc7053d0e742329a7a1012 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -138,7 +138,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
         """Returns FINN DataType of output."""
         return DataType[self.get_nodeattr("outputDataType")]
 
-    def get_stream_width(self):
+    def get_instream_width(self):
         """Returns stream width, input and output stream width are equal for
         the sliding window function"""
         ibits = self.get_input_datatype().bitwidth()
@@ -147,6 +147,12 @@ class ConvolutionInputGenerator(HLSCustomOp):
         assert simd == ifm_ch, "SWG currently requires SIMD=IFM"
         return simd * ibits
 
+    def get_outstream_width(self):
+        """Returns stream width, input and output stream width are equal for
+        the sliding window function, so the function to determine the input
+        stream width can be reused."""
+        return self.get_instream_width()
+
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
         num_output_elems = np.prod(folded_oshape[:-1])
@@ -206,7 +212,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
                 code_gen_dir, node.name, prefixed_top_name
             )
             if os.path.isfile(verilog_file):
-                nbits = self.get_stream_width()
+                nbits = self.get_instream_width()
                 rtlsim_inp = npy_to_rtlsim_input(
                     "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
                 )
@@ -223,7 +229,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
                 rtlsim_output = self.rtlsim(sim, rtlsim_inp)
                 odt = export_idt
                 target_bits = odt.bitwidth()
-                packed_bits = self.get_stream_width()
+                packed_bits = self.get_outstream_width()
                 out_npy_path = "{}/output.npy".format(code_gen_dir)
                 out_shape = self.get_folded_output_shape()
                 rtlsim_output_to_npy(
@@ -287,7 +293,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
             # use binary for bipolar storage
             dtype = DataType.BINARY
         elem_bits = dtype.bitwidth()
-        packed_bits = self.get_stream_width()
+        packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
@@ -301,10 +307,10 @@ class ConvolutionInputGenerator(HLSCustomOp):
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_stream_width())
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_stream_width())
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
     def docompute(self):
@@ -323,7 +329,7 @@ class ConvolutionInputGenerator(HLSCustomOp):
             # use binary for bipolar storage
             dtype = DataType.BINARY
         elem_bits = dtype.bitwidth()
-        packed_bits = self.get_stream_width()
+        packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index a51399a29996f7f9f19e699935179253e020ebfc..ce135e91088d2bfabe0259e1cc6873bb54884198 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -150,10 +150,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         folded_ishape = self.get_folded_input_shape()
         return np.prod(folded_ishape[:-1])
 
-    def get_in_stream_width(self):
+    def get_instream_width(self):
         return self.get_nodeattr("inWidth")
 
-    def get_out_stream_width(self):
+    def get_outstream_width(self):
         return self.get_nodeattr("outWidth")
 
     def make_shape_compatible_op(self, model):
@@ -236,7 +236,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             # use binary for bipolar storage
             dtype = DataType.BINARY
         elem_bits = dtype.bitwidth()
-        packed_bits = self.get_in_stream_width()
+        packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
@@ -250,10 +250,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_in_stream_width())
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_out_stream_width())
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
     def docompute(self):
@@ -270,7 +270,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             # use binary for bipolar storage
             dtype = DataType.BINARY
         elem_bits = dtype.bitwidth()
-        packed_bits = self.get_out_stream_width()
+        packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
@@ -294,9 +294,9 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        in_packed_bits = self.get_in_stream_width()
+        in_packed_bits = self.get_instream_width()
         in_packed_hls_type = "ap_uint<%d>" % in_packed_bits
-        out_packed_bits = self.get_out_stream_width()
+        out_packed_bits = self.get_outstream_width()
         out_packed_hls_type = "ap_uint<%d>" % out_packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
             "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
@@ -359,7 +359,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
                 code_gen_dir, node.name, prefixed_top_name
             )
             if os.path.isfile(verilog_file):
-                nbits = self.get_in_stream_width()
+                nbits = self.get_instream_width()
                 rtlsim_inp = npy_to_rtlsim_input(
                     "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
                 )
@@ -376,7 +376,7 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
                 rtlsim_output = self.rtlsim(sim, rtlsim_inp)
                 odt = export_idt
                 target_bits = odt.bitwidth()
-                packed_bits = self.get_out_stream_width()
+                packed_bits = self.get_outstream_width()
                 out_npy_path = "{}/output.npy".format(code_gen_dir)
                 out_shape = self.get_folded_output_shape()
                 rtlsim_output_to_npy(
diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
index 291d1264ca041432448d708a99d65965862afb24..804da50f5a2c2de7c920975de4e082851a627c4e 100644
--- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py
@@ -84,11 +84,15 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         folded_oshape = self.get_folded_output_shape()
         return np.prod(folded_oshape[:-1])
 
-    def get_stream_width(self):
+    def get_instream_width(self):
         dt_bits = self.get_input_datatype().bitwidth()
         ifm_ch = self.get_nodeattr("NumChannels")
         return int(dt_bits * ifm_ch)
 
+    def get_outstream_width(self):
+        """For streaming maxpool out stream with is the same as in stream width"""
+        return self.get_instream_width()
+
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
         oshape = self.get_normal_output_shape()
@@ -167,7 +171,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             # use binary for bipolar storage
             dtype = DataType.BINARY
         elem_bits = dtype.bitwidth()
-        packed_bits = self.get_stream_width()
+        packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
@@ -181,10 +185,10 @@ class StreamingMaxPool_Batch(HLSCustomOp):
     def strm_decl(self):
         self.code_gen_dict["$STREAMDECLARATIONS$"] = []
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_stream_width())
+            'hls::stream<ap_uint<{}>> in0 ("in0");'.format(self.get_instream_width())
         )
         self.code_gen_dict["$STREAMDECLARATIONS$"].append(
-            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_stream_width())
+            'hls::stream<ap_uint<{}>> out ("out");'.format(self.get_outstream_width())
         )
 
     def docompute(self):
@@ -211,7 +215,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
             # use binary for bipolar storage
             dtype = DataType.BINARY
         elem_bits = dtype.bitwidth()
-        packed_bits = self.get_stream_width()
+        packed_bits = self.get_outstream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         elem_hls_type = dtype.get_hls_datatype_str()
         npy_type = "float"
@@ -235,7 +239,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
         self.code_gen_dict["$SAVEASCNPY$"] = []
 
     def blackboxfunction(self):
-        packed_bits = self.get_stream_width()
+        packed_bits = self.get_instream_width()
         packed_hls_type = "ap_uint<%d>" % packed_bits
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
             "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
@@ -303,7 +307,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
                 code_gen_dir, node.name, prefixed_top_name
             )
             if os.path.isfile(verilog_file):
-                nbits = self.get_stream_width()
+                nbits = self.get_instream_width()
                 rtlsim_inp = npy_to_rtlsim_input(
                     "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
                 )
@@ -320,7 +324,7 @@ class StreamingMaxPool_Batch(HLSCustomOp):
                 rtlsim_output = self.rtlsim(sim, rtlsim_inp)
                 odt = export_idt
                 target_bits = odt.bitwidth()
-                packed_bits = self.get_stream_width()
+                packed_bits = self.get_outstream_width()
                 out_npy_path = "{}/output.npy".format(code_gen_dir)
                 out_shape = self.get_folded_output_shape()
                 rtlsim_output_to_npy(
diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py
index 2f54a09e49da1e1ea4257b9753b010bbc067aad6..7121434dedef428cbd20c324d39193469e9b1a04 100644
--- a/src/finn/transformation/fpgadataflow/insert_dwc.py
+++ b/src/finn/transformation/fpgadataflow/insert_dwc.py
@@ -62,15 +62,9 @@ class InsertDWC(Transformation):
                     if n0_out_shape[-1] != n1_in_shape[-1]:
                         graph_modified = True
                         # determine dwc inwidth
-                        if hasattr(n0, "get_outstream_width"):
-                            dwc_in_width = n0.get_outstream_width()
-                        else:
-                            dwc_in_width = n0.get_stream_width()
+                        dwc_in_width = n0.get_outstream_width()
                         # determine dwc outwidth
-                        if hasattr(n1, "get_instream_width"):
-                            dwc_out_width = n1.get_instream_width()
-                        else:
-                            dwc_out_width = n1.get_stream_width()
+                        dwc_out_width = n1.get_instream_width()
 
                         # determine shape for dwc
                         dwc_shape = n0.get_normal_output_shape()