diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 65c898a8c453420ed96ca22715ef2595c5840288..7de6cce936ee54d58d9a526e926ff79dcd35b90d 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -308,6 +308,12 @@ class HLSCustomOp(CustomOp):
         f.close()
         self.code_gen_dict.clear()
 
+    def code_generation_ipi(self):
+        """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+        vlnv = self.get_nodeattr("ip_vlnv")
+        cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)]
+        return cmd
+
     def compile_singlenode_code(self):
         """Builds the bash script for compilation using the CppBuilder from
         finn.util.basic and executes the script to produce the executable."""
diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
index f666becdbcceca6ca202907610595f8c0069c5a0..5f1697f819d229d6d7c3b8907abcb541061ecbb3 100644
--- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py
@@ -51,6 +51,10 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             "outWidth": ("i", True, 0),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
+            # Toggle between hls or IPI implementation
+            # hls - use the hls generated IP during stitching
+            # vivado - use the AXI Infrastructure DWC
+            "impl_style": ("s", False, "hls"),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
@@ -381,3 +385,65 @@ class StreamingDataWidthConverter_Batch(HLSCustomOp):
             exp_shape
         ), """Output
         shape doesn't match expected shape, should be same as input shape"""
+
+    def code_generation_ipi(self):
+        impl_style = self.get_nodeattr("impl_style")
+        if impl_style == "hls":
+            return super().code_generation_ipi()
+        elif impl_style == "vivado":
+            cmd = []
+            node_name = self.onnx_node.name
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate and configure DWC
+            cmd.append(
+                "create_bd_cell -type ip "
+                "-vlnv xilinx.com:ip:axis_dwidth_converter:1.1 /%s/dwc" % node_name
+            )
+            cmd.append(
+                "set_property -dict "
+                "[list CONFIG.S_TDATA_NUM_BYTES.VALUE_SRC PROPAGATED] "
+                "[get_bd_cells /%s/dwc]" % node_name
+            )
+            cmd.append(
+                "set_property -dict "
+                "[list CONFIG.M_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]"
+                % (np.ceil(self.get_outstream_width() / 8), node_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/dwc/M_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/dwc/S_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aresetn]"
+                % (node_name, rst_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aclk]"
+                % (node_name, clk_name, node_name)
+            )
+            return cmd
+        else:
+            raise Exception(
+                "DWC implementation style %s not supported, please use hls or vivado"
+                % impl_style
+            )
diff --git a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
index de640f22ae87c89721f8b7ed2b3b270b54000ebb..87f52eeea591ba42bf5374df3c93bcc3e4f8e944 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfclayer_batch.py
@@ -28,9 +28,6 @@
 
 import math
 import os
-import subprocess
-from shutil import copy
-
 import numpy as np
 
 from onnx import TensorProto, helper
@@ -100,23 +97,6 @@ class StreamingFCLayer_Batch(HLSCustomOp):
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
-    def get_verilog_top_module_name(self):
-        "Return the Verilog top module name for this node."
-
-        node = self.onnx_node
-        # set top name depending on mem_mode
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "const" or mem_mode == "external":
-            prefixed_top_name = "%s_%s" % (node.name, node.name)
-        elif mem_mode == "decoupled":
-            prefixed_top_name = "%s_memstream" % (node.name)
-        else:
-            raise Exception(
-                """Please set mem_mode to "const", "decoupled", or "external",
-                currently no other parameter value is supported!"""
-            )
-        return prefixed_top_name
-
     def calc_wmem(self):
         """Calculates and returns WMEM."""
         mw = self.get_nodeattr("MW")
@@ -705,7 +685,7 @@ class StreamingFCLayer_Batch(HLSCustomOp):
             )
             super().reset_rtlsim(sim)
             super().toggle_clk(sim)
-            if mem_mode == "external":
+            if mem_mode == "external" or mem_mode == "decoupled":
                 wnbits = self.get_weightstream_width()
                 export_wdt = self.get_weight_datatype()
                 # we have converted bipolar weights to binary for export,
@@ -1008,112 +988,96 @@ class StreamingFCLayer_Batch(HLSCustomOp):
                 )
             )
 
-    def code_generation_ipgen(self, model, fpgapart, clk):
-        # generate code for all mem_mode of MVAU/FCLayer unit
-        super().code_generation_ipgen(model, fpgapart, clk)
-
-        # if mem_mode = "decoupled" generate code for verilog wrapper
+    def code_generation_ipi(self):
+        cmd = []
+        # add streamer if needed
         mem_mode = self.get_nodeattr("mem_mode")
         if mem_mode == "decoupled":
-            # empty code gen dictionary for new entries
-            self.code_gen_dict.clear()
-            self.code_gen_dict["$TOPNAME$"] = [
-                "{}_memstream".format(self.onnx_node.name)
-            ]
-            self.code_gen_dict["$LAYER_NAME$"] = [
-                "{}_{}".format(self.onnx_node.name, self.onnx_node.name)
-            ]
-            # make instream width a multiple of 8 for AXI stream interface
-            in_width = self.get_instream_width_padded()
-            self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)]
-            self.code_gen_dict["$OUT_RANGE$"] = [
-                "[{}:0]".format(self.get_outstream_width_padded() - 1)
-            ]
-            # make weight stream width a multiple of 8 for AXI stream interface
-            weight_width = self.get_weightstream_width_padded()
-            self.code_gen_dict["$WEIGHT_RANGE$"] = ["[{}:0]".format(weight_width - 1)]
-            self.code_gen_dict["$WEIGHT_WIDTH$"] = [str(weight_width)]
-            self.code_gen_dict["$WSTREAM_DEPTH$"] = [str(self.calc_wmem())]
-            self.code_gen_dict["$MEM_DEPTH$"] = [str(self.calc_wmem())]
-            self.code_gen_dict["$RAM_STYLE$"] = [self.get_nodeattr("ram_style")]
-
-            template = self.decoupled_wrapper
-
-            for key in self.code_gen_dict:
-                # transform list into long string separated by '\n'
-                code_gen_line = "\n".join(self.code_gen_dict[key])
-                template = template.replace(key, code_gen_line)
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            f = open(
-                os.path.join(
-                    code_gen_dir, "{}_memstream.v".format(self.onnx_node.name)
-                ),
-                "w",
+            node_name = self.onnx_node.name
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
             )
-            f.write(template)
-            f.close()
-            self.code_gen_dict.clear()
-
-    def ipgen_singlenode_code(self):
-        # generate ip block of MVAU/FCLayer unit for all mem modes
-        super().ipgen_singlenode_code()
-
-        mem_mode = self.get_nodeattr("mem_mode")
-        if mem_mode == "decoupled":
-            # copy necessary verilog and .dat files
-            # into verilog folder in code generation folder
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            verilog_folder = "{}/project_{}/sol1/impl/verilog/".format(
-                code_gen_dir, self.onnx_node.name
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
             )
-            # copy memstream components from finn-rtllib
-            memstream_dir = "/workspace/finn/finn-rtllib/memstream/hdl/"
-            for file in os.listdir(memstream_dir):
-                if file.endswith(".v"):
-                    verilog_file = os.path.join(memstream_dir, file)
-                    copy(verilog_file, verilog_folder)
-            # copy .dat files of weights
-            for file in os.listdir(code_gen_dir):
-                if file.endswith(".dat"):
-                    dat_file = os.path.join(code_gen_dir, file)
-                    copy(dat_file, verilog_folder)
-            # copy verilog wrapper
-            verilog_wrapper = "{}/{}_memstream.v".format(
-                code_gen_dir, self.onnx_node.name
+            # instantiate the hls ip
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (self.get_nodeattr("ip_vlnv"), node_name, node_name)
             )
-            copy(verilog_wrapper, verilog_folder)
-            # prepare the IP packaging tcl template
-            template = templates.ip_package_tcl
-            self.code_gen_dict["$TOPNAME$"] = [
-                "{}_memstream".format(self.onnx_node.name)
-            ]
-            self.code_gen_dict["$VERILOG_DIR$"] = [verilog_folder]
-            for key in self.code_gen_dict:
-                # transform list into long string separated by '\n'
-                code_gen_line = "\n".join(self.code_gen_dict[key])
-                template = template.replace(key, code_gen_line)
-            code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
-            f = open(os.path.join(verilog_folder, "package_ip.tcl"), "w")
-            f.write(template)
-            f.close()
-            # create a shell script and call Vivado to invoke the IP pkg script
-            make_project_sh = verilog_folder + "/make_ip.sh"
-            working_dir = os.environ["PWD"]
-            with open(make_project_sh, "w") as f:
-                f.write("#!/bin/bash \n")
-                f.write("cd {}\n".format(verilog_folder))
-                f.write("vivado -mode batch -source package_ip.tcl\n")
-                f.write("cd {}\n".format(working_dir))
-            bash_command = ["bash", make_project_sh]
-            process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
-            process_compile.communicate()
-            # re-set ip_path to point to the new packaged IP
-            self.set_nodeattr("ip_path", verilog_folder)
-            vlnv = "xilinx.com:hls:%s:1.0" % (
-                "{}_memstream".format(self.onnx_node.name)
+            # instantiate a streamer and connect it to the HLS IP
+            strm_vlnv = "xilinx.com:user:memstream:1.0"
+            strm_inst = node_name + "_wstrm"
+            cmd.append(
+                "create_bd_cell -type ip -vlnv %s /%s/%s"
+                % (strm_vlnv, node_name, strm_inst)
+            )
+            cmd.append(
+                "set_property -dict [list "
+                "CONFIG.NSTREAMS {1} "
+                "CONFIG.MEM_DEPTH {%d} "
+                "CONFIG.MEM_WIDTH {%d} "
+                "CONFIG.MEM_INIT {%s} "
+                "CONFIG.RAM_STYLE {%s} "
+                "CONFIG.STRM0_DEPTH {%d} "
+                "CONFIG.STRM0_WIDTH {%d} "
+                "CONFIG.STRM0_OFFSET {0} "
+                "] [get_bd_cells /%s/%s]"
+                % (
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    self.get_nodeattr("code_gen_dir_ipgen") + "/",
+                    self.get_nodeattr("ram_style"),
+                    self.calc_wmem(),
+                    self.get_weightstream_width_padded(),
+                    node_name,
+                    strm_inst,
+                )
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s/m_axis_0] "
+                "[get_bd_intf_pins %s/%s/weights_V_V]"
+                % (node_name, strm_inst, node_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+                % (node_name, rst_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+                % (node_name, clk_name, node_name, strm_inst)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, rst_name, node_name, node_name, rst_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/%s]"
+                % (node_name, clk_name, node_name, node_name, clk_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, din_name, node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/%s] "
+                "[get_bd_intf_pins %s/%s/%s]"
+                % (node_name, dout_name, node_name, node_name, dout_name)
             )
-            self.set_nodeattr("ip_vlnv", vlnv)
-            self.code_gen_dict.clear()
+            cmd.append("save_bd_design")
+        return cmd
 
     def get_verilog_top_module_intf_names(self):
         intf_names = super().get_verilog_top_module_intf_names()
diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py
index 1f734b548f923341687843c538d1887fcc069bee..c9011b50d06a55c34bdd49c8ea374bdf81ea5f4f 100644
--- a/src/finn/custom_op/fpgadataflow/streamingfifo.py
+++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py
@@ -51,6 +51,16 @@ class StreamingFIFO(HLSCustomOp):
             "folded_shape": ("ints", True, []),
             # FINN DataTypes for inputs/outputs
             "dataType": ("s", True, ""),
+            # Toggle between hls or IPI implementation
+            # rtl - use the hls generated IP during stitching
+            # vivado - use the AXI Infrastructure FIFO
+            "impl_style": ("s", False, "rtl"),
+            # FPGA resource type for FIFOs when impl_style is vivado
+            # auto -- let Vivado decide
+            # block -- use BRAM
+            # distributed -- use LUTRAM
+            # ultra -- use URAM (on UltraScale+)
+            "ram_style": ("s", False, "auto"),
         }
         my_attrs.update(super().get_nodeattr_types())
 
@@ -306,3 +316,71 @@ class StreamingFIFO(HLSCustomOp):
 
     def pragmas(self):
         pass
+
+    def code_generation_ipi(self):
+        impl_style = self.get_nodeattr("impl_style")
+        if impl_style == "rtl":
+            return super().code_generation_ipi()
+        elif impl_style == "vivado":
+            cmd = []
+            node_name = self.onnx_node.name
+            depth = self.get_nodeattr("depth")
+            ram_style = self.get_nodeattr("ram_style")
+            # create a hierarchy for this layer, with the same port names
+            clk_name = self.get_verilog_top_module_intf_names()["clk"][0]
+            rst_name = self.get_verilog_top_module_intf_names()["rst"][0]
+            dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0]
+            din_name = self.get_verilog_top_module_intf_names()["s_axis"][0]
+            cmd.append("create_bd_cell -type hier %s" % node_name)
+            cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name))
+            cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
+            cmd.append(
+                "create_bd_intf_pin -mode Master "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
+                % (node_name, dout_name)
+            )
+            cmd.append(
+                "create_bd_intf_pin -mode Slave "
+                "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name)
+            )
+            # instantiate and configure DWC
+            cmd.append(
+                "create_bd_cell -type ip "
+                "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name
+            )
+            cmd.append(
+                "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] "
+                "[get_bd_cells /%s/fifo]" % (depth, node_name)
+            )
+            cmd.append(
+                "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] "
+                "[get_bd_cells /%s/fifo]" % (ram_style, node_name)
+            )
+            cmd.append(
+                "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] "
+                "[get_bd_cells /%s/fifo]"
+                % (np.ceil(self.get_outstream_width() / 8), node_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name)
+            )
+            cmd.append(
+                "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] "
+                "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] "
+                "[get_bd_pins %s/fifo/s_axis_aresetn]"
+                % (node_name, rst_name, node_name)
+            )
+            cmd.append(
+                "connect_bd_net [get_bd_pins %s/%s] "
+                "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name)
+            )
+            return cmd
+        else:
+            raise Exception(
+                "FIFO implementation style %s not supported, please use rtl or vivado"
+                % impl_style
+            )
diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
index 90b4b6c47e6e353c1b606d6918eb271e9c0619c5..6e9ce35634760c06c7a409fb5befdb94a08e9c7d 100644
--- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py
+++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py
@@ -182,6 +182,8 @@ class CreateStitchedIP(Transformation):
 
     def apply(self, model):
         ip_dirs = ["list"]
+        # add RTL streamer IP
+        ip_dirs.append("/workspace/finn/finn-rtllib/memstream")
         # ensure that all nodes are fpgadataflow, and that IPs are generated
         for node in model.graph.node:
             assert node.domain == "finn", 'Node domain is not set to "finn"'
@@ -196,10 +198,7 @@ class CreateStitchedIP(Transformation):
             ip_dir_value = node_inst.get_nodeattr("ip_path")
             assert os.path.isdir(ip_dir_value), "IP generation directory doesn't exist."
             ip_dirs += [ip_dir_value]
-            vlnv = node_inst.get_nodeattr("ip_vlnv")
-            inst_name = node.name
-            create_cmd = "create_bd_cell -type ip -vlnv %s %s" % (vlnv, inst_name)
-            self.create_cmds += [create_cmd]
+            self.create_cmds += node_inst.code_generation_ipi()
             my_producer = model.find_producer(node.input[0])
             self.connect_clk_rst(node)
             self.connect_axi(node)
@@ -223,6 +222,7 @@ class CreateStitchedIP(Transformation):
                 #     find index of producer output connected to our target input
                 #     get names of hdl interfaces for input and producer output
                 #     issue a TCL directive to connect input to output
+                #     if FC layer with mode "decoupled", add a streamer on input 1
                 for i in range(len(node.input)):
                     producer = model.find_producer(node.input[i])
                     if producer is None: