diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
index 99c08dc0593a928c534f1dc2a0313e0c85680144..1ac7ee178531e745bf68405d1ae9df35c0c216fb 100644
--- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py
+++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py
@@ -173,6 +173,11 @@ class MakeZYNQProject(Transformation):
                     "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
                     % (instance_names[node.name], axilite_intf_name, axilite_idx)
                 )
+                # assign_bd_address with appropriate range/offset
+                config.append(
+                    "assign_axi_addr_proc %s/%s"
+                    % (instance_names[node.name], axilite_intf_name)
+                )
                 idma_idx += 1
                 aximm_idx += 1
                 axilite_idx += 1
@@ -188,6 +193,11 @@ class MakeZYNQProject(Transformation):
                         "[get_bd_intf_pins axi_interconnect_0/M%02d_AXI]"
                         % (instance_names[node.name], axilite_intf_name, axilite_idx)
                     )
+                    # assign_bd_address with appropriate range/offset
+                    config.append(
+                        "assign_axi_addr_proc %s/%s"
+                        % (instance_names[node.name], axilite_intf_name)
+                    )
                     axilite_idx += 1
 
             config.append(
diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py
index 73beb62f06a6b625a992bd2a7401a91ed09789f3..9c0169a98f515d0b32e10bdfc834eca5fb681ffd 100644
--- a/src/finn/transformation/fpgadataflow/templates.py
+++ b/src/finn/transformation/fpgadataflow/templates.py
@@ -153,6 +153,7 @@ set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconn
 
 #create reset controller and connect interconnects to PS
 if {$ZYNQ_TYPE == "zynq_us+"} {
+    set axi_peripheral_base 0xA0000000
     connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0_FPD]
     connect_bd_intf_net [get_bd_intf_pins zynq_ps/M_AXI_HPM0_FPD] -boundary_type upper [get_bd_intf_pins axi_interconnect_0/S00_AXI]
     #connect interconnect clocks and resets
@@ -160,6 +161,7 @@ if {$ZYNQ_TYPE == "zynq_us+"} {
     apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/S00_ACLK]
     apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/pl_clk0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins zynq_ps/saxihp0_fpd_aclk]
 } elseif {$ZYNQ_TYPE == "zynq_7000"} {
+    set axi_peripheral_base 0x40000000
     connect_bd_intf_net -boundary_type upper [get_bd_intf_pins zynq_ps/M_AXI_GP0] [get_bd_intf_pins axi_interconnect_0/S00_AXI]
     connect_bd_intf_net [get_bd_intf_pins smartconnect_0/M00_AXI] [get_bd_intf_pins zynq_ps/S_AXI_HP0]
     apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ps/FCLK_CLK0} Freq {} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins axi_interconnect_0/ACLK]
@@ -168,6 +170,21 @@ if {$ZYNQ_TYPE == "zynq_us+"} {
 }
 connect_bd_net [get_bd_pins axi_interconnect_0/ARESETN] [get_bd_pins smartconnect_0/aresetn]
 
+#procedure used by below IP instantiations to map BD address segments based on the axi interface aperture
+proc assign_axi_addr_proc {axi_intf_path} {
+    #global variable holds current base address
+    global axi_peripheral_base
+    #infer range
+    set range [expr 2**[get_property CONFIG.ADDR_WIDTH [get_bd_intf_pins $axi_intf_path]]]
+    set range [expr $range < 4096 ? 4096 : $range]
+    #align base address to range
+    set offset [expr ($axi_peripheral_base + ($range-1)) & ~($range-1)]
+    #perform assignment
+    assign_bd_address [get_bd_addr_segs $axi_intf_path/Reg] -offset $offset -range $range
+    #advance base address
+    set axi_peripheral_base [expr $offset + $range]
+}
+
 #custom IP instantiations/connections start here
 %s
 
diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py
index b23f9f14909a5bd93ae24b34ef65304dafc7e0c1..7163a95c4dbbe5c8bcee4ebeea87c5e9611c179e 100644
--- a/src/finn/transformation/streamline/reorder.py
+++ b/src/finn/transformation/streamline/reorder.py
@@ -40,6 +40,7 @@ from finn.core.datatype import DataType
 from finn.core.onnx_exec import execute_node
 from finn.util.basic import get_by_name
 from finn.custom_op.registry import getCustomOp
+from finn.transformation.general import SortGraph
 
 
 class MoveAddPastMul(Transformation):
@@ -1039,3 +1040,77 @@ class MoveTransposePastScalarMul(Transformation):
             model = model.transform(InferDataLayouts())
             model = model.transform(InferShapes())
         return (model, graph_modified)
+
+
+class MoveIdenticalOpPastJoinOp(Transformation):
+    """
+    Move identical operations on different branches past the common join node.
+    This transformation assumes that the identical operations only change the
+    data layout. For linear operations, see the transformation MoveLinearPastEltwiseAdd.
+    Specifically, this transformation matches and transforms the following patterns:
+    f(x) + f(y) -> f(x + y)
+    where f(.) is currently only supporting 'Transpose', and an 'Add' node is
+    the join node.
+    """
+
+    def __init__(self, identical_op_list, join_node_list):
+        super().__init__()
+        self.ops_to_move = identical_op_list
+        self.join_node_op = join_node_list
+
+    def move_node(self, model, n, prod0, prod1):
+        # Found! move one of the identical_ops to output, remove the other one
+        identical_op0_in0 = prod0.input[0]
+        identical_op1_in0 = prod1.input[0]
+        add_in0 = n.input[0]
+        add_out = n.output[0]
+
+        # Rewire
+        n.input[0] = identical_op0_in0
+        n.input[1] = identical_op1_in0
+
+        # Output tensor of the join node must have the same shape as
+        # its input tensor (original shape is preserved)
+        new_shape = model.get_tensor_shape(identical_op0_in0)
+
+        # Set new tensor shape
+        model.set_tensor_shape(tensor_name=add_in0, tensor_shape=new_shape)
+
+        n.output[0] = add_in0
+        prod0.input[0] = add_in0
+        prod0.output[0] = add_out
+
+        model.graph.node.remove(prod1)
+
+    def apply(self, model):
+        graph = model.graph
+        graph_modified = False
+        for n in graph.node:
+            if n.op_type in self.join_node_op and model.is_join_node(n):
+                in0 = n.input[0]
+                in1 = n.input[1]
+                if in0 is None or in1 is None:
+                    continue
+
+                prod0 = model.find_producer(in0)
+                prod1 = model.find_producer(in1)
+                # Checks if the join node is preceded by
+                # two different, but identical operations
+                if prod0 == prod1:
+                    continue
+
+                identical_op = prod0.op_type == prod1.op_type
+
+                if identical_op and prod0.op_type in self.ops_to_move:
+                    self.move_node(model, n, prod0, prod1)
+                    graph_modified = True
+
+        if graph_modified:
+            model = model.transform(SortGraph(), make_deepcopy=False, cleanup=False)
+
+        return (model, graph_modified)
+
+
+class MoveTransposePastJoinAdd(MoveIdenticalOpPastJoinOp):
+    def __init__(self):
+        super().__init__(["Transpose"], ["Add"])
diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py
index 0fe64d1a46403cf0276654bca0c14f461d1c163b..2823dec1fbce9f2e6a5f5f681cf403c205ee0a2d 100644
--- a/tests/end2end/test_end2end_bnn_pynq.py
+++ b/tests/end2end/test_end2end_bnn_pynq.py
@@ -140,7 +140,6 @@ def fold_tfc(model):
         fcl_inst.set_nodeattr("PE", pe)
         fcl_inst.set_nodeattr("SIMD", simd)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
-        fcl_inst.set_nodeattr("runtime_writeable_weights", 1)
     # set parallelism for input quantizer to be same as first layer's SIMD
     inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
     inp_qnt = getCustomOp(inp_qnt_node)
@@ -164,6 +163,7 @@ def fold_lfc(model):
         fcl_inst.set_nodeattr("PE", pe)
         fcl_inst.set_nodeattr("SIMD", simd)
         fcl_inst.set_nodeattr("ram_style", ramstyle)
+        fcl_inst.set_nodeattr("runtime_writeable_weights", 1)
     # set parallelism for input quantizer to be same as first layer's SIMD
     inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0]
     inp_qnt = getCustomOp(inp_qnt_node)
diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..94eb52835b1800a839e5a9792e9cf1d7be1e681d
--- /dev/null
+++ b/tests/transformation/streamline/test_move_identical_op_past_join_op.py
@@ -0,0 +1,94 @@
+import pytest
+
+from onnx import helper as oh
+from onnx import TensorProto
+
+from finn.core.modelwrapper import ModelWrapper
+from finn.transformation.streamline.reorder import MoveTransposePastJoinAdd
+from finn.util.basic import gen_finn_dt_tensor
+import finn.core.onnx_exec as oxe
+
+
+def create_model(perm):
+    if perm == [0, 3, 1, 2]:
+        in_shape = [1, 128, 1, 256]
+        out_shape = [1, 256, 128, 1]
+    if perm == [0, 2, 3, 1]:
+        in_shape = [1, 256, 128, 1]
+        out_shape = [1, 128, 1, 256]
+
+    Transpose1_node = oh.make_node(
+        "Transpose", inputs=["in_transpose1"], outputs=["out_transpose1"], perm=perm
+    )
+
+    Transpose2_node = oh.make_node(
+        "Transpose", inputs=["in_transpose2"], outputs=["out_transpose2"], perm=perm
+    )
+
+    Join1_node = oh.make_node(
+        "Add", inputs=["out_transpose1", "out_transpose2"], outputs=["out_join1"]
+    )
+
+    in_transpose1 = oh.make_tensor_value_info(
+        "in_transpose1", TensorProto.FLOAT, in_shape
+    )
+    in_transpose2 = oh.make_tensor_value_info(
+        "in_transpose2", TensorProto.FLOAT, in_shape
+    )
+    out_transpose1 = oh.make_tensor_value_info(
+        "out_transpose1", TensorProto.FLOAT, out_shape
+    )
+    out_transpose2 = oh.make_tensor_value_info(
+        "out_transpose2", TensorProto.FLOAT, out_shape
+    )
+    out_join1 = oh.make_tensor_value_info("out_join1", TensorProto.FLOAT, out_shape)
+
+    graph = oh.make_graph(
+        nodes=[Transpose1_node, Transpose2_node, Join1_node],
+        name="test_graph",
+        inputs=[in_transpose1, in_transpose2],
+        outputs=[out_join1],
+        value_info=[
+            out_transpose1,
+            out_transpose2,
+        ],
+    )
+
+    onnx_model = oh.make_model(graph, producer_name="test_model")
+    model = ModelWrapper(onnx_model)
+
+    return model
+
+
+# Permutation of transpose node
+@pytest.mark.parametrize("perm", [[0, 3, 1, 2], [0, 2, 3, 1]])
+def test_move_identical_op_past_join_op(perm):
+    model = create_model(perm)
+
+    # Create input data
+    input0_tensor_name = model.graph.input[0].name
+    input1_tensor_name = model.graph.input[1].name
+
+    # Note: it is assumed that both tensors have the same shape and data type
+    input_shape = model.get_tensor_shape(input0_tensor_name)
+    input_dtype = model.get_tensor_datatype(input0_tensor_name)
+    input_val = gen_finn_dt_tensor(input_dtype, input_shape)
+    input_dict = {}
+    input_dict[input0_tensor_name] = input_val
+    input_dict[input1_tensor_name] = input_val
+
+    model_transformed = model.transform(MoveTransposePastJoinAdd())
+
+    assert oxe.compare_execution(model, model_transformed, input_dict)
+
+    # Check if order changed
+    node0_input0_model = model.find_consumers(model.graph.input[0].name)[0].op_type
+    node1_input1_model = model.find_consumers(model.graph.input[1].name)[0].op_type
+    node0_input0_model_transformed = model_transformed.find_consumers(
+        model_transformed.graph.input[0].name
+    )[0].op_type
+    node1_input1_model_transformed = model_transformed.find_consumers(
+        model_transformed.graph.input[1].name
+    )[0].op_type
+    assert node0_input0_model != node0_input0_model_transformed
+    assert node1_input1_model != node1_input1_model_transformed