diff --git a/src/finn/core/rtlsim_exec.py b/src/finn/core/rtlsim_exec.py
index e5e6d29bd8d8ed23f6a4958856ed1ddea3617175..91bd3d1198f997eaf96ef3883b2c25e32c5da050 100644
--- a/src/finn/core/rtlsim_exec.py
+++ b/src/finn/core/rtlsim_exec.py
@@ -66,6 +66,11 @@ def rtlsim_exec(model, execution_context):
     i_stream_w = first_node.get_instream_width()
     # convert input into time multiplexed shape
     i_folded_shape = first_node.get_folded_input_shape()
+    batchsize = i_tensor.shape[0]
+    # override batch size for input
+    i_folded_shape = list(i_folded_shape)
+    i_folded_shape[0] = batchsize
+    i_folded_shape = tuple(i_folded_shape)
     # TODO any other layout transformations need to happen here!
     i_tensor = i_tensor.reshape(i_folded_shape)
     # extract output shape
@@ -74,12 +79,20 @@ def rtlsim_exec(model, execution_context):
     o_dt = model.get_tensor_datatype(o_name)
     last_node = getCustomOp(model.find_producer(o_name))
     o_folded_shape = last_node.get_folded_output_shape()
+    # override batch size from actual input
+    o_shape = list(o_shape)
+    o_shape[0] = batchsize
+    o_shape = tuple(o_shape)
+    o_folded_shape = list(o_folded_shape)
+    o_folded_shape[0] = batchsize
+    o_folded_shape = tuple(o_folded_shape)
     o_stream_w = last_node.get_outstream_width()
     packedBits = o_stream_w
     targetBits = o_dt.bitwidth()
     # pack input
     packed_input = npy_to_rtlsim_input(i_tensor, i_dt, i_stream_w)
     num_out_values = last_node.get_number_output_values()
+    num_out_values *= batchsize
     # prepare pyverilator model
     rtlsim_so = model.get_metadata_prop("rtlsim_so")
     if (rtlsim_so is None) or (not os.path.isfile(rtlsim_so)):