To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit 805f1463 authored by Lukas Wolf's avatar Lukas Wolf
Browse files

fixed memory and runtime issues

parent b86e145c
......@@ -79,13 +79,10 @@ class Ensemble_tf:
pred_epoch = (pred_epoch/config['ensemble']).tolist()
# Compute the ensemble loss dependent on the task
if config['task'] == 'prosaccade-clf':
print("append prosaccade loss")
loss.append(bce(pred_ensemble.targets,pred_epoch).numpy())
elif config['task'] == 'angle-reg':
print("append angle loss")
loss.append(angle_loss(pred_ensemble.targets,pred_epoch).numpy())
elif config['task'] == 'gaze-reg':
print("append mse loss")
loss.append(mse(pred_ensemble.targets,pred_epoch).numpy())
else:
raise Exception("Choose valid task in config.py")
......
......@@ -2,24 +2,41 @@ import torch
from torch import nn
import numpy as np
from config import config
import logging
#from torch.utils.tensorboard import SummaryWriter
import logging
from torch_models.torch_utils.training import train_loop, test_loop
from torch_models.torch_utils.utils import get_gpu_memory
import psutil
from torch_models.torch_utils.utils import timing_decorator
from memory_profiler import profile
class Prediction_history:
"""
Collect predictions of the given validation set after each epoch
predhis is a list of lists (one for each epoch) of tensors (one for each batch)
"""
def __init__(self, dataloader) -> None:
def __init__(self, dataloader, device, model) -> None:
self.dataloader = dataloader
self.predhis = []
self.device = device
self.model = model
def on_epoch_end(self, model):
y_pred = []
for x, y in self.dataloader:
y_pred.append(model(x.float()))
self.predhis.append(y_pred)
#@timing_decorator
#@profile
def on_epoch_end(self):
with torch.no_grad():
y_pred = []
for x, y in self.dataloader:
# Move batch to GPU
if torch.cuda.is_available():
x = x.cuda()
y = y.cuda()
y_pred.append(self.model(x))
# Remove batch from GPU
del x
del y
#torch.cuda.empty_cache()
self.predhis.append(y_pred)
class BaseNet(nn.Module):
"""
......@@ -38,6 +55,7 @@ class BaseNet(nn.Module):
self.batch_size = batch_size
self.nb_channels = self.input_shape[1]
self.timesamples = self.input_shape[0]
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Create output layer depending on task and
if config['task'] == 'prosaccade_clf':
......@@ -74,36 +92,43 @@ class BaseNet(nn.Module):
# abstract method
def _split_model(self):
pass
pass
@profile
@timing_decorator
def fit(self, train_dataloader, test_dataloader, subjectID=None):
"""
Fit the model on the dataset defined by data x and labels y
"""
logging.info("------------------------------------------------------------------------------------")
logging.info(f"Fitting model number {self.model_number}")
# Move the model to GPU
if torch.cuda.is_available():
self.cuda()
logging.info(f"Model moved to cuda")
# Create the optimizer
optimizer = torch.optim.Adam(list(self.parameters()), lr=config['learning_rate'])
# Create a history to track ensemble performance
prediction_ensemble = Prediction_history(dataloader=test_dataloader)
# Create a summary writer for logging metrics
#writer = SummaryWriter(log_dir=config['model_dir']+'/summary_writer')
prediction_ensemble = Prediction_history(dataloader=test_dataloader, device=self.device, model=self)
# Train the model
epochs = config['epochs']
for t in range(epochs):
logging.info(f"Epoch {t+1}\n-------------------------------")
print(f"Start EPOCH: Free GPU memory: {get_gpu_memory()}")
print(f"memory {psutil.virtual_memory()}")
# Run through training and test set
train_loss = train_loop(train_dataloader, self.float(), self.loss_fn, optimizer)
test_loss, test_acc = test_loop(test_dataloader, self.float(), self.loss_fn)
print(self.device)
train_loop(train_dataloader, self.float(), self.loss_fn, optimizer, self.device)
print(f"Free GPU mem after train loop: {get_gpu_memory()}")
print(f"memory {psutil.virtual_memory()}")
test_loop(test_dataloader, self.float(), self.loss_fn, self.device)
print("Free GPU mem after test loop:")
print(f"memory {psutil.virtual_memory()}")
# Add the predictions on the validation set
prediction_ensemble.on_epoch_end(model=self)
logging.info("end epoch")
# Log metrics to the writer
#writer.add_scalar('Loss/train', train_loss, t)
#writer.add_scalar('Loss/test', test_loss, t)
#if config['task'] == 'prosaccade-clf':
#writer.add_scalar('Accuracy/test', test_acc, t)
prediction_ensemble.on_epoch_end()
print("Free GPU mem after prediction hist:")
print(f"memory {psutil.virtual_memory()}")
# Done with training this model
logging.info(f"Finished model number {self.model_number}")
if config['save_models'] and self.model_number==0:
ckpt_dir = config['model_dir'] + '/best_models/' + config['model'] + '_nb_{}_'.format(self.model_number) + 'best_model.pth'
......
......@@ -68,7 +68,7 @@ class ConvNet(ABC, BaseNet):
x = nn.functional.relu(x)
input_res = x
x = self.gap_layer_pad(x) # Pad for the avgpool1d gap_layer
x = self.gap_layer_pad(x)
x = self.gap_layer(x)
x = x.view(self.batch_size, -1)
......
......@@ -15,7 +15,7 @@ class EEGNet(BaseNet):
"""
def __init__(self, input_shape, epochs=50, model_number=0,
F1=16, F2=256, verbose=True, D=4, kernel_size=256,
dropout_rate=0.5):
dropout_rate=0.5, batch_size=64):
self.kernel_size = kernel_size
self.timesamples = input_shape[0]
......@@ -25,7 +25,7 @@ class EEGNet(BaseNet):
self.F2 = F2
self.kernel_size = kernel_size
self.dropout_rate = dropout_rate
super().__init__(input_shape=input_shape, epochs=epochs, model_number=model_number)
super().__init__(input_shape=input_shape, epochs=epochs, model_number=model_number, batch_size=batch_size)
# Block 1: 2dconv and depthwise conv
#TODO: build only self.block1 and self.block2 for forward pass
......
......@@ -96,20 +96,20 @@ def create_model(model_type, model_number):
Returns the specified torch model as nn.Module built on BaseNet
"""
if model_type == 'cnn':
model = CNN(input_shape=config['cnn']['input_shape'], kernel_size=64, epochs = config['epochs'], nb_filters=16,
verbose=True, batch_size=64, use_residual=True, depth=12, model_number=model_number)
model = CNN(input_shape=config['cnn']['input_shape'], kernel_size=64, epochs = config['epochs'], nb_filters=16, batch_size=config['batch_size'],
verbose=True, use_residual=True, depth=12, model_number=model_number)
elif model_type == 'inception':
model = Inception(input_shape=config['inception']['input_shape'], use_residual=True, model_number=model_number,
model = Inception(input_shape=config['inception']['input_shape'], use_residual=True, model_number=model_number, batch_size=config['batch_size'],
kernel_size=64, nb_filters=16, depth=12, bottleneck_size=16, epochs=config['epochs'])
elif model_type == 'xception':
model = XCEPTION(input_shape=config['inception']['input_shape'], use_residual=True, model_number=model_number,
kernel_size=40, nb_filters=64, depth=12, epochs=config['epochs'])
kernel_size=40, nb_filters=64, depth=18, epochs=config['epochs'], batch_size=config['batch_size'])
elif model_type == 'eegnet':
model = EEGNet(input_shape=(config['eegnet']['samples'], config['eegnet']['channels']),
model = EEGNet(input_shape=(config['eegnet']['samples'], config['eegnet']['channels']), batch_size=config['batch_size'],
model_number=model_number, epochs=config['epochs'])
elif model_type == 'pyramidal_cnn':
model = PyramidalCNN(input_shape=config['cnn']['input_shape'], epochs=config['epochs'],
model_number=model_number)
model_number=model_number, batch_size=config['batch_size'])
elif model_type == 'gazenet':
model = gazeNET(input_shape=config['gazenet']['input_shape'], seed=42)
model = gazeNET(input_shape=config['gazenet']['input_shape'], seed=42, batch_size=config['batch_size'])
return model
\ No newline at end of file
......@@ -65,16 +65,17 @@ class Inception_module(nn.Module):
kernel_size_s = [mother.kernel_size // (2 ** i) for i in range(3)]
# Define all the layers and modules we need in the forward pass: first the initial convolution and the parallel maxpooling
self.pad_conv_in = Pad_Conv(kernel_size=mother.kernel_size)
# This is the bottleneck convolution
self.conv_in = nn.Conv1d(in_channels=mother.nb_channels if depth==0 else mother.nb_features,
out_channels=mother.nb_filters, kernel_size=mother.kernel_size, bias=False)
out_channels=mother.bottleneck_size, kernel_size=mother.kernel_size, bias=False)
self.pad_pool_in = Pad_Pool(left=1, right=1)
self.maxpool_in = nn.MaxPool1d(kernel_size=3, stride=1)
# 3 parallel convolutions with their paddings
self.conv1 = nn.Conv1d(in_channels=mother.nb_filters, out_channels=mother.nb_filters, kernel_size=kernel_size_s[0], bias=False)
# 3 parallel convolutions taking the bottleneck as input
self.conv1 = nn.Conv1d(in_channels=mother.bottleneck_size, out_channels=mother.nb_filters, kernel_size=kernel_size_s[0], bias=False)
self.pad1 = Pad_Conv(kernel_size=kernel_size_s[0])
self.conv2 = nn.Conv1d(in_channels=mother.nb_filters, out_channels=mother.nb_filters, kernel_size=kernel_size_s[1], bias=False)
self.conv2 = nn.Conv1d(in_channels=mother.bottleneck_size, out_channels=mother.nb_filters, kernel_size=kernel_size_s[1], bias=False)
self.pad2 = Pad_Conv(kernel_size=kernel_size_s[1])
self.conv3 = nn.Conv1d(in_channels=mother.nb_filters, out_channels=mother.nb_filters, kernel_size=kernel_size_s[2], bias=False)
self.conv3 = nn.Conv1d(in_channels=mother.bottleneck_size, out_channels=mother.nb_filters, kernel_size=kernel_size_s[2], bias=False)
self.pad3 = Pad_Conv(kernel_size=kernel_size_s[2])
# and the 4th parallel convolution following the maxpooling, no padding needed since 1x1 convolution
self.conv4 = nn.Conv1d(in_channels=mother.nb_channels if depth==0 else mother.nb_features,
......
......@@ -8,7 +8,6 @@ class Pad_Pool(nn.Module):
"""
def __init__(self, left=0, right=1, value=0):
super().__init__()
self._parameters = 0
self.left = left
self.right = right
self.value = value
......
......@@ -9,9 +9,9 @@ def create_dataloader(X, y, batch_size, mode):
Input: X, y of type np.array
Return: dataloader containing the dataset of X and y
"""
# Transform np.array to torch tensor
tensor_x = torch.tensor(X)
tensor_y = torch.tensor(y)
# Transform np.array to torch flaot tensor
tensor_x = torch.as_tensor(X).float()
tensor_y = torch.as_tensor(y).float()
# Unsqueeze channel direction for eegNet model
if config['model'] == 'eegnet' or config['model'] == 'gazenet':
logging.info(f"Unsqueeze data for eegnet")
......@@ -23,4 +23,4 @@ def create_dataloader(X, y, batch_size, mode):
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Create dataset and dataloader
dataset = TensorDataset(tensor_x, tensor_y)
return DataLoader(dataset, batch_size=batch_size, drop_last=True)
\ No newline at end of file
return DataLoader(dataset, batch_size=batch_size, drop_last=True, num_workers=1)
\ No newline at end of file
......@@ -36,15 +36,17 @@ def plot_array(x, output_dir, metric, savefig=True):
epochs = np.arange(len(x))
plt.figure()
if config['pretrained']:
plt.title("Pretrained " + config['model'] + ' loss')
plt.title("Pretrained " + config['model'] + " " + metric)
else:
plt.title(config['model'] + ' loss')
plt.title(config['model'] + " " + metric)
plt.plot(epochs, np.array(x), 'b-', label='validation')
plt.legend()
plt.xlabel('epochs')
if config['task'] == 'gaze-reg':
plt.ylabel("MSE")
elif metric == 'accuracy':
plt.ylabel("Accuracy")
elif config['task'] == 'angle-reg':
plt.ylabel("Mean Absolute Angle Error")
else:
......
import logging
from config import config
import torch
import logging
from config import config
import torch
from torch_models.torch_utils.utils import get_gpu_memory
from torch_models.torch_utils.utils import timing_decorator
from memory_profiler import profile
#import torch.profiler
def train_loop(dataloader, model, loss_fn, optimizer):
#@timing_decorator
#@profile
def train_loop(dataloader, model, loss_fn, optimizer, device):
"""
Performs one epoch of training the model through the dataset stored in dataloader
Using the given loss_fn and optimizer
Returns training loss of the epoch to be tracked by the caller
"""
logging.info("enter train loop")
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
# Move tensors to GPU
# print(f"device type {type(device)}")
# print(f"device {device}")
# X.to(device)
# y.to(device)
if torch.cuda.is_available():
X.cuda()
y.cuda()
X = X.cuda()
y = y.cuda()
# print(f"X type {X.type()}")
# print(f"X on cuda: {X.is_cuda}")
# Compute prediction and loss
logging.info("predict")
pred = model(X.float())
logging.info("compute loss")
pred = model(X)
loss = loss_fn(pred.float(), y.float())
correct = 0
# Backpropagation
optimizer.zero_grad()
logging.info("backprop")
loss.backward()
logging.info("optimize")
optimizer.step()
logging.info("exit train loop")
if batch % 50 == 0:
# profiler.step()
# Print metrics every n batches
if batch == size - 1:
loss, current = loss.item(), batch * len(X)
logging.info(f"Avg training loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
return loss
logging.info(f"Avg training loss: {loss:>7f} samples passed [{current:>5d}/{size:>5d}]")
if config['task'] == 'prosaccade-clf':
pred = (pred > 0.5).float()
correct += (pred == y).float().sum()
correct /= config['batch_size']
logging.info(f"Avg training accuracy {correct:>8f}")
# Remove from GPU
#del X
#del y
#torch.cuda.empty_cache()
def test_loop(dataloader, model, loss_fn):
#@timing_decorator
#@profile
def test_loop(dataloader, model, loss_fn, device):
"""
Performs one prediction run through the test set stored in the dataloader
Prints the loss function computed with the prediction pred and the labels y
"""
logging.info("enter testloop")
size = len(dataloader.dataset)
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
# Move tensors to GPU
if torch.cuda.is_available():
X.cuda()
y.cuda()
pred = model(X.float())
X = X.cuda()
y = y.cuda()
#X.to(device)
#y.to(device)
# Predict
pred = model(X)
# Compute metrics
test_loss += loss_fn(pred.float(), y.float()).item()
if config['task'] == 'prosaccade_clf':
correct += (pred - y).type(torch.float).sum().item()
if config['task'] == 'prosaccade-clf':
pred = (pred > 0.5).float()
correct += (pred == y).float().sum()
# Remove from GPU
#del X
#del y
#torch.cuda.empty_cache()
test_loss /= size
logging.info(f"Avg test loss: {test_loss:>8f} \n")
logging.info(f"Avg test loss: {test_loss:>8f}")
logging.info("exit testloop")
#print(f"correct {correct}")
#print(f"test loss {test_loss}")
#print(f"size {size}")
#print(f"correct/size {correct / size}")
#print(f"test loss / size {test_loss / size}")
if config['task'] == 'prosaccade_clf':
if config['task'] == 'prosaccade-clf':
correct /= size
logging.info(f"Avg accuracy {correct:>8f} \n")
return test_loss, correct
# Otherwise return only loss
return test_loss, -1
\ No newline at end of file
logging.info(f"Avg test accuracy {correct:>8f}")
import torch
import subprocess as sp
import os
import time
def get_gpu_memory():
"""
Returns and prints the available amount of GPU memory
"""
_output_to_list = lambda x: x.decode('ascii').split('\n')[:-1]
COMMAND = "nvidia-smi --query-gpu=memory.free --format=csv"
memory_free_info = _output_to_list(sp.check_output(COMMAND.split()))[1:]
memory_free_values = [int(x.split()[0]) for i, x in enumerate(memory_free_info)]
print(memory_free_values)
return memory_free_values
def timing_decorator(func):
"""
Timing-Decorator for functions
"""
def wrapper(*args, **kwargs):
t1 = time.time()
result = func(*args, **kwargs)
t2 = time.time()
delta = (t2 - t1) * 1000 * 1000 # seconds
print(f"{func.__name__}:{(delta):.4f}ms")
return result
return wrapper
def compute_loss(loss_fn, dataloader, pred_list, nb_models):
"""
......@@ -15,7 +43,9 @@ def compute_loss(loss_fn, dataloader, pred_list, nb_models):
"""
loss = []
for batch, (X, y) in enumerate(dataloader):
loss.append(loss_fn(y.float(), torch.div(pred_list[batch], nb_models).float()))
if torch.cuda.is_available():
y = y.cuda()
loss.append(loss_fn(y, torch.div(pred_list[batch], nb_models).float()))
return sum(loss) / len(loss)
def compute_accuracy(dataloader, pred_list, nb_models):
......@@ -32,8 +62,10 @@ def compute_accuracy(dataloader, pred_list, nb_models):
"""
correct = 0
size = len(dataloader.dataset)
for batch, (X,y) in dataloader:
pred = pred_list[batch]
for batch, (X, y) in enumerate(dataloader):
if torch.cuda.is_available():
pred = pred_list[batch].cuda()
y = y.cuda()
pred = torch.round(pred)
correct += (pred - y).type(torch.float).sum().item()
return correct / size
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment