regression_preprocessing.py 34.4 KB
Newer Older
1
2
3
4
5
6
7
8
"""
Some functions to preprocess the variable length gaze regression data
"""

import numpy as np
import h5py
import logging
import scipy.io
Lukas's avatar
Lukas committed
9
10
from config import config 
from tqdm import tqdm 
Lukas's avatar
Lukas committed
11
from sklearn import preprocessing
Lukas's avatar
Lukas committed
12
13
import os
import pandas as pd 
14

15
16
from utils.calibration_task_preprocessing import get_block_targets

17
18
19
20
21
22
23
24

def max_fixation_duration():
    """
    Returns the minimum and maximum duration of gaze fixations in the currently specified dataset
    """

    logging.info("Computing the maximum gaze fixation duration")

Lukas's avatar
Lukas committed
25
26
    with h5py.File(config['data_dir'] + config['trainX_file'], 'r') as f:
        X = np.array(f[config['trainX_variable']]).T
27
28
29
30
31
32
33
34
35
36
37
38
        x_max = 0

        for i in range(len(X)):
            ref = X[i][0]
            datapoint = np.array(f[ref])
            x, y = datapoint.shape
            x_max = max(x_max, x)

    logging.info("Computed the maximum duration of gaze fixations: {} ms: ".format(x_max)) 

    return x_max

Lukas's avatar
Lukas committed
39
def load_regression_data(verbose=True):
40
    """
Lukas's avatar
Lukas committed
41
    Load the data for the regression task with padding as specified in config 
42
    This function only contains the logic to call the appropriate loader specified as data_mode in config.py
Lukas's avatar
Lukas committed
43
44
45
46

    Returns
    -------
    X, y, the data matrix and the labels 
47
48
    """

Lukas Wolf's avatar
Lukas Wolf committed
49
50
51
52
53
54
55
56
57
58
59
60
61
    if config['task'] == 'gaze-reg':
        if config['data_mode'] == 'sacc_only':
            logging.info("Using saccade only dataset")
            return get_sacc_data(verbose=verbose)
        elif config['data_mode'] == 'fix_only': 
            logging.info("Using fixation only dataset")
            return get_fix_data(verbose=verbose)
        elif config['data_mode'] == 'sacc_fix':
            logging.info("Using saccade-fixation dataset")
            return get_sacc_fix_data(verbose=verbose)
        else:
            raise Exception("Choose valid task and data_mode in config.py")
    elif config['task'] == 'angle-reg': 
62
63
        logging.info("Using fixation-saccade-fixation dataset")
        return get_fix_sacc_fix_data(verbose=verbose)
Lukas Wolf's avatar
Lukas Wolf committed
64
        # Choose the following to extract only data with precise fixations 
Lukas Wolf's avatar
Lukas Wolf committed
65
        #return get_calibration_task_fix_sacc_fix_data(verbose=verbose)
66
    else:
Lukas Wolf's avatar
Lukas Wolf committed
67
68
        raise Exception("Choose a valid task in config.py")
        
69
70
71
72
def get_fix_data(verbose=True):
    """
    Returns X, y for the gaze regression task with EEG data X only from fixations
    """
Lukas Wolf's avatar
Lukas Wolf committed
73
74
75
76
77
78
79

    # Define these variables that are needed to load the fixation data (old dataset, processing speed?)
    config['trainX_file'] = 'EEGdata-002.mat'
    config['trainY_file'] = 'label.mat'
    config['trainX_variable'] = 'EEGdata'
    config['trainY_variable'] = 'label'

Lukas's avatar
Lukas committed
80
81
82
    # Load the labels 
    y = scipy.io.loadmat(config['data_dir'] + config['trainY_variable'])
    labels = y['label'] # shape (85413, 1) for label.mat
83

Lukas's avatar
Lukas committed
84
85
86
    # Load the EEG data 
    f = h5py.File(config['data_dir'] + config['trainX_file'], 'r')
    X = np.array(f[config['trainX_variable']]).T
87

Lukas's avatar
Lukas committed
88
    # Compute how much of the data we want to preprocess and return 
Lukas Wolf's avatar
Lukas Wolf committed
89
    #testsize = int(config['data-fraction'] * len(X))
90

Lukas's avatar
Lukas committed
91
92
93
    x_list = []
    y_list = []
    # Run through the data
94
    #for i in tqdm(range(testsize), desc='Loading regression data'):
Lukas Wolf's avatar
Lukas Wolf committed
95
    for i in range(len(X)):
Lukas's avatar
Lukas committed
96
97
98
99
        # Read the datapoint from X and check how long the duration is 
        ref = X[i][0]
        x_datapoint = np.array(f[ref])
        x_len, y_len = x_datapoint.shape # x is number of time samples (2*x in ms is time), y_len=129 channels
100

Lukas's avatar
Lukas committed
101
        # Check whether the point fits the desired range 
102
        if x_len < config['min_fixation'] or x_len > config['max_fixation']:
Lukas's avatar
Lukas committed
103
            continue 
104

Lukas's avatar
Lukas committed
105
106
107
108
        # Create the 2D regression label
        label = labels[i]
        y_datapoint = np.array([label[0][1][0][0], label[0][2][0][0]])

Lukas Wolf's avatar
Lukas Wolf committed
109
        # Pad the data
110
        padding_size = config['max_fixation'] - x_len
Lukas's avatar
Lukas committed
111
        if config['padding'] == 'zero':
Lukas's avatar
Lukas committed
112
            x_datapoint = np.pad(x_datapoint, pad_width=((0,padding_size),(0,0)))#.flatten()
Lukas's avatar
Lukas committed
113
        elif config['padding'] == 'repeat':
Lukas's avatar
Lukas committed
114
            x_datapoint = np.pad(x_datapoint, pad_width=((0,padding_size),(0,0)), mode='reflect')#.flatten()
Lukas's avatar
Lukas committed
115
116
117
        else:
            raise Exception("Choose a valid padding scheme in config.py")

Lukas Wolf's avatar
Lukas Wolf committed
118
119
        x_list.append(x_datapoint)
        y_list.append(y_datapoint)
Lukas's avatar
Lukas committed
120
    X = np.asarray(x_list)
121
122
123
124
    # Reshape data and normalize it 
    norm = np.linalg.norm(X)
    X = X / norm 
    
Lukas's avatar
Lukas committed
125
    y = np.asarray(y_list)
Lukas's avatar
Lukas committed
126
127
128
129
130
131
132

    if verbose:
        logging.info("y training loaded.")
        logging.info(y.shape)
        logging.info("X training loaded.")
        logging.info(X.shape)

Lukas Wolf's avatar
Lukas Wolf committed
133
134
135
    # Save the precomputed data for future usage
    #np.save("./data/precomputed/fix_only_X", X)
    #np.save("./data/precomputed/fix_only_y", y)
Lukas's avatar
Lukas committed
136

137
    return X, y
138
139


Lukas's avatar
Lukas committed
140
def get_sacc_data(verbose=True):
141
142
143
    """
    Returns X, y for the gaze regression task with EEG data X only from saccades
    """
Lukas's avatar
Lukas committed
144
145
146
147
148
    L_saccade = 'L_saccade'
    R_saccade = 'R_saccade'
    event_names = [L_saccade, R_saccade]

    # Loop over all directories in /data/full_data and extract and concat the events from all people
Lukas Wolf's avatar
Lukas Wolf committed
149
    rootdir = './data/processing_speed_task_full_data' # modify it if necessary 
Lukas's avatar
Lukas committed
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174

    x_list = []
    y_list = []

    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            # Get the correct path to the current file
            path = os.path.join(subdir, file)
            events = load_sEEG_events(path) # access event i via events[i]
            data = load_sEEG_data(path)
            
            # Now, depending on the mode, extract the data and create the EEG data matrix and labels
            for i in range(len(events)):
                event = events[i]
                if event[0][0] not in event_names: # dereference the event name, e.g. 'L_saccade'
                    continue
                
                start_time = int(event[1])
                end_time = int(event[4])

                # extract the EEG data from sEEG.data
                x_datapoint = np.array(data[start_time:end_time])
                x_len, y_len = x_datapoint.shape
                
                # Pad the saccade only data, currently pad all to length 100
175
                if x_len < config['min_saccade'] or x_len > config['max_saccade']:
Lukas's avatar
Lukas committed
176
177
                    continue 
                
178
                padding_size = config['max_saccade'] - x_len
Lukas's avatar
Lukas committed
179
180
181
182
183
184
185
                if config['padding'] == 'zero':
                    x_datapoint = np.pad(x_datapoint, pad_width=((0,padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    x_datapoint = np.pad(x_datapoint, pad_width=((0,padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

186
                # Extract the label as saccade endposition
187
188
                sac_x_end = float(event[6]) # dereference for the double value of the x coordinate
                sac_y_end = float(event[7])
Lukas's avatar
Lukas committed
189
190
191
                y_datapoint = np.array([sac_x_end, sac_y_end])

                # Append to X and y 
192
193
                x_list.append(x_datapoint)
                y_list.append(y_datapoint)
Lukas's avatar
Lukas committed
194
195

    X = np.asarray(x_list)
196
197
198
199
200
    X = X[:,:,:129]  # Cut off the last 4 columns (time, x, y, pupil size)
    # Normalize the data
    norm = np.linalg.norm(X)
    X = X / norm 

Lukas's avatar
Lukas committed
201
202
203
204
205
206
207
208
    y = np.asarray(y_list)

    if verbose:
        logging.info("y training loaded.")
        logging.info(y.shape)
        logging.info("X training loaded.")
        logging.info(X.shape)

Lukas Wolf's avatar
Lukas Wolf committed
209
210
211
    # Save the precomputed data for future usage
    #np.save("./data/precomputed/sacc_only_X", X)
    #np.save("./data/precomputed/sacc_only_y", y)
Lukas's avatar
Lukas committed
212

213
    return X, y
Lukas's avatar
Lukas committed
214

215
def get_sacc_fix_data(verbose=True):
216
217
218
    """
    Returns X, y for the gaze regression task with EEG data X from saccade-fixation combinations
    """
219
220
221
222
223
224
225
226
227
228
229
    L_saccade = 'L_saccade'
    L_fixation = 'L_fixation'
    #L_blink = 'L_blink'
    R_saccade = 'R_saccade'
    R_fixation = 'R_fixation'
    #R_blink = 'R_blink'

    fixations = [L_fixation, R_fixation]
    saccades = [L_saccade, R_saccade]

    # Loop over all directories in /data/full_data and extract and concat the events from all people
Lukas Wolf's avatar
Lukas Wolf committed
230
    rootdir = './data/processing_speed_task_full_data' # modify it if necessary 
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265

    x_list = []
    y_list = []

    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            # Get the correct path to the current file
            path = os.path.join(subdir, file)
            events = load_sEEG_events(path) # access event i via events[i]
            data = load_sEEG_data(path)
            
            # Extract X and y from sEEG.data and sEEG.events
            for i in range(len(events)):
                event = events[i]
                event_name = event[0][0] # dereference the event name, e.g. 'L_saccade' 
                if event_name not in saccades:
                    continue # we first want a saccade 
                
                # We have a saccade, check if the next one is a fixation and no blink inbetween
                follow_event = events[i + 1]
                follow_event_name = follow_event[0][0]
                if follow_event_name not in fixations:
                    continue

                # Now we really have a saccade followed by a fixation 
                saccade = event
                fixation = follow_event

                # Pad the saccade
                saccade_start_time = int(saccade[1])
                saccade_end_time = int(saccade[4])  

                saccade_datapoint = np.array(data[saccade_start_time:saccade_end_time])
                x_len_sac, y_len_sac = saccade_datapoint.shape

266
                if x_len_sac < config['min_saccade'] or x_len_sac > config['max_saccade']:
267
268
                    continue

269
                saccade_padding_size = config['max_saccade'] - x_len_sac
270
271
272
273
274
275
276
277
278
279
280
281
282
283
                if config['padding'] == 'zero':
                    saccade_datapoint = np.pad(saccade_datapoint, pad_width=((0,saccade_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    saccade_datapoint = np.pad(saccade_datapoint, pad_width=((0,saccade_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Pad the fixation as usual 
                fixation_start_time = int(fixation[1])
                fixation_end_time = int(fixation[4])
                
                fixation_datapoint = np.array(data[fixation_start_time:fixation_end_time])
                x_len_fix, y_len_fix = fixation_datapoint.shape

284
                if x_len_fix < config['min_fixation'] or x_len_fix > config['max_fixation']:
285
286
287
                    continue 

                # Pad the data
288
                fixation_padding_size = config['max_fixation'] - x_len_fix
289
290
291
292
293
294
295
296
297
298
299
                if config['padding'] == 'zero':
                    fixation_datapoint = np.pad(fixation_datapoint, pad_width=((0,fixation_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    fixation_datapoint = np.pad(fixation_datapoint, pad_width=((0,fixation_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Stack the saccade and fixation as one datapoint 
                x_datapoint = np.concatenate((saccade_datapoint, fixation_datapoint), axis=0)

                # Get the label as fixation average position 
300
301
                fix_avg_x = float(fixation[11])
                fix_avg_y = float(fixation[12])
302
303
304
                y_datapoint = np.array([fix_avg_x, fix_avg_y])

                # Append to X and y 
Lukas Wolf's avatar
Lukas Wolf committed
305
306
                x_list.append(x_datapoint)
                y_list.append(y_datapoint)
Lukas Wolf's avatar
bug fix    
Lukas Wolf committed
307
    
308
    X = np.asarray(x_list)
309
310
    X = X[:,:,:129]  # Cut off the last 4 columns (time, x, y, pupil size)
    # Normalize the data
Lukas Wolf's avatar
Lukas Wolf committed
311
312
    #norm = np.linalg.norm(X)
    #X = X / norm 
313

314
315
316
317
318
319
320
321
    y = np.asarray(y_list)

    if verbose:
        logging.info("y training loaded.")
        logging.info(y.shape)
        logging.info("X training loaded.")
        logging.info(X.shape)

Lukas Wolf's avatar
Lukas Wolf committed
322
323
324
    # Save the precomputed data for future usage
    #np.save("./data/precomputed/sacc_fix_X", X)
    #np.save("./data/precomputed/sacc_fix_y", y)
325

326
    return X, y
327
328


329
def get_fix_sacc_fix_data(verbose=True):
330
331
332
    """
    Returns X, y for the gaze regression task with EEG data X only from triplets of fixation-saccade-fixation
    """
333
334
335
336
337
338
339
340
341
342
343
    L_saccade = 'L_saccade'
    L_fixation = 'L_fixation'
    #L_blink = 'L_blink'
    R_saccade = 'R_saccade'
    R_fixation = 'R_fixation'
    #R_blink = 'R_blink'

    fixations = [L_fixation, R_fixation]
    saccades = [L_saccade, R_saccade]

    # Loop over all directories in /data/full_data and extract and concat the events from all people
Lukas Wolf's avatar
Lukas Wolf committed
344
    if config['data_mode'] == 'fix_sacc_fix':
Lukas Wolf's avatar
debug    
Lukas Wolf committed
345
        rootdir = './data/processing_speed_task' # modify it if necessary 
Lukas Wolf's avatar
Lukas Wolf committed
346
347
348
349
    elif config['data_mode'] == 'calib_task_fix_sacc_fix':
        rootdir = './data/calibration_task'
    else:
        raise Exception("Data mode not valid with current data loader")
Lukas Wolf's avatar
Lukas Wolf committed
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465

    x_list = []
    y_list = []

    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            # Get the correct path to the current file
            path = os.path.join(subdir, file)
            events = load_sEEG_events(path) # access event i via events[i]
            data = load_sEEG_data(path)

            # We now look for fixation-sacade-fixation triplets without blinks or other actions inbetween
            # Extract X and y from sEEG.data and sEEG.events
            for i in range(len(events)):
                first_event = events[i]
                first_event_name = first_event[0][0] # dereference the event name, e.g. 'L_saccade' 
                if first_event_name not in fixations:
                    continue # we first want a saccade 
                        
                # We found a fixation as first event, now look for a following saccade
                second_event = events[i + 1]
                second_event_name = second_event[0][0]
                if second_event_name not in saccades:
                    continue
                
                # We found a following saccade, now look for a following fixation again 
                third_event = events[i + 2]
                third_event_name = third_event[0][0]
                if third_event_name not in fixations:
                    continue

                # We finally found a triplet (segment) without interferences of blinks or pushing buttons
                fixation1 = first_event
                saccade = second_event
                fixation2 = third_event

                # Pad the first fixation
                fixation1_start_time = int(fixation1[1])
                fixation1_end_time = int(fixation1[4])
                        
                fixation1_datapoint = np.array(data[fixation1_start_time:fixation1_end_time])
                x_len_fix1, y_len_fix1 = fixation1_datapoint.shape

                if x_len_fix1 < config['min_fixation'] or x_len_fix1 > config['max_fixation']:
                    continue 

                fixation1_padding_size = config['max_fixation'] - x_len_fix1
                if config['padding'] == 'zero':
                    fixation1_datapoint = np.pad(fixation1_datapoint, pad_width=((0,fixation1_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    fixation1_datapoint = np.pad(fixation1_datapoint, pad_width=((0,fixation1_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Pad the saccade
                saccade_start_time = int(saccade[1])
                saccade_end_time = int(saccade[4])  

                saccade_datapoint = np.array(data[saccade_start_time:saccade_end_time])
                x_len_sac, y_len_sac = saccade_datapoint.shape

                if x_len_sac < config['min_saccade'] or x_len_sac > config['max_saccade']:
                    continue

                saccade_padding_size = config['max_saccade'] - x_len_sac
                if config['padding'] == 'zero':
                    saccade_datapoint = np.pad(saccade_datapoint, pad_width=((0,saccade_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    saccade_datapoint = np.pad(saccade_datapoint, pad_width=((0,saccade_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Pad the second fixation 
                fixation2_start_time = int(fixation2[1])
                fixation2_end_time = int(fixation2[4])
                        
                fixation2_datapoint = np.array(data[fixation2_start_time:fixation2_end_time])
                x_len_fix2, y_len_fix2 = fixation2_datapoint.shape

                if x_len_fix2 < config['min_fixation'] or x_len_fix2 > config['max_fixation']:
                    continue 

                fixation2_padding_size = config['max_fixation'] - x_len_fix2
                if config['padding'] == 'zero':
                    fixation2_datapoint = np.pad(fixation2_datapoint, pad_width=((0,fixation2_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    fixation2_datapoint = np.pad(fixation2_datapoint, pad_width=((0,fixation2_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Stack the fixation, saccade and fixation as one datapoint 
                x_datapoint = np.concatenate((fixation1_datapoint, saccade_datapoint), axis=0)
                x_datapoint = np.concatenate((x_datapoint, fixation2_datapoint), axis=0)

                # Extract the difference vector of labels of the two fixations
                fix1_avg_x = float(fixation1[11])
                fix1_avg_y = float(fixation1[12])
                fix2_avg_x = float(fixation2[11])
                fix2_avg_y = float(fixation2[12])

                # Compute difference of the fixation coordinates
                dx = fix2_avg_x - fix1_avg_x
                dy = fix2_avg_y - fix1_avg_y

                # Compute polar coordinates of the difference vector
                rho, phi = cart2pol(dx, dy)
                #y_datapoint = np.array([rho, phi])
                #y_datapoint = np.array([dx, dy])

                y_datapoint = np.array([phi])

                # Append to X and y 
                x_list.append(x_datapoint)
                y_list.append(y_datapoint)

    X = np.asarray(x_list)
Lukas's avatar
debug    
Lukas committed
466
    #X = X[:,:,:129]  # Cut off the last 4 columns (time, x, y, pupil size)
Lukas Wolf's avatar
Lukas Wolf committed
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
    # Normalize the data
    norm = np.linalg.norm(X)
    X = X / norm 

    y = np.asarray(y_list)

    if verbose:
        logging.info("y training loaded.")
        logging.info(y.shape)
        logging.info("X training loaded.")
        logging.info(X.shape)

    # Save the precomputed data for future usage
    #np.save("./data/precomputed/fix_sacc_fix_X", X)
    #np.save("./data/precomputed/fix_sacc_fix_y", y)

    return X, y

def get_calibration_task_fix_sacc_fix_data(verbose=True):
    """
    Returns X, y for the gaze regression task, specifically the calibration task dataset, with EEG data X only from triplets of fixation-saccade-fixation
    """
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
    #import hashlib

    # somehow the string comparisons didn't work properly because of the mixed numbers and string as event names, therefore I hash the string and compare them 

    # Event names
    L_saccade = 'L_saccade' #hashlib.md5(str('L_saccade').encode()).hexdigest() 
    L_fixation = 'L_fixation' #hashlib.md5(str('L_fixation').encode()).hexdigest() 
    L_blink = 'L_blink' #hashlib.md5(str('L_blink').encode()).hexdigest() 
    R_saccade = 'R_saccade' #hashlib.md5(str('R_saccade').encode()).hexdigest() 
    R_fixation = 'R_fixation' #hashlib.md5(str('R_fixation').encode()).hexdigest() 
    R_blink = 'R_blink' #hashlib.md5(str('R_blink').encode()).hexdigest() 

    blockOn = 55 #.md5(str('55').encode()).hexdigest() 
    blockOff = 56 #hashlib.md5(str('56').encode()).hexdigest() 
    stimulusOff = 41 #hashlib.md5(str('L_saccade').encode()).hexdigest() 
    dots_start = [12, 13, 14, 15, 16, 17]
    #dots_start = [hashlib.md5(str('12').encode()).hexdigest(), hashlib.md5(str('13').encode()).hexdigest(), hashlib.md5(str('14').encode()).hexdigest(),
                    #hashlib.md5(str('15').encode()).hexdigest(), hashlib.md5(str('16').encode()).hexdigest(), hashlib.md5(str('17').encode()).hexdigest()]

    # Define lists of event names for easier search 
Lukas Wolf's avatar
Lukas Wolf committed
509
510
511
    fixations = [L_fixation, R_fixation]
    saccades = [L_saccade, R_saccade]
    blinks = [L_blink, R_blink]
512
513
514
    non_stimulus_strings = fixations + saccades + blinks 
    non_stimulus_ints = [blockOff, blockOn, stimulusOff] + dots_start

515
516
    #print(non_stimulus_ints)
    #print(non_stimulus_strings)
517
518
519
520
521
522
523
524
525
526
527
    #print(dots_start)

    # Threshold for the distance between fixation_avg_pos and the expected label
    THRESH_DIFF = 30

    # Compute the positions of the five blocks, are returned as [tarpos, tarposback, tarposflip, tarposflipback, tarpos]
    block_list = get_block_targets()
    # Intialize current target and block 
    current_target_list = block_list[0] # extract the first target list [tar1, tar2, ..., tar27]
    current_target = current_target_list[0] # extract the first expected target [x, y]
    next_target = current_target_list[1] # this would be the target for the next fixation 
Lukas Wolf's avatar
Lukas Wolf committed
528
529
530

    # Loop over all directories and extract and concat the events from all people
    rootdir = './data/calibration_task' # modify it if necessary 
531
532
533
534

    x_list = []
    y_list = []

535
536
537
    # Start searching for events
    isBlock = True

538
539
540
541
542
543
544
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            # Get the correct path to the current file
            path = os.path.join(subdir, file)
            events = load_sEEG_events(path) # access event i via events[i]
            data = load_sEEG_data(path)

545
546
547
548
549
550
551
552
553
            # Reset target and block counter for the new file
            block_cnt = -1 # index for the block we take from the target list
            target_cnt = -1 # index for the current target in the current block

            """
            We now look for fixation-sacade-fixation triplets without blinks or other actions inbetween
            For that matter we look for [fixation, stimulusOff, newStimulus, saccade, fixation] tuples, 
            where both fixations are within a thresholded distance of the expected coordinates
            """
554
            for i in range(len(events)):
555
556
557
558
559
560
                
                # Read the current event
                event_1 = events[i]
                event_1_name = event_1[0][0] # dereference the event name, e.g. 'L_saccade' 
                #event_1_name = hashlib.md5(str(event_1_name).encode()).hexdigest() 

561
                #print("{}".format(i) + ": " + event_1_name)
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620

                try:
                    num = int(event_1_name)
                    isNumeric = True
                except:
                    isNumeric = False 
                #print("is numeric: {}".format(isNumeric))

                # Check wheter the file starts
                if isNumeric:
                    if  int(event_1_name) in dots_start:
                        #print("dots start")
                        continue # this was the start of the DOTS file

                    if int(event_1_name) == stimulusOff:
                        #print("found 41 stimulus off")
                        continue 

                    if int(event_1_name) == blockOff:
                        #print("block off")
                        isBlock = False # Block ends, therefore do not read any of the next events 
                        continue

                    if int(event_1_name) == blockOn:
                        #print("block on")
                        isBlock = True # start looking for events again  
                        block_cnt += 1 # increment to next block
                        current_target_list = block_list[block_cnt] # get the target list of the next block
                        target_cnt = -1 # reset the target pointer to first element of new list
                        continue

                # Ignore events while we are outside of a block 
                if not isBlock:
                    #("not within block")
                    continue
                
                # Consume the new stimulus signal and increment the target 
                if isNumeric:
                    if event_1_name not in non_stimulus_ints:
                        #print("consume stimulus")
                        if target_cnt <= 26:
                            target_cnt += 1
                            current_target = current_target_list[target_cnt]
                        if target_cnt <= 25: # otherwise we are at the end of the current block and will change targets after block ends 
                            next_target = current_target_list[target_cnt+1]
                        continue 

                # From here on it must be a fixation first, therefore if the current event is numeric, we can continue
                if isNumeric:
                    continue
                
                # From here on we look for the quintuple

                if event_1_name not in fixations:
                    #("first event wasn't a fixation")
                    continue # we first want a fixation 


                #print("look for stimulusoff trigger")
621
                        
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
                # We found a fixation as first event, now look for the current trigger to end
                event_2 = events[i + 1]
                event_2_name = event_2[0][0]
                try:
                    if int(event_2_name) != stimulusOff:
                        continue
                except:
                    continue

                #print("look for next target signal")

                # We found the stimulusOff signal, now look for the next target signal
                event_3 = events[i + 2]
                event_3_name = event_3[0][0]
                try:
                    # if valid, then it should be an int, therefore try this first
                    if int(event_3_name) in non_stimulus_ints:
                        continue
                except:
                    #print("wasn't an int")
                    pass

                if event_3_name in non_stimulus_strings:
645
646
                    continue
                
647
648
649
650
651
652
653
654
655
656
657
658
659
660
                
                #print("look for next saccade")


                # We found the next stimuli, now look for the saccade 
                event_4 = events[i + 3]
                event_4_name = event_4[0][0]
                if event_4_name not in saccades:
                    continue


                #print("look for next second fixation")


661
                # We found a following saccade, now look for a following fixation again 
662
663
664
                event_5 = events[i + 4]
                event_5_name = event_5[0][0]
                if event_5_name not in fixations:
665
                    continue
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
                
                #print("Found a quintuple")
                # We found a quintuple, extract the triplet and check if it is good enough
                fixation1 = event_1
                saccade = event_4
                fixation2 = event_5

                # Check the two fixations for distance to targets
                # Extract the labels of the two fixations
                fix1_avg_x = float(fixation1[11])
                fix1_avg_y = float(fixation1[12])
                fix2_avg_x = float(fixation2[11])
                fix2_avg_y = float(fixation2[12])
                # Create the coordinates
                fix1_coord = np.array([fix1_avg_x, fix1_avg_y])
                fix2_coord = np.array([fix2_avg_x, fix2_avg_y])
682

683
684
685
686
687
688
689
                #print("Current target: {} - fix1: {}".format(current_target, fix1_coord))
                #print("Next target: {} - fix2: {}".format(next_target, fix2_coord))

                # Check if both fixations are close to the expected targets
                if not euclidian_dist(fix1_coord, np.array(current_target)) <= THRESH_DIFF or not euclidian_dist(fix2_coord, np.array(next_target)) <= THRESH_DIFF:
                    #print("wasn't good enough, distances are: {} and {}".format(euclidian_dist(fix1_coord, np.array(current_target)), euclidian_dist(fix2_coord, np.array(next_target))))
                    continue 
690
691
692
693
694
695
696
697

                # Pad the first fixation
                fixation1_start_time = int(fixation1[1])
                fixation1_end_time = int(fixation1[4])
                        
                fixation1_datapoint = np.array(data[fixation1_start_time:fixation1_end_time])
                x_len_fix1, y_len_fix1 = fixation1_datapoint.shape

698
                if x_len_fix1 < config['min_fixation'] or x_len_fix1 > config['max_fixation']: # in this task there is no upper limit on fixation length 
699
                    #print("fixation1 not sufficient: {}".format(x_len_fix1))
700
701
                    continue 

702
                fixation1_padding_size = config['max_fixation'] - x_len_fix1
703
704
705
706
707
708
709
710
711
712
713
714
715
716
                if config['padding'] == 'zero':
                    fixation1_datapoint = np.pad(fixation1_datapoint, pad_width=((0,fixation1_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    fixation1_datapoint = np.pad(fixation1_datapoint, pad_width=((0,fixation1_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Pad the saccade
                saccade_start_time = int(saccade[1])
                saccade_end_time = int(saccade[4])  

                saccade_datapoint = np.array(data[saccade_start_time:saccade_end_time])
                x_len_sac, y_len_sac = saccade_datapoint.shape

717
                if x_len_sac < config['min_saccade'] or x_len_sac > config['max_saccade']:
718
                    #print("Saccade not sufficient: {}".format(x_len_sac))
719
720
                    continue

721
                saccade_padding_size = config['max_saccade'] - x_len_sac
722
723
724
725
726
727
728
729
730
731
732
733
734
735
                if config['padding'] == 'zero':
                    saccade_datapoint = np.pad(saccade_datapoint, pad_width=((0,saccade_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    saccade_datapoint = np.pad(saccade_datapoint, pad_width=((0,saccade_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Pad the second fixation 
                fixation2_start_time = int(fixation2[1])
                fixation2_end_time = int(fixation2[4])
                        
                fixation2_datapoint = np.array(data[fixation2_start_time:fixation2_end_time])
                x_len_fix2, y_len_fix2 = fixation2_datapoint.shape

736
                if x_len_fix2 < config['min_fixation'] or x_len_fix2 > config['max_fixation']: # no upper bound on fixation length
737
                    #print("fixation2 not sufficient: {}".format(x_len_fix2))
738
739
                    continue 

740
                fixation2_padding_size = config['max_fixation'] - x_len_fix2
741
742
743
744
745
746
747
748
749
750
751
                if config['padding'] == 'zero':
                    fixation2_datapoint = np.pad(fixation2_datapoint, pad_width=((0,fixation2_padding_size),(0,0)))
                elif config['padding'] == 'repeat':
                    fixation2_datapoint = np.pad(fixation2_datapoint, pad_width=((0,fixation2_padding_size),(0,0)), mode='reflect')
                else:
                    raise Exception("Choose a valid padding scheme in config.py")

                # Stack the fixation, saccade and fixation as one datapoint 
                x_datapoint = np.concatenate((fixation1_datapoint, saccade_datapoint), axis=0)
                x_datapoint = np.concatenate((x_datapoint, fixation2_datapoint), axis=0)

752
                #print(x_datapoint.shape)
753
754

                # Compute difference of the fixation coordinates
755
756
                dx = fix2_avg_x - fix1_avg_x
                dy = fix2_avg_y - fix1_avg_y
757
758

                # Compute polar coordinates of the difference vector
Lukas Wolf's avatar
Lukas Wolf committed
759
                rho, phi = cart2pol(dx, dy)
760
761
                #y_datapoint = np.array([rho, phi])
                #y_datapoint = np.array([dx, dy])
762

763
                y_datapoint = np.array([phi])
764
765

                # Append to X and y 
766
767
                x_list.append(x_datapoint)
                y_list.append(y_datapoint)
768
769
                #print("-----------------------------------------------WAS GOOD ENOUGH")
                #print("we added a sample")
770
771

    X = np.asarray(x_list)
772
    #print(X.shape)
773
    X = X[:,:,:129]  # Cut off the last 4 columns (time, x, y, pupil size)
774
    # Normalize the data
775
776
    #norm = np.linalg.norm(X)
    #X = X / norm 
777

778
779
780
781
782
783
784
785
    y = np.asarray(y_list)

    if verbose:
        logging.info("y training loaded.")
        logging.info(y.shape)
        logging.info("X training loaded.")
        logging.info(X.shape)

Lukas Wolf's avatar
Lukas Wolf committed
786
    # Save the precomputed data for future usage
787
788
    np.save("./data/precomputed/calibration_task/fix_sacc_fix_X", X)
    np.save("./data/precomputed/calibration_task/fix_sacc_fix_y", y)
789

790
    return X, y
791

Lukas's avatar
Lukas committed
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817

def load_sEEG_events(abs_dir_path):
    """
    Extracts the sEEG.event section of a participants mat file 
    Returns the events as a numpy array, accessible event after event (time series)
    Filters out everything else, like participants pushing buttons 
    """
    f = scipy.io.loadmat(abs_dir_path)
    sEEG = f['sEEG']
    df = pd.DataFrame(sEEG[0])
    events = df['event'][0][0] # dereferenced to obtain the fixation, saccade, blinks, ... 
    #print("Events shape: {}".format(events.shape))
    return events # access the i-th event via events[i]


def load_sEEG_data(abs_dir_path):
    """
    Returns the 133 channels of a participant
    129 EEG channels plus 4 (time, x, y and pupil size)
    Returns the data as a numpy array, accessible via time as first coefficient 
    """
    f = scipy.io.loadmat(abs_dir_path)
    sEEG = f['sEEG']
    df = pd.DataFrame(sEEG[0])
    data = df['data'][0].T # transpose to access time series 
    #print("EEG data shape: {}".format(data.shape))
Lukas Wolf's avatar
Lukas Wolf committed
818
819
820
    return data # access the i-th recorded sample via data[i], recordings at 2ms intervals


821
822
823
824
825
826
827
def euclidian_dist(x, y):
    """
    Returns the euclidian distance between x and y
    """
    return np.linalg.norm(np.array(x) - np.array(y))


Lukas Wolf's avatar
Lukas Wolf committed
828
829
830
831
832
833
834
835
def cart2pol(x, y):
    """
    Transform cartesian to polar coordinates
    """
    rho = np.sqrt(x**2 + y**2)
    phi = np.arctan2(y, x)
    return rho, phi

836

Lukas Wolf's avatar
Lukas Wolf committed
837
838
839
840
841
842
843
def pol2cart(rho, phi):
    """
    Transform polar to cartesian coordinates
    """
    x = rho * np.cos(phi)
    y = rho * np.sin(phi)
    return x, y