flocklab_cleaner.py 13.1 KB
Newer Older
1
2
#! /usr/bin/env python3

3
import sys, os, getopt, errno, traceback, logging, time, __main__, shutil, glob, datetime, subprocess, signal
4
import lib.flocklab as flocklab
5
import flocklab_scheduler as scheduler
6
7
8
9
10
11
12
13


##############################################################################
#
# Usage
#
##############################################################################
def usage():
Reto Da Forno's avatar
Reto Da Forno committed
14
    print("Usage: %s [--debug] [--help]" % __file__)
Reto Da Forno's avatar
Reto Da Forno committed
15
16
17
    print("Options:")
    print("  --debug\t\t\tOptional. Print debug messages to log.")
    print("  --help\t\t\tOptional. Print this help.")
18
19
20
21
22
23
24
25
26
27
### END usage()


##############################################################################
#
# Main
#
##############################################################################
def main(argv):

Reto Da Forno's avatar
Reto Da Forno committed
28
    # Get logger:
Reto Da Forno's avatar
Reto Da Forno committed
29
    logger = flocklab.get_logger()
Reto Da Forno's avatar
Reto Da Forno committed
30
31
    
    # Get config ---
32
    flocklab.load_config()
Reto Da Forno's avatar
Reto Da Forno committed
33
34
35
36
37
38
39
40
41
42
43
    
    # Get the arguments:
    try:
        opts, args = getopt.getopt(argv, "dh", ["debug", "help"])
    except getopt.GetoptError as err:
        print(str(err))
        logger.warn(str(err))
        usage()
        sys.exit(errno.EINVAL)
    except:
        msg = "Error when getting arguments: %s: %s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1]))
Reto Da Forno's avatar
Reto Da Forno committed
44
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
45
46
47
48
49
50

    for opt, arg in opts:
        if opt in ("-d", "--debug"):
            logger.setLevel(logging.DEBUG)
        elif opt in ("-h", "--help"):
            usage()
51
            sys.exit(flocklab.SUCCESS)
Reto Da Forno's avatar
Reto Da Forno committed
52
53
54
55
56
        else:
            logger.warn("Wrong API usage")
            sys.exit(errno.EINVAL)
    
    # Allow only x instances ---
Reto Da Forno's avatar
Reto Da Forno committed
57
    rs = flocklab.count_running_instances(__file__)
Reto Da Forno's avatar
Reto Da Forno committed
58
    if (rs >= 0):
Reto Da Forno's avatar
Reto Da Forno committed
59
        maxinscount = flocklab.config.getint('cleaner', 'max_instances')
Reto Da Forno's avatar
Reto Da Forno committed
60
        if rs > maxinscount:
Reto Da Forno's avatar
Reto Da Forno committed
61
62
            msg = "Maximum number of instances (%d) for script %s with currently %d instances running exceeded. Aborting..." % (maxinscount, __file__, rs)
            flocklab.error_logandexit(msg, errno.EUSERS)
Reto Da Forno's avatar
Reto Da Forno committed
63
    else:
Reto Da Forno's avatar
Reto Da Forno committed
64
65
        msg = "Error when trying to count running instances of %s. Function returned with %d" % (__file__, rs)
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
66
67
68
    
    # Connect to the database ---
    try:
Reto Da Forno's avatar
Reto Da Forno committed
69
        (cn, cur) = flocklab.connect_to_db()
Reto Da Forno's avatar
Reto Da Forno committed
70
71
    except:
        msg = "Could not connect to database"
Reto Da Forno's avatar
Reto Da Forno committed
72
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
73
74
75
76
77
78
79
80
81
82
    
    # Check for running tests ---
    testisrunning = flocklab.is_test_running(cur)
    
    # Check for work ---
    if testisrunning:
        logger.debug("A test is running, thus exiting...")
    else:
        try:
            # Check for tests to delete ---
83
84
85
86
            sql = """SELECT `serv_tests_key`, `time_start_wish`
                     FROM `tbl_serv_tests` 
                     WHERE (`test_status` = 'todelete')
                  """
Reto Da Forno's avatar
Reto Da Forno committed
87
            if ( cur.execute(sql) <= 0 ):
88
                logger.debug("No tests found which are marked to be deleted.")
Reto Da Forno's avatar
Reto Da Forno committed
89
90
91
92
            else:
                rs = cur.fetchall()
                for (testid, starttime) in rs:
                    testid = str(testid)
93
                    logger.debug("Found test ID %s to delete." % testid)
Reto Da Forno's avatar
Reto Da Forno committed
94
95
96
                    # If a test is to be deleted which has not run yet, delete it completely. Otherwise, keep the metadata of the test for statistics:
                    if (starttime > datetime.datetime.today()):
                        delete_all = True
97
                        logger.debug("Test ID %s did not run yet, thus all data (including the test metadata) will be deleted." % testid)
Reto Da Forno's avatar
Reto Da Forno committed
98
99
100
                    else:
                        delete_all = False 
                    # Clean through all relevant tables ---
Reto Da Forno's avatar
Reto Da Forno committed
101
                    relevant_tables = []
Reto Da Forno's avatar
Reto Da Forno committed
102
103
104
                    if delete_all:
                        relevant_tables.append('tbl_serv_map_test_observer_targetimages')
                    for table in relevant_tables:
105
106
107
                        sql = """DELETE FROM %s
                                 WHERE (`test_fk` = %s)
                              """
Reto Da Forno's avatar
Reto Da Forno committed
108
109
110
                        starttime = time.time()
                        num_deleted_rows = cur.execute(sql%(table, testid))
                        cn.commit()
111
                        logger.debug("Deleted %i rows of data in table %s for test ID %s in %f seconds" % (num_deleted_rows, table, testid, (time.time()-starttime)))
Reto Da Forno's avatar
Reto Da Forno committed
112
113
                    
                    # Delete cached test results ---
114
115
                    archive_path = "%s/%s%s" % (flocklab.config.get('archiver','archive_dir'), testid, flocklab.config.get('archiver','archive_ext'))
                    viz_pathes = glob.glob("%s/%s_*" % (flocklab.config.get('viz','imgdir'), testid))
Reto Da Forno's avatar
Reto Da Forno committed
116
117
118
119
120
121
122
123
124
125
126
127
128
                    pathes = [archive_path]
                    pathes.extend(viz_pathes)
                    for path in pathes:
                        if os.path.exists(path):
                            if os.path.isfile(path):
                                os.remove(path)
                            else:
                                shutil.rmtree(path)
                            logger.debug("Removed path %s for test %s."%(path, testid))
                            
                    # Delete test itself ---
                    if delete_all:
                        # Delete test itself:
129
130
131
                        sql =    """DELETE FROM `tbl_serv_tests` 
                                    WHERE (`serv_tests_key` = %s)
                                 """
Reto Da Forno's avatar
Reto Da Forno committed
132
133
134
135
136
137
138
139
140
141
                        starttime = time.time()
                        num_deleted_rows = cur.execute(sql%(testid))
                        cn.commit()
                        logger.debug("Deleted %i rows of data in table tbl_serv_tests for test ID %s in %f seconds" %(num_deleted_rows, testid, (time.time()-starttime)))
                    else:
                        # Set test status to deleted but keep metadata ---
                        flocklab.set_test_status(cur, cn, int(testid), "deleted")
                        logger.debug("Set status for test ID %s to 'deleted'" %(testid))
                    
            # Delete old entries in viz cache ---
Reto Da Forno's avatar
Reto Da Forno committed
142
            keeptime = flocklab.config.getint('cleaner', 'keeptime_viz')
Reto Da Forno's avatar
Reto Da Forno committed
143
            earliest_keeptime = time.time() - (keeptime*86400)
Reto Da Forno's avatar
Reto Da Forno committed
144
            imgdir_path = flocklab.config.get('viz','imgdir')
145
146
147
148
149
150
151
152
153
            if os.path.isdir(imgdir_path):
                for f in os.listdir(imgdir_path):
                    path = os.path.join(imgdir_path, f)
                    if os.stat(path).st_mtime < earliest_keeptime:
                        logger.debug("Removing viz cache %s..."%path)
                        shutil.rmtree(path)
            else:
                logger.debug("Directory '%s' does not exist." % imgdir_path)
              
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
            # Get parameters ---
            now = time.strftime(flocklab.config.get("database", "timeformat"), time.gmtime())
            maxtestcleanuptime = flocklab.config.getint('cleaner', 'max_test_cleanuptime')
            
            # Check for tests that are still running, but should have been stopped ---
            sql = """SELECT `serv_tests_key`, `test_status` FROM `tbl_serv_tests` 
                     WHERE (`test_status` = 'running') AND (`time_end_wish` <= '%s') AND (`dispatched` = 0)
                  """
            cur.execute(sql % (now))
            rs = cur.fetchall()
            if rs:
                # start process for each test which has to be stopped
                for test in rs:
                    testid = int(test[0])
                    logger.debug("Call process to stop test %d (status: %s)." %  (testid, test[1]))
                    p = multiprocessing.Process(target=scheduler.test_startstopabort, args=(testid, True))
                    p.start()
171
172
            else:
                logger.debug("No tests found that need to be aborted.")
173
174
            
            # Check for tests that are stuck ---
175
176
            sql = """SELECT `serv_tests_key` FROM `tbl_serv_tests`
                     WHERE `test_status` IN ('preparing', 'aborting', 'syncing', 'synced')
177
                     AND TIMESTAMPDIFF(MINUTE, `time_end_wish`, '%s') > %d
178
                  """
179
180
            if cur.execute(sql % (now, maxtestcleanuptime)) <= 0:
                logger.debug("No stuck tests found.")
181
182
183
184
185
186
187
188
189
190
            else:
                rs = cur.fetchall()
                testids = []
                for testid in rs:
                    testids.append(str(testid[0]))
                # set test status to failed
                sql = "UPDATE `tbl_serv_tests` SET `test_status`='failed' WHERE `serv_tests_key` IN (%s)" % (", ".join(testids))
                cur.execute(sql)
                cn.commit()
                msg = "Found %d stuck tests in the database (IDs: %s). Test status set to 'failed'." % (len(rs), ", ".join(testids))
191
                logger.info(msg)
192
193
194
195
                emails = flocklab.get_admin_emails(cur)
                if emails != flocklab.FAILED:
                    flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
            
196
            # Check for stuck threads
Reto Da Forno's avatar
Reto Da Forno committed
197
            cmd = ["ps", "-U", "flocklab", "-o", "pid:5=,cmd:100=,etime="]
198
199
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
            out, err = p.communicate()
Reto Da Forno's avatar
Reto Da Forno committed
200
            lines = out.split("\n")
201
202
            pids = []
            for line in lines:
Reto Da Forno's avatar
Reto Da Forno committed
203
204
205
206
207
208
                if len(line) > 0:
                    try:
                        pid = int(line[0:6].strip())
                        command = line[6:106].strip()
                        runtime = line[106:].strip()
                    except:
209
                        logger.warn("Failed to parse output of 'ps'. Line was: '%s'" % line)
Reto Da Forno's avatar
Reto Da Forno committed
210
                        break
211
212
213
214
215
216
217
218
219
                    if "testid=" in command:
                        testid = int(command.split('testid=', 1)[1].split()[0])
                        # check stop time of this test
                        sql = """SELECT `serv_tests_key` FROM `tbl_serv_tests`
                                  WHERE `serv_tests_key`=%d AND TIMESTAMPDIFF(MINUTE, `time_end_wish`, '%s') > %d
                              """
                        if cur.execute(sql % (testid, now, maxtestcleanuptime)) > 0:
                            # thread is stuck -> add to kill list
                            pids.append(pid)
220
221
222
223
            if len(pids) > 0:
                # kill the stuck threads
                for pid in pids:
                    os.kill(pid, signal.SIGKILL)
224
                msg = "%d stuck threads terminated (PIDs: %s)" % (len(pids), ", ".join(pids))
225
                logger.info(msg)
226
227
228
                emails = flocklab.get_admin_emails(cur)
                if emails != flocklab.FAILED:
                    flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
Reto Da Forno's avatar
Reto Da Forno committed
229
            else:
230
                logger.debug("No stuck threads found.")
231
            
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
            # Check for offline observers and mark them accordingly in the database
            sql = """SELECT `observer_id`, `ethernet_address`, `status` FROM `tbl_serv_observer`
                     WHERE `status` = 'offline' OR `status` = 'online'
                  """
            cur.execute(sql)
            rs = cur.fetchall()
            if rs:
                for obs in rs:
                    cmd = ["timeout", "1", "ping", "-c", "1", obs[1]]
                    logger.debug("pinging observer fl-%02d with command %s" % (int(obs[0]), " ".join(cmd)))
                    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                    ret = p.wait()
                    if ret != 0:
                        logger.error("Observer %d (%s) appears to be offline." % (int(obs[0]), obs[1]))
                        if obs[2] == 'online':
                            cur.execute("UPDATE `tbl_serv_observer` SET status='offline' WHERE observer_id=%d" % int(obs[0]))
                            cn.commit()
                            logger.info("Observer %d (%s) marked as 'offline' in the database." % (int(obs[0]), obs[1]))
                    else:
                        logger.debug("Observer %d (%s) is online." % (int(obs[0]), obs[1]))
                        if obs[2] == 'offline':
                            cur.execute("UPDATE `tbl_serv_observer` SET status='online' WHERE observer_id=%d" % int(obs[0]))
                            cn.commit()
                            logger.info("Observer %d (%s) marked as 'online' in the database." % (int(obs[0]), obs[1]))
Reto Da Forno's avatar
Reto Da Forno committed
256
        except:
257
            msg = "Encountered error: %s: %s\n%s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc())
Reto Da Forno's avatar
Reto Da Forno committed
258
            logger.error(msg)
259
            emails = flocklab.get_admin_emails(cur)
Reto Da Forno's avatar
Reto Da Forno committed
260
            msg = "%s on server %s encountered error:\n\n%s" % (__file__, os.uname()[1], msg)
261
            flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
Reto Da Forno's avatar
Reto Da Forno committed
262
263
264
265
        finally:
            cur.close()
            cn.close()
    
266
    sys.exit(flocklab.SUCCESS)
267
268
269
### END main()

if __name__ == "__main__":
Reto Da Forno's avatar
Reto Da Forno committed
270
271
272
273
    try:
        main(sys.argv[1:])
    except Exception:
        msg = "Encountered error: %s: %s\n%s\nCommand line was: %s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc(), str(sys.argv))
Reto Da Forno's avatar
Reto Da Forno committed
274
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
275