flocklab_cleaner.py 11.5 KB
Newer Older
1
2
#! /usr/bin/env python3

3
import sys, os, getopt, errno, traceback, logging, time, __main__, shutil, glob, datetime, subprocess, signal
4
import lib.flocklab as flocklab
5
import flocklab_scheduler as scheduler
6
7
8
9
10
11
12
13


##############################################################################
#
# Usage
#
##############################################################################
def usage():
Reto Da Forno's avatar
Reto Da Forno committed
14
    print("Usage: %s [--debug] [--help]" % __file__)
Reto Da Forno's avatar
Reto Da Forno committed
15
16
17
    print("Options:")
    print("  --debug\t\t\tOptional. Print debug messages to log.")
    print("  --help\t\t\tOptional. Print this help.")
18
19
20
21
22
23
24
25
26
27
### END usage()


##############################################################################
#
# Main
#
##############################################################################
def main(argv):

Reto Da Forno's avatar
Reto Da Forno committed
28
    # Get logger:
Reto Da Forno's avatar
Reto Da Forno committed
29
    logger = flocklab.get_logger()
Reto Da Forno's avatar
Reto Da Forno committed
30
31
    
    # Get config ---
32
    flocklab.load_config()
Reto Da Forno's avatar
Reto Da Forno committed
33
34
35
36
37
38
39
40
41
42
43
    
    # Get the arguments:
    try:
        opts, args = getopt.getopt(argv, "dh", ["debug", "help"])
    except getopt.GetoptError as err:
        print(str(err))
        logger.warn(str(err))
        usage()
        sys.exit(errno.EINVAL)
    except:
        msg = "Error when getting arguments: %s: %s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1]))
Reto Da Forno's avatar
Reto Da Forno committed
44
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
45
46
47
48
49
50

    for opt, arg in opts:
        if opt in ("-d", "--debug"):
            logger.setLevel(logging.DEBUG)
        elif opt in ("-h", "--help"):
            usage()
51
            sys.exit(flocklab.SUCCESS)
Reto Da Forno's avatar
Reto Da Forno committed
52
53
54
55
56
        else:
            logger.warn("Wrong API usage")
            sys.exit(errno.EINVAL)
    
    # Allow only x instances ---
Reto Da Forno's avatar
Reto Da Forno committed
57
    rs = flocklab.count_running_instances(__file__)
Reto Da Forno's avatar
Reto Da Forno committed
58
    if (rs >= 0):
Reto Da Forno's avatar
Reto Da Forno committed
59
        maxinscount = flocklab.config.getint('cleaner', 'max_instances')
Reto Da Forno's avatar
Reto Da Forno committed
60
        if rs > maxinscount:
Reto Da Forno's avatar
Reto Da Forno committed
61
62
            msg = "Maximum number of instances (%d) for script %s with currently %d instances running exceeded. Aborting..." % (maxinscount, __file__, rs)
            flocklab.error_logandexit(msg, errno.EUSERS)
Reto Da Forno's avatar
Reto Da Forno committed
63
    else:
Reto Da Forno's avatar
Reto Da Forno committed
64
65
        msg = "Error when trying to count running instances of %s. Function returned with %d" % (__file__, rs)
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
66
67
68
    
    # Connect to the database ---
    try:
Reto Da Forno's avatar
Reto Da Forno committed
69
        (cn, cur) = flocklab.connect_to_db()
Reto Da Forno's avatar
Reto Da Forno committed
70
71
    except:
        msg = "Could not connect to database"
Reto Da Forno's avatar
Reto Da Forno committed
72
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
73
74
75
76
77
78
79
80
81
82
83
    #logger.debug("Connected to database")
    
    # Check for running tests ---
    testisrunning = flocklab.is_test_running(cur)
    
    # Check for work ---
    if testisrunning:
        logger.debug("A test is running, thus exiting...")
    else:
        try:
            # Check for tests to delete ---
84
85
86
87
            sql = """SELECT `serv_tests_key`, `time_start_wish`
                     FROM `tbl_serv_tests` 
                     WHERE (`test_status` = 'todelete')
                  """
Reto Da Forno's avatar
Reto Da Forno committed
88
89
90
91
92
93
            if ( cur.execute(sql) <= 0 ):
                logger.info("No tests found which are marked to be deleted.")
            else:
                rs = cur.fetchall()
                for (testid, starttime) in rs:
                    testid = str(testid)
94
                    logger.debug("Found test ID %s to delete." % testid)
Reto Da Forno's avatar
Reto Da Forno committed
95
96
97
                    # If a test is to be deleted which has not run yet, delete it completely. Otherwise, keep the metadata of the test for statistics:
                    if (starttime > datetime.datetime.today()):
                        delete_all = True
98
                        logger.debug("Test ID %s did not run yet, thus all data (including the test metadata) will be deleted." % testid)
Reto Da Forno's avatar
Reto Da Forno committed
99
100
101
102
103
104
105
                    else:
                        delete_all = False 
                    # Clean through all relevant tables ---
                    relevant_tables = ['tbl_serv_errorlog']
                    if delete_all:
                        relevant_tables.append('tbl_serv_map_test_observer_targetimages')
                    for table in relevant_tables:
106
107
108
                        sql = """DELETE FROM %s
                                 WHERE (`test_fk` = %s)
                              """
Reto Da Forno's avatar
Reto Da Forno committed
109
110
111
                        starttime = time.time()
                        num_deleted_rows = cur.execute(sql%(table, testid))
                        cn.commit()
112
                        logger.debug("Deleted %i rows of data in table %s for test ID %s in %f seconds" % (num_deleted_rows, table, testid, (time.time()-starttime)))
Reto Da Forno's avatar
Reto Da Forno committed
113
114
                    
                    # Delete cached test results ---
115
116
                    archive_path = "%s/%s%s" % (flocklab.config.get('archiver','archive_dir'), testid, flocklab.config.get('archiver','archive_ext'))
                    viz_pathes = glob.glob("%s/%s_*" % (flocklab.config.get('viz','imgdir'), testid))
Reto Da Forno's avatar
Reto Da Forno committed
117
118
119
120
121
122
123
124
125
126
127
128
129
                    pathes = [archive_path]
                    pathes.extend(viz_pathes)
                    for path in pathes:
                        if os.path.exists(path):
                            if os.path.isfile(path):
                                os.remove(path)
                            else:
                                shutil.rmtree(path)
                            logger.debug("Removed path %s for test %s."%(path, testid))
                            
                    # Delete test itself ---
                    if delete_all:
                        # Delete test itself:
130
131
132
                        sql =    """DELETE FROM `tbl_serv_tests` 
                                    WHERE (`serv_tests_key` = %s)
                                 """
Reto Da Forno's avatar
Reto Da Forno committed
133
134
135
136
137
138
139
140
141
142
                        starttime = time.time()
                        num_deleted_rows = cur.execute(sql%(testid))
                        cn.commit()
                        logger.debug("Deleted %i rows of data in table tbl_serv_tests for test ID %s in %f seconds" %(num_deleted_rows, testid, (time.time()-starttime)))
                    else:
                        # Set test status to deleted but keep metadata ---
                        flocklab.set_test_status(cur, cn, int(testid), "deleted")
                        logger.debug("Set status for test ID %s to 'deleted'" %(testid))
                    
            # Delete old entries in viz cache ---
Reto Da Forno's avatar
Reto Da Forno committed
143
            keeptime = flocklab.config.getint('cleaner', 'keeptime_viz')
Reto Da Forno's avatar
Reto Da Forno committed
144
            earliest_keeptime = time.time() - (keeptime*86400)
Reto Da Forno's avatar
Reto Da Forno committed
145
            imgdir_path = flocklab.config.get('viz','imgdir')
Reto Da Forno's avatar
Reto Da Forno committed
146
147
148
149
150
151
152
153
            if not os.path.isdir(imgdir_path):
                os.mkdir(imgdir_path)
            for f in os.listdir(imgdir_path):
                path = os.path.join(imgdir_path, f)
                if os.stat(path).st_mtime < earliest_keeptime:
                    logger.debug("Removing viz cache %s..."%path)
                    shutil.rmtree(path)
            
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
            # Get parameters ---
            now = time.strftime(flocklab.config.get("database", "timeformat"), time.gmtime())
            maxtestcleanuptime = flocklab.config.getint('cleaner', 'max_test_cleanuptime')
            
            # Check for tests that are still running, but should have been stopped ---
            sql = """SELECT `serv_tests_key`, `test_status` FROM `tbl_serv_tests` 
                     WHERE (`test_status` = 'running') AND (`time_end_wish` <= '%s') AND (`dispatched` = 0)
                  """
            cur.execute(sql % (now))
            rs = cur.fetchall()
            if rs:
                # start process for each test which has to be stopped
                for test in rs:
                    testid = int(test[0])
                    logger.debug("Call process to stop test %d (status: %s)." %  (testid, test[1]))
                    p = multiprocessing.Process(target=scheduler.test_startstopabort, args=(testid, True))
                    p.start()
            
            # Check for tests that are stuck ---
173
174
            sql = """SELECT `serv_tests_key` FROM `tbl_serv_tests`
                     WHERE `test_status` IN ('preparing', 'aborting', 'syncing', 'synced')
175
                     AND TIMESTAMPDIFF(MINUTE, `time_end_wish`, NOW()) > %d
176
                  """
177
            if cur.execute(sql % (maxtestcleanuptime)) <= 0:
Reto Da Forno's avatar
Reto Da Forno committed
178
                logger.info("No stuck tests found.")
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
            else:
                rs = cur.fetchall()
                testids = []
                for testid in rs:
                    testids.append(str(testid[0]))
                # set test status to failed
                sql = "UPDATE `tbl_serv_tests` SET `test_status`='failed' WHERE `serv_tests_key` IN (%s)" % (", ".join(testids))
                cur.execute(sql)
                cn.commit()
                msg = "Found %d stuck tests in the database (IDs: %s). Test status set to 'failed'." % (len(rs), ", ".join(testids))
                logger.debug(msg)
                emails = flocklab.get_admin_emails(cur)
                if emails != flocklab.FAILED:
                    flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
            
194
            # Check for stuck threads
Reto Da Forno's avatar
Reto Da Forno committed
195
            cmd = ["ps", "-U", "flocklab", "-o", "pid:5=,cmd:100=,etime="]
196
197
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
            out, err = p.communicate()
Reto Da Forno's avatar
Reto Da Forno committed
198
            lines = out.split("\n")
199
200
            pids = []
            for line in lines:
Reto Da Forno's avatar
Reto Da Forno committed
201
202
203
204
205
                if len(line) > 0:
                    try:
                        pid = int(line[0:6].strip())
                        command = line[6:106].strip()
                        runtime = line[106:].strip()
206
207
208
209
210
211
212
213
214
                        if "testid=" in command:
                            testid = int(command.split('testid=', 1)[1])
                            # check stop time of this test
                            sql = """SELECT `serv_tests_key` FROM `tbl_serv_tests`
                                     WHERE `serv_tests_key`=%d AND TIMESTAMPDIFF(MINUTE, `time_end_wish`, NOW()) > %d
                                  """
                            if cur.execute(sql % (testid, maxtestcleanuptime)) > 0:
                                # thread is stuck -> add to kill list
                                pids.append(pid)
Reto Da Forno's avatar
Reto Da Forno committed
215
216
217
                    except:
                        logger.debug("Failed to parse output of 'ps'. Line was: '%s'" % line)
                        break
218
219
220
221
            if len(pids) > 0:
                # kill the stuck threads
                for pid in pids:
                    os.kill(pid, signal.SIGKILL)
222
                msg = "%d stuck threads terminated (PIDs: %s)" % (len(pids), ", ".join(pids))
223
224
225
226
                logger.debug(msg)
                emails = flocklab.get_admin_emails(cur)
                if emails != flocklab.FAILED:
                    flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
Reto Da Forno's avatar
Reto Da Forno committed
227
228
            else:
                logger.info("No stuck threads found.")
229
            
Reto Da Forno's avatar
Reto Da Forno committed
230
        except:
231
            msg = "Encountered error: %s: %s\n%s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc())
Reto Da Forno's avatar
Reto Da Forno committed
232
            logger.error(msg)
233
            emails = flocklab.get_admin_emails(cur)
Reto Da Forno's avatar
Reto Da Forno committed
234
            msg = "%s on server %s encountered error:\n\n%s" % (__file__, os.uname()[1], msg)
235
            flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
Reto Da Forno's avatar
Reto Da Forno committed
236
237
238
239
        finally:
            cur.close()
            cn.close()
    
240
    sys.exit(flocklab.SUCCESS)
241
242
243
### END main()

if __name__ == "__main__":
Reto Da Forno's avatar
Reto Da Forno committed
244
245
246
247
    try:
        main(sys.argv[1:])
    except Exception:
        msg = "Encountered error: %s: %s\n%s\nCommand line was: %s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc(), str(sys.argv))
Reto Da Forno's avatar
Reto Da Forno committed
248
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
249