flocklab_cleaner.py 14.5 KB
Newer Older
1
2
#! /usr/bin/env python3

3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
"""
Copyright (c) 2010 - 2020, ETH Zurich, Computer Engineering Group
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:

* Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.

* Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.

* Neither the name of the copyright holder nor the names of its
  contributors may be used to endorse or promote products derived from
  this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

"""

35
import sys, os, getopt, errno, traceback, logging, time, __main__, shutil, glob, datetime, subprocess, signal, multiprocessing
36
import lib.flocklab as flocklab
37
import flocklab_scheduler as scheduler
38
39
40
41
42
43
44
45


##############################################################################
#
# Usage
#
##############################################################################
def usage():
Reto Da Forno's avatar
Reto Da Forno committed
46
    print("Usage: %s [--debug] [--help]" % __file__)
Reto Da Forno's avatar
Reto Da Forno committed
47
48
49
    print("Options:")
    print("  --debug\t\t\tOptional. Print debug messages to log.")
    print("  --help\t\t\tOptional. Print this help.")
50
51
52
53
54
55
56
57
58
59
### END usage()


##############################################################################
#
# Main
#
##############################################################################
def main(argv):

Reto Da Forno's avatar
Reto Da Forno committed
60
    # Get logger:
Reto Da Forno's avatar
Reto Da Forno committed
61
    logger = flocklab.get_logger()
Reto Da Forno's avatar
Reto Da Forno committed
62
63
    
    # Get config ---
64
    flocklab.load_config()
Reto Da Forno's avatar
Reto Da Forno committed
65
66
67
68
69
70
    
    # Get the arguments:
    try:
        opts, args = getopt.getopt(argv, "dh", ["debug", "help"])
    except getopt.GetoptError as err:
        print(str(err))
71
        logger.warning(str(err))
Reto Da Forno's avatar
Reto Da Forno committed
72
73
74
75
        usage()
        sys.exit(errno.EINVAL)
    except:
        msg = "Error when getting arguments: %s: %s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1]))
Reto Da Forno's avatar
Reto Da Forno committed
76
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
77
78
79
80
81
82

    for opt, arg in opts:
        if opt in ("-d", "--debug"):
            logger.setLevel(logging.DEBUG)
        elif opt in ("-h", "--help"):
            usage()
83
            sys.exit(flocklab.SUCCESS)
Reto Da Forno's avatar
Reto Da Forno committed
84
        else:
85
            logger.warning("Wrong API usage")
Reto Da Forno's avatar
Reto Da Forno committed
86
87
88
            sys.exit(errno.EINVAL)
    
    # Allow only x instances ---
Reto Da Forno's avatar
Reto Da Forno committed
89
    rs = flocklab.count_running_instances(__file__)
Reto Da Forno's avatar
Reto Da Forno committed
90
    if (rs >= 0):
Reto Da Forno's avatar
Reto Da Forno committed
91
        maxinscount = flocklab.config.getint('cleaner', 'max_instances')
Reto Da Forno's avatar
Reto Da Forno committed
92
        if rs > maxinscount:
Reto Da Forno's avatar
Reto Da Forno committed
93
94
            msg = "Maximum number of instances (%d) for script %s with currently %d instances running exceeded. Aborting..." % (maxinscount, __file__, rs)
            flocklab.error_logandexit(msg, errno.EUSERS)
Reto Da Forno's avatar
Reto Da Forno committed
95
    else:
Reto Da Forno's avatar
Reto Da Forno committed
96
97
        msg = "Error when trying to count running instances of %s. Function returned with %d" % (__file__, rs)
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
98
99
100
    
    # Connect to the database ---
    try:
Reto Da Forno's avatar
Reto Da Forno committed
101
        (cn, cur) = flocklab.connect_to_db()
Reto Da Forno's avatar
Reto Da Forno committed
102
103
    except:
        msg = "Could not connect to database"
Reto Da Forno's avatar
Reto Da Forno committed
104
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
105
106
107
108
109
110
111
112
113
114
    
    # Check for running tests ---
    testisrunning = flocklab.is_test_running(cur)
    
    # Check for work ---
    if testisrunning:
        logger.debug("A test is running, thus exiting...")
    else:
        try:
            # Check for tests to delete ---
115
116
117
118
            sql = """SELECT `serv_tests_key`, `time_start_wish`
                     FROM `tbl_serv_tests` 
                     WHERE (`test_status` = 'todelete')
                  """
Reto Da Forno's avatar
Reto Da Forno committed
119
            if ( cur.execute(sql) <= 0 ):
120
                logger.debug("No tests found which are marked to be deleted.")
Reto Da Forno's avatar
Reto Da Forno committed
121
122
123
124
            else:
                rs = cur.fetchall()
                for (testid, starttime) in rs:
                    testid = str(testid)
125
                    logger.debug("Found test ID %s to delete." % testid)
Reto Da Forno's avatar
Reto Da Forno committed
126
127
128
                    # If a test is to be deleted which has not run yet, delete it completely. Otherwise, keep the metadata of the test for statistics:
                    if (starttime > datetime.datetime.today()):
                        delete_all = True
129
                        logger.debug("Test ID %s did not run yet, thus all data (including the test metadata) will be deleted." % testid)
Reto Da Forno's avatar
Reto Da Forno committed
130
131
132
                    else:
                        delete_all = False 
                    # Clean through all relevant tables ---
Reto Da Forno's avatar
Reto Da Forno committed
133
                    relevant_tables = []
Reto Da Forno's avatar
Reto Da Forno committed
134
135
136
                    if delete_all:
                        relevant_tables.append('tbl_serv_map_test_observer_targetimages')
                    for table in relevant_tables:
137
138
139
                        sql = """DELETE FROM %s
                                 WHERE (`test_fk` = %s)
                              """
Reto Da Forno's avatar
Reto Da Forno committed
140
141
142
                        starttime = time.time()
                        num_deleted_rows = cur.execute(sql%(table, testid))
                        cn.commit()
143
                        logger.debug("Deleted %i rows of data in table %s for test ID %s in %f seconds" % (num_deleted_rows, table, testid, (time.time()-starttime)))
Reto Da Forno's avatar
Reto Da Forno committed
144
145
                    
                    # Delete cached test results ---
146
                    archive_path = "%s/%s%s" % (flocklab.config.get('archiver','archive_dir'), testid, flocklab.config.get('archiver','archive_ext'))
Reto Da Forno's avatar
Reto Da Forno committed
147
148
149
150
151
152
153
                    pathes = [archive_path]
                    for path in pathes:
                        if os.path.exists(path):
                            if os.path.isfile(path):
                                os.remove(path)
                            else:
                                shutil.rmtree(path)
Reto Da Forno's avatar
Reto Da Forno committed
154
155
                            logger.debug("Removed path %s for test %s." % (path, testid))
                    
Reto Da Forno's avatar
Reto Da Forno committed
156
157
158
                    # Delete test itself ---
                    if delete_all:
                        # Delete test itself:
159
160
161
                        sql =    """DELETE FROM `tbl_serv_tests` 
                                    WHERE (`serv_tests_key` = %s)
                                 """
Reto Da Forno's avatar
Reto Da Forno committed
162
                        starttime = time.time()
Reto Da Forno's avatar
Reto Da Forno committed
163
                        num_deleted_rows = cur.execute(sql % (testid))
Reto Da Forno's avatar
Reto Da Forno committed
164
                        cn.commit()
Reto Da Forno's avatar
Reto Da Forno committed
165
                        logger.debug("Deleted %i rows of data in table tbl_serv_tests for test ID %s in %f seconds" % (num_deleted_rows, testid, (time.time()-starttime)))
Reto Da Forno's avatar
Reto Da Forno committed
166
167
168
169
                    else:
                        # Set test status to deleted but keep metadata ---
                        flocklab.set_test_status(cur, cn, int(testid), "deleted")
                        logger.debug("Set status for test ID %s to 'deleted'" %(testid))
Reto Da Forno's avatar
Reto Da Forno committed
170
            
Reto Da Forno's avatar
Reto Da Forno committed
171
            # Delete old entries in viz cache ---
Reto Da Forno's avatar
Reto Da Forno committed
172
            keeptime = flocklab.config.getint('cleaner', 'keeptime_viz')
Reto Da Forno's avatar
Reto Da Forno committed
173
174
175
176
177
            earliest_keeptime = time.time() - (keeptime * 86400)
            vizdir = flocklab.config.get('viz','dir')
            if os.path.isdir(vizdir):
                for f in os.listdir(vizdir):
                    path = os.path.join(vizdir, f)
178
                    if os.stat(path).st_mtime < earliest_keeptime:
Reto Da Forno's avatar
Reto Da Forno committed
179
                        logger.debug("Removing plots %s..." % path)
Reto Da Forno's avatar
Reto Da Forno committed
180
                        os.remove(path)
181
            else:
Reto Da Forno's avatar
Reto Da Forno committed
182
183
                logger.debug("Directory '%s' does not exist." % vizdir)
            
184
185
186
187
188
            # Get parameters ---
            now = time.strftime(flocklab.config.get("database", "timeformat"), time.gmtime())
            maxtestcleanuptime = flocklab.config.getint('cleaner', 'max_test_cleanuptime')
            
            # Check for tests that are stuck ---
189
            sql = """SELECT `serv_tests_key` FROM `tbl_serv_tests`
190
191
                     WHERE ((`test_status` IN ('preparing', 'aborting', 'syncing', 'synced')) OR (`test_status` = 'running' AND `dispatched` = 1))
                     AND (TIMESTAMPDIFF(MINUTE, `time_end_wish`, '%s') > %d)
192
                  """
193
194
            if cur.execute(sql % (now, maxtestcleanuptime)) <= 0:
                logger.debug("No stuck tests found.")
195
196
197
198
199
200
201
202
203
204
            else:
                rs = cur.fetchall()
                testids = []
                for testid in rs:
                    testids.append(str(testid[0]))
                # set test status to failed
                sql = "UPDATE `tbl_serv_tests` SET `test_status`='failed' WHERE `serv_tests_key` IN (%s)" % (", ".join(testids))
                cur.execute(sql)
                cn.commit()
                msg = "Found %d stuck tests in the database (IDs: %s). Test status set to 'failed'." % (len(rs), ", ".join(testids))
205
                logger.info(msg)
206
                emails = flocklab.get_admin_emails()
207
208
209
                if emails != flocklab.FAILED:
                    flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
            
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
            # Check for tests that are still running, but should have been stopped (NOTE: needs to be AFTER the checking for stuck tests!) ---
            sql = """SELECT `serv_tests_key`, `test_status` FROM `tbl_serv_tests` 
                     WHERE (`test_status` = 'running') AND (`time_end_wish` <= '%s') AND (`dispatched` = 0)
                  """
            cur.execute(sql % (now))
            rs = cur.fetchall()
            if rs:
                # start process for each test which has to be stopped
                for test in rs:
                    testid = int(test[0])
                    logger.debug("Call process to stop test %d (status: %s)." %  (testid, test[1]))
                    p = multiprocessing.Process(target=scheduler.test_startstopabort, args=(testid, True))
                    p.start()
            else:
                logger.debug("No tests found that need to be aborted.")
            
226
            # Check for stuck threads
Reto Da Forno's avatar
Reto Da Forno committed
227
            cmd = ["ps", "-U", "flocklab", "-o", "pid:5=,cmd:100=,etime="]
228
229
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True)
            out, err = p.communicate()
Reto Da Forno's avatar
Reto Da Forno committed
230
            lines = out.split("\n")
231
232
            pids = []
            for line in lines:
Reto Da Forno's avatar
Reto Da Forno committed
233
234
235
236
237
238
                if len(line) > 0:
                    try:
                        pid = int(line[0:6].strip())
                        command = line[6:106].strip()
                        runtime = line[106:].strip()
                    except:
239
                        logger.warning("Failed to parse output of 'ps'. Line was: '%s'" % line)
Reto Da Forno's avatar
Reto Da Forno committed
240
                        break
241
242
243
244
245
246
247
                    if "testid=" in command:
                        testid = int(command.split('testid=', 1)[1].split()[0])
                        # check stop time of this test
                        sql = """SELECT `serv_tests_key` FROM `tbl_serv_tests`
                                  WHERE `serv_tests_key`=%d AND TIMESTAMPDIFF(MINUTE, `time_end_wish`, '%s') > %d
                              """
                        if cur.execute(sql % (testid, now, maxtestcleanuptime)) > 0:
248
249
250
                            # thread is stuck -> add to list and kill
                            pids.append(str(pid))
                            os.kill(pid, signal.SIGKILL)
251
            if len(pids) > 0:
252
                msg = "%d stuck threads terminated (PIDs: %s)" % (len(pids), ", ".join(pids))
253
                logger.info(msg)
254
                emails = flocklab.get_admin_emails()
255
256
                if emails != flocklab.FAILED:
                    flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
Reto Da Forno's avatar
Reto Da Forno committed
257
            else:
258
                logger.debug("No stuck threads found.")
259
            
260
261
262
263
264
265
266
267
268
            # Check for offline observers and mark them accordingly in the database
            sql = """SELECT `observer_id`, `ethernet_address`, `status` FROM `tbl_serv_observer`
                     WHERE `status` = 'offline' OR `status` = 'online'
                  """
            cur.execute(sql)
            rs = cur.fetchall()
            if rs:
                for obs in rs:
                    cmd = ["timeout", "1", "ping", "-c", "1", obs[1]]
Reto Da Forno's avatar
Reto Da Forno committed
269
                    #logger.debug("pinging observer fl-%02d with command %s" % (int(obs[0]), " ".join(cmd)))
270
271
272
273
274
275
276
277
278
                    p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
                    ret = p.wait()
                    if ret != 0:
                        logger.error("Observer %d (%s) appears to be offline." % (int(obs[0]), obs[1]))
                        if obs[2] == 'online':
                            cur.execute("UPDATE `tbl_serv_observer` SET status='offline' WHERE observer_id=%d" % int(obs[0]))
                            cn.commit()
                            logger.info("Observer %d (%s) marked as 'offline' in the database." % (int(obs[0]), obs[1]))
                    else:
Reto Da Forno's avatar
Reto Da Forno committed
279
                        #logger.debug("Observer %d (%s) is online." % (int(obs[0]), obs[1]))
280
281
282
283
                        if obs[2] == 'offline':
                            cur.execute("UPDATE `tbl_serv_observer` SET status='online' WHERE observer_id=%d" % int(obs[0]))
                            cn.commit()
                            logger.info("Observer %d (%s) marked as 'online' in the database." % (int(obs[0]), obs[1]))
Reto Da Forno's avatar
Reto Da Forno committed
284
        except:
285
            msg = "Encountered error: %s: %s\n%s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc())
Reto Da Forno's avatar
Reto Da Forno committed
286
            logger.error(msg)
287
            emails = flocklab.get_admin_emails()
Reto Da Forno's avatar
Reto Da Forno committed
288
            msg = "%s on server %s encountered error:\n\n%s" % (__file__, os.uname()[1], msg)
289
            flocklab.send_mail(subject="[FlockLab Cleaner]", message=msg, recipients=emails)
Reto Da Forno's avatar
Reto Da Forno committed
290
291
292
293
        finally:
            cur.close()
            cn.close()
    
294
    sys.exit(flocklab.SUCCESS)
295
296
297
### END main()

if __name__ == "__main__":
Reto Da Forno's avatar
Reto Da Forno committed
298
299
300
301
    try:
        main(sys.argv[1:])
    except Exception:
        msg = "Encountered error: %s: %s\n%s\nCommand line was: %s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc(), str(sys.argv))
Reto Da Forno's avatar
Reto Da Forno committed
302
        flocklab.error_logandexit(msg, errno.EAGAIN)
Reto Da Forno's avatar
Reto Da Forno committed
303