Commit a7f73068 authored by Reto Da Forno's avatar Reto Da Forno
Browse files

cleanup and error logging simplified

parent a15e7727
......@@ -35,19 +35,12 @@ def main(argv):
send_email = False
testid = -1
# Set timezone to UTC ---
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger ---
logger = flocklab.get_logger()
# Get config ---
if flocklab.load_config() != flocklab.SUCCESS:
msg = "Could not read configuration file. Exiting..."
flocklab.error_logandexit(msg, errno.EAGAIN)
#logger.debug("Read configuration file.")
flocklab.load_config()
# Get arguments ---
try:
opts, args = getopt.getopt(argv, "ehdt:", ["email", "help", "debug", "testid=" ])
......
......@@ -30,18 +30,11 @@ def main(argv):
### Global Variables ###
global logger
# Set timezone to UTC:
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger:
logger = flocklab.get_logger()
# Get config ---
if flocklab.load_config() != flocklab.SUCCESS:
msg = "Could not read configuration file. Exiting..."
flocklab.error_logandexit(msg, errno.EAGAIN)
#logger.debug("Read configuration file.")
flocklab.load_config()
# Get the arguments:
try:
......@@ -93,10 +86,10 @@ def main(argv):
else:
try:
# Check for tests to delete ---
sql = """ SELECT `serv_tests_key`, `time_start_wish`
FROM `tbl_serv_tests`
WHERE (`test_status` = 'todelete')
"""
sql = """SELECT `serv_tests_key`, `time_start_wish`
FROM `tbl_serv_tests`
WHERE (`test_status` = 'todelete')
"""
#logger.info("Looking in DB for tests which are marked to be deleted...")
if ( cur.execute(sql) <= 0 ):
logger.info("No tests found which are marked to be deleted.")
......
This diff is collapsed.
#! /usr/bin/env python3
import os, sys, getopt, traceback, MySQLdb, signal, random, time, errno, multiprocessing, subprocess, re, logging, __main__, threading, struct, types, queue, math, shutil, lxml
import os, sys, getopt, traceback, MySQLdb, signal, random, time, errno, multiprocessing, subprocess, re, logging, __main__, threading, struct, types, queue, math, shutil, lxml.etree
import lib.daemon as daemon
import lib.flocklab as flocklab
......@@ -231,28 +231,23 @@ def worker_convert_and_aggregate(queueitem=None, nodeid=None, resultfile_path=No
msg = "%s: Packet size (%i) did not match payload size (%i) @ %d." %(obsdbfile_path, err.expectedSize, err.actualSize, err.fpos)
_errors.append((msg, errno.EIO, obsid))
logqueue.put_nowait((loggername, logging.ERROR, msg))
except:
raise
try:
if (len(conv_values) > 0):
# There is still data left. Do a last commit:
if (viz_f != None) and (len(viz_values) > 0):
#logqueue.put_nowait((loggername, logging.DEBUG, "Viz started..."))
viz_f(testid, owner_fk, viz_values, obsid, vizimgdir, logger)
#logqueue.put_nowait((loggername, logging.DEBUG, "Viz done."))
# Write data to file:
#logqueue.put_nowait((loggername, logging.DEBUG, "Opening file %s for final writing..."%(resultfile_path)))
resultfile_lock.acquire()
f = open(resultfile_path, 'a')
f.writelines(conv_values)
f.close()
resultfile_lock.release()
logqueue.put_nowait((loggername, logging.DEBUG, "Committed final results to %s after %d rows"%(resultfile_path, rows)))
# Remove processed file:
#logqueue.put_nowait((loggername, logging.DEBUG, "Remove %s"%(obsdbfile_path)))
os.unlink(obsdbfile_path)
except:
raise
if (len(conv_values) > 0):
# There is still data left. Do a last commit:
if (viz_f != None) and (len(viz_values) > 0):
#logqueue.put_nowait((loggername, logging.DEBUG, "Viz started..."))
viz_f(testid, owner_fk, viz_values, obsid, vizimgdir, logger)
#logqueue.put_nowait((loggername, logging.DEBUG, "Viz done."))
# Write data to file:
#logqueue.put_nowait((loggername, logging.DEBUG, "Opening file %s for final writing..."%(resultfile_path)))
resultfile_lock.acquire()
f = open(resultfile_path, 'a')
f.writelines(conv_values)
f.close()
resultfile_lock.release()
logqueue.put_nowait((loggername, logging.DEBUG, "Committed final results to %s after %d rows"%(resultfile_path, rows)))
# Remove processed file:
#logqueue.put_nowait((loggername, logging.DEBUG, "Remove %s"%(obsdbfile_path)))
os.unlink(obsdbfile_path)
except:
msg = "Error in worker process: %s: %s\n%s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc())
_errors.append((msg, errno.ECOMM, obsid))
......@@ -405,16 +400,16 @@ class FetchObsThread(threading.Thread):
def __init__(self, obsid, obsethernet, dirname, debugdirname, config, logger, workQueue, stopEvent):
threading.Thread.__init__(self)
self._obsid = obsid
self._obsethernet = obsethernet
self._obsfiledir = dirname
self._obsfiledebugdir = debugdirname
self._obsethernet = obsethernet
self._obsfiledir = dirname
self._obsfiledebugdir = debugdirname
self._workQueue = workQueue
self._stopEvent = stopEvent
self._logger = logger
self._logger = logger
self._min_sleep = flocklab.config.getint("fetcher", "min_sleeptime")
self._max_randsleep = flocklab.config.getint("fetcher", "max_rand_sleeptime")
self._obsdbfolder = "%s/%d" % (flocklab.config.get("observer", "obsdbfolder"), testid)
self._obstestresfolder = "%s/%d" % (flocklab.config.get("observer", "testresultfolder"), testid)
def run(self):
try:
......@@ -436,7 +431,7 @@ class FetchObsThread(threading.Thread):
removelast = False
#self._logger.debug(self._loggerprefix + "Woke up")
# Get list of available files
cmd = ['ssh' ,'%s'%(self._obsethernet), "ls %s/" % self._obsdbfolder]
cmd = ['ssh' ,'%s'%(self._obsethernet), "ls %s/" % self._obstestresfolder]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) # universal_newlines makes sure that a string is returned instead of a byte object
out, err = p.communicate(None)
rs = p.returncode
......@@ -450,7 +445,7 @@ class FetchObsThread(threading.Thread):
# Check name and append to corresponding list
for service in services.values():
if service.matchFileName(dbfile):
service.addFile("%s/%s" % (self._obsdbfolder, dbfile))
service.addFile("%s/%s" % (self._obstestresfolder, dbfile))
break
copyfilelist = []
# Remove latest from each list as the observer might still be writing into it (unless stop event has been set).
......@@ -492,7 +487,7 @@ class FetchObsThread(threading.Thread):
#DEBUG self._logger.debug(self._loggerprefix + "Put all files onto queue.")
# Remove remote files if any are left:
if (len(copyfilelist) > 0):
cmd = ['ssh' ,'%s'%(self._obsethernet), "cd %s;"%self._obsdbfolder, "rm"]
cmd = ['ssh' ,'%s'%(self._obsethernet), "cd %s;"%self._obstestresfolder, "rm"]
cmd.extend(copyfilelist)
self._logger.debug(self._loggerprefix + "Removing files on observer...")
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
......@@ -506,7 +501,7 @@ class FetchObsThread(threading.Thread):
self._logger.debug(self._loggerprefix + "No files to download from observer.")
if removelast == False: # this is the last execution of the while loop
cmd = ['ssh' ,'%s'%(self._obsethernet), "rm -rf %s" % self._obsdbfolder]
cmd = ['ssh' ,'%s'%(self._obsethernet), "rm -rf %s" % self._obstestresfolder]
p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
out, err = p.communicate(None)
rs = p.wait()
......@@ -554,8 +549,7 @@ def start_fetcher():
try:
(cn, cur) = flocklab.connect_to_db()
except:
msg = "Could not connect to database"
flocklab.error_logandexit(msg, errno.EAGAIN)
flocklab.error_logandexit("Could not connect to database", errno.EAGAIN)
try:
cur.execute(""" SELECT `a`.observer_id, `a`.ethernet_address
FROM `tbl_serv_observer` AS `a`
......@@ -623,7 +617,7 @@ def stop_fetcher():
if (pid > 0):
# Do not stop this instance if it is the only one running:
if (pid == os.getpid()):
raise
return flocklab.FAILED
logger.debug("Sending SIGTERM signal to process %d" %pid)
try:
os.kill(pid, signal.SIGTERM)
......@@ -638,7 +632,7 @@ def stop_fetcher():
except:
pass
else:
raise
return flocklab.FAILED
except (ValueError):
logger.debug("Fetcher daemon was not running, thus it cannot be stopped.")
# Set DB status in order to allow dispatcher and scheduler to go on.:
......@@ -747,21 +741,14 @@ def main(argv):
stop = False
# Set timezone to UTC ---
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger:
logger = flocklab.get_logger()
# Get the config file ---
if flocklab.load_config() != flocklab.SUCCESS:
msg = "Could not read configuration file. Exiting..."
flocklab.error_logandexit(msg, errno.EAGAIN)
#logger.debug("Read configuration file.")
flocklab.load_config()
# Get command line parameters ---
try:
try:
opts, args = getopt.getopt(argv, "hedt:", ["help", "stop", "debug", "testid="])
except getopt.GetoptError as err:
print(str(err))
......
......@@ -32,10 +32,6 @@ def main(argv):
adapterid = None
platform = None
ret = flocklab.SUCCESS
# Set timezone to UTC:
os.environ['TZ'] = 'UTC'
time.tzset()
# Get command line parameters.
try:
......@@ -69,9 +65,7 @@ def main(argv):
sys.exit(errno.EINVAL)
# Get the config file:
if flocklab.load_config() != flocklab.SUCCESS:
print("Could not read configuration file.")
sys.exit(errno.EAGAIN)
flocklab.load_config()
# Check if a test is preparing, running or cleaning up. If yes, exit program.
try:
......
......@@ -30,18 +30,11 @@ def main(argv):
### Global Variables ###
global logger
# Set timezone to UTC:
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger:
logger = flocklab.get_logger()
# Get config ---
if flocklab.load_config() != flocklab.SUCCESS:
msg = "Could not read configuration file. Exiting..."
flocklab.error_logandexit(msg, errno.EAGAIN)
#logger.debug("Read configuration file.")
flocklab.load_config()
# Get the arguments:
try:
......
......@@ -30,21 +30,20 @@ def test_startstopabort(testid=None, mode='stop',delay=0):
# change status of test that the next scheduler will skip this test
try:
conn = MySQLdb.connect(host=flocklab.config.get('database','host'), user=flocklab.config.get('database','user'), passwd=flocklab.config.get('database','password'), db=flocklab.config.get('database','database'))
cursor = conn.cursor()
(conn, cursor) = flocklab.connect_to_db()
except:
logger.error("Could not connect to the database because: %s: %s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1])))
logger.error("Could not connect to the database.")
flocklab.set_test_dispatched(cursor, conn, testid)
# wait for the actual start time of the test
time.sleep(delay)
logger.info("Found test ID %d which should be %sed."%(testid, mode))
logger.info("Found test ID %d which should be %sed." % (testid, mode))
# Add testid to logger name
logger.name += " (Test %d)"%testid
# Call the dispatcher:
cmd = [flocklab.config.get("dispatcher", "dispatcherscript"), '--testid=%d'%testid, '--%s'%mode]
cmd = [flocklab.config.get("dispatcher", "dispatcherscript"), '--testid=%d' % testid, '--%s' % mode]
# Make sure no other instance of the scheduler is running for the same task:
cmd2 = ['pgrep', '-o', '-f', ' '.join(cmd)]
p = subprocess.Popen(cmd2, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
......@@ -60,7 +59,7 @@ def test_startstopabort(testid=None, mode='stop',delay=0):
p.wait()
rs = p.returncode
if (rs != flocklab.SUCCESS):
logger.error("Dispatcher to %s test returned with error %d" %(mode, rs))
logger.error("Dispatcher to %s test returned with error %d" % (mode, rs))
logger.debug("Command executed was: %s"%(str(cmd)))
conn.close()
return errno.EFAULT
......@@ -95,19 +94,12 @@ def main(argv):
### Global Variables ###
global logger
global debug
# Set timezone to UTC:
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger:
logger = flocklab.get_logger()
if not logger:
flocklab.error_logandexit("Failed to init logger.")
# Get the config file:
if flocklab.load_config() != flocklab.SUCCESS:
flocklab.error_logandexit("Could not read configuration file.", errno.EAGAIN)
flocklab.load_config()
# Get the arguments:
try:
......
#! /usr/bin/env python3
import os, sys, getopt, traceback, MySQLdb, signal, time, errno, subprocess, logging, __main__, multiprocessing, queue, threading, select, socket, io, lxml
import os, sys, getopt, traceback, MySQLdb, signal, time, errno, subprocess, logging, __main__, multiprocessing, queue, threading, select, socket, io, lxml.etree
import lib.daemon as daemon
import lib.flocklab as flocklab
......@@ -89,6 +89,7 @@ def obs_connect_process(conreqQueue, condoneQueue, _stopevent):
worklist.remove(w)
except ConnectionRefusedError:
logger.info("Could not connect to observer %s on port %d, will retry later.." % (w[0],w[1]))
time.sleep(2)
except Exception:
logger.info("Could not connect to observer %s on port %d: %s, %s\n%s" % (w[0], w[1], str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc()))
pass
......@@ -109,6 +110,7 @@ def update_configuration_from_db():
proxystartport = flocklab.config.getint('serialproxy', 'startport')
obsdataport = flocklab.config.getint('serialproxy', 'obsdataport')
proxyConfig = []
logger.debug("Updating configuration from DB...")
try:
(cn, cur) = flocklab.connect_to_db()
cur.execute('SET time_zone="+0:00"')
......@@ -393,7 +395,7 @@ def start_proxy():
# Catch kill signals ---
signal.signal(signal.SIGTERM, sigterm_handler)
signal.signal(signal.SIGINT, sigterm_handler)
logger.info("Daemon started")
logger.info("Serial proxy daemon started.")
proxy = ProxyConnections()
proxy.reloadConfiguration(proxyConfig)
proxy.run()
......@@ -407,40 +409,36 @@ def start_proxy():
##############################################################################
def sig_proxy(signum):
# Get oldest running instance of the proxy for the selected test ID which is the main process and send it the terminate signal:
try:
searchterm = "%s" % __file__
cmd = ['pgrep', '-o', '-f', searchterm]
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
out, err = p.communicate()
if (p.returncode == 0):
pid = int(out)
# Do not stop this instance if it is the only one running:
if (pid == os.getpid()):
raise
else:
logger.warn("Command failed: %s" % (str(cmd)))
raise
# Signal the process to stop:
if (pid > 0):
logger.debug("Sending signal %d to process %d" %(signum, pid))
try:
os.kill(pid, signum)
if signum == signal.SIGTERM:
logger.debug("Waiting for process to finish...")
# wait for process to finish (timeout..)
shutdown_timeout = flocklab.config.getint("serialproxy", "shutdown_timeout")
pidpath = "/proc/%d"%pid
while os.path.exists(pidpath) & (shutdown_timeout>0):
time.sleep(1)
shutdown_timeout = shutdown_timeout - 1
if os.path.exists(pidpath):
logger.warn("Serial proxy is still running, sending it the SIGKILL signal...")
os.kill(pid, signal.SIGKILL)
except:
logger.warn("Failed to send SIGKILL: %s: %s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1])))
except (ValueError):
logger.debug("Serial proxy daemon was not running, thus it cannot be stopped.")
searchterm = "%s" % __file__
cmd = ['pgrep', '-o', '-f', searchterm]
p = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
out, err = p.communicate()
if (p.returncode == 0):
pid = int(out)
# Do not stop this instance if it is the only one running:
if (pid == os.getpid()):
return errno.ENOPKG
else:
logger.warn("Command failed: %s" % (str(cmd)))
return errno.ENOPKG
# Signal the process to stop:
if (pid > 0):
logger.debug("Sending signal %d to process %d" %(signum, pid))
try:
os.kill(pid, signum)
if signum == signal.SIGTERM:
logger.debug("Waiting for process to finish...")
# wait for process to finish (timeout..)
shutdown_timeout = flocklab.config.getint("serialproxy", "shutdown_timeout")
pidpath = "/proc/%d"%pid
while os.path.exists(pidpath) & (shutdown_timeout>0):
time.sleep(1)
shutdown_timeout = shutdown_timeout - 1
if os.path.exists(pidpath):
logger.warn("Serial proxy is still running, sending it the SIGKILL signal...")
os.kill(pid, signal.SIGKILL)
except:
logger.warn("Failed to send SIGKILL: %s: %s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1])))
return flocklab.SUCCESS
### END sig_proxy
......@@ -477,18 +475,12 @@ def main(argv):
start = False
notify = False
# Set timezone to UTC ---
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger:
logger = flocklab.get_logger()
logger = flocklab.get_logger(debug=debug)
# Get the config file ---
if flocklab.load_config() != flocklab.SUCCESS:
msg = "Could not read configuration file. Exiting..."
flocklab.error_logandexit(msg, errno.EAGAIN)
flocklab.load_config()
# Get command line parameters ---
try:
opts, args = getopt.getopt(argv, "hnsed", ["help", "notify", "start", "stop", "debug"])
......@@ -498,15 +490,14 @@ def main(argv):
usage()
sys.exit(errno.EINVAL)
except:
msg = "Error when getting arguments: %s: %s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1]))
flocklab.error_logandexit(msg, errno.EAGAIN)
logger.error("Error when getting arguments: %s: %s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1])), errno.EAGAIN)
sys.exit(errno.EINVAL)
for opt, arg in opts:
if opt in ("-h", "--help"):
usage()
sys.exit(flocklab.SUCCESS)
elif opt in ("-d", "--debug"):
debug = True
logger.debug("Detected debug flag.")
logger.setLevel(logging.DEBUG)
elif opt in ("-e", "--stop"):
stop = True
......@@ -523,16 +514,20 @@ def main(argv):
ret = flocklab.SUCCESS
if stop:
ret = sig_proxy(signal.SIGTERM)
logger.debug("SIGTERM sent to serial proxy daemon.")
elif notify:
ret = sig_proxy(signal.SIGINT)
if start or notify and ret == errno.ENOPKG:
logger.debug("SIGINT sent to serial proxy daemon.")
if start or (notify and ret == errno.ENOPKG):
# Start the proxy processes:
ret = flocklab.SUCCESS
try:
logger.debug("Starting serial proxy...")
start_proxy()
except Exception:
logger.info(traceback.format_exc())
raise
logger.error("Failed to start serial proxy daemon.\n%s" % traceback.format_exc())
ret = flocklab.FAILED
sys.exit(ret)
### END main()
......@@ -542,5 +537,4 @@ if __name__ == "__main__":
try:
main(sys.argv[1:])
except Exception:
msg = "Encountered error: %s: %s\n%s\nCommand line was: %s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc(), str(sys.argv))
flocklab.error_logandexit(msg, errno.EAGAIN)
flocklab.error_logandexit("Encountered error: %s: %s\n%s\nCommand line was: %s" % (str(sys.exc_info()[0]), str(sys.exc_info()[1]), traceback.format_exc(), str(sys.argv)))
......@@ -172,10 +172,6 @@ def main(argv):
observer = ""
status = "'online', 'internal', 'develop'"
# Set timezone to UTC:
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger:
logger = flocklab.get_logger()
......@@ -236,9 +232,7 @@ def main(argv):
sys.exit(errno.EINVAL)
# Get the config file:
if flocklab.load_config() != flocklab.SUCCESS:
logger.warn("Could not read configuration file. Exiting...")
sys.exit(errno.EAGAIN)
flocklab.load_config()
# Check if a test is preparing, running or cleaning up. If yes, exit program.
try:
......
This diff is collapsed.
......@@ -34,14 +34,9 @@ def main(argv):
# Open the log and create logger:
logger = flocklab.get_logger()
if not logger:
print("Failed to init logger.")
sys.exit(errno.EINVAL)
# Get the config file:
if flocklab.load_config() != flocklab.SUCCESS:
logger.warn("Could not read configuration file. Exiting...")
sys.exit(errno.EAGAIN)
flocklab.load_config()
# Get command line parameters.
try:
......@@ -81,27 +76,31 @@ def main(argv):
# Connect to the DB:
try:
db = MySQLdb.connect(host=flocklab.config.get('database','host'), user=flocklab.config.get('database','user'), passwd=flocklab.config.get('database','password'), db=flocklab.config.get('database','database'))
cursor = db.cursor()
(cn, cur) = flocklab.connect_to_db()
except:
logger.warn("Could not connect to the database because: %s: %s" %(str(sys.exc_info()[0]), str(sys.exc_info()[1])))
sys.exit(errno.EAGAIN)
flocklab.error_logandexit("Could not connect to the database.")
# Check if platform is registered in database and get platform architecture:
sql = """SELECT `a`.`architecture` FROM `tbl_serv_platforms` LEFT JOIN `tbl_serv_architectures` `a` ON `tbl_serv_platforms`.`serv_platforms_key` = `a`.`platforms_fk` WHERE LOWER(name) = '%s' and `core`=%d;"""
cursor.execute(sql %(str(platform).lower(), core))
ret = cursor.fetchone()
sql = """SELECT `a`.`architecture` FROM `tbl_serv_platforms`
LEFT JOIN `tbl_serv_architectures` `a` ON `tbl_serv_platforms`.`serv_platforms_key` = `a`.`platforms_fk`
WHERE LOWER(name) = '%s' and `core`=%d;
"""
cur.execute(sql %(str(platform).lower(), core))
ret = cur.fetchone()
if not ret:
err_str = "Could not find platform %s in database. Exiting..." % (str(platform))
logger.warn(err_str)
if not quiet:
print(err_str)
db.close()
cn.close()
sys.exit(errno.EINVAL)
else:
arch = ret[0]
arch = arch.lower()
cur.close()
cn.close()
# Validate the image. This is dependent on the architecture of the target platform:
errcnt = 0
if arch == 'msp430':
......
......@@ -227,18 +227,11 @@ def main(argv):
errors = []
_serial_service_file = None
# Set timezone to UTC:
os.environ['TZ'] = 'UTC'
time.tzset()
# Get logger:
logger = flocklab.get_logger()
# Get the config file:
if flocklab.load_config() != flocklab.SUCCESS:
msg = "Could not read configuration file. Exiting..."
flocklab.error_logandexit(msg, errno.EAGAIN)
#logger.debug("Read configuration file.")
flocklab.load_config()