To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit d7a6964d authored by matthmey's avatar matthmey
Browse files

added MH Image source

parent 39bf3d51
# stuett
## Quickstart
You can install the package with
```
pip install poetry
potery install
```
If you want to install it into a anaconda environment do
```
conda create -n stuett python==3.7 poetry
conda activate stuett
poetry install
```
This diff is collapsed.
......@@ -22,13 +22,15 @@ toml = "^0.9"
# Dependencies with extras
dask = {extras = ["complete"], version = "^2.6.0"}
xarray = "^0.14.0"
pandas = "^0.25.3"
toolz = "^0.10.0"
obspy = "^1.1.1"
appdirs = "^1.4.3"
obsplus = "^0.0.2"
zarr = "^2.3.2"
xarray = { git = "https://github.com/niowniow/xarray.git", branch = "strided_rolling" }
pillow = "^6.2.1"
xarray-extras = "^0.4.2"
# Optional dependencies (extras)
......@@ -36,3 +38,4 @@ zarr = "^2.3.2"
pytest = "^3.0"
pytest-cov = "^2.4"
black = {version = "^19.10b0", allows-prereleases = true}
flake8 = "^3.7.9"
......@@ -3,11 +3,12 @@ from dask.core import get_dependencies, flatten
import numpy as np
import copy
class Node(object):
def __init__(self):
pass
def configure(self,requests):
def configure(self, requests):
""" Before a task graph is executed each node is configured.
The request is propagated from the end to the beginning
of the DAG and each nodes "configure" routine is called.
......@@ -42,16 +43,18 @@ class Node(object):
node from the task graph.
"""
if not isinstance(requests,list):
raise RuntimeError('Please provide a **list** of request')
if not isinstance(requests, list):
raise RuntimeError("Please provide a **list** of request")
if len(requests) > 1:
raise RuntimeError('Default configuration function cannot handle '
'multiple requests. Please provide a custom '
'configuration implementation')
raise RuntimeError(
"Default configuration function cannot handle "
"multiple requests. Please provide a custom "
"configuration implementation"
)
return requests
@dask.delayed
def __call__(self,x,request=None):
def __call__(self, x, request=None):
raise NotImplementedError()
def get_config(self):
......@@ -59,8 +62,9 @@ class Node(object):
"""
raise NotImplementedError()
class StuettNode(Node): # TODO: define where this class should be (maybe not here)
def configure(self,requests):
def configure(self, requests):
""" Default configure for stuett nodes
Expects two keys per request (*start_time* and *tend*)
If multiple requests are passed, they will be merged
......@@ -73,24 +77,25 @@ class StuettNode(Node): # TODO: define where this class should be (m
Returns:
dict -- Original request or merged requests
"""
if not isinstance(requests,list):
raise RuntimeError('Please provide a list of request')
if not isinstance(requests, list):
raise RuntimeError("Please provide a list of request")
# For time requests we just use the union of both time segments
new_request = requests[0].copy()
key_func = {'start_time':np.minimum, 'end_time':np.maximum}
key_func = {"start_time": np.minimum, "end_time": np.maximum}
for r in requests[1:]:
for key in ['start_time', 'end_time']:
for key in ["start_time", "end_time"]:
if key in r:
if key in new_request:
new_request[key] = key_func[key](new_request[key],r[key])
new_request[key] = key_func[key](new_request[key], r[key])
else:
new_request[key] = r[key]
return new_request
def configuration(delayed,request,keys=None,default_merge=None):
def configuration(delayed, request, keys=None, default_merge=None):
""" Configures each node of the graph by propagating the request from outputs
to inputs.
Each node checks if it can fulfil the request and what it needs to fulfil the request.
......@@ -114,20 +119,16 @@ def configuration(delayed,request,keys=None,default_merge=None):
dask.delayed or list -- Config-optimized delayed object or list of delayed objects
"""
if not isinstance(delayed,list):
if not isinstance(delayed, list):
collections = [delayed]
# dsk = dask.base.collections_to_dsk(collections)
dsk, dsk_keys = dask.base._extract_graph_and_keys(collections)
dependencies,dependants = dask.core.get_deps(dsk)
dependencies, dependants = dask.core.get_deps(dsk)
if keys is None:
keys = dsk_keys
print('dsk',dsk.layers)
# print('keys',keys)
if not isinstance(keys, (list, set)):
keys = [keys]
out_keys = []
......@@ -135,17 +136,19 @@ def configuration(delayed,request,keys=None,default_merge=None):
work = list(set(flatten(keys)))
if isinstance(request,list):
if isinstance(request, list):
if len(request) != len(work):
raise RuntimeError("When passing multiple request items "
raise RuntimeError(
"When passing multiple request items "
"The number of request items must be same "
"as the number of keys")
"as the number of keys"
)
requests = {work[i]: [request[i]] for i in range(len(request)) }
requests = {work[i]: [request[i]] for i in range(len(request))}
else:
requests = {k: [request] for k in work }
requests = {k: [request] for k in work}
remove = {k:False for k in work}
remove = {k: False for k in work}
input_requests = {}
while work:
new_work = []
......@@ -159,7 +162,11 @@ def configuration(delayed,request,keys=None,default_merge=None):
# check if we have collected all dependencies so far
# we will come back to this node another time
# TODO: make a better check for the case when dependants[k] is a set, also: why is it a set in the first place..?
if k in dependants and len(dependants[k]) != len(requests[k]) and not isinstance(dependants[k],set):
if (
k in dependants
and len(dependants[k]) != len(requests[k])
and not isinstance(dependants[k], set)
):
# print(f'Waiting at {k}', dependants[k], requests[k])
continue
......@@ -167,11 +174,17 @@ def configuration(delayed,request,keys=None,default_merge=None):
# set configuration for this node k
# If we create a delayed object from a class, `self` will be dsk[k][1]
if isinstance(dsk[k],tuple) and isinstance(dsk[k][1],Node): # Check if we get a node of type Node class
if isinstance(dsk[k], tuple) and isinstance(
dsk[k][1], Node
): # Check if we get a node of type Node class
# current_requests = [r for r in requests[k] if r] # get all requests belonging to this node
current_requests = requests[k]
new_request = dsk[k][1].configure(current_requests) # Call the class configuration function
if not isinstance(new_request,list): # prepare the request return value
new_request = dsk[k][1].configure(
current_requests
) # Call the class configuration function
if not isinstance(
new_request, list
): # prepare the request return value
new_request = [new_request]
else: # We didn't get a Node class so there is no
# custom configuration function: pass through
......@@ -179,15 +192,22 @@ def configuration(delayed,request,keys=None,default_merge=None):
if callable(default_merge):
new_request = default_merge(requests[k])
else:
raise RuntimeError("No valid default merger supplied. Cannot merge requests. "
raise RuntimeError(
"No valid default merger supplied. Cannot merge requests. "
"Either convert your function to a class Node or provide "
"a default merger")
"a default merger"
)
else:
new_request = requests[k]
if 'requires_request' in new_request[0] and new_request[0]['requires_request'] == True:
del new_request[0]['requires_request']
input_requests[k] = copy.deepcopy(new_request[0]) #TODO: check if we need a deepcopy here!
if (
"requires_request" in new_request[0]
and new_request[0]["requires_request"] == True
):
del new_request[0]["requires_request"]
input_requests[k] = copy.deepcopy(
new_request[0]
) # TODO: check if we need a deepcopy here!
# update dependencies
current_deps = get_dependencies(dsk, k, as_list=True)
......@@ -197,11 +217,15 @@ def configuration(delayed,request,keys=None,default_merge=None):
remove[d] = remove[d] and (not new_request[0])
else:
requests[d] = new_request
remove[d] = (not new_request[0]) # if we received an empty dictionary flag deps for removal
remove[d] = not new_request[
0
] # if we received an empty dictionary flag deps for removal
# only configure each node once in a round!
if d not in new_work and d not in work: # TODO: verify this
new_work.append(d) # TODO: Do we need to configure dependency if we'll remove it?
new_work.append(
d
) # TODO: Do we need to configure dependency if we'll remove it?
work = new_work
......@@ -211,25 +235,23 @@ def configuration(delayed,request,keys=None,default_merge=None):
for k in input_requests:
out[k] += (input_requests[k],)
# convert to delayed object
from dask.delayed import Delayed
in_keys = list(flatten(keys))
print(in_keys)
# print(in_keys)
if len(in_keys) > 1:
collection = [Delayed(key=key,dsk=out) for key in in_keys]
collection = [Delayed(key=key, dsk=out) for key in in_keys]
else:
collection = Delayed(key=in_keys[0],dsk=out)
if isinstance(collection,list):
collection = Delayed(key=in_keys[0], dsk=out)
if isinstance(collection, list):
collection = [collection]
return collection
class Freezer(Node):
def __init__(self,caching=True):
def __init__(self, caching=True):
self.caching = caching
@dask.delayed
......@@ -245,14 +267,16 @@ class Freezer(Node):
xarray -- Data loaded from cache or input data passed through
"""
if isinstance(x,dict):
if isinstance(x, dict):
if self.is_cached(x) and self.caching:
# TODO: load from cache and return it
pass
elif not self.caching:
raise RuntimeError(f'If caching is disabled cannot perform request {x}')
raise RuntimeError(f"If caching is disabled cannot perform request {x}")
else:
raise RuntimeError(f'Result is not cached but cached result is requested with {x}')
raise RuntimeError(
f"Result is not cached but cached result is requested with {x}"
)
if self.caching:
# TODO: store the input data
......@@ -260,13 +284,13 @@ class Freezer(Node):
return x
def configure(self,requests):
def configure(self, requests):
if self.caching:
return [{}]
return config_conflict(requests)
def optimize_freeze(dsk, keys, request_key='request'):
def optimize_freeze(dsk, keys, request_key="request"):
""" Return new dask with tasks removed which are unnecessary because a later stage
reads from cache
``keys`` may be a single key or list of keys.
......@@ -285,14 +309,16 @@ def optimize_freeze(dsk, keys, request_key='request'):
seen = set()
dependencies = dict()
if (request_key not in dsk):
raise RuntimeError(f"Please provide a task graph which includes '{request_key}'")
if request_key not in dsk:
raise RuntimeError(
f"Please provide a task graph which includes '{request_key}'"
)
request = dsk[request_key]
def is_cached(task,request):
if isinstance(task,tuple):
if isinstance(task[0],Freezer):
def is_cached(task, request):
if isinstance(task, tuple):
if isinstance(task[0], Freezer):
return task[0].is_cached(request)
return False
......@@ -303,7 +329,7 @@ def optimize_freeze(dsk, keys, request_key='request'):
out_keys += work
deps = []
for k in work:
if is_cached(dsk[k],request):
if is_cached(dsk[k], request):
cached_keys.append(k)
else:
deps.append((k, get_dependencies(dsk, k, as_list=True)))
......@@ -319,7 +345,7 @@ def optimize_freeze(dsk, keys, request_key='request'):
out = {k: dsk[k] for k in out_keys}
# finally we need to replace the input of the caching nodes with the request
cached = {k: (out[k][0],request_key) for k in cached_keys}
cached = {k: (out[k][0], request_key) for k in cached_keys}
out.update(cached)
return out, dependencies
from .management import *
from .processing import *
# from .collection import *
This diff is collapsed.
......@@ -7,25 +7,29 @@ import xarray as xr
class MinMaxDownsampling(StuettNode):
def __init__(self,rate=1):
def __init__(self, rate=1):
# since we always choose two values (min and max) per bucket the
# the internal downsampling rate must be of factor two larger than
# the effective (and desired) downsampling rate
self.rate = rate*2
self.rate = rate * 2
@dask.delayed
def __call__(self,x):
rolling = x.rolling(time=self.rate,stride=self.rate)
x_min = rolling.min().dropna('time')
x_max = rolling.max().dropna('time')
def __call__(self, x):
rolling = x.rolling(time=self.rate, stride=self.rate)
x_ds = xr.concat([x_min,x_max],'time') # TODO: better interleave instead of concat
# x_min = rolling.construct("time", stride=self.rate).min("time").dropna('time')
# x_max = rolling.construct("time", stride=self.rate).max("time").dropna('time')
x_ds = x_ds.sortby('time') # TODO: try to avoid this by using interleaving
x_min = rolling.min().dropna("time")
x_max = rolling.max().dropna("time")
return x_ds
x_ds = xr.concat(
[x_min, x_max], "time"
) # TODO: better interleave instead of concat
x_ds = x_ds.sortby("time") # TODO: try to avoid this by using interleaving
return x_ds
class Downsampling(StuettNode):
......
......@@ -8,16 +8,18 @@ import datetime as dt
# initialize global settings with certain default values
_GLOBAL_CONFIG_DICT = {
"permasense_server": "http://data.permasense.ch/",
"reference_time": dt.datetime(2000,1,1,0,0,0)
"reference_time": dt.datetime(2000, 1, 1, 0, 0, 0),
}
def get_global_config():
return _GLOBAL_CONFIG_DICT
def setting_exists(key):
return key in _GLOBAL_CONFIG_DICT
def get_setting(key):
"""Returns the global setting for the given key
......@@ -31,11 +33,13 @@ def get_setting(key):
try:
return _GLOBAL_CONFIG_DICT[key]
except KeyError:
print(f'''The stuett setting {key} was not found and is required by a function call.
print(
f"""The stuett setting {key} was not found and is required by a function call.
Set it before your first call to the stuett package.
This can be done by either providing a settings file via stuett.load_config(),
updating your user config file or updating the settings directly with
stuett.set_setting('{key}',value)''')
stuett.set_setting('{key}',value)"""
)
def get_setting_path(key):
......@@ -47,14 +51,16 @@ def get_setting_path(key):
Returns:
[type] -- setting value
"""
path = join(get_setting(key), '')
path = join(get_setting(key), "")
if not os.path.isdir(path):
warnings.warn('stuett requested a path which is invalid: {}'.format(path))
warnings.warn("stuett requested a path which is invalid: {}".format(path))
return path
def set_setting(key,value):
def set_setting(key, value):
_GLOBAL_CONFIG_DICT[key] = value
def load_config(filename):
"""Load settings from a yaml file.
......@@ -71,17 +77,17 @@ def load_config(filename):
"""
if os.path.isfile(filename):
with open(filename, 'r') as f:
with open(filename, "r") as f:
settings = yaml.safe_load(f)
else:
raise IOError("Parameter file not found [%s]" % filename)
_GLOBAL_CONFIG_DICT.update(settings)
def get_user_config_file():
return join(appdirs.AppDirs("stuett", "stuett").user_config_dir,'config.yml')
return join(appdirs.AppDirs("stuett", "stuett").user_config_dir, "config.yml")
def load_user_config():
"""Check if a user specific config exists and load it
......@@ -92,7 +98,9 @@ def load_user_config():
if os.path.isfile(user_config):
load_config(user_config)
def get_global_time():
return dt.datetime(2000,1,1,0,0,0)
return dt.datetime(2000, 1, 1, 0, 0, 0)
load_user_config() # intially load the user settings
user_dir/*
MHDSLR/*
!.gitkeep
\ No newline at end of file
import datetime as dt
'''
"""
Some example data which can be used by tests
'''
"""
channels = ['EHE', 'EHN', 'EHZ']
stations = ['MH36', 'MH44', 'MH48', 'MH52', 'MH54']
channels = ["EHE", "EHN", "EHZ"]
stations = ["MH36", "MH44", "MH48", "MH52", "MH54"]
start_time = dt.datetime(2019,7,14,7,7,0,tzinfo=dt.timezone.utc)
end_time = dt.datetime(2019,7,14,7,7,7,tzinfo=dt.timezone.utc)
# start_time = dt.datetime(2017,7,14,7,7,0,tzinfo=dt.timezone.utc)
# end_time = dt.datetime(2017,7,14,7,7,7,tzinfo=dt.timezone.utc)
start_time = dt.datetime(2017, 7, 14, 7, 7, 0, tzinfo=dt.timezone.utc)
end_time = dt.datetime(2017, 7, 14, 7, 7, 10, tzinfo=dt.timezone.utc)
offset = dt.timedelta(days=1)
config = {'channel':channels[0],
'station':stations[0],
'start_time':start_time,
'end_time':end_time}
config = {
"channel": channels[0],
"station": stations[0],
"start_time": start_time,
"end_time": end_time,
}
#TODO: use @pytest.mark.parametrize
\ No newline at end of file
# TODO: use @pytest.mark.parametrize
......@@ -9,33 +9,37 @@ import pytest
def bypass(x):
return x
class MyNode(stuett.core.StuettNode):
@stuett.dat
def __call__(self,x):
def __call__(self, x):
return x + 4
def configure(self,requests=None):
def configure(self, requests=None):
requests = super().configure(requests)
if 'start_time' in requests:
requests['start_time'] += 1
if "start_time" in requests:
requests["start_time"] += 1
return requests
class MyMerge(stuett.core.StuettNode):
@stuett.dat
def __call__(self,x,y):
def __call__(self, x, y):
return x + y
class MySource(stuett.data.DataSource):
@stuett.dat
def __call__(self,request=None):
return request['start_time']
def __call__(self, request=None):
return request["start_time"]
class TestConfiguration(object):
def test_configuration(self):
node = MyNode()
# create a stuett graph
x = node({'start_time':0,'end_time':-1})
x = node({"start_time": 0, "end_time": -1})
x = bypass(x)
x = node(x)
......@@ -43,9 +47,9 @@ class TestConfiguration(object):
config = {}
# configure the graph
x_configured = stuett.core.configuration(x,config)
x_configured = stuett.core.configuration(x, config)
#TODO: finalize test
# TODO: finalize test
def test_datasource(self):
source = MySource()
......@@ -57,16 +61,15 @@ class TestConfiguration(object):
x = node(x)
# create a configuration file
config = {'start_time':0,'end_time':1}
config = {"start_time": 0, "end_time": 1}
# configure the graph
configured = stuett.core.configuration(x,config)
configured = stuett.core.configuration(x, config)
x_configured = configured.compute()
assert x_configured == 5
def test_merging(self):
source = MySource()
node = MyNode()
......@@ -74,17 +77,18 @@ class TestConfiguration(object):
# create a stuett graph
import dask