To receive notifications about scheduled maintenance, please subscribe to the mailing-list gitlab-operations@sympa.ethz.ch. You can subscribe to the mailing-list at https://sympa.ethz.ch

Commit ce909f1b authored by matthmey's avatar matthmey
Browse files

fixed configuration with forward()

parent 8b642f83
...@@ -3,4 +3,3 @@ from __future__ import absolute_import ...@@ -3,4 +3,3 @@ from __future__ import absolute_import
from . import data from . import data
from . import global_config from . import global_config
from .convenience import * from .convenience import *
...@@ -17,4 +17,4 @@ def dat(x): ...@@ -17,4 +17,4 @@ def dat(x):
dask.delayed object -- dask.delayed object --
""" """
return dask.delayed(x) return dask.delayed(x)
\ No newline at end of file
...@@ -190,15 +190,13 @@ def configuration(delayed, request, keys=None, default_merge=None): ...@@ -190,15 +190,13 @@ def configuration(delayed, request, keys=None, default_merge=None):
# print(f"configuring {k}",requests[k]) # print(f"configuring {k}",requests[k])
# set configuration for this node k # set configuration for this node k
# If we create a delayed object from a class, `self` will be dsk[k][1] # If we create a delayed object from a class, `self` will be dsk[k][1]
print(dsk[k])
print(type(dsk[k][1]))
argument_is_node = None argument_is_node = None
if isinstance(dsk[k],tuple): if isinstance(dsk[k], tuple):
# check the first two arguments # TODO: possibly change this to only the first argument # check the first argument
for ain in range(2): # # TODO: make tests for all use cases and then remove for-loop
if hasattr(dsk[k][ain], '__self__'): for ain in range(1):
if hasattr(dsk[k][ain], "__self__"):
if isinstance(dsk[k][ain].__self__, Node): if isinstance(dsk[k][ain].__self__, Node):
argument_is_node = ain argument_is_node = ain
# Check if we get a node of type Node class # Check if we get a node of type Node class
...@@ -226,27 +224,24 @@ def configuration(delayed, request, keys=None, default_merge=None): ...@@ -226,27 +224,24 @@ def configuration(delayed, request, keys=None, default_merge=None):
else: else:
new_request = requests[k] new_request = requests[k]
if ( if (new_request[0] is not None and
"requires_request" in new_request[0] "requires_request" in new_request[0]
and new_request[0]["requires_request"] == True and new_request[0]["requires_request"] == True
): ):
del new_request[0]["requires_request"] del new_request[0]["requires_request"]
# TODO: check if we need a deepcopy here! # TODO: check if we need a deepcopy here!
input_requests[k] = copy.deepcopy( input_requests[k] = copy.deepcopy(new_request[0])
new_request[0]
)
# update dependencies # update dependencies
current_deps = get_dependencies(dsk, k, as_list=True) current_deps = get_dependencies(dsk, k, as_list=True)
for i, d in enumerate(current_deps): for i, d in enumerate(current_deps):
if d in requests: if d in requests:
requests[d] += new_request requests[d] += new_request
remove[d] = remove[d] and (not new_request[0]) remove[d] = remove[d] and (new_request[0] is None)
else: else:
requests[d] = new_request requests[d] = new_request
remove[d] = not new_request[ # if we received None
0 remove[d] = new_request[0] is None
] # if we received an empty dictionary flag deps for removal
# only configure each node once in a round! # only configure each node once in a round!
if d not in new_work and d not in work: # TODO: verify this if d not in new_work and d not in work: # TODO: verify this
...@@ -261,17 +256,20 @@ def configuration(delayed, request, keys=None, default_merge=None): ...@@ -261,17 +256,20 @@ def configuration(delayed, request, keys=None, default_merge=None):
# After we have aquired all requests we can input the required_requests as a input node to the requiring node # After we have aquired all requests we can input the required_requests as a input node to the requiring node
# we assume that the last argument is the request # we assume that the last argument is the request
for k in input_requests: for k in input_requests:
if isinstance(out[k][-1],dict): # Here we assume that we always receive the same tuple of (bound method, data, request)
out[k][-1].update(input_requests[k]) # If the interface changes this will break #TODO: check for all cases
if isinstance(out[k][2], dict):
out[k][2].update(input_requests[k])
else: else:
# replace the last entry # replace the last entry
out[k] = out[k][:-1] + (input_requests[k],) out[k] = out[k][:2] + (input_requests[k],)
# convert to delayed object # convert to delayed object
from dask.delayed import Delayed from dask.delayed import Delayed
in_keys = list(flatten(keys)) in_keys = list(flatten(keys))
# print(in_keys) # print(in_keys)
if len(in_keys) > 1: if len(in_keys) > 1:
collection = [Delayed(key=key, dsk=out) for key in in_keys] collection = [Delayed(key=key, dsk=out) for key in in_keys]
else: else:
...@@ -282,7 +280,6 @@ def configuration(delayed, request, keys=None, default_merge=None): ...@@ -282,7 +280,6 @@ def configuration(delayed, request, keys=None, default_merge=None):
return collection return collection
def optimize_freeze(dsk, keys, request_key="request"): def optimize_freeze(dsk, keys, request_key="request"):
""" Return new dask with tasks removed which are unnecessary because a later stage """ Return new dask with tasks removed which are unnecessary because a later stage
reads from cache reads from cache
......
import permasense import permasense
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import pyarrow as pa import pyarrow as pa
import warnings import warnings
import datetime as dt import datetime as dt
from .management import DataSource from .management import DataSource
from ..core import configuration from ..core import configuration
class DataCollector(DataSource): class DataCollector(DataSource):
def __init__(self, data_paths=[], granularities=[]): def __init__(self, data_paths=[], granularities=[]):
"""Add and choose data path according to its granularity. """Add and choose data path according to its granularity.
...@@ -21,35 +22,38 @@ class DataCollector(DataSource): ...@@ -21,35 +22,38 @@ class DataCollector(DataSource):
granularities {list} -- a list of sorted granularities (default: {[]}) granularities {list} -- a list of sorted granularities (default: {[]})
""" """
super().__init__() super().__init__()
self.data_paths = data_paths self.data_paths = data_paths
self.granularities = granularities self.granularities = granularities
if len(self.data_paths) != len(self.granularities):
if(len(self.data_paths) != len(self.granularities)): raise ValueError(
raise ValueError("Each granularity is supposed to have its corresponding data manager") "Each granularity is supposed to have its corresponding data manager"
)
if len(self.granularities) > 1 and not self.is_sorted(self.granularities): if len(self.granularities) > 1 and not self.is_sorted(self.granularities):
raise ValueError('Granularities should be sorted') raise ValueError("Granularities should be sorted")
def forward(self, data=None, request=None): def forward(self, data=None, request=None):
if(len(self.data_paths) != len(self.granularities)): if len(self.data_paths) != len(self.granularities):
raise ValueError("Each granularity is supposed to have its corresponding data manager") raise ValueError(
"Each granularity is supposed to have its corresponding data manager"
)
if len(self.granularities) > 1 and not self.is_sorted(self.granularities): if len(self.granularities) > 1 and not self.is_sorted(self.granularities):
raise ValueError('Granularities should be sorted') raise ValueError("Granularities should be sorted")
# TODO: change to generic indices or slices # TODO: change to generic indices or slices
granularity = request['end_time'] - request['start_time'] granularity = request["end_time"] - request["start_time"]
data_path = None data_path = None
for i in range(len(self.granularities)): for i in range(len(self.granularities)):
print(i,granularity,'<',self.granularities[i],self.data_paths[i]) print(i, granularity, "<", self.granularities[i], self.data_paths[i])
if granularity < self.granularities[i]: if granularity < self.granularities[i]:
data_path = self.data_paths[i] data_path = self.data_paths[i]
break break
if data_path is None: if data_path is None:
raise AttributeError('No data manager can be used for this timeframe') raise AttributeError("No data manager can be used for this timeframe")
return data_path return data_path
def is_sorted(self, l): def is_sorted(self, l):
...@@ -61,4 +65,4 @@ class DataCollector(DataSource): ...@@ -61,4 +65,4 @@ class DataCollector(DataSource):
Returns: Returns:
[bool] -- if the list is sorted, return true [bool] -- if the list is sorted, return true
""" """
return all(a <= b for a, b in zip(l, l[1:])) return all(a <= b for a, b in zip(l, l[1:]))
\ No newline at end of file
...@@ -27,12 +27,15 @@ import datetime as dt ...@@ -27,12 +27,15 @@ import datetime as dt
from pathlib import Path from pathlib import Path
def forward(x, data=None, request=None):
print("inforward", x, data, request)
class DataSource(StuettNode): class DataSource(StuettNode):
def __init__(self, **kwargs): def __init__(self, **kwargs):
super().__init__(kwargs=kwargs) super().__init__(kwargs=kwargs)
def __call__(self, data=None, request=None, delayed=False): def __call__(self, data=None, request=None, delayed=False):
print(data,request)
if data is not None: if data is not None:
if request is None: if request is None:
request = data request = data
...@@ -48,19 +51,23 @@ class DataSource(StuettNode): ...@@ -48,19 +51,23 @@ class DataSource(StuettNode):
config.update(request) config.update(request)
# TODO: change when rewriting for general indices # TODO: change when rewriting for general indices
if 'start_time' in config and config['start_time'] is not None: if "start_time" in config and config["start_time"] is not None:
config['start_time'] = pd.to_datetime(config['start_time'], utc=True).tz_localize( config["start_time"] = pd.to_datetime(
config["start_time"], utc=True
).tz_localize(
None None
) # TODO: change when xarray #3291 is fixed ) # TODO: change when xarray #3291 is fixed
if 'end_time' in config and config['end_time'] is not None: if "end_time" in config and config["end_time"] is not None:
config['end_time'] = pd.to_datetime(config['end_time'], utc=True).tz_localize( config["end_time"] = pd.to_datetime(
config["end_time"], utc=True
).tz_localize(
None None
) # TODO: change when xarray #3291 is fixed ) # TODO: change when xarray #3291 is fixed
if delayed: if delayed:
return dask.delayed(self.forward)(None,config) return dask.delayed(self.forward)(None, config)
else: else:
return self.forward(None,config) return self.forward(None, config)
def configure(self, requests=None): def configure(self, requests=None):
""" Default configure for DataSource nodes """ Default configure for DataSource nodes
...@@ -72,22 +79,38 @@ class DataSource(StuettNode): ...@@ -72,22 +79,38 @@ class DataSource(StuettNode):
Returns: Returns:
dict -- Original request or merged requests dict -- Original request or merged requests
""" """
requests = super().configure(requests) requests = super().configure(requests) # merging request here
requests["requires_request"] = True requests["requires_request"] = True
return requests return requests
class GSNDataSource(DataSource): class GSNDataSource(DataSource):
def __init__( def __init__(
self, deployment=None, vsensor=None, position=None, start_time=None, end_time=None, **kwargs self,
deployment=None,
vsensor=None,
position=None,
start_time=None,
end_time=None,
**kwargs,
): ):
super().__init__(deployment=deployment, position=position, vsensor=vsensor, start_time=start_time, end_time=end_time, kwargs=kwargs) super().__init__(
deployment=deployment,
position=position,
vsensor=vsensor,
start_time=start_time,
end_time=end_time,
kwargs=kwargs,
)
def forward(self, data=None, request=None): def forward(self, data=None, request=None):
#### 1 - DEFINE VSENSOR-DEPENDENT COLUMNS #### #### 1 - DEFINE VSENSOR-DEPENDENT COLUMNS ####
colnames = pd.read_csv( colnames = pd.read_csv(
Path(get_setting("metadata_directory")).joinpath("vsensor_metadata/{:s}_{:s}.csv".format( Path(get_setting("metadata_directory")).joinpath(
request["deployment"], request["vsensor"]) "vsensor_metadata/{:s}_{:s}.csv".format(
request["deployment"], request["vsensor"]
)
), ),
skiprows=0, skiprows=0,
) )
...@@ -131,8 +154,10 @@ class GSNDataSource(DataSource): ...@@ -131,8 +154,10 @@ class GSNDataSource(DataSource):
"&c_max[1]={:02d}" "&c_max[1]={:02d}"
).format( ).format(
virtual_sensor, virtual_sensor,
pd.to_datetime(request['start_time'],utc=True).strftime("%d/%m/%Y+%H:%M:%S"), pd.to_datetime(request["start_time"], utc=True).strftime(
pd.to_datetime(request['end_time'],utc=True).strftime("%d/%m/%Y+%H:%M:%S"), "%d/%m/%Y+%H:%M:%S"
),
pd.to_datetime(request["end_time"], utc=True).strftime("%d/%m/%Y+%H:%M:%S"),
virtual_sensor, virtual_sensor,
int(request["position"]) - 1, int(request["position"]) - 1,
request["position"], request["position"],
...@@ -158,13 +183,12 @@ class GSNDataSource(DataSource): ...@@ -158,13 +183,12 @@ class GSNDataSource(DataSource):
url, skiprows=2 url, skiprows=2
) # skip header lines (first 2) for import: skiprows=2 ) # skip header lines (first 2) for import: skiprows=2
df = pd.DataFrame(columns=columns_new) df = pd.DataFrame(columns=columns_new)
df.time = pd.to_datetime(d.generation_time,utc=True) df.time = pd.to_datetime(d.generation_time, utc=True)
# if depo in ['mh25', 'mh27old', 'mh27new', 'mh30', 'jj22', 'jj25', 'mh-nodehealth']: # if depo in ['mh25', 'mh27old', 'mh27new', 'mh30', 'jj22', 'jj25', 'mh-nodehealth']:
# d = d.convert_objects(convert_numeric=True) # TODO: rewrite to remove warning # d = d.convert_objects(convert_numeric=True) # TODO: rewrite to remove warning
for k in list(df): for k in list(df):
df[k]=pd.to_numeric(df[k], errors='ignore') df[k] = pd.to_numeric(df[k], errors="ignore")
# df = pd.DataFrame(columns=columns_new) # df = pd.DataFrame(columns=columns_new)
# df.time = timestamp2datetime(d['generation_time']/1000) # df.time = timestamp2datetime(d['generation_time']/1000)
...@@ -187,7 +211,7 @@ class GSNDataSource(DataSource): ...@@ -187,7 +211,7 @@ class GSNDataSource(DataSource):
df = df.set_index("time") df = df.set_index("time")
df = df.sort_index(axis=1) df = df.sort_index(axis=1)
x = xr.DataArray(df, dims=["time", "name"], name="CSV") x = xr.DataArray(df, dims=["time", "name"], name="CSV")
try: try:
......
import stuett import stuett
from pathlib import Path from pathlib import Path
import pandas as pd import pandas as pd
test_data_dir = Path(__file__).absolute().parent.joinpath("..", "data") test_data_dir = Path(__file__).absolute().parent.joinpath("..", "data")
stuett.global_config.set_setting("user_dir", test_data_dir.joinpath("user_dir/")) stuett.global_config.set_setting("user_dir", test_data_dir.joinpath("user_dir/"))
...@@ -14,43 +14,41 @@ def test_collector(): ...@@ -14,43 +14,41 @@ def test_collector():
node = stuett.data.CsvSource(filename) node = stuett.data.CsvSource(filename)
minmax_rate2 = stuett.data.MinMaxDownsampling(rate=2, dim="time") minmax_rate2 = stuett.data.MinMaxDownsampling(rate=2, dim="time")
print("creating delayed node")
print('creating delayed node')
x = node(delayed=True) x = node(delayed=True)
print('downsampled delayed node') print("downsampled delayed node")
downsampled = minmax_rate2(x,delayed=True) downsampled = minmax_rate2(x, delayed=True)
print('DataCollector node') print("DataCollector node")
data_paths = [x,downsampled] data_paths = [x, downsampled]
granularities = [stuett.to_timedelta(180,'s'), stuett.to_timedelta(2,'d')] granularities = [stuett.to_timedelta(180, "s"), stuett.to_timedelta(2, "d")]
collector_node = stuett.data.DataCollector(data_paths, granularities) collector_node = stuett.data.DataCollector(data_paths, granularities)
print('Instatiating DataCollector node') print("Instatiating DataCollector node")
request = {'start_time':'2017-08-01', 'end_time':'2017-08-02'} request = {"start_time": "2017-08-01", "end_time": "2017-08-02"}
path = collector_node(request=request) path = collector_node(request=request)
# print(type(path)) # print(type(path))
print('Configuration node') print("Configuration node")
import dask import dask
dsk, dsk_keys = dask.base._extract_graph_and_keys([path])
dsk, dsk_keys = dask.base._extract_graph_and_keys([path])
path = stuett.core.configuration(path,request) path = stuett.core.configuration(path, request)
dsk, dsk_keys = dask.base._extract_graph_and_keys([path]) dsk, dsk_keys = dask.base._extract_graph_and_keys([path])
print(dsk) print(dsk)
print(type(path)) print(type(path))
print('executing delayed node') print("executing delayed node")
print(path.compute()) print(path.compute())
# request = {'start_time':'2017-08-01 10:01:00', 'end_time':'2017-08-01 10:02:00'} # request = {'start_time':'2017-08-01 10:01:00', 'end_time':'2017-08-01 10:02:00'}
# path = collector_node(request=request) # path = collector_node(request=request)
# path = stuett.core.configuration(path,request) # path = stuett.core.configuration(path,request)
# print(path.compute()) # print(path.compute())
test_collector()
\ No newline at end of file test_collector()
...@@ -15,6 +15,7 @@ class MyNode(stuett.core.StuettNode): ...@@ -15,6 +15,7 @@ class MyNode(stuett.core.StuettNode):
super().__init__() super().__init__()
def forward(self, data=None, request=None): def forward(self, data=None, request=None):
print(data, request)
return data + 4 return data + 4
def configure(self, requests=None): def configure(self, requests=None):
...@@ -24,16 +25,16 @@ class MyNode(stuett.core.StuettNode): ...@@ -24,16 +25,16 @@ class MyNode(stuett.core.StuettNode):
return requests return requests
# class MyMerge(stuett.core.StuettNode): class MyMerge(stuett.core.StuettNode):
# def forward(self, x, y): def forward(self, data, request):
# return x + y return data[0] + data[1]
class MySource(stuett.data.DataSource): class MySource(stuett.data.DataSource):
def __init__(self,start_time=None): def __init__(self, start_time=None):
super().__init__(start_time=start_time) super().__init__(start_time=start_time)
def forward(self, request=None): def forward(self, data=None, request=None):
return request["start_time"] return request["start_time"]
...@@ -41,18 +42,40 @@ def test_configuration(): ...@@ -41,18 +42,40 @@ def test_configuration():
node = MyNode() node = MyNode()
# create a stuett graph # create a stuett graph
x = node({"start_time": 0, "end_time": -1}) x_in = node(data=5, delayed=True) # data input
x = bypass(x) x = bypass(x_in)
x = node(x) x = node(x, delayed=True)
print(x.compute())
# x.visualize()
import dask
dsk, dsk_keys = dask.base._extract_graph_and_keys([x])
print(dict(dsk))
# create a configuration file # create a configuration file
config = {} config = {}
# configure the graph # configure the graph
x_configured = stuett.core.configuration(x, config) x_configured = stuett.core.configuration(
x, config
) # BUG: somehow config cuts the graph off
import dask
dsk, dsk_keys = dask.base._extract_graph_and_keys([x_configured])
print(dsk)
x = x_configured.compute()
print(x)
# TODO: finalize test # TODO: finalize test
test_configuration()
def test_datasource(): def test_datasource():
source = MySource() source = MySource()
node = MyNode() node = MyNode()
...@@ -60,51 +83,84 @@ def test_datasource(): ...@@ -60,51 +83,84 @@ def test_datasource():
# create a stuett graph # create a stuett graph
x = source(delayed=True) x = source(delayed=True)
x = bypass(x)