Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
M
mossutils
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Operations
Operations
Incidents
Environments
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
dinfk-lecturers
mossutils
Commits
49f918b9
Commit
49f918b9
authored
May 02, 2020
by
scmalte
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
aggr.py: joining cluster data with exports from CX and eDoz
parent
d6652bb1
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
13 deletions
+57
-13
mossutils/aggr.py
mossutils/aggr.py
+57
-13
No files found.
mossutils/aggr.py
View file @
49f918b9
...
...
@@ -10,23 +10,67 @@ def main(
clusters_summary_csv_file
=
DEFAULT_CLUSTERS_SUMMARY_CSV_FILE
,
cx_course_students_csv_file
=
DEFAULT_CX_COURSE_STUDENTS_CSV_FILE
):
clusters_csv
=
pd
.
read_csv
(
clusters_summary_csv_file
)
students_csv
=
pd
.
read_csv
(
cx_course_students_csv_file
)
clusters_csv
:
pd
.
DataFrame
=
pd
.
read_csv
(
clusters_summary_csv_file
)
# Read CX course data, reduce to relevant columns, set index column
course_csv
:
pd
.
DataFrame
=
pd
.
read_csv
(
cx_course_students_csv_file
)
course_csv
=
course_csv
[[
"Legi"
,
"Lastname"
,
"Firstname"
,
"Email"
]]
course_csv
.
set_index
(
"Legi"
,
inplace
=
True
)
clusters
=
clusters_csv
.
groupby
(
"cluster_id"
)
for
name
,
cluster
in
clusters
:
ids
=
pd
.
concat
([
cluster
[
"id1"
],
cluster
[
"id2"
]],
ignore_index
=
False
)
# print(cluster[["id1", "id2"]].unique())
print
(
ids
.
unique
())
# Analogous for eDoz course data
edoz1_csv
:
pd
.
DataFrame
=
pd
.
read_csv
(
"edoz-252083200L.csv"
,
sep
=
"
\t
"
)
edoz1_csv
=
edoz1_csv
[[
"Nummer"
,
"Departement"
]]
edoz1_csv
.
rename
(
columns
=
{
"Nummer"
:
"Legi"
},
inplace
=
True
)
edoz1_csv
.
set_index
(
"Legi"
,
inplace
=
True
)
# print(edoz1_csv)
# print("edoz1_csv.index.is_unique = {}".format(edoz1_csv.index.is_unique))
edoz2_csv
:
pd
.
DataFrame
=
pd
.
read_csv
(
"edoz-252084800L.csv"
,
sep
=
"
\t
"
)
edoz2_csv
=
edoz2_csv
[[
"Nummer"
,
"Departement"
]]
edoz2_csv
.
rename
(
columns
=
{
"Nummer"
:
"Legi"
},
inplace
=
True
)
edoz2_csv
.
set_index
(
"Legi"
,
inplace
=
True
)
# print(edoz2_csv.index)
# print("edoz2_csv.index.is_unique = {}".format(edoz2_csv.index.is_unique))
# for wtf in clusters[["id1", "id2"]]:
# print(wtf)
# Vertically concat eDoz data. Since students may be enrolled into multiple
# courses, duplicated rows are afterwards dropped.
edoz_csv
:
pd
.
DataFrame
=
pd
.
concat
([
edoz1_csv
,
edoz2_csv
])
edoz_csv
.
drop_duplicates
(
inplace
=
True
)
assert
edoz_csv
.
index
.
is_unique
,
"Expected unique indices (= legis) in edoz_csv"
# # Show rows with non-unique indices (https://stackoverflow.com/questions/20199129)
# print(edoz_csv[edoz_csv.index.duplicated(keep=False)])
cluster_groups
:
pd
.
DataFrameGroupBy
=
clusters_csv
.
groupby
(
"cluster_id"
)
for
_
,
cluster
in
cluster_groups
:
# cluster: pd.DataFrame
print
(
"-"
*
60
)
# Get all ids (= legis) participating in a cluster
ids_values
:
numpy
.
ndarray
=
pd
.
concat
([
cluster
[
"id1"
],
cluster
[
"id2"
]]).
unique
()
# ids = pd.Series(ids_values, name="Legi", index=ids_values)
# for cluster in clusters.groups:
# print(cluster)
# Performs an inner join on the keys; here: legis
# https://pandas.pydata.org/pandas-docs/stable/getting_started/comparison/comparison_with_sql.html#compare-with-sql-join
# join = pd.merge(ids, course_csv, left_index=True, right_index=True)
# for cluster in clusters.groupby("cluster_id"):
# print(cluster["id1"])
join
=
course_csv
.
loc
[
ids_values
]
print
(
"========== ids_values"
)
print
(
ids_values
.
shape
)
print
(
ids_values
)
print
(
"========== course_csv"
)
print
(
course_csv
)
print
(
"========== join"
)
print
(
join
.
shape
)
print
(
join
)
# print(edoz_csv)
# full_cluster_data = pd.merge(
# join,
# edoz_csv,
# left_index=True, right_index=True)
# print(full_cluster_data)
break
if
__name__
==
"__main__"
:
main
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment