Updated thesis

parent 501ba5da
 ... ... @@ -257,13 +257,6 @@ class CellList: min_ = min(cellCounts) avg = n / self.totalCells """ print("Cell statistics:") print("Number of cells (%d, %d, %d)" % (self.nCells, self.nCells, self.nCells)) print("Max. pores per cell: %d (%f%%)" % (max_, max_ / n * 100.)) print("Min. pores per cell: %d (%f%%)" % (min_, min_ / n * 100.)) print("Avg. pores per cell: %.1f (%f%%)" % (avg, avg / n * 100.)) """ return (max_, min_, avg) # Remove pore from sorted pores ... ... @@ -805,7 +798,7 @@ def generate_dendrogram(basenet: Network, targetsize: List[int], \ serialThresh = nthreads * 4 # Pores in interior that are not fully-connected yet poresRemain = pores[:n] while (throatsUnrealized > 0): while (throatsLeft > 0): if (not mute): print("Throats left: %d Throats Unrealized: %d (%.2f%%)" % \ ... ... @@ -846,6 +839,7 @@ def generate_dendrogram(basenet: Network, targetsize: List[int], \ if (len(poreTtbr) == 0): continue # Nothing left to do # Set of indices of connected pores pConn = cellList.connPores[pore.index] # Compute neighbor matches for pore matches = cellList.fetch_matches(pore) for throatIdx, match in zip(pore.throats, matches): ... ... @@ -938,12 +932,12 @@ def generate_dendrogram(basenet: Network, targetsize: List[int], \ cellList.connPores[pore.index].clear() poresRemain = nextPores throatsUnrealized = throatCount // 2 # Counted all throats twice throatsLeft = totalThroats - len(throats) throatsLeft = throatCount // 2 # Counted all throats twice throatsUnrealized = totalThroats - len(throats) # Unrealized throats nUnrealized = throatsLeft nUnrealized = throatsUnrealized # Free memory (not needed anymore) del cellList ... ...
 ... ... @@ -45,29 +45,35 @@ \newlabel{fig:balance}{{1}{4}{Pressures $p_{\mathrm {in}}$ and $p_{\mathrm {out}}$ are applied to in-pores and out-pores respectively, driving the network flow. The resulting pressure system is solved with the available solvers and the different fluxes are shown for the individual pores in the case of PETSc (using 4 processes). Since the sequential solvers produce an identical plot, they are omitted here. \relax }{figure.caption.1}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {3}Parallel Network Generation}{4}{section.3}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}Cell Lists}{4}{subsection.3.1}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Multiprocessing}{4}{subsection.3.2}\protected@file@percent } \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {2}{\ignorespaces Cell lists visualized in 2D. The neighborhood of the pore highlighted in red is marked in blue. Because the cell-size is the maximally permissible throat length $L_m$, only the pores contained within the blue region must be considered during neighbor search. Finally, the periodic buffer layers, containing copies of pores on the opposite side from the interior, are painted in orange.\relax }}{5}{figure.caption.2}\protected@file@percent } \newlabel{fig:cell}{{2}{5}{Cell lists visualized in 2D. The neighborhood of the pore highlighted in red is marked in blue. Because the cell-size is the maximally permissible throat length $L_m$, only the pores contained within the blue region must be considered during neighbor search. Finally, the periodic buffer layers, containing copies of pores on the opposite side from the interior, are painted in orange.\relax }{figure.caption.2}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {3.2}Iterative Algorithm}{5}{subsection.3.2}\protected@file@percent } \@writefile{loa}{\defcounter {refsection}{0}\relax }\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Connect pores in parallel\relax }}{6}{algorithm.1}\protected@file@percent } \newlabel{alg:connect}{{1}{6}{Connect pores in parallel\relax }{algorithm.1}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {3.3}Iterative Algorithm}{6}{subsection.3.3}\protected@file@percent } \@writefile{lot}{\defcounter {refsection}{0}\relax }\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Sample run of parallel pore-connecting algorithm using 4 threads. The generated network is 3 times as large as the base network in all directions. The maximal feasible number of throats is 103464, of which 27 were not realized due to there being no possible candidates left for these remaining pores.\relax }}{6}{table.caption.3}\protected@file@percent } \newlabel{table:iter}{{1}{6}{Sample run of parallel pore-connecting algorithm using 4 threads. The generated network is 3 times as large as the base network in all directions. The maximal feasible number of throats is 103464, of which 27 were not realized due to there being no possible candidates left for these remaining pores.\relax }{table.caption.3}{}} \@writefile{loa}{\defcounter {refsection}{0}\relax }\@writefile{loa}{\contentsline {algorithm}{\numberline {1}{\ignorespaces Connect pores in parallel\relax }}{7}{algorithm.1}\protected@file@percent } \newlabel{alg:connect}{{1}{7}{Connect pores in parallel\relax }{algorithm.1}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {3.4}Results}{7}{subsection.3.4}\protected@file@percent } \abx@aux@cite{MEYER2021101592} \abx@aux@segm{0}{0}{MEYER2021101592} \@writefile{lot}{\defcounter {refsection}{0}\relax }\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Sample run of parallel pore-connecting algorithm using 4 threads. The generated network is 3 times as large as the base network in all directions. The maximal feasible number of throats is 103464, of which 27 were not realized due to there being no possible candidates left for these remaining pores.\relax }}{7}{table.caption.3}\protected@file@percent } \newlabel{table:iter}{{1}{7}{Sample run of parallel pore-connecting algorithm using 4 threads. The generated network is 3 times as large as the base network in all directions. The maximal feasible number of throats is 103464, of which 27 were not realized due to there being no possible candidates left for these remaining pores.\relax }{table.caption.3}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {4}Performance Analysis}{7}{section.4}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Strong Scaling}{7}{subsection.4.1}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Comparison}{7}{subsection.4.2}\protected@file@percent } \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {3}{\ignorespaces For a fixed problem size where the network to be generated is $3^3 = 27$ times as large as the base network in total, we compute the \emph {speedup} for $1, 2, 4, 8, 16$ and $32$ threads. The time measurements of the parallel generation algorithm were conducted on an Euler IV node equipped with Intel Xeon Gold 6150 CPUs and $36$ cores in total.\relax }}{8}{figure.caption.4}\protected@file@percent } \newlabel{fig:strong}{{3}{8}{For a fixed problem size where the network to be generated is $3^3 = 27$ times as large as the base network in total, we compute the \emph {speedup} for $1, 2, 4, 8, 16$ and $32$ threads. The time measurements of the parallel generation algorithm were conducted on an Euler IV node equipped with Intel Xeon Gold 6150 CPUs and $36$ cores in total.\relax }{figure.caption.4}{}} \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Run-times of both serial and parallel algorithms in seconds, as measured on Euler IV node (same as above). The generated size on the $x$-axis refers to the number of times the base network fits inside the generated network. The parallel version is consistently between $10$ and $20$ times faster.\relax }}{8}{figure.caption.4}\protected@file@percent } \newlabel{fig:comp}{{4}{8}{Run-times of both serial and parallel algorithms in seconds, as measured on Euler IV node (same as above). The generated size on the $x$-axis refers to the number of times the base network fits inside the generated network. The parallel version is consistently between $10$ and $20$ times faster.\relax }{figure.caption.4}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Flow Simulation}{9}{subsection.4.3}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {4}Performance Analysis}{8}{section.4}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {4.1}Strong Scaling}{8}{subsection.4.1}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {4.2}Comparison}{8}{subsection.4.2}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {subsection}{\numberline {4.3}Flow Simulation}{8}{subsection.4.3}\protected@file@percent } \abx@aux@cite{MEYER2021101592} \abx@aux@segm{0}{0}{MEYER2021101592} \newlabel{fig:strong}{{3a}{9}{For a fixed problem size where the network to be generated is $3^3 = 27$ times as large as the base network, we compute the \emph {speedup} for $1, 2, 4, 8, 16$ and $32$ threads. The time measurements of the parallel generation algorithm were conducted on an Euler IV node equipped with Intel Xeon Gold 6150 CPUs and $36$ cores in total.\relax }{figure.caption.4}{}} \newlabel{sub@fig:strong}{{a}{9}{For a fixed problem size where the network to be generated is $3^3 = 27$ times as large as the base network, we compute the \emph {speedup} for $1, 2, 4, 8, 16$ and $32$ threads. The time measurements of the parallel generation algorithm were conducted on an Euler IV node equipped with Intel Xeon Gold 6150 CPUs and $36$ cores in total.\relax }{figure.caption.4}{}} \newlabel{fig:comp}{{3b}{9}{Run-times of both serial and parallel algorithms in seconds, as measured on Euler IV node. Measurements are only performed once, since the impact of noise is negligible for sufficiently large networks. The generated size on the $x$-axis refers to the number of times the base network fits inside the generated network. The parallel version is consistently between $10$ and $20$ times faster.\relax }{figure.caption.4}{}} \newlabel{sub@fig:comp}{{b}{9}{Run-times of both serial and parallel algorithms in seconds, as measured on Euler IV node. Measurements are only performed once, since the impact of noise is negligible for sufficiently large networks. The generated size on the $x$-axis refers to the number of times the base network fits inside the generated network. The parallel version is consistently between $10$ and $20$ times faster.\relax }{figure.caption.4}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {5}Discussion}{9}{section.5}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {6}Acknowledgments}{9}{section.6}\protected@file@percent } \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {A}PETSc Installation}{10}{appendix.A}\protected@file@percent } \newlabel{appendix:install}{{A}{10}{PETSc Installation}{appendix.A}{}} \@writefile{toc}{\defcounter {refsection}{0}\relax }\@writefile{toc}{\contentsline {section}{\numberline {B}Base Network}{10}{appendix.B}\protected@file@percent } \newlabel{appendix:base}{{B}{10}{Base Network}{appendix.B}{}} \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {5}{\ignorespaces Network consisting of 2636 pores and 4291 throats, inscribed within a cube extending $1.07$ mm in each spatial direction. The pore-arrangement is obviously \emph {not uniform}, as can be seen by the clustering of pores in some regions, while others are mostly undisturbed. The \emph {porosity}, measured as the relative fraction of volume taken up by the void-space geometry, is roughly $32\%$.\relax }}{10}{figure.caption.5}\protected@file@percent } \newlabel{fig:base}{{5}{10}{Network consisting of 2636 pores and 4291 throats, inscribed within a cube extending $1.07$ mm in each spatial direction. The pore-arrangement is obviously \emph {not uniform}, as can be seen by the clustering of pores in some regions, while others are mostly undisturbed. The \emph {porosity}, measured as the relative fraction of volume taken up by the void-space geometry, is roughly $32\%$.\relax }{figure.caption.5}{}} \@writefile{lof}{\defcounter {refsection}{0}\relax }\@writefile{lof}{\contentsline {figure}{\numberline {4}{\ignorespaces Network consisting of 2636 pores and 4291 throats, inscribed within a cube extending $1.07$ mm in each spatial direction. The pore-arrangement is obviously \emph {not uniform}, as can be seen by the clustering of pores in some regions, while others are mostly undisturbed. The \emph {porosity}, measured as the relative fraction of volume taken up by the void-space geometry, is roughly $32\%$.\relax }}{10}{figure.caption.5}\protected@file@percent } \newlabel{fig:base}{{4}{10}{Network consisting of 2636 pores and 4291 throats, inscribed within a cube extending $1.07$ mm in each spatial direction. The pore-arrangement is obviously \emph {not uniform}, as can be seen by the clustering of pores in some regions, while others are mostly undisturbed. The \emph {porosity}, measured as the relative fraction of volume taken up by the void-space geometry, is roughly $32\%$.\relax }{figure.caption.5}{}} \abx@aux@refcontextdefaultsdone \abx@aux@defaultrefcontext{0}{MEYER2021103936}{none/global//global/global} \abx@aux@defaultrefcontext{0}{MEYER2021101592}{none/global//global/global} ... ...
 ... ... @@ -2193,6 +2193,7 @@ MEYER2021101592 MEYER2021103936 MEYER2021101592 MEYER2021101592 ... ...
This diff is collapsed.
 ... ... @@ -6,10 +6,14 @@ \BOOKMARK [-]{subsection.2.4}{Results}{section.2}% 6 \BOOKMARK [-]{section.3}{Parallel Network Generation}{}% 7 \BOOKMARK [-]{subsection.3.1}{Cell Lists}{section.3}% 8 \BOOKMARK [-]{subsection.3.2}{Iterative Algorithm}{section.3}% 9 \BOOKMARK [-]{section.4}{Performance Analysis}{}% 10 \BOOKMARK [-]{subsection.4.1}{Strong Scaling}{section.4}% 11 \BOOKMARK [-]{subsection.4.2}{Comparison}{section.4}% 12 \BOOKMARK [-]{subsection.4.3}{Flow Simulation}{section.4}% 13 \BOOKMARK [-]{appendix.A}{PETSc Installation}{}% 14 \BOOKMARK [-]{appendix.B}{Base Network}{}% 15 \BOOKMARK [-]{subsection.3.2}{Multiprocessing}{section.3}% 9 \BOOKMARK [-]{subsection.3.3}{Iterative Algorithm}{section.3}% 10 \BOOKMARK [-]{subsection.3.4}{Results}{section.3}% 11 \BOOKMARK [-]{section.4}{Performance Analysis}{}% 12 \BOOKMARK [-]{subsection.4.1}{Strong Scaling}{section.4}% 13 \BOOKMARK [-]{subsection.4.2}{Comparison}{section.4}% 14 \BOOKMARK [-]{subsection.4.3}{Flow Simulation}{section.4}% 15 \BOOKMARK [-]{section.5}{Discussion}{}% 16 \BOOKMARK [-]{section.6}{Acknowledgments}{}% 17 \BOOKMARK [-]{appendix.A}{PETSc Installation}{}% 18 \BOOKMARK [-]{appendix.B}{Base Network}{}% 19
No preview for this file type
No preview for this file type
 ... ... @@ -6,10 +6,13 @@ % packages \usepackage[backend=biber, sorting=none]{biblatex} \usepackage[labelfont=bf]{caption} % figure captions (bold-face) \usepackage{caption} \usepackage{subcaption} % sub-figures \usepackage[labelfont=bf]{caption, subcaption} % figure captions (bold-face) \usepackage{multicol} % multicolumn environment \usepackage{tikz} % figures \usepackage{xcolor} % color for cell lists picture \usepackage[bottom]{footmisc} % footnotes below figures \usepackage{hyperref} % web-links \usepackage{listings} % line break in verb environment \usepackage{amsmath} % math commands ... ... @@ -49,7 +52,7 @@ \newpage \begin{center} \Large{\textbf{Abstract}} \end{center} \hspace{0.5cm}To study the flow properties of large void-space geometries found in porous media such as f.ex. soil or gravel, \cite{MEYER2021103936} describes and implements routines for the generation \& simulation of flow networks in the Python library \emph{netflow} \cite{MEYER2021101592}. Based on a relatively small base network acquired via tomographic scans, the generated flow network is of intermediate size (millions of pores). To extend this procedure to even larger networks (up to 100 millions of pores), parallel computing is employed for both generation of pore-networks as well as solving the flow for said networks. In the latter, we rely on existing MPI-based parallel solvers from the PETSc \cite{petsc-web-page} toolkit. See Appendix ~\ref{appendix:install} for installation details. To study the flow properties of large void-space geometries found in porous media such as f.ex. soil or gravel, \cite{MEYER2021103936} describes and implements routines for the generation \& simulation of flow networks in the Python library \emph{netflow} \cite{MEYER2021101592}. Based on a relatively small base network acquired via tomographic scans, the generated flow network is of intermediate size (millions of pores). To extend this procedure to even larger networks (up to 100 millions of pores), parallel computing is employed for both generation of pore-networks as well as solving the flow for said networks. In the latter, we rely on existing MPI-based parallel solvers from the PETSc \cite{petsc-web-page} toolkit. See Appendix ~\ref{appendix:install} for installation details. \vspace{5ex} \begin{multicols}{2} ... ... @@ -82,12 +85,11 @@ \hspace{0.5cm}The existing serial dendrogram-based network generation algorithm, as presented in \cite{MEYER2021101592}, is now modified. Concretely, to allow connecting all pores that populate the larger, generated network domain in parallel, we rely on a shared memory approach via the \verb|multiprocessing| Python module. To accomplish this, we first shift our attention to \emph{cell lists}, which offer a direct application of the already existing \textbf{maximal throat length} parameter $L_m$ as a suitable \textbf{cell size}. \subsection{Cell Lists} \hspace{0.5cm}To speed up neighbor search for the \emph{stationary} pores, we have opted to use the well-known cell lists data structure instead of the triangulation based approach outlined in \cite{MEYER2021103936}. This choice is supported by the useful properties of cell lists for our purpose of generating a network of similar pore-arrangement. It is also favorable over the triangulation method as the cell lists are only initialized once, and pores that have already been processed by the algorithm can be removed efficiently. Primarily however, it allows a straight-forward application of parallel computing, by distributing the work needed to find the neighbors of individual pores evenly. See Figure ~\ref{fig:cell} for a visualization. \subsection{Multiprocessing} \hspace{0.5cm}Using the \verb|multiprocessing| module, we can initialize a \emph{shared memory} region in the form of a \verb|RawArray| to store the indices\footnote{RawArray only supports primitive data types, therefore indices are used to uniquely identify every pore.} of best matching neighbor pores for all throats of any given pore. This so-called pore-match table or array is filled, massively in parallel by the individual threads, without any need for synchronization, as each process only operates on its dedicated part. This has excellent performance implications and is discussed in more detail in section 4. A \textbf{single thread} then iterates over its assigned pores, and \textbf{among all neighboring pores} contained within adjacent cells of the current pore, finds for each throat, carried over from the base network, the closest match. Ignored as possible match candidates are, trivially, the pore itself, any pore \emph{already} connected to it and pores farther away than the \emph{maximal throat length} $L_m$. It is also ensured that the matched pores are unique\footnote{There is a minor edge-case here that applies only if the generated network is of the \textbf{same size} as the original in any one direction. The issue is that both the original pore as well as one of its periodic copies are part of the same neighborhood. However, this is handled accordingly.}. If at any point no possible matches are found, then the corresponding entry in the pore-match table is marked as \emph{invalid}. \end{multicols} \newpage \begin{figure}[ht] \vspace{-0.5cm} \begin{figure}[b!] \centering \begin{tikzpicture} \draw[black] (-3,-3) grid (3,3); ... ... @@ -208,95 +210,120 @@ \caption{Cell lists visualized in 2D. The neighborhood of the pore highlighted in red is marked in blue. Because the cell-size is the maximally permissible throat length $L_m$, only the pores contained within the blue region must be considered during neighbor search. Finally, the periodic buffer layers, containing copies of pores on the opposite side from the interior, are painted in orange.} \label{fig:cell} \end{figure} \newpage \begin{multicols}{2} \subsection{Iterative Algorithm} \hspace{0.5cm}To connect the generated pores by throats, we employ an \textbf{iterative} strategy. Beforehand however, the cell list is constructed based on the extent of the full domain (including periodic buffer layer) and cell-size $L_m$. Next, all pores are placed in their respective cell computed from their position. Now, for each pore and for each of its throats, which are copied from the base network and are sought to be realized, we find an \emph{ideal} match from its \textbf{neighboring cells}. Here, ideal refers to minimal absolute difference between physical distance of the pores and original length of the current throat $L_t$. Given position of $i$-th pore $\mathbf{p}_i$ and original throat length $L_t$ we seek: \begin{equation} j^* = \argmin_{j \in \mathcal{N}(i)}\Bigr| ||\mathbf{p}_i - \mathbf{p}_j||_2 - L_t \Bigr| \end{equation} Where $\mathcal{N}(i)$ is the set of all pores in adjacent cells to pore $i$. Finding $j^*$ for different pores is \emph{embarrassingly parallel} and can therefore be computed by a large number of threads, storing their results in shared memory. Each thread works on an even chunk of the pores located \emph{inside} the domain. Subsequently, these ideal matches are connected, while avoiding \textbf{conflicts} of pores seeking either a neighbor that is already fully-connected or that was previously connected to them. Finally we repeat the two steps from above, now only considering pores that still have throats left in \emph{random} order (for improved \textbf{load balancing}), until \textbf{no more connections} can be found. Empirically, this process converges very fast and consistently, for different target sizes of the full domain, after merely 7-8 iterations. The first iteration alone leaves only $\approx 18\%$ of all possible throats left, see Table ~\ref{table:iter}. Final iterations may be performed serially if only few pores remain as to avoid costly spawning of threads without speed gain. The procedure is summarized as pseudo-code in Algorithm ~\ref{alg:connect}. Where $\mathcal{N}(i)$ is the set of all pores in adjacent cells to pore $i$. Finding $j^*$ for different pores is \emph{embarrassingly parallel} and can therefore be computed by a large number of threads, storing their results in shared memory. Each thread works on an even chunk of the pores located \emph{inside} the domain. Subsequently, these ideal matches are connected by throats, while avoiding \textbf{conflicts} of pores seeking either a neighbor that is already fully-connected or that was previously connected to them. Finally we repeat the two steps from above, now only considering pores that still have throats left in \emph{random} order (for improved \textbf{load balancing}), until \textbf{no more valid matches} can be found. Empirically, this process converges very fast and consistently, for different target sizes of the full domain, after merely 7-8 iterations. The first iteration alone leaves only $\approx 18\%$ of all possible throats left, see Table ~\ref{table:iter}. Final iterations may be performed serially if only few pores remain as to avoid costly spawning of threads without speed gain. The procedure is summarized as pseudo-code in Algorithm ~\ref{alg:connect}. \end{multicols} \begin{algorithm} \begin{table}[b] \vspace{-0.5cm} \centering \caption{Sample run of parallel pore-connecting algorithm using 4 threads. The generated network is 3 times as large as the base network in all directions. The maximal feasible number of throats is 103464, of which 27 were not realized due to there being no possible candidates left for these remaining pores.} \begin{tabular}{|c|c|c|} \hline \textbf{Iteration} & \textbf{Throats left} & \textbf{Rel. percentage} \\ \hline 0 & 103464 & 100.0\% \\ 1 & 18690 & 18.1\% \\ 2 & 4329 & 4.2\% \\ 3 & 1096 & 1.1\% \\ 4 & 282 & 0.3\% \\ 5 & 73 & 0.1\% \\ 6 & 30 & 0.03\% \\ 7 & 27 & 0.026\% \\ \hline \end{tabular} \label{table:iter} \end{table} \newpage \begin{algorithm}[ht] \caption{Connect pores in parallel} \begin{algorithmic} \State Initialize $cellList$ using $pores$ and compute $totalThroats$ \State $poresRemain \gets pores$ \State $throatsLeft \gets totalThroats$ \State $throatsUnrealized \gets totalThroats$ \While{$throatsUnrealized > 0$} \While{$throatsLeft > 0$} \State Spawn $nthreads$ threads \State Compute best matches for all pores in $poresRemain$ using threads \State Store result in shared memory location $poreMatchTable$ \For{$pore$ \textbf{in} $poresRemain$} \For{$throat$ \textbf{in} $pore.throats$} \State Fetch $match$ pore from $poreMatchTable$ \If{$throat$ is not already realized \textbf{and} $match$ is found} \If{$throat$ is not already realized \textbf{and} $match$ is valid} \State Realize $throat$ \EndIf \EndFor \EndFor \State Compute list of pores with throats left in random order: $nextPores$ \State $poresRemain \gets nextPores$ \State Count throats that are still left: $nextThroats$ \State Count remaining $throats$ having valid matches: $nextThroats$ \State $throatsLeft \gets nextThroats$ \State Count $throats$ that have been successfully realized: $realizedThroats$ \State $throatsUnrealized \gets totalThroats - realizedThroats$ \EndWhile \end{algorithmic} \label{alg:connect} \end{algorithm} \begin{table} \vspace{-0.5cm} \centering \caption{Sample run of parallel pore-connecting algorithm using 4 threads. The generated network is 3 times as large as the base network in all directions. The maximal feasible number of throats is 103464, of which 27 were not realized due to there being no possible candidates left for these remaining pores.} \begin{tabular}{|c|c|c|} \hline \textbf{Iteration} & \textbf{Throats left} & \textbf{Rel. percentage} \\ \hline 0 & 103464 & 100.0\% \\ 1 & 18690 & 18.1\% \\ 2 & 4329 & 4.2\% \\ 3 & 1096 & 1.1\% \\ 4 & 282 & 0.3\% \\ 5 & 73 & 0.1\% \\ 6 & 30 & 0.03\% \\ 7 & 27 & 0.026\% \\ \hline \end{tabular} \label{table:iter} \end{table} \begin{multicols}{2} \subsection{Results} \hspace{0.5cm}Lastly, we discuss the quality of network realizations generated by our method. The immediate advantage of this iterative algorithm is that it connects as many pores as it can, achieving a much \textbf{larger fraction of realized throats}. \end{multicols} \newpage \begin{multicols}{2} \section{Performance Analysis} \hspace{0.5cm}In the following we discuss the results of our parallel algorithm and compare it to the existing serial implementation from \cite{MEYER2021101592}. Particularly, we are interested in the \textbf{scaling} of our method with respect to the number of processes and the difference in \textbf{run-time} between the two implementations for varying target sizes of the generated domain. All performance measurements were conducted on \emph{Euler} and the specific type of node used is mentioned together with the results. \hspace{0.5cm}In the following we discuss the results of our parallel pore-connecting algorithm and compare it to the existing serial implementation from \cite{MEYER2021101592}. Particularly, we are interested in the \textbf{scaling} of our method with respect to the number of processes and the difference in \textbf{run-time} between the two implementations for varying target sizes of the generated domain. All performance measurements were conducted on \emph{Euler} and the specific type of node used is mentioned together with the results. The last sub-section is then dedicated to the distributed flow solver, comparing it to its serial counterpart for increasing numbers of pores. \subsection{Strong Scaling} \hspace{0.5cm}Firstly, we investigate the \textbf{strong scaling} as shown in Figure ~\ref{fig:strong}. We observe that our method scales fairly well for an increasing number of processes, as the measurements lie reasonably close to the \emph{ideal} case. As expected, the speedup is not perfect, since the network generation function is not fully parallelized. In particular, the distribution of pores using dendrogram-based clustering is still serial. Additionally, connecting the individual pores cannot be parallelized in a meaningful way, as it involves substantial complexity concerning synchronization. It is also significantly faster and computationally cheap, than the parallel match finding. Despite all this, because these serial parts only account for a small fraction of the total computation time, we still observe a large \textbf{speedup} of up to a factor $20$. \hspace{0.5cm}Firstly, we investigate the \textbf{strong scaling} as shown in Figure ~\ref{fig:strong}. We observe that our method scales fairly well for an increasing number of processes, as the measurements lie reasonably close to the \emph{ideal} case. As expected, the speedup is not perfect, since the network generation function is not fully parallelized. In particular, the distribution of pores using dendrogram-based clustering is still serial. Additionally, connecting the individual pores cannot be parallelized in a useful way, as it involves substantial complexity concerning synchronization. It is also significantly faster and computationally cheap, than the parallel match finding. Despite all this, because these serial parts only account for a small fraction of the total computation time, we still observe a large \textbf{speedup} of up to a factor $20$. \subsection{Comparison} \hspace{0.5cm}Secondly, we compare the performance of the concurrent algorithm with that of the serial version. To do this, we consider increasing target sizes, starting from the original size of the network. Next, we generate networks that are twice as large in the first, then also the second spatial direction etc. We continue this process of doubling individual dimension sizes until we reach a network that is 4 times as large in every direction, or in total 64 times as large as the original base network. The results, which are depicted in Figure ~\ref{fig:comp}, indicate a drastic improvement in terms of computation time over the original implementation. \end{multicols} \begin{figure}[ht!] \vspace{-0.5cm} \centering \includegraphics[width=0.8\textwidth]{plots/strong_scaling.png} \caption{For a fixed problem size where the network to be generated is $3^3 = 27$ times as large as the base network in total, we compute the \emph{speedup} for $1, 2, 4, 8, 16$ and $32$ threads. The time measurements of the parallel generation algorithm were conducted on an Euler IV node equipped with Intel Xeon Gold 6150 CPUs and $36$ cores in total.} \begin{subfigure}[t]{0.45\textwidth} \centering \includegraphics[width=\textwidth]{plots/strong_scaling.png} \caption{For a fixed problem size where the network to be generated is $3^3 = 27$ times as large as the base network, we compute the \emph{speedup} for $1, 2, 4, 8, 16$ and $32$ threads. The time measurements of the parallel generation algorithm were conducted on an Euler IV node equipped with Intel Xeon Gold 6150 CPUs and $36$ cores in total.} \label{fig:strong} \includegraphics[width=0.8\textwidth]{plots/comparison_time.png} \caption{Run-times of both serial and parallel algorithms in seconds, as measured on Euler IV node (same as above). The generated size on the $x$-axis refers to the number of times the base network fits inside the generated network. The parallel version is consistently between $10$ and $20$ times faster.} \end{subfigure} \hfill \begin{subfigure}[t]{0.45\textwidth} \centering \includegraphics[width=\textwidth]{plots/comparison_time.png} \caption{Run-times of both serial and parallel algorithms in seconds, as measured on Euler IV node. Measurements are only performed once, since the impact of noise is negligible for sufficiently large networks. The generated size on the $x$-axis refers to the number of times the base network fits inside the generated network. The parallel version is consistently between $10$ and $20$ times faster.} \label{fig:comp} \end{subfigure} \end{figure} \newpage \begin{multicols}{2} \subsection{Flow Simulation} \hspace{0.5cm}In the next step we evaluate the performance of our distributed flow solver as introduced in section 2. \end{multicols} \newpage \begin{multicols}{2} \section{Discussion} \hspace{0.5cm}In this work we have achieved large performance improvements over the previous, serial implementations from \cite{MEYER2021101592}, especially regarding the generation algorithm. This allows us to compute much larger network realizations of identical quality and simulate the flow through them within a reasonable time-frame of a few hours. \section{Acknowledgments} \hspace{0.5cm}The author thankfully acknowledges the various discussions held with supervisor Daniel W. Meyer, who has guided the course of this thesis and offered helpful insight during development of the code. \end{multicols} \newpage \appendix \begin{center} \Large{\textbf{Appendix}} \end{center} \section{PETSc Installation} ... ...
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!