diff --git a/.github/workflows/latex.yml b/.github/workflows/latex.yml
index 35b6f07..2c21e7d 100644
--- a/.github/workflows/latex.yml
+++ b/.github/workflows/latex.yml
@@ -71,21 +71,23 @@ jobs:
run: |
bash ./a.cli setup
-# - name: Update BibTeX references
-# run: |
-# bash ./a.cli update-bibtex
-#
-# - name: Commit new references.bib
-# run: |
-# git config --local user.email "${{ github.actor }}@users.noreply.github.com"
-# git config --local user.name "${{ github.actor }}"
-# git add references.bib
-# if ! git diff --cached --quiet; then
-# git commit -m "Update references.bib from Zotero"
-# git push
-# else
-# echo "No changes in references.bib to commit."
-# fi
+ - name: Update BibTeX references
+ if: ${{ github.ref != 'refs/heads/main' }}
+ run: |
+ bash ./a.cli update-bibtex
+
+ - name: Commit new references.bib
+ if: ${{ github.ref != 'refs/heads/main' }}
+ run: |
+ git config --local user.email "${{ github.actor }}@users.noreply.github.com"
+ git config --local user.name "${{ github.actor }}"
+ git add references.bib
+ if ! git diff --cached --quiet; then
+ git commit -m "Update references.bib from Zotero"
+ git push
+ else
+ echo "No changes in references.bib to commit."
+ fi
- name: Compile LaTeX document
uses: xu-cheng/latex-action@v3
diff --git a/.gitignore b/.gitignore
index e154f3a..dd2b133 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,7 +6,7 @@ _minted*
**/*.out
**/*.pyg
**/*.fls
-**/*.synctex.gz
+**/*.synctex*
main.pdf
*.toc
article.template.pdf
@@ -23,5 +23,5 @@ exa-ma-d7.1.pdf
*.chl
*.lof
*.lot
-*.pdf
-.venv
\ No newline at end of file
+.venv
+.DS_Store
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 2052ecc..6ba4008 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -38,8 +38,10 @@
]
}
],
- "latex-workshop.latex.autoBuild.run": "onFileChange",
+ "latex-workshop.latex.autoBuild.run": "onSave",
"latex-workshop.latex.autoBuild.enabled": true,
"latex-workshop.latex.build.showOutput": "always",
- "latex-workshop.latex.outDir": "%DIR%"
+ "latex-workshop.latex.outDir": "%DIR%",
+ "latex-workshop.message.badbox.show": "none",
+ "workbench.editor.pinnedTabsOnSeparateRow": true
}
\ No newline at end of file
diff --git a/README.adoc b/README.adoc
index a79ec49..7061d6b 100644
--- a/README.adoc
+++ b/README.adoc
@@ -1,5 +1,5 @@
= Exa-MA D7.1 Report Automation
-
+:experimental: true
//.Zenodo DOI
//--
//image::https://zenodo.org/badge/DOI/10.5281/zenodo.13341126.svg[DOI, link=https://doi.org/10.5281/zenodo.13341126]
@@ -33,15 +33,60 @@ NOTE: `a.cli` is a shell script to setup the repo, create release and update bib
.Command line to compile fully the deliverable
[source, shell]
----
-latexmk -shell-escape -pdf -file-line-error -halt-on-error -interaction=nonstopmode exa-ma-d7.1.tex
+latexmk --shell-escape -pdf -file-line-error -halt-on-error -interaction=nonstopmode exa-ma-d7.1.tex
----
.Simple command line to compile once
[source, shell]
----
-pdflatex -shell-escape exa-ma-d7.1.tex
+pdflatex --shell-escape exa-ma-d7.1.tex
----
+== Zotero
+
+=== Update Zotero references
+
+To update the references (`references.bib`), you have several options:
+
+Download from Zotero:: download the file references.bib by exporting the exa-ma library from the GUI
+Using `a.cli`:: setup the variable and run `a.cli update-bibtex`
++
+[,console]
+----
+
+export ZOTERO_API_KEY=xxx
+bash ./a.cli update-bibtex
+----
++
+to create the key go to https://www.zotero.org/settings/security[> Zotero Secutiry page] (and log in)
++
+then go to the **Applications** at the bottom of the **Secutiry** page
++
+image:graphics/zotero/zotero-applications.png[]
++
+click btn:[Create new private key] then select group permission **Read only*. Then btn:[Save Key].
++
+image:graphics/zotero/zotero-newkey.png[]
++
+Then keep the key in a safe place and use it to to define `ZOTERO_API_KEY`
+
+=== How to set citation keys
+
+[.right]
+image:graphics/zotero/zotero-citation-keys.png[]
+
+
+To set your own citation key, simply enter the following line in Zotero's **Extra** field:
+[source,text]
+----
+Citation Key: saigre_coupled_2024_paper
+----
+
+When the biblio is exported, this key will be used for the reference.
+
+NOTE: you have then to handle possible conflicts in citation keys yourself in you set them yourself.
+
+
== Workflow Description
The repository utilizes GitHub Actions to automate the following tasks:
diff --git a/a.cli b/a.cli
index eb7deb2..318e406 100644
--- a/a.cli
+++ b/a.cli
@@ -24,7 +24,7 @@ parse_args() {
--zotero-api-key) ZOTERO_API_KEY="$2"; shift ;;
--zotero-group-id) ZOTERO_GROUP_ID="$2"; shift ;;
--bibtex-file) BIBTEX_FILE="$2"; shift ;;
- --help)
+ --help)
echo "Usage: $0 [options] {setup|create|list|delete|update-bibtex} [version]"
echo "Options:"
echo " --zotero-user-id Zotero User ID"
@@ -109,6 +109,11 @@ list_releases() {
git tag --sort=-creatordate | head -n $1
}
+# Function to clean latex build files
+clean() {
+ # clean latex build files
+ rm -f *.aux *.bbl *.blg *.log *.out *.pyg *.fls *.synctex* *.toc *.fdb_latexmk *.fls *.idx *.ilg *.ind *.chl *.lof *.lot *.pdf
+}
# Function to delete a release
delete_release() {
VERSION=$1
@@ -146,16 +151,17 @@ update_bibtex() {
# Define the URL to fetch BibTeX
if [ -z "$ZOTERO_GROUP_ID" ]; then
- URL="https://api.zotero.org/users/$ZOTERO_USER_ID/items?format=bibtex"
+ URL="https://api.zotero.org/users/$ZOTERO_USER_ID/items?format=biblatex"
else
- URL="https://api.zotero.org/groups/$ZOTERO_GROUP_ID/items?format=bibtex"
+ URL="https://api.zotero.org/groups/$ZOTERO_GROUP_ID/items?format=biblatex"
fi
start=0
limit=100
+ has_more=true
echo "" > "$BIBTEX_FILE"
while $has_more; do
response=$(curl -s -H "Zotero-API-Key: $ZOTERO_API_KEY" "$URL&start=$start&limit=$limit")
-
+
if [ -z "$response" ]; then
echo "No more items to fetch."
has_more=false
@@ -176,6 +182,69 @@ update_bibtex() {
echo "BibTeX entries updated successfully in $BIBTEX_FILE."
}
+update_bibtex2() {
+ if [ -z "$ZOTERO_GROUP_ID" ]; then
+ if [ -z "$ZOTERO_USER_ID" ]; then
+ echo "Zotero group ID and user ID are not set, one of them is required."
+ echo "If Group ID is set, User ID is not required."
+ exit 1
+ fi
+ fi
+
+ if [ -z "$ZOTERO_API_KEY" ]; then
+ echo "Zotero API key is not set."
+ exit 1
+ fi
+
+ echo "Fetching BibTeX entries from Zotero..."
+
+ # Define the URL to fetch BibTeX
+ if [ -z "$ZOTERO_GROUP_ID" ]; then
+ URL="https://api.zotero.org/users/$ZOTERO_USER_ID/items?format=bibtex"
+ else
+ URL="https://api.zotero.org/groups/$ZOTERO_GROUP_ID/items?format=bibtex"
+ fi
+
+ start=0
+ limit=100
+ has_more=true # Initialize the loop control variable
+ echo "" > "$BIBTEX_FILE"
+
+ while $has_more; do
+ # Send the request
+ response=$(curl -s -w "%{http_code}" -H "Zotero-API-Key: $ZOTERO_API_KEY" "$URL&start=$start&limit=$limit")
+
+ # Separate the response content and the status code
+ http_code=$(echo "$response" | tail -n1)
+ content=$(echo "$response" | sed '$d') # Everything except the last line (status code)
+
+ # Check if the request was successful (status code 200)
+ if [ "$http_code" -ne 200 ]; then
+ echo "Error fetching data: HTTP status $http_code"
+ exit 1
+ fi
+ if [ -z "$content" ]; then
+ echo "No more items to fetch."
+ has_more=false
+ break
+ fi
+ # Append the content to the BibTeX file
+ echo "$content" >> "$BIBTEX_FILE"
+
+ # Check if there are more items by inspecting the Link header or counting the items
+ num_items=$(echo "$content" | grep -c "@") # Counting BibTeX entries
+ echo "downloaded $num_items items"
+ if [ "$num_items" = "0" ]; then
+ has_more=false
+ else
+ start=$((start + limit))
+ fi
+ echo "start=$start, has_more=$has_more"
+ done
+
+ echo "BibTeX entries updated successfully in $BIBTEX_FILE."
+}
+
# Main script logic
parse_args "$@"
@@ -183,6 +252,9 @@ case "$1" in
setup)
setup
;;
+ clean)
+ clean
+ ;;
create)
create_release "$2"
;;
@@ -193,7 +265,7 @@ case "$1" in
delete_release "$2"
;;
update-bibtex)
- update_bibtex
+ update_bibtex2
;;
*)
echo "Usage: $0 [options] {setup|create|list|delete|update-bibtex} [version]"
@@ -204,6 +276,7 @@ case "$1" in
echo " --bibtex-file Path to the BibTeX file (default: your_bibtex_file.bib)"
echo "Commands:"
echo " setup : Setup hooks for commit, checkout, and merge"
+ echo " clean : Clean latex build files"
echo " create : Create a new release with the provided version"
echo " list : List all existing releases"
echo " delete : Delete the release with the provided version"
diff --git a/chapters/00-index.tex b/chapters/00-index.tex
index ff8f570..cf552a7 100644
--- a/chapters/00-index.tex
+++ b/chapters/00-index.tex
@@ -1,24 +1,30 @@
\chapter{WP1 - Discretization}
+\label{chap:wp1}
\clearpage
\subimport{./WP1}{00-index}
\chapter{WP2 - Model order, Surrogate, Scientific Machine Learning methods}
+\label{chap:wp2}
\clearpage
\subimport{./WP2}{00-index}
\chapter{WP3 - Solvers}
+\label{chap:wp3}
\clearpage
\subimport{./WP3}{00-index}
\chapter{WP4 - Data assimilation}
+\label{chap:wp4}
\clearpage
\subimport{./WP4}{00-index}
\chapter{WP5 - Optimization}
+\label{chap:wp5}
\clearpage
\subimport{./WP5}{00-index}
\chapter{WP6 - Uncertainty quantification}
+\label{chap:wp6}
\clearpage
\subimport{./WP6}{00-index}
diff --git a/chapters/WP3/00-index.tex b/chapters/WP3/00-index.tex
index cc33c39..910b47a 100644
--- a/chapters/WP3/00-index.tex
+++ b/chapters/WP3/00-index.tex
@@ -1,4 +1,4 @@
-\input{software/arcane-framework/WP3/WP3.tex}
+%%\input{software/arcane-framework/WP3/WP3.tex}
\input{software/composyx/WP3/WP3.tex}
\input{software/feelpp/WP3/WP3.tex}
\input{software/freefempp/WP3/WP3.tex}
diff --git a/chapters/WP7/00-index.tex b/chapters/WP7/00-index.tex
index 72650e0..0c6168b 100644
--- a/chapters/WP7/00-index.tex
+++ b/chapters/WP7/00-index.tex
@@ -1,4 +1,4 @@
-\input{software/arcane-framework/WP7/WP7.tex}
+%\input{software/arcane-framework/WP7/WP7.tex}
\input{software/feelpp/WP7/WP7.tex}
\input{software/freefempp/WP7/WP7.tex}
\input{software/manta/WP7/WP7.tex}
diff --git a/chapters/benchmarking.tex b/chapters/benchmarking.tex
index b74dd23..140b454 100644
--- a/chapters/benchmarking.tex
+++ b/chapters/benchmarking.tex
@@ -402,7 +402,7 @@ \section{Conclusion}
We presented a phased benchmarking strategy that integrates testing, validation, profiling, and continuous integration to ensure that the software is ready for exascale environments. The testing processes, including non-regression, verification, and validation, emphasize maintaining performance integrity across updates and ensuring that new algorithmic improvements lead to meaningful progress.
-The benchmarking strategy involves comprehensive testing across \textbf{CPU}, \textbf{GPU}, and \textbf{hybrid} CPU-GPU architectures to ensure that software can scale efficiently. This includes separate tests for CPU and GPU performance, as well as hybrid configurations where workloads are distributed across both architectures.
+The benchmarking strategy involves testing across \textbf{CPU}, \textbf{GPU}, and \textbf{hybrid} CPU-GPU architectures to ensure that software can scale efficiently. This includes separate tests for CPU and GPU performance, as well as hybrid configurations where workloads are distributed across both architectures.
Data management and I/O strategies (\ac{B6}, \ac{B9}, \ac{B10}, \ac{B11}) were also highlighted to address I/O bottlenecks and ensure efficient handling of large datasets, leveraging fault-tolerant mechanisms for data integrity. As the project progresses, more advanced fault tolerance strategies may be integrated to ensure resilience against hardware or software failures.
diff --git a/chapters/software.tex b/chapters/software.tex
index c09d902..03548a6 100644
--- a/chapters/software.tex
+++ b/chapters/software.tex
@@ -1,6 +1,6 @@
\clearpage
\chapter{Software}
-\label{sec:software}
+\label{chap:software}
This chapter presents the software developed within Exa-MA, focusing on features, mathematics, functionalities, publications, acknowledgments, and contact details.
@@ -173,14 +173,14 @@ \subsection{DevOps - Testing}
\end{figure}
-\input{software/arcane-framework/arcane-framework.tex}
+%\input{software/arcane-framework/arcane-framework.tex}
\input{software/cgal/cgal.tex}
\input{software/composyx/composyx.tex}
\input{software/feelpp/feelpp.tex}
\input{software/freefempp/freefempp.tex}
\input{software/hawen/hawen.tex}
\input{software/hpddm/hpddm.tex}
-\input{software/mahyco/mahyco.tex}
+%%\input{software/mahyco/mahyco.tex}
\input{software/manta/manta.tex}
\input{software/pbb/pbb.tex}
\input{software/samurai/samurai.tex}
diff --git a/defs.tex b/defs.tex
index b093d67..1072b27 100644
--- a/defs.tex
+++ b/defs.tex
@@ -6,3 +6,7 @@
\def\exaatow{\textsc{Exa-Atow}\xspace}
\def\exadi{\textsc{Exa-DI}\xspace}
\def\numpex{NumPEx}
+\def\Rplus{\protect\hspace{-0em}\protect\raisebox{.35ex}{{\smaller\textbf{+}}}}
+\def\Cpp#1{\mbox{C\Rplus\Rplus#1}\xspace}
+\def\feelpp{\ac{feelpp}\xspace}
+\def\Feelpp{\mbox{Feel\Rplus\Rplus}\xspace}
\ No newline at end of file
diff --git a/exa-ma-d7.1.tex b/exa-ma-d7.1.tex
index bced6dc..c697563 100644
--- a/exa-ma-d7.1.tex
+++ b/exa-ma-d7.1.tex
@@ -4,24 +4,39 @@
\documentclass[11pt]{report}
-\usepackage{numpex} % numpex specific definitions and styles
+\usepackage{numpex} % numpex specific definitions and styles
\usepackage[utf8]{inputenc} % UTF8 package
\usepackage[T1]{fontenc}
\usepackage{textcomp} % common special chars
\usepackage{amsmath} % math formula
+\usepackage{empheq}
\usepackage{fancybox}
\usepackage{anyfontsize} % fonts
\usepackage{lipsum}
\usepackage{cleveref}
-\usepackage{standalone}
+\usepackage{standalone}
\usepackage[]{acronym}
\usepackage{import}
\usepackage{booktabs} % For nicer tables
-\usepackage{geometry}
-\usepackage{tabularx}
+\usepackage{geometry}
+\usepackage{tabularx,longtable}
+\usepackage{paralist} % For compactitem
+\usepackage{multirow} % For multirow in tables
\usepackage{pgf-pie} % For pie charts
\usepackage{tikz} % Required for drawing graphics
-\usepackage{xspace,float} % Required for controlling the position of objects
+\usepackage{xspace,float,relsize} % Required for controlling the position of objects
+\usepackage{paralist}
+\usepackage{minted}
+\usepackage{siunitx} % Required for typesetting units
+\DeclareSIUnit\mmHg{mmHg}
+\usepackage{currfile} % Required for getting the current file name
+\usepackage{pgfplots}
+\usepackage{pgfplotstable}
+\AtBeginEnvironment{tikzpicture}{\tracinglostchars=0\relax}
+\pgfplotsset{compat=newest}
+\usetikzlibrary{positioning,fit,matrix}
+
+\definecolor{bgcolor}{gray}{0.95}
\definecolor{lightblue}{RGB}{173,216,230}
\definecolor{darkgreen}{RGB}{0,100,0}
\definecolor{pink}{RGB}{255,192,203}
@@ -30,18 +45,34 @@
\definecolor{peach}{RGB}{255, 229, 180}
\definecolor{lavender}{RGB}{230, 190, 255}
+\definecolor{customdarkblue}{HTML}{0C2472}
+\definecolor{customcyan}{HTML}{00BFFF}
+\definecolor{customorange}{HTML}{F1892C}
+\definecolor{customyellow}{HTML}{FFD700}
+\definecolor{custompurple}{HTML}{800080}
+\definecolor{customgreen}{HTML}{00FF00}
+
\definecolor{CustomBlue}{rgb}{0.25, 0.41, 0.88} % RoyalBlue
\hypersetup{
pdftitle={Benchmarking analysis report},
- pdfauthor={[Names of co-authors (partners short names)]},
+ pdfauthor={[Christophe Prud'homme (UNISTRA), Pierre Alliez (INRIA), Vincent Chabannes (UNISTRA), Rudy Chocat (CEA), Emmanuel Franck (INRIA), Vincent Fraucher (CEA), Floriant Faucher (INRIA), Clément Gauchy (CEA), Christos Georgiadis (INRIA), Luc Giraud (INRIA), Frédéric Hecht (SU), Guillaume Helbecque (U Luxembourg), Pierre Jolivet (CNRS), Olivier Jamond (CEA), Pierre Ledac (CEA), Nouredine Melab (U. Lille), Victor Michel-Dansac (INRIA), Frédéric Nataf (SU), Lucas Palazzolo (INRIA), Yannick Privat (UL), Thomas Saigre-Tardif (UNISTRA), Rudy Chocat (CEA), El-Ghazali Talbi (U Lille), Pierre Henri Tournier (SU), Christophe Trophime (CNRS), Céline Van Landeghem (UNISTRA), Raphael Zanella (SU)
+ ]},
pdfkeywords={HPC, Exascale, Benchmarking},
- bookmarksnumbered,bookmarksopen,linktocpage,
+ bookmarksnumbered,linktocpage,
colorlinks=true,
citecolor=CustomBlue,
linkcolor=CustomBlue,
urlcolor=blue
}
+\makeatletter
+% Deal with stackedplots
+\newcommand\resetstackedplots{
+\makeatletter
+\pgfplots@stacked@isfirstplottrue
+\makeatother
+}
+
\IfFileExists{.git/gitHeadInfo.gin}{
\usepackage[pcount,grumpy,mark,markifdirty]{gitinfo2}
}{%
@@ -64,7 +95,7 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% URL style same as regular text
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-
+
\urlstyle{same}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -92,20 +123,20 @@
\delivShortTile{BAR}
% Deliverable Responsible Partner
-\delivResponsible{UNISTRA}
+\delivResponsible{UNISTRA}
% Deliverable Version, Contractual and Actual Date, Dissemination Level, Type
-\delivVersion{v0.0.6}
-\ContractualDate{15/10/2024}
+\delivVersion{v1.1.0}
+\ContractualDate{22/10/2024}
\ActualDate{\today}
\delivDissLevel{PU} % PU, PP, RE, CO
\delivType{Report}
% List of Main Authors (usually from the responsible partner)
-\delivAuthor{[Names of co-authors (partners short names)]}
+\delivAuthor{[Christophe Prud'homme (UNISTRA)]}
% List of Co-Authors (all other co-authors should be listed here)
-\delivFPAuthor{[Names of co-authors (partners short names)]}
+\delivFPAuthor{[Pierre Alliez (INRIA), Vincent Chabannes (UNISTRA), Rudy Chocat (CEA), Emmanuel Franck (INRIA), Vincent Fraucher (CEA), Floriant Faucher (INRIA), Clément Gauchy (CEA), Christos Georgiadis (INRIA), Luc Giraud (INRIA), Frédéric Hecht (SU), Guillaume Helbecque (U Luxembourg), Pierre Jolivet (CNRS), Olivier Jamond (CEA), Pierre Ledac (CEA), Nouredine Melab (U. Lille), Victor Michel-Dansac (INRIA), Frédéric Nataf (SU), Lucas Palazzolo (INRIA), Yannick Privat (UL), Thomas Saigre-Tardif (UNISTRA), Rudy Chocat (CEA), El-Ghazali Talbi (U Lille), Pierre Henri Tournier (SU), Christophe Trophime (CNRS), Céline Van Landeghem (UNISTRA), Raphael Zanella (SU)]}
% Provision of Keywords (about 5-10)
\delivKeywords{HPC, Exascale, Benchmarking, Software}
@@ -117,13 +148,17 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Change Log
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+\istChange{22/10/2024}{v1.1.0}{Josselin Garnier (EP), Loic Gouarin (EP), Florian Faucher (INRIA), Vincent Faucher (CEA), Pierre Jolivet (CNRS), Prud'homme Christophe (UNISTRA), Raphael Zanella (SU)}{fixed undefined references and citations; fixed École Polytechnique logo; removed last boilerplate text in Trust, Manta and Hawen}
+\istChange{15/10/2024}{v1.0.0 (submitted to ANR)}{\href{https://github.com/numpex/exa-ma-d7.1/graphs/contributors}{26 Contributors}}{Finalized contributions and reviews}
+\istChange{11/10/2024}{v0.2.0}{\href{https://github.com/numpex/exa-ma-d7.1/graphs/contributors}{+14 Contributors}}{Initial contributions}
+\istChange{30/09/2024}{v0.1.0}{Prud'homme Christophe (UNISTRA)}{setup architecture of D7.1, update profiling tools in toc, updates in methodology chapter, add information store in excel sheet in the report,update benchmark methodology}
\istChange{27/09/2024}{v0.0.6}{Prud'homme Christophe (UNISTRA)}{In the methodology chapter, link the deliverable to the bottlenecks identified in Exa-MA scientific document. Add resilience stats and methdology, benchmark mmg and parmmg indirectly.}
\istChange{26/09/2024}{v0.0.6}{Pierre Jolivet (CNRS)}{Review of the document}
\istChange{16/09/2024}{v0.0.5}{Prud'homme Christophe (UNISTRA)}{ToC: add list of computer science features per software, add list of math features per workpackage, add statistics about Exa-MA software in chapter Software}
\istChange{02/09/2024}{v0.0.4}{Prud'homme Christophe (UNISTRA)}{ToC: add profiling tools in methodology chapter and udated the chapter overall}
\istChange{30/08/2024}{v0.0.3}{Prud'homme Christophe (UNISTRA)}{ToC: setup architecture of D7.1;add benchmarking methodology chapter;add samurai software to be benchmarked}
\istChange{20/08/2024}{v0.0.1}{Prud'homme Christophe (UNISTRA)}{Draft report template}
-\istChange{}{}{}{}
+%%\istChange{}{}{}{}
\begin{document}
@@ -132,7 +167,7 @@
%%% Cover Page
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\makecover
+\makecover%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Table of Contents
@@ -140,7 +175,7 @@
\clearpage
\fancypagestyle{plain}{}
-\settableofcontents
+\settableofcontents%
\tableofcontents
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -148,7 +183,7 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
-\setlistoffigures
+\setlistoffigures%
\listoffigures
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -156,7 +191,7 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
-\setlistoftables
+\setlistoftables%
\listoftables
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -169,7 +204,8 @@
%%% Deliverable Content
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\chapter[]{Introductions}
+\chapter{Introduction}
+\label{chap:introduction}
\input{sections/summary}
\input{sections/introduction}
\import{chapters/}{benchmarking}
@@ -177,6 +213,7 @@
\import{chapters/}{00-index}
\chapter{Conclusions}
+\label{chap:conclusions}
\input{sections/conclusions}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -194,7 +231,7 @@ \chapter{Conclusions}
\appendix
\input{sections/appendix-a}
-\input{sections/appendix-b}
+%%\input{sections/appendix-b}
@@ -202,8 +239,8 @@ \chapter{Conclusions}
%%% Back Page
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\makedisclaimer
+\makedisclaimer%
\end{document}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ No newline at end of file
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/gitHeadLocal.gin b/gitHeadLocal.gin
index 70a2f4f..416f401 100644
--- a/gitHeadLocal.gin
+++ b/gitHeadLocal.gin
@@ -1,17 +1,17 @@
\usepackage[%
- shash={b26bd7c},
- lhash={b26bd7c5389f87390796c7310db5829e332eb9b6},
+ shash={ffcd14b},
+ lhash={ffcd14b4feb86148df32b483d2a3c49441e6442f},
authname={Christophe Prud'homme},
authemail={christophe.prudhomme@cemosis.fr},
- authsdate={2024-09-27},
- authidate={2024-09-27 13:21:07 +0200},
- authudate={1727436067},
+ authsdate={2024-10-16},
+ authidate={2024-10-16 19:35:35 +0200},
+ authudate={1729100135},
commname={Christophe Prud'homme},
commemail={christophe.prudhomme@cemosis.fr},
- commsdate={2024-09-27},
- commidate={2024-09-27 13:21:07 +0200},
- commudate={1727436067},
- refnames={ (HEAD -> main, tag: v0.1.0, origin/main, origin/HEAD)},
- firsttagdescribe={v0.1.0},
- reltag={v0.1.0-0-gb26bd7c}
+ commsdate={2024-10-16},
+ commidate={2024-10-16 19:35:35 +0200},
+ commudate={1729100135},
+ refnames={ (HEAD -> main, tag: v1.1.0-preview.1, origin/main, origin/HEAD)},
+ firsttagdescribe={v1.1.0-preview.1},
+ reltag={v1.1.0-preview.1-0-gffcd14b}
]{gitexinfo}
\ No newline at end of file
diff --git a/graphics/cgal/fan.png b/graphics/cgal/fan.png
new file mode 100644
index 0000000..4feb471
Binary files /dev/null and b/graphics/cgal/fan.png differ
diff --git a/graphics/cgal/refinement_speedup.png b/graphics/cgal/refinement_speedup.png
new file mode 100644
index 0000000..b1b8a97
Binary files /dev/null and b/graphics/cgal/refinement_speedup.png differ
diff --git a/graphics/composyx/composyx-solverstack.png b/graphics/composyx/composyx-solverstack.png
new file mode 100644
index 0000000..9086f6a
Binary files /dev/null and b/graphics/composyx/composyx-solverstack.png differ
diff --git a/graphics/composyx/composyx-solverstack.svg b/graphics/composyx/composyx-solverstack.svg
new file mode 100644
index 0000000..ab904a9
--- /dev/null
+++ b/graphics/composyx/composyx-solverstack.svg
@@ -0,0 +1,172 @@
+
+
+
+
+
diff --git a/graphics/numpex-cover-bkg.pdf b/graphics/exama-cover-bkg.pdf
similarity index 67%
rename from graphics/numpex-cover-bkg.pdf
rename to graphics/exama-cover-bkg.pdf
index 0eb2215..b3a8fd5 100644
Binary files a/graphics/numpex-cover-bkg.pdf and b/graphics/exama-cover-bkg.pdf differ
diff --git a/graphics/exama-disclaimer.pdf b/graphics/exama-disclaimer.pdf
new file mode 100644
index 0000000..9292be7
Binary files /dev/null and b/graphics/exama-disclaimer.pdf differ
diff --git a/graphics/feelpp/feelpp-benchmark-HL-31-current_density.png b/graphics/feelpp/feelpp-benchmark-HL-31-current_density.png
new file mode 100644
index 0000000..e8b5e16
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-HL-31-current_density.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-HL-31-geo-zoom.png b/graphics/feelpp/feelpp-benchmark-HL-31-geo-zoom.png
new file mode 100644
index 0000000..a575fb1
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-HL-31-geo-zoom.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-HL-31-geo.png b/graphics/feelpp/feelpp-benchmark-HL-31-geo.png
new file mode 100644
index 0000000..5c091d9
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-HL-31-geo.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-HL-31-potential_density_streamines.png b/graphics/feelpp/feelpp-benchmark-HL-31-potential_density_streamines.png
new file mode 100644
index 0000000..13c0297
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-HL-31-potential_density_streamines.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-HL-31-temperature-streamlines.png b/graphics/feelpp/feelpp-benchmark-HL-31-temperature-streamlines.png
new file mode 100644
index 0000000..ea59082
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-HL-31-temperature-streamlines.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-HL-31-temperature.png b/graphics/feelpp/feelpp-benchmark-HL-31-temperature.png
new file mode 100644
index 0000000..6d94912
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-HL-31-temperature.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-contact-temperature.png b/graphics/feelpp/feelpp-benchmark-contact-temperature.png
new file mode 100644
index 0000000..1c873c8
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-contact-temperature.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-flowprone.png b/graphics/feelpp/feelpp-benchmark-heatfluid-flowprone.png
new file mode 100644
index 0000000..865fd75
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-heatfluid-flowprone.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-flowstanding.png b/graphics/feelpp/feelpp-benchmark-heatfluid-flowstanding.png
new file mode 100644
index 0000000..43db44d
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-heatfluid-flowstanding.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-flowsupine.png b/graphics/feelpp/feelpp-benchmark-heatfluid-flowsupine.png
new file mode 100644
index 0000000..c316203
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-heatfluid-flowsupine.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-prone.tex b/graphics/feelpp/feelpp-benchmark-heatfluid-prone.tex
new file mode 100644
index 0000000..80d6994
--- /dev/null
+++ b/graphics/feelpp/feelpp-benchmark-heatfluid-prone.tex
@@ -0,0 +1,73 @@
+\begin{tikzpicture}
+ \begin{axis}[
+ colorbar,
+ colormap/jet, % Choose the colormap you prefer
+ % axis equal image,
+ enlargelimits=false,
+ colorbar horizontal,
+ point meta max=15.5,
+ point meta min=15.395625664,
+ axis line style = {draw=none},
+ tick style = {draw=none},
+ xtick = \empty, ytick = \empty,
+ colorbar style={
+ % xlabel style={
+ % at={(0.5,1.1)},
+ % anchor=south,
+ % },
+ xlabel = {$p$ [\si{\mmHg}]},
+ height=0.05*\pgfkeysvalueof{/pgfplots/parent axis height},
+ width=0.9*\pgfkeysvalueof{/pgfplots/parent axis width},
+ at={(0.5,-0.02)},
+ anchor=center,
+ tick label style={font=\footnotesize},
+ },
+ colorbar/draw/.append code={
+ \begin{axis}[
+ colormap={Gray and Red}{
+ rgb255(-1cm)=(26,26,26);
+ rgb255(-0.87451cm)=(58,58,58);
+ rgb255(-0.74902cm)=(91,91,91);
+ rgb255(-0.623529cm)=(128,128,128);
+ rgb255(-0.498039cm)=(161,161,161);
+ rgb255(-0.372549cm)=(191,191,191);
+ rgb255(-0.247059cm)=(215,215,215);
+ rgb255(-0.121569cm)=(236,236,236);
+ rgb255(0.00392157cm)=(254,254,253);
+ rgb255(0.129412cm)=(253,231,218);
+ rgb255(0.254902cm)=(250,204,180);
+ rgb255(0.380392cm)=(244,170,136);
+ rgb255(0.505882cm)=(228,128,101);
+ rgb255(0.631373cm)=(208,84,71);
+ rgb255(0.756863cm)=(185,39,50);
+ rgb255(0.882353cm)=(147,14,38);
+ rgb255(1cm)=(103,0,31);
+ },
+ colorbar horizontal,
+ point meta min=0,
+ point meta max=0.0000062121127024231835,
+ every colorbar,
+ anchor=center,
+ colorbar shift,
+ colorbar=false,
+ xlabel = {$\vct{u}$ [\si{\meter\per\second}]},
+ height=0.05*\pgfkeysvalueof{/pgfplots/parent axis height},
+ width=0.9*\pgfkeysvalueof{/pgfplots/parent axis width},
+ at={(0.5,-0.1*3.5*\pgfkeysvalueof{/pgfplots/parent axis height})},
+ tick label style={font=\footnotesize},
+ ]
+ \pgfkeysvalueof{/pgfplots/colorbar addplot}
+ \end{axis}
+ },
+ width=\subfigwidth
+ ]
+ \addplotgraphicsnatural[xmin=0, xmax=1, ymin=0, ymax=1]{graphics/feelpp/feelpp-benchmark-heatfluid-flowprone.png}
+
+ \draw[->] (0.2, 0.88) -- (0.1, 0.88) node[midway, anchor=south] {$\vct{g}$};
+
+ \draw[->, red] (0.1, 0.1) -- (0.2, 0.1) node[pos=1, anchor=north] {$x$};
+ \draw[->, green!60!black] (0.1, 0.1) -- (0.1, 0.2) node[pos=1, anchor=east] {$y$};
+ \draw[blue] (0.1, 0.1) node {$\odot$};
+ \draw[blue] (0.1, 0.1) node[anchor=north east] {$z$};
+ \end{axis}
+\end{tikzpicture}
\ No newline at end of file
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-resheat.png b/graphics/feelpp/feelpp-benchmark-heatfluid-resheat.png
new file mode 100644
index 0000000..a6d6c20
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-heatfluid-resheat.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-resheat.tex b/graphics/feelpp/feelpp-benchmark-heatfluid-resheat.tex
new file mode 100644
index 0000000..ea9fc47
--- /dev/null
+++ b/graphics/feelpp/feelpp-benchmark-heatfluid-resheat.tex
@@ -0,0 +1,31 @@
+\begin{tikzpicture}
+ \begin{axis}[
+ colorbar,
+ colormap/jet, % Choose the colormap you prefer
+ % axis equal image,
+ enlargelimits=false,
+ point meta max=310.14813232421875,
+ point meta min=308.0091857910156,
+ colorbar horizontal,
+ axis line style = {draw=none},
+ tick style = {draw=none},
+ xtick = \empty, ytick = \empty,
+ colorbar style={
+ xlabel = {$T$ [\si{\kelvin}]},
+ height=0.05*\pgfkeysvalueof{/pgfplots/parent axis height},
+ width=0.9*\pgfkeysvalueof{/pgfplots/parent axis width},
+ at={(0.5,-0.02)}, % Adjust the position to center vertically
+ anchor=center, % Adjust the anchor point
+ },
+ width=0.6\textwidth
+ ]
+ \addplotgraphicsnatural[xmin=0, xmax=1, ymin=0, ymax=1]{graphics/feelpp/feelpp-benchmark-heatfluid-resheat.png}
+
+ \draw[->] (0.8, 0.9) -- (0.8, 0.8) node[midway, anchor=west] {$\vct{g}$};
+
+ \draw[->, red] (0.1, 0.1) -- (0.2, 0.1) node[pos=1, anchor=north] {$x$};
+ \draw[->, green!60!black] (0.1, 0.1) -- (0.1, 0.2) node[pos=1, anchor=east] {$y$};
+ \draw[blue] (0.1, 0.1) node {$\odot$};
+ \draw[blue] (0.1, 0.1) node[anchor=north east] {$z$};
+ \end{axis}
+\end{tikzpicture}
\ No newline at end of file
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-standing.tex b/graphics/feelpp/feelpp-benchmark-heatfluid-standing.tex
new file mode 100644
index 0000000..cd0dc49
--- /dev/null
+++ b/graphics/feelpp/feelpp-benchmark-heatfluid-standing.tex
@@ -0,0 +1,73 @@
+\begin{tikzpicture}
+ \begin{axis}[
+ colorbar,
+ colormap/jet, % Choose the colormap you prefer
+ % axis equal image,
+ enlargelimits=false,
+ colorbar horizontal,
+ point meta max=15.5,
+ point meta min=15.045621357,
+ axis line style = {draw=none},
+ tick style = {draw=none},
+ xtick = \empty, ytick = \empty,
+ colorbar style={
+ % xlabel style={
+ % at={(0.5,1.1)},
+ % anchor=south,
+ % },
+ xlabel = {$p$ [\si{\mmHg}]},
+ height=0.05*\pgfkeysvalueof{/pgfplots/parent axis height},
+ width=0.9*\pgfkeysvalueof{/pgfplots/parent axis width},
+ at={(0.5,-0.02)},
+ anchor=center,
+ tick label style={font=\footnotesize},
+ },
+ colorbar/draw/.append code={
+ \begin{axis}[
+ colormap={Gray and Red}{
+ rgb255(-1cm)=(26,26,26);
+ rgb255(-0.87451cm)=(58,58,58);
+ rgb255(-0.74902cm)=(91,91,91);
+ rgb255(-0.623529cm)=(128,128,128);
+ rgb255(-0.498039cm)=(161,161,161);
+ rgb255(-0.372549cm)=(191,191,191);
+ rgb255(-0.247059cm)=(215,215,215);
+ rgb255(-0.121569cm)=(236,236,236);
+ rgb255(0.00392157cm)=(254,254,253);
+ rgb255(0.129412cm)=(253,231,218);
+ rgb255(0.254902cm)=(250,204,180);
+ rgb255(0.380392cm)=(244,170,136);
+ rgb255(0.505882cm)=(228,128,101);
+ rgb255(0.631373cm)=(208,84,71);
+ rgb255(0.756863cm)=(185,39,50);
+ rgb255(0.882353cm)=(147,14,38);
+ rgb255(1cm)=(103,0,31);
+ },
+ colorbar horizontal,
+ point meta min=1.023663860179765e-8,
+ point meta max=0.00012864508759841435,
+ every colorbar,
+ anchor=center,
+ colorbar shift,
+ colorbar=false,
+ xlabel = {$\vct{u}$ [\si{\meter\per\second}]},
+ height=0.05*\pgfkeysvalueof{/pgfplots/parent axis height},
+ width=0.9*\pgfkeysvalueof{/pgfplots/parent axis width},
+ at={(0.5,-0.1*3.5*\pgfkeysvalueof{/pgfplots/parent axis height})},
+ tick label style={font=\footnotesize},
+ ]
+ \pgfkeysvalueof{/pgfplots/colorbar addplot}
+ \end{axis}
+ },
+ width=\subfigwidth
+ ]
+ \addplotgraphicsnatural[xmin=0, xmax=1, ymin=0, ymax=1]{graphics/feelpp/feelpp-benchmark-heatfluid-flowstanding.png}
+
+ \draw[->] (0.1, 1) -- (0.1, 0.9) node[midway, anchor=west] {$\vct{g}$};
+
+ \draw[->, red] (0.1, 0.1) -- (0.2, 0.1) node[pos=1, anchor=north] {$x$};
+ \draw[->, green!60!black] (0.1, 0.1) -- (0.1, 0.2) node[pos=1, anchor=east] {$y$};
+ \draw[blue] (0.1, 0.1) node {$\odot$};
+ \draw[blue] (0.1, 0.1) node[anchor=north east] {$z$};
+ \end{axis}
+\end{tikzpicture}
\ No newline at end of file
diff --git a/graphics/feelpp/feelpp-benchmark-heatfluid-supine.tex b/graphics/feelpp/feelpp-benchmark-heatfluid-supine.tex
new file mode 100644
index 0000000..d7171dc
--- /dev/null
+++ b/graphics/feelpp/feelpp-benchmark-heatfluid-supine.tex
@@ -0,0 +1,73 @@
+\begin{tikzpicture}
+ \begin{axis}[
+ colorbar,
+ colormap/jet, % Choose the colormap you prefer
+ % axis equal image,
+ enlargelimits=false,
+ colorbar horizontal,
+ point meta max=15.5,
+ point meta min=15.395051958,
+ axis line style = {draw=none},
+ tick style = {draw=none},
+ xtick = \empty, ytick = \empty,
+ colorbar style={
+ % xlabel style={
+ % at={(0.5,1.1)},
+ % anchor=south,
+ % },
+ xlabel = {$p$ [\si{\mmHg}]},
+ height=0.05*\pgfkeysvalueof{/pgfplots/parent axis height},
+ width=0.9*\pgfkeysvalueof{/pgfplots/parent axis width},
+ at={(0.5,-0.02)},
+ anchor=center,
+ tick label style={font=\footnotesize},
+ },
+ colorbar/draw/.append code={
+ \begin{axis}[
+ colormap={Gray and Red}{
+ rgb255(-1cm)=(26,26,26);
+ rgb255(-0.87451cm)=(58,58,58);
+ rgb255(-0.74902cm)=(91,91,91);
+ rgb255(-0.623529cm)=(128,128,128);
+ rgb255(-0.498039cm)=(161,161,161);
+ rgb255(-0.372549cm)=(191,191,191);
+ rgb255(-0.247059cm)=(215,215,215);
+ rgb255(-0.121569cm)=(236,236,236);
+ rgb255(0.00392157cm)=(254,254,253);
+ rgb255(0.129412cm)=(253,231,218);
+ rgb255(0.254902cm)=(250,204,180);
+ rgb255(0.380392cm)=(244,170,136);
+ rgb255(0.505882cm)=(228,128,101);
+ rgb255(0.631373cm)=(208,84,71);
+ rgb255(0.756863cm)=(185,39,50);
+ rgb255(0.882353cm)=(147,14,38);
+ rgb255(1cm)=(103,0,31);
+ },
+ colorbar horizontal,
+ point meta min=1.6125707421871079e-9,
+ point meta max=0.000007417459292689545,
+ every colorbar,
+ anchor=center,
+ colorbar shift,
+ colorbar=false,
+ xlabel = {$\vct{u}$ [\si{\meter\per\second}]},
+ height=0.05*\pgfkeysvalueof{/pgfplots/parent axis height},
+ width=0.9*\pgfkeysvalueof{/pgfplots/parent axis width},
+ at={(0.5,-0.1*3.5*\pgfkeysvalueof{/pgfplots/parent axis height})},
+ tick label style={font=\footnotesize},
+ ]
+ \pgfkeysvalueof{/pgfplots/colorbar addplot}
+ \end{axis}
+ },
+ width=\subfigwidth
+ ]
+ \addplotgraphicsnatural[xmin=0, xmax=1, ymin=0, ymax=1]{graphics/feelpp/feelpp-benchmark-heatfluid-flowsupine.png}
+
+ \draw[->] (0.1, 0.88) -- (0.2, 0.88) node[midway, anchor=south] {$\vct{g}$};
+
+ \draw[->, red] (0.1, 0.1) -- (0.2, 0.1) node[pos=1, anchor=north] {$x$};
+ \draw[->, green!60!black] (0.1, 0.1) -- (0.1, 0.2) node[pos=1, anchor=east] {$y$};
+ \draw[blue] (0.1, 0.1) node {$\odot$};
+ \draw[blue] (0.1, 0.1) node[anchor=north east] {$z$};
+ \end{axis}
+ \end{tikzpicture}
\ No newline at end of file
diff --git a/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-disp.png b/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-disp.png
new file mode 100644
index 0000000..8bf2cd0
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-disp.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-tresca-wrap.png b/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-tresca-wrap.png
new file mode 100644
index 0000000..003f8bb
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-tresca-wrap.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-vonmises.png b/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-vonmises.png
new file mode 100644
index 0000000..1f013b9
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-nafems-le10-solution-vonmises.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-thermalbridges-geom.png b/graphics/feelpp/feelpp-benchmark-thermalbridges-geom.png
new file mode 100644
index 0000000..61727e7
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-thermalbridges-geom.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-thermalbridges-geom2.png b/graphics/feelpp/feelpp-benchmark-thermalbridges-geom2.png
new file mode 100644
index 0000000..ea8a30d
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-thermalbridges-geom2.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-thermalbridges-pid.png b/graphics/feelpp/feelpp-benchmark-thermalbridges-pid.png
new file mode 100644
index 0000000..9535d6e
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-thermalbridges-pid.png differ
diff --git a/graphics/feelpp/feelpp-benchmark-thermalbridges-solution.png b/graphics/feelpp/feelpp-benchmark-thermalbridges-solution.png
new file mode 100644
index 0000000..f058f36
Binary files /dev/null and b/graphics/feelpp/feelpp-benchmark-thermalbridges-solution.png differ
diff --git a/graphics/feelpp/feelpp-cb-workflow.png b/graphics/feelpp/feelpp-cb-workflow.png
new file mode 100644
index 0000000..eb636bd
Binary files /dev/null and b/graphics/feelpp/feelpp-cb-workflow.png differ
diff --git a/graphics/feelpp/feelpp-ci-workflow.png b/graphics/feelpp/feelpp-ci-workflow.png
new file mode 100644
index 0000000..d5d88cb
Binary files /dev/null and b/graphics/feelpp/feelpp-ci-workflow.png differ
diff --git a/graphics/feelpp/feelpp-components.tex b/graphics/feelpp/feelpp-components.tex
new file mode 100644
index 0000000..9545c98
--- /dev/null
+++ b/graphics/feelpp/feelpp-components.tex
@@ -0,0 +1,80 @@
+%!TEX root = ../exa-ma-d7.1.tex
+
+\definecolor{mybluei}{RGB}{70, 82, 186}
+\definecolor{myblueii}{RGB}{73,121,193}
+\definecolor{mygreen}{RGB}{142, 209, 79}
+\definecolor{mypink}{RGB}{255,248,241}
+
+\newcommand\widernode[5][widebox]{
+ \node[
+ #1,
+ fit={(#2) (#3)},
+ label=center:{\sffamily\bfseries\color{white}#4}] (#5) {};
+}
+
+\begin{tikzpicture}[node distance=3pt,outer sep=0pt,
+boxstyle/.style={
+ draw=white,
+ fill=#1,
+ rounded corners,
+ font={\sffamily\bfseries\color{white}},
+ align=center,
+ minimum height=30pt
+},
+box/.style={
+ boxstyle=#1,
+ text width=2.5cm},
+box/.default=mybluei,
+title/.style={font={\sffamily\bfseries\color{white}}},
+widebox/.style={draw=white,inner sep=0pt, rounded corners,fill=#1},
+widebox/.default=mybluei,
+mylabel/.style={font={\sffamily\bfseries\color{white}}},
+]
+
+\matrix (stack) [boxstyle=mybluei!40, draw=black,%
+ column sep=3pt, row sep=3pt, inner sep=4mm,%
+ matrix of nodes,%
+ nodes={box, outer sep=0pt, anchor=center, inner sep=3pt},%
+ nodes in empty cells,
+ %row 1/.style={nodes={fill=none,draw=none,minimum height=3mm}},
+]
+{
+ & & & & \\
+& & & & \\
+& & & & \\
+& & & & \\
+ & & & & Models Description\\
+Space Cartesian Products & & PDE based Preconditioners & Ray-Tracing (BVH) & Level-Set (FastMarching) \\
+ & & & & Forms\\
+Finite Elements & Geometric Mapping & & & Export/Import\\
+ Timings & & & & Quadratures \\
+& & & & \\};
+\widernode[widebox=mygreen]{stack-1-1}{stack-1-5}{\begin{tabular}{c}Advanced Methods and Analysis:\\ Inverse Problems, Data Assimilation, UQ, ML\ldots \end{tabular}}{Analysis}
+\widernode[widebox=mygreen]{stack-2-1}{stack-2-5}{\begin{tabular}{c}Python bindings:\\ Core, \Feelpp Libs, Toolboxes, MOR\end{tabular}}{Python}
+\widernode[widebox=mygreen]{stack-3-1}{stack-3-5}{\begin{tabular}{c}ROM: \\RB(Greedy,POD,Error Bounds, SCM/Min-$\theta$, NL-C), (D,G)EIM, PBDW, NIRB\end{tabular}}{RB}
+\widernode[widebox=mygreen]{stack-4-1}{stack-4-5}{\begin{tabular}{c}Toolboxes: \\ ALE, CFPDEs, CFD, CSM, Heat, HeatFluid, FSI, ThermoElectric, Maxwell\end{tabular}}{Toolboxes}
+\widernode{stack-5-3}{stack-5-4}{Models Description: JSON...}{model}
+\widernode{stack-5-1}{stack-5-2}{DSEL for Galerkin Methods}{dsel}
+\widernode{stack-6-1}{stack-6-2}{\begin{tabular}{c}Cartesian Products:\\ Spaces, Functions and Forms\end{tabular}}{prod}
+\widernode{stack-7-3}{stack-7-4}{Interpolation}{interp}
+\widernode{stack-7-1}{stack-7-2}{Function Spaces}{fespace}
+\widernode{stack-8-3}{stack-8-3}{Mesh}{Mesh}
+\widernode{stack-8-4}{stack-8-4}{Time Discr.}{TimeD}
+%\widernode{stack-5-4}{stack-5-5}{Export/Import}{ExImp}
+\widernode{stack-9-2}{stack-9-4}{Linear Algebra:
+ preconditioner framework, ...}{Alg}
+\widernode[widebox=mygreen]{stack-10-1}{stack-10-5}{Core: Environment,
+ Logging, Monitoring, Events, Communicators ...}{Core}
+
+\node [fit={(stack.north west)(stack.north
+ east)},boxstyle=mybluei!60,draw=black,inner sep=0pt,above=3pt of
+stack.north,anchor=south,label={[mylabel]center: Applications: Ktirio Urban Building, Eye2Brain, Swimmer, HifiMagnet}] (AnalysisTools) {};
+\node [fit={(stack.south west)(stack.south
+ east)},boxstyle=mybluei!60,draw=black,inner sep=0pt,below=3pt of
+stack.south,anchor=north,label={[mylabel]center: Python3, Gmsh, MMG, CGAL, OpenTURNS, OpenModelica, Salome}] (DepsTools) {};
+ \node [fit={(DepsTools.south west)(DepsTools.south
+ east)},boxstyle=mybluei!60,draw=black,inner sep=0pt,below=3pt of
+DepsTools.south,anchor=north,label={[mylabel]center: C++17, MPI, Boost,
+ Google::Glog, MongoDB, Eigen3, PETSc/SLEPc}] (Deps) {};
+
+\end{tikzpicture}
diff --git a/graphics/freefempp/samplecode.png b/graphics/freefempp/samplecode.png
new file mode 100644
index 0000000..3bca260
Binary files /dev/null and b/graphics/freefempp/samplecode.png differ
diff --git a/graphics/freefempp/sampleplot.pdf b/graphics/freefempp/sampleplot.pdf
new file mode 100644
index 0000000..e516c70
Binary files /dev/null and b/graphics/freefempp/sampleplot.pdf differ
diff --git a/graphics/hawen/haven_inversion.pdf b/graphics/hawen/haven_inversion.pdf
new file mode 100644
index 0000000..52d8caa
Binary files /dev/null and b/graphics/hawen/haven_inversion.pdf differ
diff --git a/graphics/hawen/skeleton_3D_seam.pdf b/graphics/hawen/skeleton_3D_seam.pdf
new file mode 100644
index 0000000..4e00f2e
Binary files /dev/null and b/graphics/hawen/skeleton_3D_seam.pdf differ
diff --git a/graphics/logo-ep-horizontal.png b/graphics/logo-ep-horizontal.png
new file mode 100644
index 0000000..4579870
Binary files /dev/null and b/graphics/logo-ep-horizontal.png differ
diff --git a/graphics/logo-ep.png b/graphics/logo-ep-vertical.png
similarity index 100%
rename from graphics/logo-ep.png
rename to graphics/logo-ep-vertical.png
diff --git a/graphics/samurai/p4est_3.png b/graphics/samurai/p4est_3.png
new file mode 100644
index 0000000..1886dd6
Binary files /dev/null and b/graphics/samurai/p4est_3.png differ
diff --git a/graphics/samurai/samurai.png b/graphics/samurai/samurai.png
new file mode 100644
index 0000000..783162e
Binary files /dev/null and b/graphics/samurai/samurai.png differ
diff --git a/graphics/zotero/zotero-applications.png b/graphics/zotero/zotero-applications.png
new file mode 100644
index 0000000..184ceed
Binary files /dev/null and b/graphics/zotero/zotero-applications.png differ
diff --git a/graphics/zotero/zotero-citation-keys.png b/graphics/zotero/zotero-citation-keys.png
new file mode 100644
index 0000000..135c343
Binary files /dev/null and b/graphics/zotero/zotero-citation-keys.png differ
diff --git a/graphics/zotero/zotero-newkey.png b/graphics/zotero/zotero-newkey.png
new file mode 100644
index 0000000..3bec21a
Binary files /dev/null and b/graphics/zotero/zotero-newkey.png differ
diff --git a/istcover.sty b/istcover.sty
index 29c00d6..ff3e436 100644
--- a/istcover.sty
+++ b/istcover.sty
@@ -29,7 +29,7 @@
\newpage
\newgeometry{top=8.5mm,left=8.5mm,right=15mm,bottom=-14mm}
% shadowed frame
- \ThisCenterWallPaper{1}{graphics/numpex-cover-bkg}
+ \ThisCenterWallPaper{1}{graphics/exama-cover-bkg}
% \global\@topnum\z@ % prevents figures from going at top of page
\thispagestyle{empty} % so we get no page number on title page
\setcounter{footnote}{0}
diff --git a/istprog.sty b/istprog.sty
index f4903dc..44616af 100644
--- a/istprog.sty
+++ b/istprog.sty
@@ -146,12 +146,12 @@
\newcommand{\makedisclaimer}{
\cleardoublepage
\thispagestyle{empty}
- \ThisCenterWallPaper{1}{graphics/numpex-cover-bkg}
+ \ThisCenterWallPaper{1}{graphics/exama-cover-bkg}
\vspace*{5cm}
{\fontsize{12}{15}\bf\color{numpexblue}Consortium}\\[2ex]
- {\includegraphics[width=0.98\linewidth,page=2]{graphics/numpex-cover-bkg.pdf}}\\[1ex]
+ {\includegraphics[width=0.98\linewidth]{graphics/exama-disclaimer.pdf}}\\[1ex]
{\fontsize{12}{15}\bf\color{numpexblue}Disclaimer}\\[2ex]
{\small All information provided reflects the status of the \projacronym{} project at the time of writing and may be subject to change.
diff --git a/numpex.sty b/numpex.sty
index b5451f1..2032ebd 100644
--- a/numpex.sty
+++ b/numpex.sty
@@ -23,12 +23,14 @@
\usepackage{istprog}
\RequirePackage{a4wide} % A4
\usepackage{etoolbox}
+\usepackage{caption}
+\usepackage{subcaption}
\usepackage{tocloft} % formatting of the table of contents.
\usepackage[linktocpage=true]{hyperref}
\hypersetup{
colorlinks=true,
linkcolor=black,
- filecolor=black,
+ filecolor=black,
citecolor=black,
urlcolor=blue,
pdftitle=\deliv@stitle,
@@ -66,7 +68,7 @@
\ProjectStartDuration{1 Septembre 2023 / 60 Months}
\Security{Public}
-% \partners{
+% \partners{
% \textbf{AIT Austrian Institute of Technology} \\
% }
@@ -82,7 +84,7 @@
% }
\usepackage{graphicx}
-\usepackage[usenames,dvipsnames,svgnames]{xcolor}
+\usepackage[dvipsnames,svgnames]{xcolor}
\usepackage{colortbl}
\usepackage{geometry}
\usepackage{wallpaper}
@@ -107,7 +109,7 @@
\usepackage[parfill]{parskip}
%% title page
-%%
+%%
\usepackage{istcover}
% toc design
@@ -139,9 +141,9 @@
% lof design
\def\setlistoffigures{%
-\renewcommand{\cftfigfont}{\small}
-\renewcommand{\cftfigpagefont}{\small}
-\setlength{\cftfigindent}{0mm}
+\renewcommand{\cftfigfont}{\small}
+\renewcommand{\cftfigpagefont}{\small}
+\setlength{\cftfigindent}{0mm}
\setlength{\cftbeforeloftitleskip}{-0.4cm}
\setlength{\cftafterloftitleskip}{3mm}
\renewcommand{\cftloftitlefont}{\normalfont\fontsize{18}{23}\sffamily\bfseries\color{numpexblue}}
@@ -156,8 +158,8 @@
% lot design
\def\setlistoftables{%
-\renewcommand{\cfttabfont}{\small}
-\renewcommand{\cfttabpagefont}{\small}
+\renewcommand{\cfttabfont}{\small}
+\renewcommand{\cfttabpagefont}{\small}
\setlength{\cfttabindent}{0mm}
\setlength{\cftbeforelottitleskip}{-0.4cm}
\setlength{\cftafterlottitleskip}{3mm}
diff --git a/references.bib b/references.bib
index 4f00509..3ee7d21 100644
--- a/references.bib
+++ b/references.bib
@@ -1,732 +1,2008 @@
-@inproceedings{haidar_harnessing_2018,
- location = {Dallas, {TX}, {USA}},
- title = {Harnessing {GPU} Tensor Cores for Fast {FP}16 Arithmetic to Speed up Mixed-Precision Iterative Refinement Solvers},
- isbn = {978-1-5386-8384-2},
- url = {https://ieeexplore.ieee.org/document/8665777/},
- doi = {10.1109/SC.2018.00050},
- eventtitle = {{SC}18: International Conference for High Performance Computing, Networking, Storage and Analysis},
- pages = {603--613},
- booktitle = {{SC}18: International Conference for High Performance Computing, Networking, Storage and Analysis},
- publisher = {{IEEE}},
- author = {Haidar, Azzam and Tomov, Stanimire and Dongarra, Jack and Higham, Nicholas J.},
- urldate = {2024-06-28},
- date = {2018-11},
- file = {Full Text:files/682/Haidar et al. - 2018 - Harnessing GPU Tensor Cores for Fast FP16 Arithmet.pdf:application/pdf},
-}
-@misc{ootomo_dgemm_2024,
- title = {{DGEMM} on Integer Matrix Multiplication Unit},
- url = {http://arxiv.org/abs/2306.11975},
- abstract = {Deep learning hardware achieves high throughput and low power consumption by reducing computing precision and specializing in matrix multiplication. For machine learning inference, fixed-point value computation is commonplace, where the input and output values and the model parameters are quantized. Thus, many processors are now equipped with fast integer matrix multiplication units ({IMMU}). It is of significant interest to find a way to harness these {IMMUs} to improve the performance of {HPC} applications while maintaining accuracy. We focus on the Ozaki scheme, which computes a high-precision matrix multiplication by using lower-precision computing units, and show the advantages and disadvantages of using {IMMU}. The experiment using integer Tensor Cores shows that we can compute double-precision matrix multiplication faster than {cuBLAS} and an existing Ozaki scheme implementation on {FP}16 Tensor Cores on {NVIDIA} consumer {GPUs}. Furthermore, we demonstrate accelerating a quantum circuit simulation by up to 4.33 while maintaining the {FP}64 accuracy.},
- number = {{arXiv}:2306.11975},
- publisher = {{arXiv}},
- author = {Ootomo, Hiroyuki and Ozaki, Katsuhisa and Yokota, Rio},
- urldate = {2024-06-28},
- date = {2024-03-30},
- eprinttype = {arxiv},
- eprint = {2306.11975 [cs]},
- keywords = {Computer Science - Distributed, Parallel, and Cluster Computing},
- file = {arXiv Fulltext PDF:files/685/Ootomo et al. - 2024 - DGEMM on Integer Matrix Multiplication Unit.pdf:application/pdf;arXiv.org Snapshot:files/686/2306.html:text/html},
+@misc{cardosi_specx_2023,
+ title = {Specx: a {C}++ task-based runtime system for heterogeneous distributed architectures},
+ copyright = {Creative Commons Attribution 4.0 International},
+ shorttitle = {Specx},
+ url = {https://arxiv.org/abs/2308.15964},
+ doi = {10.48550/ARXIV.2308.15964},
+ abstract = {Parallelization is needed everywhere, from laptops and mobile phones to supercomputers. Among parallel programming models, task-based programming has demonstrated a powerful potential and is widely used in high-performance scientific computing. Not only does it allow for efficient parallelization across distributed heterogeneous computing nodes, but it also allows for elegant source code structuring by describing hardware-independent algorithms. In this paper, we present Specx, a task-based runtime system written in modern C++. Specx supports distributed heterogeneous computing by simultaneously exploiting CPUs and GPUs (CUDA/HIP) and incorporating communication into the task graph. We describe the specificities of Specx and demonstrate its potential by running parallel applications.},
+ urldate = {2024-10-15},
+ publisher = {arXiv},
+ author = {Cardosi, Paul and Bramas, Bérenger},
+ year = {2023},
+ note = {Version Number: 1},
+ keywords = {Distributed, Parallel, and Cluster Computing (cs.DC), FOS: Computer and information sciences, Software Engineering (cs.SE)},
}
-@software{prudhomme_feelppfeelpp_2024,
- title = {feelpp/feelpp: Feel++ Release V111 preview.10},
- rights = {Creative Commons Attribution 4.0 International, {GNU} Lesser General Public License v3.0 or later, {GNU} General Public License v3.0 or later},
- url = {https://zenodo.org/doi/10.5281/zenodo.591797},
- shorttitle = {feelpp/feelpp},
- abstract = {🎉 We're happy to share our developments as we approach the V111 release of Feel++. Following a refreshed naming strategy, we've moved to the -preview.x suffix from the conventional -alpha.x, -beta, or -rc labels. This change signifies our dedication to enhancing transparency and setting clear expectations for our pre-release versions.
+@misc{Palazollo_Feel_Shape_Optimization,
+ title = {Feel++ shape optimization toolbox},
+ copyright = {LGPL-3.0-or-later},
+ url = {https://github.com/feelpp/feelpp-shapo},
+ author = {Palazollo, Lucas and Prud'homme, Christophe},
+ year = {2024},
+}
-Each pre-release version of Feel++ undergoes a rigorous process, encompassing detailed reviews, extensive tests across varied scenarios, and careful packaging. Our commitment to delivering a high-quality, reliable experience is reflected in our comprehensive platform support strategy. Alongside offering support for the latest two Long-Term Support ({LTS}) versions of Ubuntu and the newest {LTS} version of Debian, we're excited to announce that Feel++ is now accessible to Windows users through the Windows Subsystem for Linux ({WSL}) and to Mac users via {MacPorts}, Homebrew, Docker and now Apptainer. This expansion of platform support is a testament to our commitment to making Feel++ as accessible and versatile as possible for our diverse user base.
+@book{britain_standard_1990,
+ title = {The {Standard} {NAFEMS} {Benchmarks}},
+ publisher = {NAFEMS},
+ author = {Britain), National Agency for Finite Element Methods \& Standards (Great},
+ year = {1990},
+}
-As we continue to refine and enhance Feel++, the V111 release promises to bring forward significant innovations and improvements. Stay tuned for further updates of Feel++.
+@article{nguessan_high_2021,
+ series = {Numerical {Solution} of {Differential} and {Differential}-{Algebraic} {Equations}. {Selected} {Papers} from {NUMDIFF}-15},
+ title = {High order time integration and mesh adaptation with error control for incompressible {Navier}–{Stokes} and scalar transport resolution on dual grids},
+ volume = {387},
+ issn = {0377-0427},
+ url = {https://www.sciencedirect.com/science/article/pii/S0377042719305473},
+ doi = {10.1016/j.cam.2019.112542},
+ abstract = {Relying on a building block developed by the authors in order to resolve the incompressible Navier–Stokes equation with high order implicit time stepping and dynamic mesh adaptation based on multiresolution analysis with collocated variables, the present contribution investigates the ability to extend such a strategy for scalar transport at relatively large Schmidt numbers using a finer level of refinement compared to the resolution of the hydrodynamic variables, while preserving space adaptation with error control. This building block is a key part of a strategy to construct a low-Mach number code based on a splitting strategy for combustion applications, where several spatial scales are into play. The computational efficiency and accuracy of the proposed strategy is assessed on a well-chosen three-vortex simulation.},
+ urldate = {2024-10-15},
+ journal = {Journal of Computational and Applied Mathematics},
+ author = {N’Guessan, Marc-Arthur and Massot, Marc and Séries, Laurent and Tenaud, Christian},
+ month = may,
+ year = {2021},
+ keywords = {Dual grid with error control, Dynamic mesh adaptation, High order implicit Runge Kutta, Incompressible Navier–Stokes, Multiresolution analysis, Scalar transport},
+ pages = {112542},
+}
-Packages
+@article{lecointre_hydrogen_nodate,
+ title = {Hydrogen flame acceleration in non-uniform mixtures},
+ abstract = {This thesis, carried out with the support of the CEA, presents the development of numerical methods dedicated to the simulation of the acceleration process of a hydrogen flame.},
+ language = {fr},
+ author = {Lecointre, Luc},
+}
+@article{duarte_adaptive_nodate,
+ title = {Adaptive numerical methods in time and space for the simulation of multi-scale reaction fronts.},
+ language = {fr},
+ author = {Duarte, Max Pedro},
+}
+@article{helbecque_parallel_2023,
+ title = {Parallel distributed productivity‐aware tree‐search using {Chapel}},
+ volume = {35},
+ issn = {1532-0626, 1532-0634},
+ url = {https://onlinelibrary.wiley.com/doi/10.1002/cpe.7874},
+ doi = {10.1002/cpe.7874},
+ abstract = {Abstract
+
+ With the recent arrival of the exascale era, modern supercomputers are increasingly big making their programming much more complex. In addition to performance, software productivity is a major concern to choose a programming language, such as Chapel, designed for exascale computing. In this paper, we investigate the design of a parallel distributed tree‐search algorithm, namely P3D‐DFS, and its implementation using Chapel. The design is based on the Chapel's
+ DistBag
+ data structure, revisited by: (1) redefining the data structure for Depth‐First tree‐Search (DFS), henceforth renamed
+ DistBag‐DFS
+ ; (2) redesigning the underlying load balancing mechanism. In addition, we propose two instantiations of P3D‐DFS considering the Branch‐and‐Bound (B\&B) and Unbalanced Tree Search (UTS) algorithms. In order to evaluate how much performance is traded for productivity, we compare the Chapel‐based implementations of B\&B and UTS to their best‐known counterparts based on traditional OpenMP (intra‐node) and MPI+X (inter‐node). For experimental validation using 4096 processing cores, we consider the permutation flow‐shop scheduling problem for B\&B and synthetic literature benchmarks for UTS. The reported results show that P3D‐DFS competes with its OpenMP baselines for coarser‐grained shared‐memory scenarios, and with its MPI+X counterparts for distributed‐memory settings, considering both performance and productivity‐awareness. In the context of this work, this makes Chapel an alternative to OpenMP/MPI+X for exascale programming.},
+ language = {en},
+ number = {27},
+ urldate = {2024-10-15},
+ journal = {Concurrency and Computation: Practice and Experience},
+ author = {Helbecque, Guillaume and Gmys, Jan and Melab, Nouredine and Carneiro, Tiago and Bouvry, Pascal},
+ month = dec,
+ year = {2023},
+ pages = {e7874},
+}
-📦 Ubuntu packages
+@incollection{franco_pgas_2024,
+ address = {Cham},
+ title = {{PGAS} {Data} {Structure} for {Unbalanced} {Tree}-{Based} {Algorithms} at {Scale}},
+ volume = {14834},
+ isbn = {978-3-031-63758-2 978-3-031-63759-9},
+ url = {https://link.springer.com/10.1007/978-3-031-63759-9_13},
+ language = {en},
+ urldate = {2024-10-15},
+ booktitle = {Computational {Science} – {ICCS} 2024},
+ publisher = {Springer Nature Switzerland},
+ author = {Helbecque, Guillaume and Carneiro, Tiago and Melab, Nouredine and Gmys, Jan and Bouvry, Pascal},
+ editor = {Franco, Leonardo and De Mulatier, Clélia and Paszynski, Maciej and Krzhizhanovskaya, Valeria V. and Dongarra, Jack J. and Sloot, Peter M. A.},
+ year = {2024},
+ doi = {10.1007/978-3-031-63759-9_13},
+ note = {Series Title: Lecture Notes in Computer Science},
+ pages = {103--111},
+}
-📦 Debian packages
+@article{gmys_exactly_2022,
+ title = {Exactly {Solving} {Hard} {Permutation} {Flowshop} {Scheduling} {Problems} on {Peta}-{Scale} {GPU}-{Accelerated} {Supercomputers}},
+ volume = {34},
+ issn = {1091-9856, 1526-5528},
+ url = {https://pubsonline.informs.org/doi/10.1287/ijoc.2022.1193},
+ doi = {10.1287/ijoc.2022.1193},
+ abstract = {Makespan minimization in permutation flow-shop scheduling is a well-known hard combinatorial optimization problem. Among the 120 standard benchmark instances proposed by E. Taillard in 1993, 23 have remained unsolved for almost three decades. In this paper, we present our attempts to solve these instances to optimality using parallel Branch-and-Bound (BB) on the GPU-accelerated Jean Zay supercomputer. We report the exact solution of 11 previously unsolved problem instances and improved upper bounds for eight instances. The solution of these problems requires both algorithmic improvements and leveraging the computing power of peta-scale high-performance computing platforms. The challenge consists in efficiently performing parallel depth-first traversal of a highly irregular and fine-grained search tree on distributed systems composed of hundreds of massively parallel accelerator devices and multicore processors. We present and discuss the design and implementation of our permutation-based BB and experimentally evaluate its parallel performance on up to 384 V100 GPUs (2 million CUDA cores) and 3840 CPU cores. The optimality proof for the largest solved instance requires about 64 CPU-years of computation—using 256 GPUs and over 4 million parallel search agents, the traversal of the search tree is completed in 13 hours, exploring [Formula: see text] nodes.},
+ language = {en},
+ number = {5},
+ urldate = {2024-10-15},
+ journal = {INFORMS Journal on Computing},
+ author = {Gmys, Jan},
+ month = sep,
+ year = {2022},
+ pages = {2502--2522},
+}
-📦 Docker images
+@article{delorme_novel_nodate,
+ title = {{NOVEL} {NUMERICAL} {METHODS} {FOR} {SOLAR} {CONVECTION}: {THE} {DYABLO} {WHOLE}-{SUN} {ADAPTATIVE} {MESH} {REFINEMENT} {CODE}},
+ abstract = {We present a new solar simulation code named Dyablo Whole-Sun (DWS) and the first steps of its validation. DWS is a novel portable high-performance code aiming at making the first holistic simulations of the Sun, from the radiative interior to the corona. We discuss the validation of the development of the code using a solar convection benchmark in Cartesian geometry.},
+ language = {en},
+ author = {Delorme, M and Durocher, A and Brun, A S and Strugarek, A},
+}
+@article{dubey_survey_2014,
+ series = {Domain-{Specific} {Languages} and {High}-{Level} {Frameworks} for {High}-{Performance} {Computing}},
+ title = {A survey of high level frameworks in block-structured adaptive mesh refinement packages},
+ volume = {74},
+ issn = {0743-7315},
+ url = {https://www.sciencedirect.com/science/article/pii/S0743731514001178},
+ doi = {10.1016/j.jpdc.2014.07.001},
+ abstract = {Over the last decade block-structured adaptive mesh refinement (SAMR) has found increasing use in large, publicly available codes and frameworks. SAMR frameworks have evolved along different paths. Some have stayed focused on specific domain areas, others have pursued a more general functionality, providing the building blocks for a larger variety of applications. In this survey paper we examine a representative set of SAMR packages and SAMR-based codes that have been in existence for half a decade or more, have a reasonably sized and active user base outside of their home institutions, and are publicly available. The set consists of a mix of SAMR packages and application codes that cover a broad range of scientific domains. We look at their high-level frameworks, their design trade-offs and their approach to dealing with the advent of radical changes in hardware architecture. The codes included in this survey are BoxLib, Cactus, Chombo, Enzo, FLASH, and Uintah.},
+ number = {12},
+ urldate = {2024-10-14},
+ journal = {Journal of Parallel and Distributed Computing},
+ author = {Dubey, Anshu and Almgren, Ann and Bell, John and Berzins, Martin and Brandt, Steve and Bryan, Greg and Colella, Phillip and Graves, Daniel and Lijewski, Michael and Löffler, Frank and O’Shea, Brian and Schnetter, Erik and Van Straalen, Brian and Weide, Klaus},
+ month = dec,
+ year = {2014},
+ keywords = {BoxLib, Cactus, Chombo, Enzo, FLASH, SAMR, Uintah},
+ pages = {3217--3227},
+}
-docker pull ghcr.io/feelpp/feelpp:v0.111.0-preview.10-jammy
-docker run ghcr.io/feelpp/feelpp:v0.111.0-preview.10-jammy ls
+@article{dubey_survey_2014-1,
+ series = {Domain-{Specific} {Languages} and {High}-{Level} {Frameworks} for {High}-{Performance} {Computing}},
+ title = {A survey of high level frameworks in block-structured adaptive mesh refinement packages},
+ volume = {74},
+ issn = {0743-7315},
+ url = {https://www.sciencedirect.com/science/article/pii/S0743731514001178},
+ doi = {10.1016/j.jpdc.2014.07.001},
+ abstract = {Over the last decade block-structured adaptive mesh refinement (SAMR) has found increasing use in large, publicly available codes and frameworks. SAMR frameworks have evolved along different paths. Some have stayed focused on specific domain areas, others have pursued a more general functionality, providing the building blocks for a larger variety of applications. In this survey paper we examine a representative set of SAMR packages and SAMR-based codes that have been in existence for half a decade or more, have a reasonably sized and active user base outside of their home institutions, and are publicly available. The set consists of a mix of SAMR packages and application codes that cover a broad range of scientific domains. We look at their high-level frameworks, their design trade-offs and their approach to dealing with the advent of radical changes in hardware architecture. The codes included in this survey are BoxLib, Cactus, Chombo, Enzo, FLASH, and Uintah.},
+ number = {12},
+ urldate = {2024-10-14},
+ journal = {Journal of Parallel and Distributed Computing},
+ author = {Dubey, Anshu and Almgren, Ann and Bell, John and Berzins, Martin and Brandt, Steve and Bryan, Greg and Colella, Phillip and Graves, Daniel and Lijewski, Michael and Löffler, Frank and O’Shea, Brian and Schnetter, Erik and Van Straalen, Brian and Weide, Klaus},
+ month = dec,
+ year = {2014},
+ keywords = {BoxLib, Cactus, Chombo, Enzo, FLASH, SAMR, Uintah},
+ pages = {3217--3227},
+}
+@article{cohen_fully_2003,
+ title = {Fully adaptive multiresolution finite volume schemes for conservation laws},
+ volume = {72},
+ issn = {0025-5718, 1088-6842},
+ url = {https://www.ams.org/mcom/2003-72-241/S0025-5718-01-01391-6/},
+ doi = {10.1090/S0025-5718-01-01391-6},
+ abstract = {Advancing research. Creating connections.},
+ language = {English},
+ number = {241},
+ urldate = {2024-10-14},
+ journal = {Mathematics of Computation},
+ author = {Cohen, Albert and Kaber, Sidi and Müller, Siegfried and Postel, Marie},
+ year = {2003},
+ keywords = {Conservation laws, adaptivity, finite volume schemes, multiresolution, wavelets.},
+ pages = {183--225},
+}
+@article{krah_wavelet_2022,
+ title = {Wavelet adaptive proper orthogonal decomposition for large-scale flow data},
+ volume = {48},
+ issn = {1572-9044},
+ url = {https://doi.org/10.1007/s10444-021-09922-2},
+ doi = {10.1007/s10444-021-09922-2},
+ abstract = {The proper orthogonal decomposition (POD) is a powerful classical tool in fluid mechanics used, for instance, for model reduction and extraction of coherent flow features. However, its applicability to high-resolution data, as produced by three-dimensional direct numerical simulations, is limited owing to its computational complexity. Here, we propose a wavelet-based adaptive version of the POD (the wPOD), in order to overcome this limitation. The amount of data to be analyzed is reduced by compressing them using biorthogonal wavelets, yielding a sparse representation while conveniently providing control of the compression error. Numerical analysis shows how the distinct error contributions of wavelet compression and POD truncation can be balanced under certain assumptions, allowing us to efficiently process high-resolution data from three-dimensional simulations of flow problems. Using a synthetic academic test case, we compare our algorithm with the randomized singular value decomposition. Furthermore, we demonstrate the ability of our method analyzing data of a two-dimensional wake flow and a three-dimensional flow generated by a flapping insect computed with direct numerical simulation.},
+ language = {en},
+ number = {2},
+ urldate = {2024-10-14},
+ journal = {Advances in Computational Mathematics},
+ author = {Krah, Philipp and Engels, Thomas and Schneider, Kai and Reiss, Julius},
+ month = feb,
+ year = {2022},
+ keywords = {Biorthogonal wavelets, Fluid dynamics, Proper orthogonal decomposition, Reduced order models, Wavelet adaptive block-based grids},
+ pages = {10},
+}
+@article{gillis_murphy---scalable_2022,
+ title = {{MURPHY}---{A} {Scalable} {Multiresolution} {Framework} for {Scientific} {Computing} on {3D} {Block}-{Structured} {Collocated} {Grids}},
+ volume = {44},
+ issn = {1064-8275},
+ url = {https://epubs.siam.org/doi/abs/10.1137/21M141676X},
+ doi = {10.1137/21M141676X},
+ abstract = {We present the lifting scheme, a simple construction of second generation wavelets; these are wavelets that are not necessarily translates and dilates of one fixed function. Such wavelets can be adapted to intervals, domains, surfaces, weights, and irregular samples. We show how the lifting scheme leads to a faster, in-place calculation of the wavelet transform. Several examples are included.},
+ number = {5},
+ urldate = {2024-10-14},
+ journal = {SIAM Journal on Scientific Computing},
+ author = {Gillis, Thomas and van Rees, Wim M.},
+ month = oct,
+ year = {2022},
+ note = {Publisher: Society for Industrial and Applied Mathematics},
+ pages = {C367--C398},
+}
-📦 Apptainer images
+@article{zhang_amrex_2021,
+ title = {{AMReX}: {Block}-structured adaptive mesh refinement for multiphysics applications},
+ volume = {35},
+ issn = {1094-3420},
+ shorttitle = {{AMReX}},
+ url = {https://doi.org/10.1177/10943420211022811},
+ doi = {10.1177/10943420211022811},
+ abstract = {Block-structured adaptive mesh refinement (AMR) provides the basis for the temporal and spatial discretization strategy for a number of Exascale Computing Project applications in the areas of accelerator design, additive manufacturing, astrophysics, combustion, cosmology, multiphase flow, and wind plant modeling. AMReX is a software framework that provides a unified infrastructure with the functionality needed for these and other AMR applications to be able to effectively and efficiently utilize machines from laptops to exascale architectures. AMR reduces the computational cost and memory footprint compared to a uniform mesh while preserving accurate descriptions of different physical processes in complex multiphysics algorithms. AMReX supports algorithms that solve systems of partial differential equations in simple or complex geometries and those that use particles and/or particle–mesh operations to represent component physical processes. In this article, we will discuss the core elements of the AMReX framework such as data containers and iterators as well as several specialized operations to meet the needs of the application projects. In addition, we will highlight the strategy that the AMReX team is pursuing to achieve highly performant code across a range of accelerator-based architectures for a variety of different applications.},
+ language = {en},
+ number = {6},
+ urldate = {2024-10-14},
+ journal = {The International Journal of High Performance Computing Applications},
+ author = {Zhang, Weiqun and Myers, Andrew and Gott, Kevin and Almgren, Ann and Bell, John},
+ month = nov,
+ year = {2021},
+ note = {Publisher: SAGE Publications Ltd STM},
+ pages = {508--526},
+}
+@article{burstedde_p4est_2011,
+ title = {p4est: {Scalable} {Algorithms} for {Parallel} {Adaptive} {Mesh} {Refinement} on {Forests} of {Octrees}},
+ volume = {33},
+ issn = {1064-8275},
+ shorttitle = {p4est},
+ url = {https://epubs.siam.org/doi/abs/10.1137/100791634},
+ doi = {10.1137/100791634},
+ abstract = {In this article, we propose new parallel algorithms for the construction and 2:1 balance refinement of large linear octrees on distributed memory machines. Such octrees are used in many problems in computational science and engineering, e.g., object representation, image analysis, unstructured meshing, finite elements, adaptive mesh refinement, and N-body simulations. Fixed-size scalability and isogranular analysis of the algorithms using an MPI-based parallel implementation was performed on a variety of input data and demonstrated good scalability for different processor counts (1 to 1024 processors) on the Pittsburgh Supercomputing Center's TCS-1 AlphaServer. The results are consistent for different data distributions. Octrees with over a billion octants were constructed and balanced in less than a minute on 1024 processors. Like other existing algorithms for constructing and balancing octrees, our algorithms have \${\textbackslash}mathcal\{O\}(N{\textbackslash}log N)\$ work and \${\textbackslash}mathcal\{O\}(N)\$ storage complexity. Under reasonable assumptions on the distribution of octants and the work per octant, the parallel time complexity is \${\textbackslash}mathcal\{O\}({\textbackslash}frac\{N\}\{n\_p\}{\textbackslash}log({\textbackslash}frac\{N\}\{n\_p\})+n\_p{\textbackslash}log n\_p)\$, where N is the size of the final linear octree and \$n\_p\$ is the number of processors.},
+ number = {3},
+ urldate = {2024-10-14},
+ journal = {SIAM Journal on Scientific Computing},
+ author = {Burstedde, Carsten and Wilcox, Lucas C. and Ghattas, Omar},
+ month = jan,
+ year = {2011},
+ note = {Publisher: Society for Industrial and Applied Mathematics},
+ pages = {1103--1133},
+}
-apptainer pull -F oras://ghcr.io/feelpp/feelpp:v0.111.0-preview.10-jammy-sif
-apptainer exec feelpp\_v0.111.0-preview.10-jammy-sif.sif feelpp\_toolbox\_fluid --version
+@article{bellotti_numerical_nodate,
+ title = {Numerical analysis of lattice {Boltzmann} schemes: from fundamental issues to efficient and accurate adaptive methods},
+ language = {en},
+ author = {Bellotti, Thomas},
+}
+@article{bellotti_multiresolution-based_2022,
+ title = {Multiresolution-{Based} {Mesh} {Adaptation} and {Error} {Control} for {Lattice} {Boltzmann} {Methods} with {Applications} to {Hyperbolic} {Conservation} {Laws}},
+ volume = {44},
+ issn = {1064-8275},
+ url = {https://epubs.siam.org/doi/abs/10.1137/21M140256X},
+ doi = {10.1137/21M140256X},
+ abstract = {Lattice Boltzmann methods (LBM) stand out for their simplicity and computational efficiency while offering the possibility of simulating complex phenomena. While they are optimal for Cartesian meshes, adapted meshes have traditionally been a stumbling block since it is difficult to predict the right physics through various levels of meshes. In this work, we design a class of fully adaptive LBM methods with dynamic mesh adaptation and error control relying on multiresolution analysis. This wavelet-based approach allows us to adapt the mesh based on the regularity of the solution and leads to a very efficient compression of the solution without loosing its quality and with the preservation of the properties of the original LBM method on the finest grid. This yields a general approach for a large spectrum of schemes and allows precise error bounds, without the need for deep modifications on the reference scheme. An error analysis is proposed. For the purpose of validating this error analysis, we conduct a series of test cases for various schemes and scalar and systems of conservation laws, where solutions with shocks are to be found and local mesh adaptation is especially relevant. Theoretical estimates are retrieved while a reduced memory footprint is observed. It paves the way to an implementation in a multidimensional framework and high computational efficiency of the method for both parabolic and hyperbolic equations, which is the subject of a companion paper.
+Keywords
+lattice Boltzmann method
+multiresolution analysis
+wavelets
+dynamic mesh adaptation
+error control
+hyperbolic conservation laws
+MSC codes
+76M28
+65M50
+42C40
+65M12
+35L65},
+ number = {4},
+ urldate = {2024-10-14},
+ journal = {SIAM Journal on Scientific Computing},
+ author = {Bellotti, Thomas and Gouarin, Loïc and Graille, Benjamin and Massot, Marc},
+ month = aug,
+ year = {2022},
+ note = {Publisher: Society for Industrial and Applied Mathematics},
+ pages = {A2599--A2627},
+}
-What's Changed
+@article{bellotti_multidimensional_2022,
+ title = {Multidimensional fully adaptive lattice {Boltzmann} methods with error control based on multiresolution analysis},
+ volume = {471},
+ issn = {0021-9991},
+ url = {https://www.sciencedirect.com/science/article/pii/S0021999122007331},
+ doi = {10.1016/j.jcp.2022.111670},
+ abstract = {Lattice-Boltzmann methods are known for their simplicity, efficiency and ease of parallelization, usually relying on uniform Cartesian meshes with a strong bond between spatial and temporal discretization. This fact complicates the crucial issue of reducing the computational cost and the memory impact by automatically coarsening the grid where a fine mesh is unnecessary, still ensuring the overall quality of the numerical solution through error control. This work provides a possible answer to this interesting question, by connecting, for the first time, the field of lattice-Boltzmann Methods (LBM) to the adaptive multiresolution (MR) approach based on wavelets. To this end, we employ a MR multi-scale transform to adapt the mesh as the solution evolves in time according to its local regularity. The collision phase is not affected due to its inherent local nature and because we do not modify the speed of the sound, contrarily to most of the LBM/Adaptive Mesh Refinement (AMR) strategies proposed in the literature, thus preserving the original structure of any LBM scheme. Besides, an original use of the MR allows the scheme to resolve the proper physics by efficiently controlling the accuracy of the transport phase. We carefully test our method to conclude on its adaptability to a wide family of existing lattice Boltzmann schemes, treating both hyperbolic and parabolic systems of equations, thus being less problem-dependent than the AMR approaches, which have a hard time guaranteeing an effective control on the error. The ability of the method to yield a very efficient compression rate and thus a computational cost reduction for solutions involving localized structures with loss of regularity is also shown, while guaranteeing a precise control on the approximation error introduced by the spatial adaptation of the grid. The numerical strategy is implemented on a specific open-source platform called SAMURAI with a dedicated data-structure relying on set algebra.},
+ urldate = {2024-10-14},
+ journal = {Journal of Computational Physics},
+ author = {Bellotti, Thomas and Gouarin, Loïc and Graille, Benjamin and Massot, Marc},
+ month = dec,
+ year = {2022},
+ keywords = {Dynamic mesh adaptation, Error control, Hyperbolic systems of conservation laws, Incompressible Navier-Stokes equations, Lattice Boltzmann method, Multiresolution analysis},
+ pages = {111670},
+}
-Exciting New Features 🎉
+@inproceedings{jamond_manta_2022,
+ address = {Giens, France},
+ title = {{MANTA} : un code {HPC} généraliste pour la simulation de problèmes complexes en mécanique},
+ shorttitle = {{MANTA}},
+ url = {https://hal.science/hal-03688160},
+ abstract = {Le code MANTA a l’ambition de permettre la réalisation de simulations complexes en mécanique sur des supercalculateurs actuels et futurs tout en préservant les fondamentaux des codes développés au CEA : adaptabilité au problème posé, robustesse des algorithmes, pérennité des modèles et du code. On expose les principes de développement de ce code de nouvelle génération, et quelques exemples représentatifs de ses capacités actuelles sont également décrits.},
+ urldate = {2024-10-14},
+ booktitle = {{CSMA} 2022 15ème {Colloque} {National} en {Calcul} des {Structures}},
+ author = {Jamond, Olivier and Lelong, Nicolas and Fourmont, Axel and Bluthé, Joffrey and Breuze, Matthieu and Bouda, Pascal and Brooking, Guillaume and Drui, Florence and Epalle, Alexandre and Fandeur, Olivier and Folzan, Gauthier and Helfer, Thomas and Kloss, Francis and Latu, Guillaume and Motte, Antoine and Nahed, Christopher and Picard, Alexis and Prat, Raphael and Ramière, Isabelle and Steins, Morgane and Prabel, Benoit},
+ month = may,
+ year = {2022},
+ keywords = {Code de calcul, Eléments finis, HPC, Implicite - explicite, Mécanique des fluides, Mécanique des structures, Toolbox, Volumes finis},
+}
+@inproceedings{jamond_manta_2024,
+ address = {Giens, France},
+ title = {{MANTA}: an industrial-strength open-source high performance explicit and implicit multi-physics solver},
+ shorttitle = {{MANTA}},
+ url = {https://hal.science/hal-04610968},
+ urldate = {2024-10-14},
+ booktitle = {16ème {Colloque} {National} en {Calcul} de {Structures}},
+ publisher = {CNRS, CSMA, ENS Paris-Saclay, CentraleSupélec},
+ author = {Jamond, Olivier and Lelong, Nicolas and Brooking, Guillaume and Helfer, Thomas and Prabel, Benoit and Prat, Raphael and Jaccon, Adrien},
+ month = may,
+ year = {2024},
+ keywords = {HPC, Industrial applications, PDEs solving, fluid mechanics, multiphysics coupling, structural mechanics},
+}
+@misc{noauthor_16eme_nodate,
+ title = {16ème {Colloque} {National} en {Calcul} de {Structures} - {Sciencesconf}.org},
+ url = {https://csma2024.sciencesconf.org/517460},
+ urldate = {2024-10-14},
+}
-resolve 2231 : Support parts configuration in exporter by @vincentchabannes in https://github.com/feelpp/feelpp/pull/2232
+@phdthesis{daver2016,
+ type = {phd},
+ title = {Reduced basis method applied to large non-linear multi-physics problems : application to high field magnets design},
+ url = {http://www.theses.fr/2016STRAD019},
+ author = {Daversin - Catty, Cécile},
+ year = {2016},
+ note = {tex.note+duplicate-1: 2016STRAD019},
+}
-resolves 1489 and 2175: enrich range object and simplify {FunctionSpace} by @prudhomm in https://github.com/feelpp/feelpp/pull/2176
+@phdthesis{Hild2020,
+ type = {phd},
+ title = {Control and optimization of high magnetic fields},
+ url = {http://www.theses.fr/2020STRAD031},
+ author = {Hild, Romain},
+ year = {2020},
+ note = {tex.note+duplicate-1: 2020STRAD031},
+}
-resolves 2191 and 2196: cleanup and python wrapper for forms and implement feelpp namespace package by @prudhomm in https://github.com/feelpp/feelpp/pull/2227
+@article{wang_fluid_2016,
+ title = {Fluid and structure coupling analysis of the interaction between aqueous humor and iris},
+ volume = {15},
+ issn = {1475-925X},
+ url = {http://biomedical-engineering-online.biomedcentral.com/articles/10.1186/s12938-016-0261-3},
+ doi = {10.1186/s12938-016-0261-3},
+ language = {en},
+ number = {S2},
+ urldate = {2024-10-14},
+ journal = {BioMedical Engineering OnLine},
+ author = {Wang, Wenjia and Qian, Xiuqing and Song, Hongfang and Zhang, Mindi and Liu, Zhicheng},
+ month = dec,
+ year = {2016},
+ pages = {133},
+}
-resolves 2233: improve hdg toolbox, add new terms by @prudhomm in https://github.com/feelpp/feelpp/pull/2236
+@book{ansorge_programming_2022,
+ edition = {1},
+ title = {Programming in {Parallel} with {CUDA}: {A} {Practical} {Guide}},
+ copyright = {https://www.cambridge.org/core/terms},
+ isbn = {978-1-108-85527-3 978-1-108-47953-0},
+ shorttitle = {Programming in {Parallel} with {CUDA}},
+ url = {https://www.cambridge.org/core/product/identifier/9781108855273/type/book},
+ abstract = {CUDA is now the dominant language used for programming GPUs, one of the most exciting hardware developments of recent decades. With CUDA, you can use a desktop PC for work that would have previously required a large cluster of PCs or access to a HPC facility. As a result, CUDA is increasingly important in scientific and technical computing across the whole STEM community, from medical physics and financial modelling to big data applications and beyond. This unique book on CUDA draws on the author's passion for and long experience of developing and using computers to acquire and analyse scientific data. The result is an innovative text featuring a much richer set of examples than found in any other comparable book on GPU computing. Much attention has been paid to the C++ coding style, which is compact, elegant and efficient. A code base of examples and supporting material is available online, which readers can build on for their own projects.},
+ language = {en},
+ urldate = {2024-10-11},
+ publisher = {Cambridge University Press},
+ author = {Ansorge, Richard},
+ month = may,
+ year = {2022},
+ doi = {10.1017/9781108855273},
+}
-resolves 2259: add script to get feelpp version and improve packaging workflow by @prudhomm in https://github.com/feelpp/feelpp/pull/2260
+@article{noauthor_cuda_nodate,
+ title = {{CUDA} {GRAPHS} in {GROMACS}},
+ language = {en},
+}
+@article{schoonover_mpi_nodate,
+ title = {{MPI}+ {Programming} with {HIP} and {OpenMP}},
+ language = {en},
+ author = {Schoonover, Dr Joe},
+}
-{HPC} Changes
+@article{edvalson_readthedocs-breathe_nodate,
+ title = {{ReadTheDocs}-{Breathe} {Documentation}},
+ language = {en},
+ author = {Edvalson, Thomas},
+}
+@article{maia_rocm_nodate,
+ title = {{ROCm}™ {Library} {Support} \& {Profiling} {Tools}},
+ language = {en},
+ author = {Maia, Julio and Chalmers, Noel and Bauman, Paul T and Curtis, Nicholas and Malaya, Nicholas and McDougall, Damon and van Oostrum, Rene},
+}
+@article{malavally_amd_nodate,
+ title = {{AMD} {HIP} {Programming} {Guide}},
+ language = {en},
+ author = {Malavally, Roopa},
+}
-resolves 2246: fix non blocking mpi communication for large scale communications by @vincentchabannes in https://github.com/feelpp/feelpp/pull/2249
+@article{noauthor_use_nodate,
+ title = {Use {ROCm}™ on {Radeon}™ {GPUs} {Documentation}},
+ language = {en},
+}
+@article{edvalson_readthedocs-breathe_nodate-1,
+ title = {{ReadTheDocs}-{Breathe} {Documentation}},
+ language = {en},
+ author = {Edvalson, Thomas},
+}
-Recent Publications using Feel++
+@article{noauthor_cuda_nodate-1,
+ title = {{CUDA} {C}++ {Programming} {Guide}},
+}
+@article{parks_recycling_2006,
+ title = {Recycling {Krylov} {Subspaces} for {Sequences} of {Linear} {Systems}},
+ volume = {28},
+ issn = {1064-8275, 1095-7197},
+ url = {http://epubs.siam.org/doi/10.1137/040607277},
+ doi = {10.1137/040607277},
+ language = {en},
+ number = {5},
+ urldate = {2024-10-11},
+ journal = {SIAM Journal on Scientific Computing},
+ author = {Parks, Michael L. and De Sturler, Eric and Mackey, Greg and Johnson, Duane D. and Maiti, Spandan},
+ month = jan,
+ year = {2006},
+ pages = {1651--1674},
+}
+@article{robbe_exact_2006,
+ title = {Exact and inexact breakdowns in the block {GMRES} method},
+ volume = {419},
+ copyright = {https://www.elsevier.com/tdm/userlicense/1.0/},
+ issn = {00243795},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S0024379506002230},
+ doi = {10.1016/j.laa.2006.04.018},
+ language = {en},
+ number = {1},
+ urldate = {2024-10-11},
+ journal = {Linear Algebra and its Applications},
+ author = {Robbé, Mickaël and Sadkane, Miloud},
+ month = nov,
+ year = {2006},
+ pages = {265--285},
+}
-Ktirio Urban Building: A Computational Framework for City Energy Simulations Enhanced by {CI}/{CD} Innovations on {EuroHPC} Systems
+@unpublished{saigre_coupled_2024_paper,
+ type = {In preparation},
+ title = {A coupled fluid-dynamics-heat transfer model for {3D} simulations of the aqueous humor flow in the human eye},
+ abstract = {Understanding human eye behavior involves intricate interactions between physical phenomena such as heat transfer and fluid dynamics. Accurate computational models are vital for comprehending ocular diseases and therapeutic interventions.
+This work focuses on modeling and simulating aqueous humor flow in the anterior and posterior chambers of the eye, coupled with overall heat transfer.
+Aqueous humor dynamics regulates intraocular pressure, which is crucial for understanding conditions like glaucoma.
+Convective effects from temperature disparities also influence this flow.
+Extending prior research, this work develops a comprehensive three-dimensional computational model to simulate a coupled fluid-dynamic-heat transfer model, thus contributing to the understanding of ocular physiology.},
+ author = {Saigre, Thomas and Chabannes, Vincent and Prud'Homme, Christophe and Szopos, Marcela},
+}
-Nonlinear compressive reduced basis approximation for multi-parameter elliptic problem
+@inproceedings{jolivet_block_2016,
+ address = {Salt Lake City, Utah},
+ series = {{SC} '16},
+ title = {Block iterative methods and recycling for improved scalability of linear solvers},
+ isbn = {9781467388153},
+ abstract = {Contemporary large-scale Partial Differential Equation (PDE) simulations usually require the solution of large and sparse linear systems. Moreover, it is often needed to solve these linear systems with different or multiple Right-Hand Sides (RHSs). In this paper, various strategies will be presented to extend the scalability of existing multigrid or domain decomposition linear solvers using appropriate recycling strategies or block methods---i.e., by treating multiple right-hand sides simultaneously.The scalability of this work is assessed by performing simulations on up to 8,192 cores for solving linear systems arising from various physical phenomena modeled by Poisson's equation, the system of linear elasticity, or Maxwell's equation.This work is shipped as part of on open-source software, readily available and usable in any C/C++, Python, or Fortran code. In particular, some simulations are performed on top of a well-established library, PETSc, and it is shown how our approaches can be used to decrease time to solution down by 30\%.},
+ urldate = {2024-10-10},
+ booktitle = {Proceedings of the {International} {Conference} for {High} {Performance} {Computing}, {Networking}, {Storage} and {Analysis}},
+ publisher = {IEEE Press},
+ author = {Jolivet, Pierre and Tournier, Pierre-Henri},
+ month = nov,
+ year = {2016},
+ pages = {1--14},
+}
-2D Axisymmetric Modeling of the {HTS} Insert Nougat in a Background Magnetic Field Generated by Resistive Magnet
+@inproceedings{jolivet_scalable_2013,
+ address = {Denver Colorado},
+ title = {Scalable domain decomposition preconditioners for heterogeneous elliptic problems},
+ isbn = {9781450323789},
+ url = {https://dl.acm.org/doi/10.1145/2503210.2503212},
+ doi = {10.1145/2503210.2503212},
+ language = {en},
+ urldate = {2024-10-10},
+ booktitle = {Proceedings of the {International} {Conference} on {High} {Performance} {Computing}, {Networking}, {Storage} and {Analysis}},
+ publisher = {ACM},
+ author = {Jolivet, Pierre and Hecht, Frédéric and Nataf, Frédéric and Prud'homme, Christophe},
+ month = nov,
+ year = {2013},
+ pages = {1--11},
+}
+@article{jolivet_ksphpddm_2021,
+ title = {{KSPHPDDM} and {PCHPDDM}: {Extending} {PETSc} with advanced {Krylov} methods and robust multilevel overlapping {Schwarz} preconditioners},
+ volume = {84},
+ issn = {08981221},
+ shorttitle = {{KSPHPDDM} and {PCHPDDM}},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S0898122121000055},
+ doi = {10.1016/j.camwa.2021.01.003},
+ language = {en},
+ urldate = {2024-10-10},
+ journal = {Computers \& Mathematics with Applications},
+ author = {Jolivet, Pierre and Roman, Jose E. and Zampini, Stefano},
+ month = feb,
+ year = {2021},
+ pages = {277--295},
+}
-Enjoy!
+@article{al_daas_multilevel_2021,
+ title = {A {Multilevel} {Schwarz} {Preconditioner} {Based} on a {Hierarchy} of {Robust} {Coarse} {Spaces}},
+ volume = {43},
+ issn = {1064-8275, 1095-7197},
+ url = {https://epubs.siam.org/doi/10.1137/19M1266964},
+ doi = {10.1137/19M1266964},
+ language = {en},
+ number = {3},
+ urldate = {2024-10-10},
+ journal = {SIAM Journal on Scientific Computing},
+ author = {Al Daas, Hussam and Grigori, Laura and Jolivet, Pierre and Tournier, Pierre-Henri},
+ month = jan,
+ year = {2021},
+ pages = {A1907--A1928},
+}
-Full Changelog: https://github.com/feelpp/feelpp/compare/v0.111.0-preview.9...v0.111.0-preview.10},
- version = {v0.111.0-preview.10},
- publisher = {Cemosis},
- author = {Prud'homme, Christophe and Chabannes, Vincent and Saigre, Thomas and Trophime, Christophe and Berti, Luca and Samaké, Abdoulaye and Van Landeghem, Céline and Szopos, Marcela and Giraldi, Laetitia and Bertoluzza, Silvia and Maday, Yvon},
- urldate = {2024-09-04},
- date = {2024-07-15},
- doi = {10.5281/ZENODO.591797},
+@book{bernardi_mathematics_2024,
+ address = {Philadelphia, PA},
+ title = {Mathematics and {Finite} {Element} {Discretizations} of {Incompressible} {Navier}—{Stokes} {Flows}},
+ isbn = {9781611978117 9781611978124},
+ url = {https://epubs.siam.org/doi/book/10.1137/1.9781611978124},
+ language = {en},
+ urldate = {2024-10-09},
+ publisher = {Society for Industrial and Applied Mathematics},
+ author = {Bernardi, Christine and Girault, Vivette and Hecht, Frédéric and Raviart, Pierre-Arnaud and Rivière, Beatrice},
+ month = jan,
+ year = {2024},
+ doi = {10.1137/1.9781611978124},
}
-@misc{balay_petsc_2024,
- title = {{PETSc} Web page},
- url = {https://petsc.org/},
- author = {Balay, Satish and Abhyankar, Shrirang and Adams, Mark F. and Benson, Steven and Brown, Jed and Brune, Peter and Buschelman, Kris and Constantinescu, Emil M. and Dalcin, Lisandro and Dener, Alp and Eijkhout, Victor and Faibussowitsch, Jacob and Gropp, William D. and Hapla, Václav and Isaac, Tobin and Jolivet, Pierre and Karpeev, Dmitry and Kaushik, Dinesh and Knepley, Matthew G. and Kong, Fande and Kruger, Scott and May, Dave A. and {McInnes}, Lois Curfman and Mills, Richard Tran and Mitchell, Lawrence and Munson, Todd and Roman, Jose E. and Rupp, Karl and Sanan, Patrick and Sarich, Jason and Smith, Barry F. and Zampini, Stefano and Zhang, Hong and Zhang, Hong and Zhang, Junchao},
- date = {2024},
+@article{dapogny_geometrical_2018,
+ title = {Geometrical shape optimization in fluid mechanics using {FreeFem}++},
+ volume = {58},
+ issn = {1615-147X, 1615-1488},
+ url = {http://link.springer.com/10.1007/s00158-018-2023-2},
+ doi = {10.1007/s00158-018-2023-2},
+ language = {en},
+ number = {6},
+ urldate = {2024-10-09},
+ journal = {Structural and Multidisciplinary Optimization},
+ author = {Dapogny, Charles and Frey, Pascal and Omnès, Florian and Privat, Yannick},
+ month = dec,
+ year = {2018},
+ pages = {2761--2788},
}
-@report{balay_petsctao_2024,
- title = {{PETSc}/{TAO} Users Manual},
- number = {{ANL}-21/39 - Revision 3.21},
- institution = {Argonne National Laboratory},
- author = {Balay, Satish and Abhyankar, Shrirang and Adams, Mark F. and Benson, Steven and Brown, Jed and Brune, Peter and Buschelman, Kris and Constantinescu, Emil and Dalcin, Lisandro and Dener, Alp and Eijkhout, Victor and Faibussowitsch, Jacob and Gropp, William D. and Hapla, Václav and Isaac, Tobin and Jolivet, Pierre and Karpeev, Dmitry and Kaushik, Dinesh and Knepley, Matthew G. and Kong, Fande and Kruger, Scott and May, Dave A. and {McInnes}, Lois Curfman and Mills, Richard Tran and Mitchell, Lawrence and Munson, Todd and Roman, Jose E. and Rupp, Karl and Sanan, Patrick and Sarich, Jason and Smith, Barry F. and Zampini, Stefano and Zhang, Hong and Zhang, Hong and Zhang, Junchao},
- date = {2024},
- doi = {10.2172/2205494},
+@article{zhu_89-line_2021,
+ title = {An 89-line code for geometrically nonlinear topology optimization written in {FreeFEM}},
+ volume = {63},
+ issn = {1615-147X, 1615-1488},
+ url = {https://link.springer.com/10.1007/s00158-020-02733-x},
+ doi = {10.1007/s00158-020-02733-x},
+ language = {en},
+ number = {2},
+ urldate = {2024-10-09},
+ journal = {Structural and Multidisciplinary Optimization},
+ author = {Zhu, Benliang and Zhang, Xianmin and Li, Hai and Liang, Junwen and Wang, Rixin and Li, Hao and Nishiwaki, Shinji},
+ month = feb,
+ year = {2021},
+ pages = {1015--1027},
}
-@article{dalcin_parallel_2011,
- title = {Parallel distributed computing using Python},
- volume = {34},
- issn = {0309-1708},
- doi = {10.1016/j.advwatres.2011.04.013},
- pages = {1124 -- 1139},
- number = {9},
- journaltitle = {Advances in Water Resources},
- author = {Dalcin, Lisandro D. and Paz, Rodrigo R. and Kler, Pablo A. and Cosimo, Alejandro},
- date = {2011},
+@article{sadaka_finite_2024,
+ title = {A finite element toolbox for the {Bogoliubov}-de {Gennes} stability analysis of {Bose}-{Einstein} condensates},
+ volume = {294},
+ issn = {00104655},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S001046552300293X},
+ doi = {10.1016/j.cpc.2023.108948},
+ language = {en},
+ urldate = {2024-10-09},
+ journal = {Computer Physics Communications},
+ author = {Sadaka, Georges and Kalt, Victor and Danaila, Ionut and Hecht, Frédéric},
+ month = jan,
+ year = {2024},
+ pages = {108948},
}
-@article{zhang_petscsf_2022,
- title = {The {PetscSF} Scalable Communication Layer},
- volume = {33},
- pages = {842--853},
- number = {4},
- journaltitle = {{IEEE} Transactions on Parallel and Distributed Systems},
- author = {Zhang, Junchao and Brown, Jed and Balay, Satish and Faibussowitsch, Jacob and Knepley, Matthew and Marin, Oana and Mills, Richard Tran and Munson, Todd and Smith, Barry F. and Zampini, Stefano},
- date = {2022},
+@article{golse_radiative_2023,
+ title = {Radiative transfer for variable three-dimensional atmospheres},
+ volume = {475},
+ issn = {00219991},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S0021999122009275},
+ doi = {10.1016/j.jcp.2022.111864},
+ language = {en},
+ urldate = {2024-10-09},
+ journal = {Journal of Computational Physics},
+ author = {Golse, F. and Hecht, F. and Pironneau, O. and Smets, D. and Tournier, P.-H.},
+ month = feb,
+ year = {2023},
+ pages = {111864},
}
-@inproceedings{balay_efficient_1997,
- title = {Efficient Management of Parallelism in Object Oriented Numerical Software Libraries},
- pages = {163--202},
- booktitle = {Modern Software Tools in Scientific Computing},
- publisher = {Birkhäuser Press},
- author = {Balay, Satish and Gropp, William D. and {McInnes}, Lois Curfman and Smith, Barry F.},
- editor = {Arge, E. and Bruaset, A. M. and Langtangen, H. P.},
- date = {1997},
+@article{li_three-dimensional_2022,
+ title = {Three-dimensional topology optimization of a fluid–structure system using body-fitted mesh adaption based on the level-set method},
+ volume = {101},
+ issn = {0307904X},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S0307904X21003966},
+ doi = {10.1016/j.apm.2021.08.021},
+ language = {en},
+ urldate = {2024-10-09},
+ journal = {Applied Mathematical Modelling},
+ author = {Li, Hao and Kondoh, Tsuguo and Jolivet, Pierre and Furuta, Kozo and Yamada, Takayuki and Zhu, Benliang and Izui, Kazuhiro and Nishiwaki, Shinji},
+ month = jan,
+ year = {2022},
+ pages = {276--308},
}
-@article{ballout_nonlinear_2024,
- title = {Nonlinear compressive reduced basis approximation for multi-parameter elliptic problem},
- rights = {Creative Commons Attribution 4.0 International},
- url = {https://zenodo.org/doi/10.5281/zenodo.13336083},
- doi = {10.5281/ZENODO.13336083},
- abstract = {What's Changed
+@article{nataf_geneo_2024,
+ title = {A {GenEO} {Domain} {Decomposition} method for {Saddle} {Point} problems},
+ volume = {351},
+ issn = {1873-7234},
+ url = {https://comptes-rendus.academie-sciences.fr/mecanique/articles/10.5802/crmeca.175/},
+ doi = {10.5802/crmeca.175},
+ language = {en},
+ number = {S1},
+ urldate = {2024-10-09},
+ journal = {Comptes Rendus. Mécanique},
+ author = {Nataf, Frédéric and Tournier, Pierre-Henri},
+ month = apr,
+ year = {2024},
+ pages = {667--684},
+}
+@article{tournier_three-dimensional_2022,
+ title = {Three-dimensional finite-difference finite-element frequency-domain wave simulation with multi-level optimized additive {Schwarz} domain-decomposition preconditioner: {A} tool for {FWI} of sparse node datasets},
+ issn = {0016-8033, 1942-2156},
+ shorttitle = {Three-dimensional finite-difference finite-element frequency-domain wave simulation with multi-level optimized additive {Schwarz} domain-decomposition preconditioner},
+ url = {https://library.seg.org/doi/10.1190/geo2021-0702.1},
+ doi = {10.1190/geo2021-0702.1},
+ abstract = {Efficient frequency-domain full-waveform inversion (FWI) of long-offset node data can be designed with a few discrete frequencies, which lead to modest data volumes to be managed during the inversion process. Moreover, attenuation effects can be straightforwardly implemented in the forward problem without the computational overhead. However, 3D frequency-domain seismic modeling is challenging because it requires solving a large and sparse linear indefinite system for each frequency with multiple right-hand sides (RHSs). This linear system can be solved by direct or iterative methods. The former allows efficient processing of multiple RHSs but may suffer from limited scalability for very large problems. Iterative methods equipped with a domain-decomposition preconditioner provide a suitable alternative to process large computational domains for sparse-node acquisition. We have investigated the domain-decomposition preconditioner based on the optimized restricted additive Schwarz (ORAS) method, in which a Robin or perfectly matched layer condition is implemented at the boundaries between the subdomains. The preconditioned system is solved by a Krylov subspace method, whereas a block low-rank lower-upper decomposition of the local matrices is performed at a preprocessing stage. Multiple sources are processed in groups with a pseudoblock method. The accuracy, the computational cost, and the scalability of the ORAS solver are assessed against several realistic benchmarks. In terms of discretization, we compare a compact wavelength-adaptive 27-point finite-difference stencil on a regular Cartesian grid with a P
+ 3
+ finite-element method on h-adaptive tetrahedral mesh. Although both schemes have comparable accuracy, the former is more computationally efficient, the latter being beneficial to comply with known boundaries such as bathymetry. The scalability of the method, the block processing of multiple RHSs, and the straightforward implementation of attenuation, which further improves the convergence of the iterative solver, make the method a versatile forward engine for large-scale 3D FWI applications from sparse node data sets.},
+ language = {en},
+ urldate = {2024-10-09},
+ journal = {GEOPHYSICS},
+ author = {Tournier, Pierre-Henri and Jolivet, Pierre and Dolean, Victorita and Aghamiry, Hossein S. and Operto, Stéphane and Riffo, Sebastian},
+ month = jul,
+ year = {2022},
+ pages = {1--84},
+}
+@article{tournier_numerical_2017,
+ title = {Numerical {Modeling} and {High}-{Speed} {Parallel} {Computing}: {New} {Perspectives} on {Tomographic} {Microwave} {Imaging} for {Brain} {Stroke} {Detection} and {Monitoring}},
+ volume = {59},
+ issn = {1558-4143},
+ shorttitle = {Numerical {Modeling} and {High}-{Speed} {Parallel} {Computing}},
+ url = {https://ieeexplore.ieee.org/abstract/document/8014422#:~:text=10.1109/MAP.2017.2731199},
+ doi = {10.1109/MAP.2017.2731199},
+ abstract = {This article deals with microwave tomography for brain stroke imaging using state-of-the-art numerical modeling and massively parallel computing. Iterative microwave tomographic imaging requires the solution of an inverse problem based on a minimization algorithm (e.g., gradient based) with successive solutions of a direct problem such as the accurate modeling of a whole-microwave measurement system. Moreover, a sufficiently high number of unknowns is required to accurately represent the solution. As the system will be used for detecting a brain stroke (ischemic or hemorrhagic) as well as for monitoring during the treatment, the running times for the reconstructions should be reasonable. The method used is based on high-order finite elements, parallel preconditioners from the domain decomposition method and domain-specific language with the opensource FreeFEM++ solver.},
+ number = {5},
+ urldate = {2024-10-09},
+ journal = {IEEE Antennas and Propagation Magazine},
+ author = {Tournier, Pierre-Henri and Bonazzoli, Marcella and Dolean, Victorita and Rapetti, Francesca and Hecht, Frederic and Nataf, Frederic and Aliferis, Iannis and El Kanfoud, Ibtissam and Migliaccio, Claire and de Buhan, Maya and Darbas, Marion and Semenov, Serguei and Pichot, Christian},
+ month = oct,
+ year = {2017},
+ keywords = {Antenna measurements, Boundary conditions, Brain modeling, Computational modeling, Finite element analysis, Tomography},
+ pages = {98--110},
+}
-add citation file by @prudhomm in https://github.com/feelpp/article.nl-c-rbm/pull/4
+@article{sadaka_parallel_2020,
+ title = {Parallel finite-element codes for the simulation of two-dimensional and three-dimensional solid–liquid phase-change systems with natural convection},
+ volume = {257},
+ issn = {00104655},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S0010465520302319},
+ doi = {10.1016/j.cpc.2020.107492},
+ language = {en},
+ urldate = {2024-10-09},
+ journal = {Computer Physics Communications},
+ author = {Sadaka, Georges and Rakotondrandisa, Aina and Tournier, Pierre-Henri and Luddens, Francky and Lothodé, Corentin and Danaila, Ionut},
+ month = dec,
+ year = {2020},
+ pages = {107492},
+}
+@book{dolean_introduction_2015,
+ address = {Philadelphia, PA},
+ title = {An {Introduction} to {Domain} {Decomposition} {Methods}: {Algorithms}, {Theory}, and {Parallel} {Implementation}},
+ isbn = {9781611974058 9781611974065},
+ shorttitle = {An {Introduction} to {Domain} {Decomposition} {Methods}},
+ url = {http://epubs.siam.org/doi/book/10.1137/1.9781611974065},
+ language = {en},
+ urldate = {2024-10-09},
+ publisher = {Society for Industrial and Applied Mathematics},
+ author = {Dolean, Victorita and Jolivet, Pierre and Nataf, Frédéric},
+ month = nov,
+ year = {2015},
+ doi = {10.1137/1.9781611974065},
+}
-Full Changelog: https://github.com/feelpp/article.nl-c-rbm/compare/v1.0.1...v1.1.0},
- author = {Ballout, Hassan and Maday, Yvon and Prud'homme, Christophe},
- urldate = {2024-09-04},
- date = {2024-08-17},
- note = {Publisher: Zenodo
-Version Number: v1.1.0},
+@book{hecht_pde-constrained_2024,
+ title = {{PDE}-constrained optimization within {FreeFEM}},
+ url = {https://hal.science/hal-04724788},
+ abstract = {This book is aimed at students and researchers who want to learn how to efficiently solve constrained optimization problems involving partial differential equations (PDE) using the FreeFEM software.},
+ urldate = {2024-10-09},
+ author = {Hecht, Frédéric and Lance, Gontran and Trélat, Emmanuel},
+ year = {2024},
}
-@software{karakasis_reframe-hpcreframe_2024,
- title = {reframe-hpc/reframe: {ReFrame} 4.6.0},
- url = {https://doi.org/10.5281/zenodo.11002528},
- version = {v4.6.0},
- publisher = {Zenodo},
- author = {Karakasis, Vasileios and Manitaras, Theofilos and Otero, Javier and Koutsaniti, Eirini and {jgp} and {rsarm} and Bignamini, Christopher and {victorusu} and Jocksch, Andreas and {kraushm} and {lucamar} and Keller, Sebastian and Omlin, Samuel and Kliavinek, Sergei and Mendonça, Henrique and Giordano, Mosè and {MarkLTurner} and {GiuseppeLoRe} and Grassano, Davide and Boissonneault, Maxime and Leak, Steve and Paipuri, Mahendra and {jfavre} and {Vanessasaurus} and Morrison, Jack and Moors, Sam and You, Zhi-Qiang and Sandgren, Ake and {brandon-biggs}},
- date = {2024-04},
- doi = {10.5281/zenodo.11002528},
+@inproceedings{saigre_coupled_2024_abstract,
+ address = {Arlington (Virginia), United States},
+ title = {A coupled fluid-dynamics-heat transfer model for {3D} simulations of the aqueous humor flow in the human eye},
+ url = {https://hal.science/hal-04558924},
+ booktitle = {{CMBE24}},
+ author = {Saigre, Thomas and Prud'Homme, Christophe and Szopos, Marcela and Chabannes, Vincent},
+ month = jun,
+ year = {2024},
+ keywords = {Thermo-fluid dynamics, finite element method, mathematical and computational ophthalmology, thermo-fluid dynamics},
}
-@book{slurm_development_team_slurm_2024,
- title = {{SLURM} Workload Manager},
- url = {https://slurm.schedmd.com/documentation.html},
- author = {{SLURM Development Team}},
- date = {2024},
+@article{giraud_block_2022,
+ title = {A {Block} {Minimum} {Residual} {Norm} {Subspace} {Solver} with {Partial} {Convergence} {Management} for {Sequences} of {Linear} {Systems}},
+ volume = {43},
+ issn = {0895-4798, 1095-7162},
+ url = {https://epubs.siam.org/doi/10.1137/21M1401127},
+ doi = {10.1137/21M1401127},
+ language = {en},
+ number = {2},
+ urldate = {2024-10-09},
+ journal = {SIAM Journal on Matrix Analysis and Applications},
+ author = {Giraud, Luc and Jing, Yan-Fei and Xiang, Yanfei},
+ month = jun,
+ year = {2022},
+ pages = {710--739},
}
-@misc{apptainer_contributors_apptainer_2024,
- title = {Apptainer User Documentation},
- url = {https://apptainer.org/docs},
- author = {{Apptainer Contributors}},
- date = {2024},
+@article{agullo_robust_2019,
+ title = {Robust {Preconditioners} via {Generalized} {Eigenproblems} for {Hybrid} {Sparse} {Linear} {Solvers}},
+ volume = {40},
+ issn = {0895-4798, 1095-7162},
+ url = {https://epubs.siam.org/doi/10.1137/17M1153765},
+ doi = {10.1137/17M1153765},
+ language = {en},
+ number = {2},
+ urldate = {2024-10-09},
+ journal = {SIAM Journal on Matrix Analysis and Applications},
+ author = {Agullo, Emmanuel and Giraud, Luc and Poirel, Louis},
+ month = jan,
+ year = {2019},
+ pages = {417--439},
}
-@incollection{baudin_openturns_2016,
- location = {Cham},
- title = {{OpenTURNS}: An Industrial Software for Uncertainty Quantification in Simulation},
- isbn = {978-3-319-11259-6},
- url = {https://doi.org/10.1007/978-3-319-11259-6_64-1},
- pages = {1--38},
- booktitle = {Handbook of Uncertainty Quantification},
- publisher = {Springer International Publishing},
- author = {Baudin, Michaël and Dutfoy, Anne and Iooss, Bertrand and Popelin, Anne-Laure},
- editor = {Ghanem, Roger and Higdon, David and Owhadi, Houman},
- date = {2016},
- doi = {10.1007/978-3-319-11259-6_64-1},
+@article{agullo_resiliency_2022,
+ title = {Resiliency in numerical algorithm design for extreme scale simulations},
+ volume = {36},
+ issn = {1094-3420, 1741-2846},
+ url = {https://journals.sagepub.com/doi/10.1177/10943420211055188},
+ doi = {10.1177/10943420211055188},
+ abstract = {This work is based on the seminar titled ‘Resiliency in Numerical Algorithm Design for Extreme Scale Simulations’ held March 1–6, 2020, at Schloss Dagstuhl, that was attended by all the authors. Advanced supercomputing is characterized by very high computation speeds at the cost of involving an enormous amount of resources and costs. A typical large-scale computation running for 48 h on a system consuming 20 MW, as predicted for exascale systems, would consume a million kWh, corresponding to about 100k Euro in energy cost for executing 10
+ 23
+ floating-point operations. It is clearly unacceptable to lose the whole computation if any of the several million parallel processes fails during the execution. Moreover, if a single operation suffers from a bit-flip error, should the whole computation be declared invalid? What about the notion of reproducibility itself: should this core paradigm of science be revised and refined for results that are obtained by large-scale simulation? Naive versions of conventional resilience techniques will not scale to the exascale regime: with a main memory footprint of tens of Petabytes, synchronously writing checkpoint data all the way to background storage at frequent intervals will create intolerable overheads in runtime and energy consumption. Forecasts show that the mean time between failures could be lower than the time to recover from such a checkpoint, so that large calculations at scale might not make any progress if robust alternatives are not investigated. More advanced resilience techniques must be devised. The key may lie in exploiting both advanced system features as well as specific application knowledge. Research will face two essential questions: (1) what are the reliability requirements for a particular computation and (2) how do we best design the algorithms and software to meet these requirements? While the analysis of use cases can help understand the particular reliability requirements, the construction of remedies is currently wide open. One avenue would be to refine and improve on system- or application-level checkpointing and rollback strategies in the case an error is detected. Developers might use fault notification interfaces and flexible runtime systems to respond to node failures in an application-dependent fashion. Novel numerical algorithms or more stochastic computational approaches may be required to meet accuracy requirements in the face of undetectable soft errors. These ideas constituted an essential topic of the seminar. The goal of this Dagstuhl Seminar was to bring together a diverse group of scientists with expertise in exascale computing to discuss novel ways to make applications resilient against detected and undetected faults. In particular, participants explored the role that algorithms and applications play in the holistic approach needed to tackle this challenge. This article gathers a broad range of perspectives on the role of algorithms, applications and systems in achieving resilience for extreme scale simulations. The ultimate goal is to spark novel ideas and encourage the development of concrete solutions for achieving such resilience holistically.},
+ language = {en},
+ number = {2},
+ urldate = {2024-10-09},
+ journal = {The International Journal of High Performance Computing Applications},
+ author = {Agullo, Emmanuel and Altenbernd, Mirco and Anzt, Hartwig and Bautista-Gomez, Leonardo and Benacchio, Tommaso and Bonaventura, Luca and Bungartz, Hans-Joachim and Chatterjee, Sanjay and Ciorba, Florina M and DeBardeleben, Nathan and Drzisga, Daniel and Eibl, Sebastian and Engelmann, Christian and Gansterer, Wilfried N and Giraud, Luc and Göddeke, Dominik and Heisig, Marco and Jézéquel, Fabienne and Kohl, Nils and Li, Xiaoye Sherry and Lion, Romain and Mehl, Miriam and Mycek, Paul and Obersteiner, Michael and Quintana-Ortí, Enrique S and Rizzi, Francesco and Rüde, Ulrich and Schulz, Martin and Fung, Fred and Speck, Robert and Stals, Linda and Teranishi, Keita and Thibault, Samuel and Thönnes, Dominik and Wagner, Andreas and Wohlmuth, Barbara},
+ month = mar,
+ year = {2022},
+ pages = {251--285},
}
-@incollection{baudin_openturns_2016-1,
- location = {Cham},
- title = {{OpenTURNS}: An Industrial Software for Uncertainty Quantification in Simulation},
- isbn = {978-3-319-11259-6},
- url = {https://doi.org/10.1007/978-3-319-11259-6_64-1},
- pages = {1--38},
- booktitle = {Handbook of Uncertainty Quantification},
- publisher = {Springer International Publishing},
- author = {Baudin, Michaël and Dutfoy, Anne and Iooss, Bertrand and Popelin, Anne-Laure},
- editor = {Ghanem, Roger and Higdon, David and Owhadi, Houman},
- date = {2016},
- doi = {10.1007/978-3-319-11259-6_64-1},
+@article{agullo_soft_2020,
+ title = {On {Soft} {Errors} in the {Conjugate} {Gradient} {Method}: {Sensitivity} and {Robust} {Numerical} {Detection}},
+ volume = {42},
+ issn = {1064-8275, 1095-7197},
+ shorttitle = {On {Soft} {Errors} in the {Conjugate} {Gradient} {Method}},
+ url = {https://epubs.siam.org/doi/10.1137/18M122858X},
+ doi = {10.1137/18M122858X},
+ language = {en},
+ number = {6},
+ urldate = {2024-10-09},
+ journal = {SIAM Journal on Scientific Computing},
+ author = {Agullo, Emmanuel and Cools, Siegfried and Yetkin, Emrullah Fatih and Giraud, Luc and Schenkels, Nick and Vanroose, Wim},
+ month = jan,
+ year = {2020},
+ pages = {C335--C358},
}
-@article{faucher_hawen_2021,
- title = {hawen: time-harmonic wave modeling and inversion using hybridizable discontinuous Galerkin discretization},
- volume = {6},
- rights = {http://creativecommons.org/licenses/by/4.0/},
- issn = {2475-9066},
- url = {https://joss.theoj.org/papers/10.21105/joss.02699},
- doi = {10.21105/joss.02699},
- shorttitle = {hawen},
- pages = {2699},
- number = {57},
- journaltitle = {Journal of Open Source Software},
- shortjournal = {{JOSS}},
- author = {Faucher, Florian},
- urldate = {2024-09-05},
- date = {2021-01-24},
- file = {Full Text:files/1144/Faucher - 2021 - hawen time-harmonic wave modeling and inversion u.pdf:application/pdf},
+@article{pham_assembling_2024,
+ title = {Assembling algorithm for {Green}'s tensors and absorbing boundary conditions for {Galbrun}'s equation in radial symmetry},
+ volume = {519},
+ issn = {0021-9991},
+ url = {https://www.sciencedirect.com/science/article/pii/S0021999124006922},
+ doi = {10.1016/j.jcp.2024.113444},
+ abstract = {Solar oscillations can be modeled by Galbrun's equation which describes Lagrangian wave displacement in a self-gravitating stratified medium. For spherically symmetric backgrounds, we construct an algorithm to compute efficiently and accurately the coefficients of the Green's tensor of the time-harmonic equation in vector spherical harmonic basis. With only two resolutions, our algorithm provides values of the kernels for all heights of source and receiver, and prescribes analytically the singularities of the kernels. We also derive absorbing boundary conditions (ABC) to model wave propagation in the atmosphere above the cut-off frequency. The construction of ABC, which contains varying gravity terms, is rendered difficult by the complex behavior of the solar potential in low atmosphere and for frequencies below the Lamb frequency. We carry out extensive numerical investigations to compare and evaluate the efficiency of the ABCs in capturing outgoing solutions. Finally, as an application towards helioseismology, we compute synthetic solar power spectra that contain pressure modes as well as internal-gravity (g-) and surface-gravity (f-) ridges which are missing in simpler approximations of the wave equation. For purpose of validation, the locations of the ridges in the synthetic power spectra are compared with observed solar modes.},
+ urldate = {2024-10-09},
+ journal = {Journal of Computational Physics},
+ author = {Pham, Ha and Faucher, Florian and Fournier, Damien and Barucq, Hélène and Gizon, Laurent},
+ month = dec,
+ year = {2024},
+ pages = {113444},
}
-@report{adams_dakota_2022,
- title = {Dakota, A Multilevel Parallel Object-Oriented Framework for Design Optimization, Parameter Estimation, Uncertainty Quantification, and Sensitivity Analysis: Version 6.16 User’s Manual},
- number = {{SAND}2022-6171},
- institution = {Sandia National Laboratories},
- author = {Adams, B. M. and Bohnhoff, W. J. and Dalbey, K. R. and Ebeida, M. S. and Eddy, J. P. and Eldred, M. S. and Hooper, R. W. and Hough, P. D. and Hu, K. T. and Jakeman, J. D. and Khalil, M. and Maupin, K. A. and Monschke, J. A. and Ridgway, E. M. and Rushdi, A. A. and Seidl, D. T. and Stephens, J. A. and Swiler, L. P. and Winokur, J. G.},
- date = {2022-05},
+@book{elman_finite_2014,
+ address = {Oxford},
+ edition = {2. ed},
+ series = {Numerical mathematics and scientific computation},
+ title = {Finite elements and fast iterative solvers: with applications in incompressible fluid dynamics},
+ isbn = {978-0-19-967879-2},
+ shorttitle = {Finite elements and fast iterative solvers},
+ language = {eng},
+ publisher = {Oxford Univ. Press},
+ author = {Elman, Howard C. and Silvester, David J. and Wathen, Andrew J.},
+ year = {2014},
}
-@article{vallet_toward_2022,
- title = {Toward practical transparent verifiable and long-term reproducible research using Guix},
- volume = {9},
- issn = {2052-4463},
- url = {https://www.nature.com/articles/s41597-022-01720-9},
- doi = {10.1038/s41597-022-01720-9},
- abstract = {Abstract
- Reproducibility crisis urge scientists to promote transparency which allows peers to draw same conclusions after performing identical steps from hypothesis to results. Growing resources are developed to open the access to methods, data and source codes. Still, the computational environment, an interface between data and source code running analyses, is not addressed. Environments are usually described with software and library names associated with version labels or provided as an opaque container image. This is not enough to describe the complexity of the dependencies on which they rely to operate on. We describe this issue and illustrate how open tools like Guix can be used by any scientist to share their environment and allow peers to reproduce it. Some steps of research might not be fully reproducible, but at least, transparency for computation is technically addressable. These tools should be considered by scientists willing to promote transparency and open science.},
- pages = {597},
+@article{prudhomme_reliable_2002,
+ title = {Reliable {Real}-{Time} {Solution} of {Parametrized} {Partial} {Differential} {Equations}: {Reduced}-{Basis} {Output} {Bound} {Methods}},
+ volume = {124},
+ issn = {0098-2202, 1528-901X},
+ shorttitle = {Reliable {Real}-{Time} {Solution} of {Parametrized} {Partial} {Differential} {Equations}},
+ url = {https://asmedigitalcollection.asme.org/fluidsengineering/article/124/1/70/462808/Reliable-RealTime-Solution-of-Parametrized-Partial},
+ doi = {10.1115/1.1448332},
+ abstract = {We present a technique for the rapid and reliable prediction of linear-functional outputs of elliptic (and parabolic) partial differential equations with affine parameter dependence. The essential components are (i) (provably) rapidly convergent global reduced-basis approximations—Galerkin projection onto a space WN spanned by solutions of the governing partial differential equation at N selected points in parameter space; (ii) a posteriori error estimation—relaxations of the error-residual equation that provide inexpensive yet sharp and rigorous bounds for the error in the outputs of interest; and (iii) off-line/on-line computational procedures methods which decouple the generation and projection stages of the approximation process. The operation count for the on-line stage in which, given a new parameter value, we calculate the output of interest and associated error bound, depends only on N (typically very small) and the parametric complexity of the problem; the method is thus ideally suited for the repeated and rapid evaluations required in the context of parameter estimation, design, optimization, and real-time control.},
+ language = {en},
number = {1},
- journaltitle = {Scientific Data},
- shortjournal = {Sci Data},
- author = {Vallet, Nicolas and Michonneau, David and Tournier, Simon},
- urldate = {2024-09-05},
- date = {2022-10-04},
- langid = {english},
- file = {Full Text:files/1150/Vallet et al. - 2022 - Toward practical transparent verifiable and long-t.pdf:application/pdf},
+ journal = {Journal of Fluids Engineering},
+ author = {Prud’homme, C. and Rovas, D. V. and Veroy, K. and Machiels, L. and Maday, Y. and Patera, A. T. and Turinici, G.},
+ month = mar,
+ year = {2002},
+ pages = {70--80},
}
-@inproceedings{gamblin_spack_2015,
- location = {Austin Texas},
- title = {The Spack package manager: bringing order to {HPC} software chaos},
- isbn = {978-1-4503-3723-6},
- url = {https://dl.acm.org/doi/10.1145/2807591.2807623},
- doi = {10.1145/2807591.2807623},
- shorttitle = {The Spack package manager},
- eventtitle = {{SC}15: The International Conference for High Performance Computing, Networking, Storage and Analysis},
- pages = {1--12},
- booktitle = {Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
- publisher = {{ACM}},
- author = {Gamblin, Todd and {LeGendre}, Matthew and Collette, Michael R. and Lee, Gregory L. and Moody, Adam and De Supinski, Bronis R. and Futral, Scott},
- urldate = {2024-09-05},
- date = {2015-11-15},
- langid = {english},
+@article{Virieux2009,
+ title = {An overview of full-waveform inversion in exploration geophysics},
+ volume = {74},
+ doi = {10.1190/1.3238367},
+ number = {6},
+ journal = {Geophysics},
+ author = {Virieux, Jean and Operto, Stéphane},
+ year = {2009},
+ note = {Publisher: Society of Exploration Geophysicists},
+ pages = {WCC1--WCC26},
}
-@article{hecht_new_2012,
- title = {New development in {FreeFem}++},
- volume = {20},
- issn = {1570-2820},
- url = {https://freefem.org/},
- pages = {251--265},
- number = {3},
- journaltitle = {Journal of Numerical Mathematics},
- shortjournal = {J. Numer. Math.},
- author = {Hecht, F.},
- date = {2012},
- mrnumber = {3043640},
+@article{Pham2020Siam,
+ title = {Efficient and accurate algorithm for the full modal {Green}'s kernel of the scalar wave equation in helioseismology},
+ volume = {80},
+ doi = {10.1137/20M1336709},
+ number = {6},
+ journal = {SIAM Journal on Applied Mathematics},
+ author = {Barucq, Hélène and Faucher, Florian and Fournier, Damien and Gizon, Laurent and Pham, Ha},
+ year = {2020},
+ pages = {2657--2683},
}
-@incollection{alliez_3d_2024,
- edition = {5.6.1},
- title = {3D Alpha Wrapping},
- url = {https://doc.cgal.org/5.6.1/Manual/packages.html#PkgAlphaWrap3},
- booktitle = {{CGAL} User and Reference Manual},
- publisher = {{CGAL} Editorial Board},
- author = {Alliez, Pierre and Cohen-Steiner, David and Hemmer, Michael and Portaneri, Cédric and Rouxel-Labbé, Mael},
- date = {2024},
- keywords = {cgal},
+@article{Pham2019radiationBC,
+ title = {Outgoing solutions and radiation boundary conditions for the ideal atmospheric scalar wave equation in helioseismology},
+ volume = {54},
+ doi = {10.1051/m2an/2019088},
+ number = {4},
+ journal = {ESAIM: Mathematical Modelling and Numerical Analysis},
+ author = {Barucq, Hélène and Faucher, Florian and Pham, Ha},
+ year = {2020},
+ pages = {1111--1138},
}
-@book{the_cgal_project_cgal_2024,
- edition = {5.6.1},
- title = {{CGAL} User and Reference Manual},
- url = {https://doc.cgal.org/5.6.1/Manual/packages.html},
- publisher = {{CGAL} Editorial Board},
- author = {{The CGAL Project}},
- date = {2024},
- keywords = {cgal},
+@article{Liu2024,
+ title = {{WaveBench}: {Benchmarking} data-driven solvers for linear wave propagation {PDEs}},
+ issn = {2835-8856},
+ url = {https://openreview.net/forum?id=6wpInwnzs8},
+ journal = {Transactions on Machine Learning Research},
+ author = {Liu, Tianlin and Benitez, Jose Antonio Lara and Faucher, Florian and Khorashadizadeh, AmirEhsan and de Hoop, Maarten V. and Dokmanić, Ivan},
+ year = {2024},
}
-@misc{belieres--frendo_volume-preserving_2024,
- title = {Volume-preserving geometric shape optimization of the Dirichlet energy using variational neural networks},
- url = {http://arxiv.org/abs/2407.19064},
- abstract = {In this work, we explore the numerical solution of geometric shape optimization problems using neural network-based approaches. This involves minimizing a numerical criterion that includes solving a partial differential equation with respect to a domain, often under geometric constraints like constant volume. Our goal is to develop a proof of concept using a flexible and parallelizable methodology to tackle these problems. We focus on a prototypal problem: minimizing the so-called Dirichlet energy with respect to the domain under a volume constraint, involving a Poisson equation in \${\textbackslash}mathbb R{\textasciicircum}2\$. We use physics-informed neural networks ({PINN}) to approximate the Poisson equation's solution on a given domain and represent the shape through a neural network that approximates a volume-preserving transformation from an initial shape to an optimal one. These processes are combined in a single optimization algorithm that minimizes the Dirichlet energy. One of the significant advantages of this approach is its parallelizable nature, which makes it easy to handle the addition of parameters. Additionally, it does not rely on shape derivative or adjoint calculations. Our approach is tested on Dirichlet and Robin boundary conditions, parametric right-hand sides, and extended to Bernoulli-type free boundary problems. The source code for solving the shape optimization problem is open-source and freely available.},
- number = {{arXiv}:2407.19064},
- publisher = {{arXiv}},
- author = {Bélières--Frendo, Amaury and Franck, Emmanuel and Michel-Dansac, Victor and Privat, Yannick},
- urldate = {2024-09-17},
- date = {2024-08-10},
- eprinttype = {arxiv},
- eprint = {2407.19064 [cs, math]},
- keywords = {Mathematics - Numerical Analysis, Mathematics - Optimization and Control},
+@article{Benitez2024,
+ title = {Out-of-distributional risk bounds for neural operators with applications to the {Helmholtz} equation},
+ doi = {10.1016/j.jcp.2024.113168},
+ journal = {Journal of Computational Physics},
+ author = {Benitez, Jose Antonio Lara and Furuya, Takashi and Faucher, Florian and Kratsios, Anastasis and Tricoche, Xavier and de Hoop, Maarten V},
+ year = {2024},
+ note = {Publisher: Elsevier},
+ pages = {113168},
}
-@unpublished{prudhomme_ktirio_2024,
- title = {Ktirio Urban Building: A Computational Framework for City Energy Simulations Enhanced by {CI}/{CD} Innovations on {EuroHPC} Systems},
- url = {https://hal.science/hal-04590586},
- author = {Prud'Homme, Christophe and Chabannes, Vincent and Berti, Luca and Maslek, Maryam and Pincon, Philippe and Cladellas, Javier and Diallo, Abdoulaye},
- date = {2024-05},
- keywords = {City Energy Simulation, {HPC}, {HPC} {HPCOps} Urban building City Energy Simulation, {HPCOps}, Urban building},
+@article{Faucher2023viscoacoustic,
+ title = {Quantitative inverse problem in visco-acoustic media under attenuation model uncertainty},
+ volume = {472},
+ doi = {10.1016/j.jcp.2022.111685},
+ journal = {Journal of Computational Physics},
+ author = {Faucher, Florian and Scherzer, Otmar},
+ year = {2023},
+ note = {Publisher: Elsevier},
+ pages = {111685},
}
-@unpublished{mary_error_2024,
- title = {Error analysis of the Gram low-rank approximation (and why it is not as unstable as one may think)},
- url = {https://hal.science/hal-04554516},
- author = {Mary, Théo},
- date = {2024-04},
- keywords = {eigenvalue decomposition, finite precision arithmetic, Gram matrix, iterative refinement, low-rank approximation, mixed precision, rounding error analysis, singular value decomposition},
- file = {HAL PDF Full Text:files/1200/Mary - 2024 - Error analysis of the Gram low-rank approximation (and why it is not as unstable as one may think).pdf:application/pdf},
+@article{Faucher2020DAS,
+ title = {Reciprocity-gap misfit functional for {Distributed} {Acoustic} {Sensing}, combining data from passive and active sources},
+ volume = {86},
+ issn = {0016-8033},
+ doi = {10.1190/geo2020-0305.1},
+ number = {2},
+ journal = {Geophysics},
+ author = {Faucher, Florian and De Hoop, Maarten V and Scherzer, Otmar},
+ year = {2020},
+ pages = {1--46},
}
-@unpublished{mary_error_2024-1,
- title = {Error analysis of matrix multiplication with narrow range floating-point arithmetic},
- url = {https://hal.science/hal-04671474},
- author = {Mary, Théo and Mikaitis, Mantas},
- date = {2024-08},
- keywords = {mixed precision, rounding error analysis, floating-point arithmetic, {GPUs}, matrix multiplication, multiword arithmetic, overflow, reduced precision, scaling, underflow},
+@article{Faucher2020adjoint,
+ title = {Adjoint-state method for {Hybridizable} {Discontinuous} {Galerkin} discretization, application to the inverse acoustic wave problem},
+ volume = {372},
+ issn = {0045-7825},
+ doi = {10.1016/j.cma.2020.113406},
+ journal = {Computer Methods in Applied Mechanics and Engineering},
+ author = {Faucher, Florian and Scherzer, Otmar},
+ year = {2020},
+ pages = {113406},
}
-@article{pham_numerical_2024,
- title = {Numerical investigation of stabilization in the Hybridizable Discontinuous Galerkin method for linear anisotropic elastic equation},
- url = {https://hal.science/hal-04503407},
- doi = {10.1016/j.cma.2024.117080},
- pages = {117080},
- journaltitle = {Computer Methods in Applied Mechanics and Engineering},
- author = {Pham, Ha and Faucher, Florian and Barucq, Hélène},
- date = {2024-06},
- note = {Publisher: Elsevier},
+@article{Faucher2019FRgWIGeo,
+ title = {Full {Reciprocity}-{Gap} {Waveform} {Inversion}, enabling sparse-source acquisition},
+ volume = {85},
+ doi = {10.1190/geo2019-0527.1},
+ number = {6},
+ journal = {Geophysics},
+ author = {Faucher, Florian and Alessandrini, Giovanni and Barucq, Hélène and de Hoop, Maarten and Gaburro, Romina and Sincich, Eva},
+ year = {2020},
+ note = {Publisher: Society of Exploration Geophysicists},
+ pages = {R461--R476},
}
-@unpublished{aghili_accelerating_2024,
- title = {Accelerating the convergence of Newton's method for nonlinear elliptic {PDEs} using Fourier neural operators},
- url = {https://hal.science/hal-04440076},
- author = {Aghili, Joubine and Hild, Romain and Michel-Dansac, Victor and Vigon, Vincent and Franck, Emmanuel},
- date = {2024-02},
- keywords = {Fourier neural operators, Neural operators, Newton's method, Nonlinear elliptic {PDEs}},
+@article{bonazzoli_domain_2019,
+ title = {Domain decomposition preconditioning for the high-frequency time-harmonic {Maxwell} equations with absorption},
+ volume = {88},
+ copyright = {https://www.ams.org/publications/copyright-and-permissions},
+ issn = {0025-5718, 1088-6842},
+ url = {https://www.ams.org/mcom/2019-88-320/S0025-5718-2019-03447-6/},
+ doi = {10.1090/mcom/3447},
+ language = {en},
+ number = {320},
+ urldate = {2024-10-08},
+ journal = {Mathematics of Computation},
+ author = {Bonazzoli, M. and Dolean, V. and Graham, I. G. and Spence, E. A. and Tournier, P.-H.},
+ month = may,
+ year = {2019},
+ pages = {2559--2604},
}
-@article{saigre_model_2024,
- title = {Model order reduction and sensitivity analysis for complex heat transfer simulations inside the human eyeball},
- url = {https://hal.science/hal-04361954},
- doi = {10.1002/cnm.3864},
- pages = {e3864},
- journaltitle = {International Journal for Numerical Methods in Biomedical Engineering},
- author = {Saigre, Thomas and Prud'Homme, Christophe and Szopos, Marcela},
- date = {2024-09},
- note = {Publisher: John Wiley and Sons},
- keywords = {Heat transfer, Sensitivity analysis, Mathematical and computational ophthalmology, real-time model order reduction, Uncertainty qualification, Validation},
+@article{noauthor_benchmarking_nodate,
+ title = {Benchmarking analysis report},
+ language = {en},
}
-@unpublished{buttari_modular_2024,
- title = {A modular framework for the backward error analysis of {GMRES}},
- url = {https://hal.science/hal-04525918},
- author = {Buttari, Alfredo and Higham, Nicholas J and Mary, Théo and Vieublé, Bastien},
- date = {2024-03},
- keywords = {Computer arithmetic, {GMRES}, Iterative solvers, Linear system of equations, Rounding error analysis},
+@book{asch_data_2016,
+ address = {Philadelphia, PA},
+ title = {Data assimilation},
+ url = {https://epubs.siam.org/doi/abs/10.1137/1.9781611974546},
+ publisher = {Society for Industrial and Applied Mathematics},
+ author = {Asch, Mark and Bocquet, Marc and Nodet, Maëlle},
+ year = {2016},
+ note = {Citation Key:
+doi:10.1137/1.9781611974546
+tex.eprint: https://epubs.siam.org/doi/pdf/10.1137/1.9781611974546},
}
-@unpublished{beuzeville_deterministic_2024,
- title = {Deterministic and probabilistic backward error analysis of neural networks in floating-point arithmetic},
- url = {https://hal.science/hal-04663142},
- author = {Beuzeville, Théo and Buttari, Alfredo and Gratton, Serge and Mary, Theo},
- date = {2024-07},
- keywords = {floating-point arithmetic, artificial neural networks, backward error, error analysis, probabilistic error analysis, rounding errors},
+@article{CRMATH_2009__347_7-8_435_0,
+ title = {Une méthode combinée d'éléments finis à deux grilles/bases réduites pour l'approximation des solutions d'une {E}.{D}.{P}. paramétrique},
+ volume = {347},
+ url = {http://www.numdam.org/articles/10.1016/j.crma.2009.02.019/},
+ doi = {10.1016/j.crma.2009.02.019},
+ language = {fr},
+ number = {7-8},
+ journal = {Comptes Rendus. Mathématique},
+ author = {Chakir, Rachida and Maday, Yvon},
+ year = {2009},
+ note = {Publisher: Elsevier},
+ pages = {435--440},
}
-@inproceedings{ouertatani_accelerated_2024,
- location = {Lugano, Switzerland},
- title = {Accelerated {NAS} via pretrained ensembles and multi-fidelity Bayesian Optimization},
- url = {https://hal.science/hal-04611343},
- booktitle = {33rd International Conference on Artificial Neural Networks ({ICANN})},
- author = {Ouertatani, Houssem and Maxim, Cristian and Niar, Smail and Talbi, El-Ghazali},
- date = {2024-09},
- keywords = {Deep Ensembles, Multi-fidelity {BO}, Neural Architecture Search},
+@article{noauthor_notitle_nodate,
}
-@online{fault_tolerance_working_group_mpi_forum_user_2024,
- title = {User Level Failure Mitigation ({ULFM})},
- url = {https://fault-tolerance.org/},
- abstract = {The User Level Failure Mitigation ({ULFM}) proposal is developed by the {MPI} Forum’s Fault Tolerance Working Group to support the continued operation of {MPI} programs after crash (node failures) have impacted the execution. The key principle is that no {MPI} call (point-to-point, collective, {RMA}, {IO}, …) can block indefinitely after a failure, but must either succeed or raise an {MPI} error. In addition the design is centered around user needs and flexibility, the {API} should allow varied fault tolerant models to be built as external libraries.},
- author = {{Fault Tolerance Working Group, MPI Forum}},
- date = {2024},
- keywords = {fault tolerance, resilience},
+@article{calvin_object-oriented_2002,
+ title = {An object-oriented approach to the design of fluid mechanics software},
+ volume = {36},
+ issn = {0764-583X, 1290-3841},
+ url = {http://www.esaim-m2an.org/10.1051/m2an:2002038},
+ doi = {10.1051/m2an:2002038},
+ number = {5},
+ urldate = {2024-10-07},
+ journal = {ESAIM: Mathematical Modelling and Numerical Analysis},
+ author = {Calvin, Christophe and Cueto, Olga and Emonot, Philippe},
+ month = sep,
+ year = {2002},
+ pages = {907--921},
}
-@online{open_mpi_documentation_team_user_2024,
- title = {User Level Failure Mitigation ({ULFM}) in Open {MPI}},
- url = {https://docs.open-mpi.org/en/v5.0.x/features/ulfm.html},
- abstract = {This chapter documents the features and options specific to the User Level Failure Mitigation ({ULFM}) Open {MPI} implementation. The {ULFM} proposal is developed by the {MPI} Forum’s Fault Tolerance Working Group to support the continued operation of {MPI} programs after failures, both hard and soft, have impacted execution. No {MPI} call can block indefinitely after a failure, and errors are not necessarily fatal, as the {MPI} implementation makes a best effort to maintain the execution environment.},
- author = {{Open MPI Documentation Team}},
- date = {2024},
- keywords = {fault tolerance, resilience},
+@misc{genci_cines,
+ title = {Centre informatique national de l'{Enseignement} supérieur ({CINES})},
+ url = {https://www.genci.fr/centre-informatique-national-de-lenseignement-superieur-cines},
+ author = {{GENCI}},
+ year = {2024},
+ keywords = {cines, genci},
}
-@inproceedings{bautista-gomez_fti_2011,
- location = {New York, {NY}, {USA}},
- title = {{FTI}: high performance fault tolerance interface for hybrid systems},
- isbn = {978-1-4503-0771-0},
- url = {https://doi.org/10.1145/2063384.2063427},
- doi = {10.1145/2063384.2063427},
- series = {{SC} '11},
- abstract = {Large scientific applications deployed on current petascale systems expend a significant amount of their execution time dumping checkpoint files to remote storage. New fault tolerant techniques will be critical to efficiently exploit post-petascale systems. In this work, we propose a low-overhead high-frequency multi-level checkpoint technique in which we integrate a highly-reliable topology-aware Reed-Solomon encoding in a three-level checkpoint scheme. We efficiently hide the encoding time using one Fault-Tolerance dedicated thread per node. We implement our technique in the Fault Tolerance Interface {FTI}. We evaluate the correctness of our performance model and conduct a study of the reliability of our library. To demonstrate the performance of {FTI}, we present a case study of the Mw9.0 Tohoku Japan earthquake simulation with {SPECFEM}3D on {TSUBAME}2.0. We demonstrate a checkpoint overhead as low as 8\% on sustained 0.1 petaflops runs (1152 {GPUs}) while checkpointing at high frequency.},
- booktitle = {Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis},
- publisher = {Association for Computing Machinery},
- author = {Bautista-Gomez, Leonardo and Tsuboi, Seiji and Komatitsch, Dimitri and Cappello, Franck and Maruyama, Naoya and Matsuoka, Satoshi},
- date = {2011},
- note = {event-place: Seattle, Washington},
+@misc{genci_tgcc,
+ title = {Très grand centre de calcul du {CEA} ({TGCC})},
+ url = {https://www.genci.fr/tres-grand-centre-de-calcul-du-cea-tgcc},
+ author = {{GENCI}},
+ year = {2024},
+ keywords = {genci, tgcc},
}
-@book{fti_documentation_team_fti_2024,
- title = {{FTI}: Fault Tolerance Interface - Examples},
- url = {https://fault-tolerance-interface.readthedocs.io/en/latest/examples.html},
- abstract = {This webpage provides examples of how to use the Fault Tolerance Interface ({FTI}) for implementing fault tolerance in {HPC} applications. The examples demonstrate checkpointing, recovery, and handling failures with {FTI}.},
- author = {{FTI Documentation Team}},
- date = {2024},
- keywords = {fault tolerance, resilience},
+@misc{genci_idris,
+ title = {Institut du développement et des ressources en informatique scientifique ({IDRIS})},
+ url = {https://www.genci.fr/institut-du-developpement-et-des-ressources-en-informatique-scientifique-idris},
+ author = {{GENCI}},
+ year = {2024},
+ keywords = {genci, idris},
}
-@inproceedings{koziol_extreme_2012,
- title = {Extreme I/O Scaling with {HDF}5},
- url = {https://cscads.rice.edu/HDF5-CScADS.pdf},
- booktitle = {{XSEDE} 12 - Extreme Scaling Workshop},
- publisher = {The {HDF} Group},
- author = {Koziol, Quincey},
- date = {2012},
- file = {Full Text:files/1238/Koziol - 2012 - Extreme IO Scaling with HDF5.pdf:application/pdf},
+@misc{eurohpc_supercomputers,
+ title = {{EuroHPC} supercomputers},
+ url = {https://eurohpc-ju.europa.eu/supercomputers/our-supercomputers_en},
+ author = {{EuroHPC JU}},
+ year = {2024},
}
-@online{laboratory_llnl_scalable_2024,
- title = {Scalable Checkpoint/Restart for {MPI} ({SCR})},
- url = {https://computing.llnl.gov/projects/scalable-checkpoint-restart-for-mpi},
- abstract = {The Scalable Checkpoint/Restart ({SCR}) library enables efficient, scalable checkpointing of {MPI} applications. This project from {LLNL} focuses on reducing checkpoint overhead to improve resilience in high-performance computing environments.},
- author = {Laboratory ({LLNL}), Lawrence Livermore National},
- date = {2024},
+@article{cea_conception_1986,
+ title = {Conception optimale ou identification de formes, calcul rapide de la dérivée directionnelle de la fonction coût},
+ volume = {20},
+ issn = {0764-583X, 1290-3841},
+ url = {http://www.esaim-m2an.org/10.1051/m2an/1986200303711},
+ doi = {10.1051/m2an/1986200303711},
+ number = {3},
+ urldate = {2024-10-07},
+ journal = {ESAIM: Mathematical Modelling and Numerical Analysis},
+ author = {Cea, Jean},
+ year = {1986},
+ pages = {371--402},
}
-@inproceedings{liang_daos_2020,
- location = {Berlin, Heidelberg},
- title = {{DAOS}: A Scale-Out High Performance Storage Stack for Storage Class Memory},
- isbn = {978-3-030-48841-3},
- url = {https://doi.org/10.1007/978-3-030-48842-0_3},
- doi = {10.1007/978-3-030-48842-0_3},
- abstract = {The Distributed Asynchronous Object Storage ({DAOS}) is an open source scale-out storage system that is designed from the ground up to support Storage Class Memory ({SCM}) and {NVMe} storage in user space. Its advanced storage {API} enables the native support of structured, semi-structured and unstructured data models, overcoming the limitations of traditional {POSIX} based parallel filesystem. For {HPC} workloads, {DAOS} provides direct {MPI}-{IO} and {HDF}5 support as well as {POSIX} access for legacy applications. In this paper we present the architecture of the {DAOS} storage engine and its high-level application interfaces. We also describe initial performance results of {DAOS} for {IO}500 benchmarks.},
- pages = {40--54},
- booktitle = {Supercomputing Frontiers: 6th Asian Conference, {SCFA} 2020, Singapore, February 24–27, 2020, Proceedings},
- publisher = {Springer-Verlag},
- author = {Liang, Zhen and Lombardi, Johann and Chaarawi, Mohamad and Hennecke, Michael},
- date = {2020},
- note = {event-place: Singapore, Singapore},
- keywords = {{DAOS}, Distributed storage system, {NVMe}, Parallel filesystem, Persistent memory, {RAFT}, {SCM}, {SWIM}},
+@mastersthesis{palazzolo2023shape,
+ title = {Shape optimisation for rigid objects in a {Stokes} flow},
+ school = {Internship report},
+ author = {Palazzolo, Lucas},
+ month = aug,
+ year = {2023},
+ note = {tex.supervisors: Luca Berti, Michaël Binois, Laetitia Giraldi, Christophe Prud’homme},
}
-@book{project_zfs_2024,
- title = {{ZFS} Administration Guide},
- url = {https://openzfs.github.io/openzfs-docs/man/master/8/zfs.8.html},
- abstract = {The {ZFS} administration guide provides detailed documentation on managing {ZFS}, a robust file system and volume manager for high-performance computing environments. It covers various commands for managing file systems, snapshots, and data integrity.},
- author = {Project, {OpenZFS}},
- date = {2024},
+@article{pironneau_optimum_1974,
+ title = {On optimum design in fluid mechanics},
+ volume = {64},
+ copyright = {https://www.cambridge.org/core/terms},
+ issn = {0022-1120, 1469-7645},
+ url = {https://www.cambridge.org/core/product/identifier/S0022112074002023/type/journal_article},
+ doi = {10.1017/S0022112074002023},
+ abstract = {In this paper, the change in energy dissipation due to a small hump on a body in a uniform steady flow is calculated. The result is used in conjunction with the variational methods of optimal control to obtain the optimality conditions for four minimum-drag problems of fluid mechanics. These conditions imply that the unit-area profile of smallest drag has a front end shaped like a wedge of angle 90°.},
+ language = {en},
+ number = {1},
+ urldate = {2024-10-07},
+ journal = {Journal of Fluid Mechanics},
+ author = {Pironneau, O.},
+ month = jun,
+ year = {1974},
+ pages = {97--110},
}
-@misc{wikipedia_contributors_zfs_2024,
- title = {{ZFS}},
- url = {https://en.wikipedia.org/wiki/ZFS},
- abstract = {{ZFS} is a combined file system and logical volume manager designed by Sun Microsystems. It is known for its data integrity, support for high storage capacities, and protection against data corruption. This Wikipedia entry provides an overview of its history, features, and applications.},
- author = {{Wikipedia contributors}},
- date = {2024},
+@phdthesis{saigre_mathematical_2024,
+ address = {Strasbourg, France},
+ type = {{PhD}},
+ title = {Mathematical modeling, simulation, and order reduction of ocular fluid flows and their interactions: {Building} the digital twin of the eye},
+ school = {Université de Strasbourg},
+ author = {Saigre, Thomas},
+ month = dec,
+ year = {2024},
+ note = {In preparation},
+ keywords = {aqueous humor, heat transfer, model reduction, ocular fluid dynamics, reduced basis method, sensitivity analysis},
}
-@inproceedings{firmin_massively_2023,
- title = {Massively Parallel Asynchronous Fractal Optimization},
- doi = {10.1109/IPDPSW59300.2023.00151},
- pages = {930--938},
- booktitle = {2023 {IEEE} International Parallel and Distributed Processing Symposium Workshops ({IPDPSW})},
- author = {Firmin, Thomas and Talbi, El-Ghazali},
- date = {2023},
- keywords = {Asynchronous metaheuristic, Conferences, Continuous optimization, Distributed processing, Fractals, Hierarchical decomposition, Linear programming, Search problems, Software, Software algorithms},
+@unpublished{van_landeghem_motion_nodate,
+ type = {Unpublished paper},
+ title = {Motion of soft bodies in fluids in complex environments {Part} {I}: {Elasticity} modeling and simulation},
+ author = {Van Landeghem, Céline and Prud'homme, Christophe and Chabannes, Vincent and Giraldi, Laetitia and Chouippe, Agathe},
}
-@article{blanchard_uranie_2019,
- title = {The Uranie platform: an open-source software for optimisation, meta-modelling and uncertainty analysis},
- volume = {5},
- rights = {© J.-B. Blanchard et al., published by {EDP} Sciences, 2019},
- issn = {2491-9292},
- url = {https://www.epj-n.org/articles/epjn/abs/2019/01/epjn180009/epjn180009.html},
- doi = {10.1051/epjn/2018050},
- shorttitle = {The Uranie platform},
- abstract = {The high-performance computing resources and the constant improvement of both numerical simulation accuracy and the experimental measurements with which they are confronted bring a new compulsory step to strengthen the credence given to the simulation results: uncertainty quantification. This can have different meanings, according to the requested goals (rank uncertainty sources, reduce them, estimate precisely a critical threshold or an optimal working point), and it could request mathematical methods with greater or lesser complexity. This paper introduces the Uranie platform, an open-source framework developed at the Alternative Energies and Atomic Energy Commission ({CEA}), in the nuclear energy division, in order to deal with uncertainty propagation, surrogate models, optimisation issues, code calibration, etc. This platform benefits from both its dependencies and from personal developments, to offer an efficient data handling model, a C++ and Python interface, advanced graphi graphical tools, several parallelisation solutions, etc. These methods can then be applied to many kinds of code (considered as black boxes by Uranie) so to many fields of physics as well. In this paper, the example of thermal exchange between a plate-sheet and a fluid is introduced to show how Uranie can be used to perform a large range of analysis.},
- pages = {4},
- journaltitle = {{EPJ} Nuclear Sciences \& Technologies},
- shortjournal = {{EPJ} Nuclear Sci. Technol.},
- author = {Blanchard, Jean-Baptiste and Damblin, Guillaume and Martinez, Jean-Marc and Arnaud, Gilles and Gaudier, Fabrice},
- urldate = {2024-09-30},
- date = {2019},
- langid = {english},
- file = {Full Text PDF:files/1286/Blanchard et al. - 2019 - The Uranie platform an open-source software for o.pdf:application/pdf},
+@article{prudhomme_feel_2012,
+ title = {Feel++ : {A} computational framework for {Galerkin} {Methods} and {Advanced} {Numerical} {Methods}},
+ volume = {38},
+ issn = {1270-900X},
+ shorttitle = {Feel++},
+ url = {http://www.esaim-proc.org/10.1051/proc/201238024},
+ doi = {10.1051/proc/201238024},
+ urldate = {2024-10-07},
+ journal = {ESAIM: Proceedings},
+ author = {Prud’homme, Christophe and Chabannes, Vincent and Doyeux, Vincent and Ismail, Mourad and Samake, Abdoulaye and Pena, Goncalo},
+ editor = {Coquel, F. and Gutnic, M. and Helluy, P. and Lagoutière, F. and Rohde, C. and Seguin, N.},
+ month = dec,
+ year = {2012},
+ keywords = {dsel, feelpp},
+ pages = {429--455},
}
-@inproceedings{firmin_comparative_2023,
- title = {A Comparative Study of Fractal-Based Decomposition Optimization},
- volume = {1824},
- url = {https://doi.org/10.1007/978-3-031-34020-8\_1},
- doi = {10.1007/978-3-031-34020-8_1},
- series = {Communications in Computer and Information Science},
- pages = {3--20},
- booktitle = {Optimization and Learning - 6th International Conference, {OLA} 2023, Malaga, Spain, May 3-5, 2023, Proceedings},
- publisher = {Springer},
- author = {Firmin, Thomas and Talbi, El-Ghazali},
- editor = {Dorronsoro, Bernabé and Chicano, Francisco and Danoy, Grégoire and Talbi, El-Ghazali},
- date = {2023},
+@article{saikali_highly_2019,
+ title = {Highly resolved large eddy simulations of a binary mixture flow in a cavity with two vents: {Influence} of the computational domain},
+ volume = {44},
+ issn = {03603199},
+ shorttitle = {Highly resolved large eddy simulations of a binary mixture flow in a cavity with two vents},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S0360319918326521},
+ doi = {10.1016/j.ijhydene.2018.08.108},
+ language = {en},
+ number = {17},
+ urldate = {2024-10-06},
+ journal = {International Journal of Hydrogen Energy},
+ author = {Saikali, E. and Bernard-Michel, G. and Sergent, A. and Tenaud, C. and Salem, R.},
+ month = apr,
+ year = {2019},
+ keywords = {trust},
+ pages = {8856--8873},
}
-@article{van_landeghem_mathematical_2024,
- title = {Mathematical and computational framework for moving and colliding rigid bodies in a Newtonian fluid},
- volume = {9},
- issn = {2380288X, 23802898},
- url = {https://www.intlpress.com/site/pub/pages/journals/items/amsa/content/vols/0009/0001/a002/},
- doi = {10.4310/AMSA.2024.v9.n1.a2},
- pages = {59--89},
+@article{angeli_wall-resolved_2022,
+ title = {Wall-{Resolved} {Large} {Eddy} {Simulations} of the {Transient} {Turbulent} {Fluid} {Mixing} in a {Closed} {System} {Replicating} a {Pressurized} {Thermal} {Shock}},
+ volume = {108},
+ issn = {1386-6184, 1573-1987},
+ url = {https://link.springer.com/10.1007/s10494-021-00272-z},
+ doi = {10.1007/s10494-021-00272-z},
+ language = {en},
number = {1},
- journaltitle = {Annals of Mathematical Sciences and Applications},
- author = {Van Landeghem, Céline and Berti, Luca and Chabannes, Vincent and Chouippe, Agathe and Giraldi, Laetitia and Hoarau, Yannick and Prud’homme, Christophe},
- urldate = {2024-05-30},
- date = {2024},
- file = {Submitted Version:files/1289/Van Landeghem et al. - 2024 - Mathematical and computational framework for movin.pdf:application/pdf},
+ urldate = {2024-10-06},
+ journal = {Flow, Turbulence and Combustion},
+ author = {Angeli, Pierre-Emmanuel},
+ month = jan,
+ year = {2022},
+ keywords = {trust},
+ pages = {43--75},
}
-@misc{noauthor_iso_2017,
- title = {{ISO} 10211:2017 - Thermal bridges in building construction — Heat flows and surface temperatures — Detailed calculations},
- url = {https://www.iso.org/standard/65710.html},
- date = {2017},
- keywords = {building construction, detailed calculations, heat flows, surface temperatures, thermal bridges},
- file = {PDF:files/1303/2017 - ISO 102112017 - Thermal bridges in building construction — Heat flows and surface temperatures — De.pdf:application/pdf},
+@article{chouly_explicit_2018,
+ title = {Explicit {Verlet} time-integration for a {Nitsche}-based approximation of elastodynamic contact problems},
+ volume = {5},
+ journal = {Advanced Modeling and Simulation in Engineering Sciences},
+ author = {Chouly, Franz and Renard, Yves},
+ year = {2018},
+ note = {Publisher: Springer},
+ pages = {1--38},
}
-@book{advanced_micro_devices_inc_rocthrust_2024,
- edition = {Release 3.0.1},
- title = {{rocThrust} Documentation},
- url = {https://rocm.docs.amd.com/projects/rocThrust/en/latest/index.html},
- author = {{Advanced Micro Devices, Inc.}},
- date = {2024-08},
- keywords = {gpu, {AMD}, rocm},
- file = {PDF:files/1299/Advanced Micro Devices, Inc. - 2024 - rocThrust Documentation.pdf:application/pdf},
+@misc{balay_petsc_2024,
+ title = {{PETSc} {Web} page},
+ url = {https://petsc.org/},
+ author = {Balay, Satish and Abhyankar, Shrirang and Adams, Mark F. and Benson, Steven and Brown, Jed and Brune, Peter and Buschelman, Kris and Constantinescu, Emil M. and Dalcin, Lisandro and Dener, Alp and Eijkhout, Victor and Faibussowitsch, Jacob and Gropp, William D. and Hapla, Václav and Isaac, Tobin and Jolivet, Pierre and Karpeev, Dmitry and Kaushik, Dinesh and Knepley, Matthew G. and Kong, Fande and Kruger, Scott and May, Dave A. and McInnes, Lois Curfman and Mills, Richard Tran and Mitchell, Lawrence and Munson, Todd and Roman, Jose E. and Rupp, Karl and Sanan, Patrick and Sarich, Jason and Smith, Barry F. and Zampini, Stefano and Zhang, Hong and Zhang, Hong and Zhang, Junchao},
+ year = {2024},
}
-@book{advanced_micro_devices_inc_hip_2024,
- edition = {Release 6.1.40091},
- title = {{HIP} Documentation},
- url = {https://rocm.docs.amd.com/projects/HIP/en/latest/index.html},
- author = {{Advanced Micro Devices, Inc.}},
- date = {2024-05},
- keywords = {{AMD}, {HIP}, {GPU}},
- file = {PDF:files/1302/Advanced Micro Devices, Inc. - 2024 - HIP Documentation.pdf:application/pdf},
+@article{bruna_neural_2024,
+ title = {Neural {Galerkin} schemes with active learning for high-dimensional evolution equations},
+ volume = {496},
+ issn = {0021-9991},
+ doi = {10.1016/j.jcp.2023.112588},
+ journal = {Journal of Computational Physics},
+ author = {Bruna, J. and Peherstorfer, B. and Vanden-Eijnden, E.},
+ year = {2024},
+ note = {Publisher: Elsevier BV},
+ pages = {112588},
}
-@article{feppon_shape_2019,
- title = {Shape optimization of a coupled thermal fluid-structure problem in a level set mesh evolution framework},
- volume = {76},
- issn = {2254-3902},
- doi = {10.1007/s40324-018-00185-4},
- pages = {413--458},
+@article{lu_learning_2021,
+ title = {Learning nonlinear operators via {DeepONet} based on the universal approximation theorem of operators},
+ volume = {3},
+ issn = {2522-5839},
+ doi = {10.1038/s42256-021-00302-5},
number = {3},
- journaltitle = {{SeMA}},
- author = {Feppon, F. and Allaire, G. and Bordeu, F. and Cortial, J. and Dapogny, C.},
- date = {2019-09},
- note = {Publisher: Springer International Publishing},
- keywords = {shape optimisation, multiphysics, fsi},
- file = {Submitted Version:files/1314/Feppon et al. - 2019 - Shape optimization of a coupled thermal fluid-structure problem in a level set mesh evolution framew.pdf:application/pdf},
+ journal = {Nat. Mach. Intell.},
+ author = {Lu, L. and Jin, P. and Pang, G. and Zhang, Z. and Karniadakis, G. E.},
+ year = {2021},
+ note = {Publisher: Springer Science and Business Media LLC},
+ pages = {218--229},
}
-@article{feppon_f_null_2020,
- title = {Null space gradient flows for constrained optimization with applications to shape optimization},
- volume = {26},
- url = {https://doi.org/10.1051/cocv/2020015},
- doi = {10.1051/cocv/2020015},
- pages = {90},
- journaltitle = {{ESAIM}: {COCV}},
- author = {{Feppon, F.} and {Allaire, G.} and {Dapogny, C.}},
- date = {2020},
- keywords = {shape optimisation, multiphysics},
- file = {Full Text PDF:files/1316/Feppon, F. et al. - 2020 - Null space gradient flows for constrained optimization with applications to shape optimization.pdf:application/pdf},
+@article{karniadakis_physics-informed_2021,
+ title = {Physics-informed machine learning},
+ volume = {3},
+ issn = {2522-5820},
+ doi = {10.1038/s42254-021-00314-5},
+ number = {6},
+ journal = {Nat. Rev. Phys.},
+ author = {Karniadakis, G. E. and Kevrekidis, I. G. and Lu, L. and Perdikaris, P. and Wang, S. and Yang, L.},
+ year = {2021},
+ note = {Publisher: Springer Science and Business Media LLC},
+ pages = {422--440},
}
-@thesis{feppon_shape_2019-1,
- title = {Shape and topology optimization of multiphysics systems},
- institution = {Thèse de doctorat de l'Universit'e Paris-Saclay pr'epar'ee à l”Ecole polytechnique},
- type = {phdthesis},
- author = {Feppon, Florian},
- date = {2019},
- keywords = {shape optimisation, multiphysics},
- file = {PDF:files/1318/Feppon - 2019 - Shape and topology optimization of multiphysics systems.pdf:application/pdf},
+@misc{saigre_mesh_2024,
+ title = {Mesh and configuration files to perform coupled heat+fluid simulations on a realistic human eyeball geometry with {Feel}++},
+ copyright = {Creative Commons Attribution 4.0 International},
+ url = {https://zenodo.org/doi/10.5281/zenodo.13886143},
+ doi = {10.5281/ZENODO.13886143},
+ abstract = {Run the simulation
+
+With slurm
+
+Set up position and desired mesh in the `run.slurm` file. Then, submit the job with the following command:
+
+sbatch run.slurm
+
+Without slurm
+
+Run by hand the command of the run.slurm file.POSITION=prone \# prone supine standingSOLVER\_TYPE=simple \# simple lscMESH\_INDEX=M4 \# M1 M2 M3 M4 M5
+
+mpirun -np 128 feelpp\_toolbox\_heatfluid {\textbackslash} --config-files eye-\$\{POSITION\}.cfg pc\_\$\{SOLVER\_TYPE\}.cfg {\textbackslash} --heat-fluid.json.patch='\{ "op": "replace", "path": "/Meshes/heatfluid/Import/filename", "value": "\$cfgdir/mesh/Mr/'\$\{MESH\_INDEX\}'/Eye\_Mesh3D\_p\$np.json" \}' {\textbackslash} --heat-fluid.scalability-save=1 --heat-fluid.heat.scalability-save=1 --heat-fluid.fluid.scalability-save=1
+
+Available meshes
+
+The meshes are available and are already partitioned for parallel computing:
+
+M0 : 1, 64, 128, 256, 384, 512, 640, 768M1 : 1, 64, 128, 256, 384, 512, 640, 768M2 : 1, 64, 128, 256, 384, 512, 640, 768M3 : 1, 64, 128, 256, 384, 512, 640, 768M4 : 1, 64, 128, 256, 384, 512, 640, 768M5 : 1, 64, 128, 256, 384, 512, 640, 768M6 : 128, 256, 384, 512, 640, 768},
+ urldate = {2024-10-04},
+ publisher = {Zenodo},
+ author = {Saigre, Thomas and Prud'homme, Christophe and Szopos, Marcela and Chabannes, Vincent},
+ month = oct,
+ year = {2024},
}
-@article{stewart_assessment_2012,
- title = {Assessment of {CFD} Performance in Simulations of an Idealized Medical Device: Results of {FDA}’s First Computational Interlaboratory Study},
- volume = {3},
- issn = {1869-408X},
- url = {http://dx.doi.org/10.1007/s13239-012-0087-5},
- doi = {10.1007/s13239-012-0087-5},
- pages = {139--160},
- number = {2},
- journaltitle = {Cardiovascular Engineering and Technology},
- author = {Stewart, {SandyF}.C. and Paterson, {EricG}. and Burgreen, {GregW}. and Hariharan, Prasanna and Giarra, Matthew and Reddy, Varun and Day, {StevenW}. and Manning, {KeefeB}. and Deutsch, Steven and Berman, {MichaelR}. and Myers, {MatthewR}. and Malinauskas, {RichardA}.},
- date = {2012},
- keywords = {Blood damage, Computational fluid dynamics, Experimental validation, Medical devices, {FDA}},
+@incollection{baudin_openturns_2016,
+ address = {Cham},
+ title = {{OpenTURNS}: {An} {Industrial} {Software} for {Uncertainty} {Quantification} in {Simulation}},
+ isbn = {978-3-319-11259-6},
+ url = {https://doi.org/10.1007/978-3-319-11259-6_64-1},
+ booktitle = {Handbook of {Uncertainty} {Quantification}},
+ publisher = {Springer International Publishing},
+ author = {Baudin, Michaël and Dutfoy, Anne and Iooss, Bertrand and Popelin, Anne-Laure},
+ editor = {Ghanem, Roger and Higdon, David and Owhadi, Houman},
+ year = {2016},
+ doi = {10.1007/978-3-319-11259-6_64-1},
+ pages = {1--38},
}
-@article{hariharan_multilaboratory_2011,
- title = {Multilaboratory Particle Image Velocimetry Analysis of the {FDA} Benchmark Nozzle Model to Support Validation of Computational Fluid Dynamics Simulations},
- volume = {133},
- issn = {0148-0731},
- url = {http://dx.doi.org/10.1115/1.4003440},
- doi = {doi: 10.1115/1.4003440},
- journaltitle = {Journal of Biomechanical Engineering},
- author = {Hariharan, Prasanna and Giarra, Matthew and Reddy, Varun and Day, Steven W. and Manning, Keefe B. and Deutsch, Steven and Stewart, Sandy F. C. and Myers, Matthew R. and Berman, Michael R. and Burgreen, Greg W. and Paterson, Eric G. and Malinauskas, Richard A.},
- date = {2011-02},
- keywords = {{FDA}},
+@misc{prudhomme_feelppfeelpp_2024,
+ title = {feelpp/feelpp: {Feel}++ {Release} {V111} preview.10},
+ copyright = {Creative Commons Attribution 4.0 International, GNU Lesser General Public License v3.0 or later, GNU General Public License v3.0 or later},
+ shorttitle = {feelpp/feelpp},
+ url = {https://zenodo.org/doi/10.5281/zenodo.591797},
+ abstract = {🎉 We're happy to share our developments as we approach the V111 release of Feel++. Following a refreshed naming strategy, we've moved to the -preview.x suffix from the conventional -alpha.x, -beta, or -rc labels. This change signifies our dedication to enhancing transparency and setting clear expectations for our pre-release versions.
+
+Each pre-release version of Feel++ undergoes a rigorous process, encompassing detailed reviews, extensive tests across varied scenarios, and careful packaging. Our commitment to delivering a high-quality, reliable experience is reflected in our comprehensive platform support strategy. Alongside offering support for the latest two Long-Term Support (LTS) versions of Ubuntu and the newest LTS version of Debian, we're excited to announce that Feel++ is now accessible to Windows users through the Windows Subsystem for Linux (WSL) and to Mac users via MacPorts, Homebrew, Docker and now Apptainer. This expansion of platform support is a testament to our commitment to making Feel++ as accessible and versatile as possible for our diverse user base.
+
+As we continue to refine and enhance Feel++, the V111 release promises to bring forward significant innovations and improvements. Stay tuned for further updates of Feel++.
+
+Packages
+
+
+
+📦 Ubuntu packages
+
+📦 Debian packages
+
+📦 Docker images
+
+
+docker pull ghcr.io/feelpp/feelpp:v0.111.0-preview.10-jammy
+docker run ghcr.io/feelpp/feelpp:v0.111.0-preview.10-jammy ls
+
+
+
+
+📦 Apptainer images
+
+
+apptainer pull -F oras://ghcr.io/feelpp/feelpp:v0.111.0-preview.10-jammy-sif
+apptainer exec feelpp\_v0.111.0-preview.10-jammy-sif.sif feelpp\_toolbox\_fluid --version
+
+
+What's Changed
+
+Exciting New Features 🎉
+
+
+
+resolve 2231 : Support parts configuration in exporter by @vincentchabannes in https://github.com/feelpp/feelpp/pull/2232
+
+resolves 1489 and 2175: enrich range object and simplify FunctionSpace by @prudhomm in https://github.com/feelpp/feelpp/pull/2176
+
+resolves 2191 and 2196: cleanup and python wrapper for forms and implement feelpp namespace package by @prudhomm in https://github.com/feelpp/feelpp/pull/2227
+
+resolves 2233: improve hdg toolbox, add new terms by @prudhomm in https://github.com/feelpp/feelpp/pull/2236
+
+resolves 2259: add script to get feelpp version and improve packaging workflow by @prudhomm in https://github.com/feelpp/feelpp/pull/2260
+
+
+HPC Changes
+
+
+
+resolves 2246: fix non blocking mpi communication for large scale communications by @vincentchabannes in https://github.com/feelpp/feelpp/pull/2249
+
+
+Recent Publications using Feel++
+
+
+
+Ktirio Urban Building: A Computational Framework for City Energy Simulations Enhanced by CI/CD Innovations on EuroHPC Systems
+
+Nonlinear compressive reduced basis approximation for multi-parameter elliptic problem
+
+2D Axisymmetric Modeling of the HTS Insert Nougat in a Background Magnetic Field Generated by Resistive Magnet
+
+
+Enjoy!
+
+Full Changelog: https://github.com/feelpp/feelpp/compare/v0.111.0-preview.9...v0.111.0-preview.10},
+ urldate = {2024-09-04},
+ publisher = {Cemosis},
+ author = {Prud'homme, Christophe and Chabannes, Vincent and Saigre, Thomas and Trophime, Christophe and Berti, Luca and Samaké, Abdoulaye and Van Landeghem, Céline and Szopos, Marcela and Giraldi, Laetitia and Bertoluzza, Silvia and Maday, Yvon},
+ month = jul,
+ year = {2024},
+ doi = {10.5281/ZENODO.591797},
}
-@inproceedings{saigre_coupled_2024,
- location = {Arlington (Virginia), United States},
- title = {A coupled fluid-dynamics-heat transfer model for 3D simulations of the aqueous humor flow in the human eye},
- url = {https://hal.science/hal-04558924},
- booktitle = {{CMBE}24},
- author = {Saigre, Thomas and Prud'Homme, Christophe and Szopos, Marcela and Chabannes, Vincent},
- date = {2024-06},
- keywords = {finite element method, mathematical and computational ophthalmology, thermo-fluid dynamics, Thermo-fluid dynamics},
+@misc{chabannes_3d_2024,
+ title = {A {3D} geometrical model and meshing procedures for the human eyeball},
+ copyright = {Creative Commons Attribution 4.0 International},
+ url = {https://zenodo.org/doi/10.5281/zenodo.13829740},
+ abstract = {What's Changed
+
+
+
+up README with author + sort cff by @prudhomm in https://github.com/feelpp/mesh.eye/pull/2
+
+
+New Contributors
+
+
+
+@prudhomm made their first contribution in https://github.com/feelpp/mesh.eye/pull/2
+
+
+Full Changelog: https://github.com/feelpp/mesh.eye/compare/1.0.0-preview.1...v1.0.0-preview.2},
+ urldate = {2024-10-04},
+ publisher = {Zenodo},
+ author = {Chabannes, Vincent and Prud'homme, Christophe and Saigre, Thomas and Lorenzo, Sala and Szopos, Marcela and Trophime, Christophe},
+ month = sep,
+ year = {2024},
+ doi = {10.5281/ZENODO.13829740},
}
-@inproceedings{chabannes_high_2017,
- location = {Pittsburgh, {PA}, United States},
- title = {High order finite element simulations for fluid dynamics validated by experimental data from the fda benchmark nozzle model},
- url = {https://hal.science/hal-01429685},
- booktitle = {5th International Conference on Computational and Mathematical Biomedical Engineering - {CMBE}2017},
- author = {Chabannes, Vincent and Prud'Homme, Christophe and Szopos, Marcela and Tarabay, Ranine},
- date = {2017-04},
- keywords = {{CFD}, validation, medical device, open source finite element software},
- file = {HAL PDF Full Text:files/1326/Chabannes et al. - 2017 - High order finite element simulations for fluid dynamics validated by experimental data from the fda.pdf:application/pdf},
+@misc{noauthor_master-csmioverfitting-underfitting_nodate,
+ title = {master-csmi/overfitting-underfitting},
+ url = {https://github.com/master-csmi/overfitting-underfitting},
+ urldate = {2024-10-03},
+}
+
+@article{ooi_simulation_2008,
+ title = {Simulation of aqueous humor hydrodynamics in human eye heat transfer},
+ volume = {38},
+ copyright = {https://www.elsevier.com/tdm/userlicense/1.0/},
+ issn = {00104825},
+ url = {https://linkinghub.elsevier.com/retrieve/pii/S001048250700176X},
+ doi = {10.1016/j.compbiomed.2007.10.007},
+ language = {en},
+ number = {2},
+ urldate = {2024-10-03},
+ journal = {Computers in Biology and Medicine},
+ author = {Ooi, Ean-Hin and Ng, Eddie Yin-Kwee},
+ month = feb,
+ year = {2008},
+ pages = {252--262},
+}
+
+@incollection{kilgour_operator_2021,
+ address = {Cham},
+ title = {Operator {Splitting} for the {Simulation} of {Aqueous} {Humor} {Thermo}-{Fluid}-{Dynamics} in the {Anterior} {Chamber}},
+ volume = {343},
+ isbn = {978-3-030-63590-9 978-3-030-63591-6},
+ url = {https://link.springer.com/10.1007/978-3-030-63591-6_45},
+ language = {en},
+ urldate = {2024-10-03},
+ booktitle = {Recent {Developments} in {Mathematical}, {Statistical} and {Computational} {Sciences}},
+ publisher = {Springer International Publishing},
+ author = {Abdelhafid, Farah and Guidoboni, Giovanna and Okumura, Naoki and Koizumi, Noriko and Srinivas, Sangly P.},
+ editor = {Kilgour, D. Marc and Kunze, Herb and Makarov, Roman and Melnik, Roderick and Wang, Xu},
+ year = {2021},
+ doi = {10.1007/978-3-030-63591-6_45},
+ note = {Series Title: Springer Proceedings in Mathematics \& Statistics},
+ pages = {489--499},
}
@article{portaneri_alpha_2022,
- title = {Alpha Wrapping with an Offset},
+ title = {Alpha {Wrapping} with an {Offset}},
volume = {41},
url = {https://inria.hal.science/hal-03688637},
doi = {10.1145/3528223.3530152},
- abstract = {Given an input 3D geometry such as a triangle soup or a point set, we address the problem of generating a watertight and orientable surface triangle mesh that strictly encloses the input. The output mesh is obtained by greedily refining and carving a 3D Delaunay triangulation on an offset surface of the input, while carving with empty balls of radius alpha. The proposed algorithm is controlled via two user-defined parameters: alpha and offset. Alpha controls the size of cavities or holes that cannot be traversed during carving, while offset controls the distance between the vertices of the output mesh and the input. Our algorithm is guaranteed to terminate and to yield a valid and strictly enclosing mesh, even for defect-laden inputs. Genericity is achieved using an abstract interface probing the input, enabling any geometry to be used, provided a few basic geometric queries can be answered. We benchmark the algorithm on large public datasets such as Thingi10k, and compare it to state-of-the-art approaches in terms of robustness, approximation, output complexity, speed, and peak memory consumption. Our implementation is available through the {CGAL} library.},
- pages = {1},
+ abstract = {Given an input 3D geometry such as a triangle soup or a point set, we address the problem of generating a watertight and orientable surface triangle mesh that strictly encloses the input. The output mesh is obtained by greedily refining and carving a 3D Delaunay triangulation on an offset surface of the input, while carving with empty balls of radius alpha. The proposed algorithm is controlled via two user-defined parameters: alpha and offset. Alpha controls the size of cavities or holes that cannot be traversed during carving, while offset controls the distance between the vertices of the output mesh and the input. Our algorithm is guaranteed to terminate and to yield a valid and strictly enclosing mesh, even for defect-laden inputs. Genericity is achieved using an abstract interface probing the input, enabling any geometry to be used, provided a few basic geometric queries can be answered. We benchmark the algorithm on large public datasets such as Thingi10k, and compare it to state-of-the-art approaches in terms of robustness, approximation, output complexity, speed, and peak memory consumption. Our implementation is available through the CGAL library.},
+ language = {en},
number = {4},
- journaltitle = {{ACM} Transactions on Graphics},
- author = {Portaneri, Cédric and Rouxel-Labbé, Mael and Hemmer, Michael and Cohen-Steiner, David and Alliez, Pierre},
urldate = {2024-10-02},
- date = {2022-06-01},
- langid = {english},
+ journal = {ACM Transactions on Graphics},
+ author = {Portaneri, Cédric and Rouxel-Labbé, Mael and Hemmer, Michael and Cohen-Steiner, David and Alliez, Pierre},
+ month = jun,
+ year = {2022},
keywords = {cgal},
+ pages = {1},
}
@article{jamin_cgalmesh_2015,
- title = {{CGALmesh}: a Generic Framework for Delaunay Mesh Generation},
+ title = {{CGALmesh}: a {Generic} {Framework} for {Delaunay} {Mesh} {Generation}},
volume = {41},
+ shorttitle = {{CGALmesh}},
url = {https://inria.hal.science/hal-01071759},
doi = {10.1145/2699463},
- shorttitle = {{CGALmesh}},
- abstract = {{CGALmesh} is the mesh generation software package of the Computational Geometry Algorithm Library ({CGAL}). It generates isotropic simplicial meshes -- surface triangular meshes or volume tetrahedral meshes -- from input surfaces, 3D domains as well as 3D multi-domains, with or without sharp features. The underlying meshing algorithm relies on restricted Delaunay triangulations to approximate domains and surfaces, and on Delaunay refinement to ensure both approximation accuracy and mesh quality. {CGALmesh} provides guarantees on approximation quality as well as on the size and shape of the mesh elements. It provides four optional mesh optimization algorithms to further improve the mesh quality. A distinctive property of {CGALmesh} is its high flexibility with respect to the input domain representation. Such a flexibility is achieved through a careful software design, gathering into a single abstract concept, denoted by the oracle, all required interface features between the meshing engine and the input domain. We already provide oracles for domains defined by polyhedral and implicit surfaces.},
- pages = {24},
+ abstract = {CGALmesh is the mesh generation software package of the Computational Geometry Algorithm Library (CGAL). It generates isotropic simplicial meshes -- surface triangular meshes or volume tetrahedral meshes -- from input surfaces, 3D domains as well as 3D multi-domains, with or without sharp features. The underlying meshing algorithm relies on restricted Delaunay triangulations to approximate domains and surfaces, and on Delaunay refinement to ensure both approximation accuracy and mesh quality. CGALmesh provides guarantees on approximation quality as well as on the size and shape of the mesh elements. It provides four optional mesh optimization algorithms to further improve the mesh quality. A distinctive property of CGALmesh is its high flexibility with respect to the input domain representation. Such a flexibility is achieved through a careful software design, gathering into a single abstract concept, denoted by the oracle, all required interface features between the meshing engine and the input domain. We already provide oracles for domains defined by polyhedral and implicit surfaces.},
+ language = {en},
number = {4},
- journaltitle = {{ACM} Transactions on Mathematical Software},
- author = {Jamin, Clément and Alliez, Pierre and Yvinec, Mariette and Boissonnat, Jean-Daniel},
urldate = {2024-10-02},
- date = {2015},
- langid = {english},
+ journal = {ACM Transactions on Mathematical Software},
+ author = {Jamin, Clément and Alliez, Pierre and Yvinec, Mariette and Boissonnat, Jean-Daniel},
+ year = {2015},
keywords = {cgal},
+ pages = {24},
}
-@incollection{alliez_3d_2024-1,
+@book{the_cgal_project_cgal_2024,
+ edition = {5.6.1},
+ title = {{CGAL} {User} and {Reference} {Manual}},
+ url = {https://doc.cgal.org/5.6.1/Manual/packages.html},
+ publisher = {CGAL Editorial Board},
+ author = {{The CGAL Project}},
+ year = {2024},
+ keywords = {cgal},
+}
+
+@incollection{alliez_3d_2024,
edition = {6.0},
- title = {3D mesh generation},
+ title = {{3D} mesh generation},
url = {https://doc.cgal.org/6.0/Manual/packages.html#PkgMesh3},
booktitle = {{CGAL} user and reference manual},
- publisher = {{CGAL} Editorial Board},
+ publisher = {CGAL Editorial Board},
author = {Alliez, Pierre and Jamin, Clément and Rineau, Laurent and Tayeb, Stéphane and Tournois, Jane and Yvinec, Mariette},
- date = {2024},
+ year = {2024},
+ keywords = {cgal},
+}
+
+@incollection{alliez_3d_2024-1,
+ edition = {5.6.1},
+ title = {{3D} {Alpha} {Wrapping}},
+ url = {https://doc.cgal.org/5.6.1/Manual/packages.html#PkgAlphaWrap3},
+ booktitle = {{CGAL} {User} and {Reference} {Manual}},
+ publisher = {CGAL Editorial Board},
+ author = {Alliez, Pierre and Cohen-Steiner, David and Hemmer, Michael and Portaneri, Cédric and Rouxel-Labbé, Mael},
+ year = {2024},
keywords = {cgal},
}
+
+@inproceedings{chabannes_high_2017,
+ address = {Pittsburgh, PA, United States},
+ title = {High order finite element simulations for fluid dynamics validated by experimental data from the fda benchmark nozzle model},
+ url = {https://hal.science/hal-01429685},
+ booktitle = {5th {International} {Conference} on {Computational} and {Mathematical} {Biomedical} {Engineering} - {CMBE2017}},
+ author = {Chabannes, Vincent and Prud'Homme, Christophe and Szopos, Marcela and Tarabay, Ranine},
+ month = apr,
+ year = {2017},
+ keywords = {CFD, medical device, open source finite element software, validation},
+}
+
+@article{stewart_assessment_2012,
+ title = {Assessment of {CFD} {Performance} in {Simulations} of an {Idealized} {Medical} {Device}: {Results} of {FDA}’s {First} {Computational} {Interlaboratory} {Study}},
+ volume = {3},
+ issn = {1869-408X},
+ url = {http://dx.doi.org/10.1007/s13239-012-0087-5},
+ doi = {10.1007/s13239-012-0087-5},
+ number = {2},
+ journal = {Cardiovascular Engineering and Technology},
+ author = {Stewart, SandyF.C. and Paterson, EricG. and Burgreen, GregW. and Hariharan, Prasanna and Giarra, Matthew and Reddy, Varun and Day, StevenW. and Manning, KeefeB. and Deutsch, Steven and Berman, MichaelR. and Myers, MatthewR. and Malinauskas, RichardA.},
+ year = {2012},
+ keywords = {Blood damage, Computational fluid dynamics, Experimental validation, FDA, Medical devices},
+ pages = {139--160},
+}
+
+@article{hariharan_multilaboratory_2011,
+ title = {Multilaboratory {Particle} {Image} {Velocimetry} {Analysis} of the {FDA} {Benchmark} {Nozzle} {Model} to {Support} {Validation} of {Computational} {Fluid} {Dynamics} {Simulations}},
+ volume = {133},
+ issn = {0148-0731},
+ url = {http://dx.doi.org/10.1115/1.4003440},
+ doi = {doi: 10.1115/1.4003440},
+ journal = {Journal of Biomechanical Engineering},
+ author = {Hariharan, Prasanna and Giarra, Matthew and Reddy, Varun and Day, Steven W. and Manning, Keefe B. and Deutsch, Steven and Stewart, Sandy F. C. and Myers, Matthew R. and Berman, Michael R. and Burgreen, Greg W. and Paterson, Eric G. and Malinauskas, Richard A.},
+ month = feb,
+ year = {2011},
+ keywords = {FDA},
+}
+
+@article{feppon_f_null_2020,
+ title = {Null space gradient flows for constrained optimization with applications to shape optimization},
+ volume = {26},
+ url = {https://doi.org/10.1051/cocv/2020015},
+ doi = {10.1051/cocv/2020015},
+ journal = {ESAIM: COCV},
+ author = {{Feppon, F.} and {Allaire, G.} and {Dapogny, C.}},
+ year = {2020},
+ keywords = {multiphysics, shape optimisation},
+ pages = {90},
+}
+
+@article{feppon_shape_2019,
+ title = {Shape optimization of a coupled thermal fluid-structure problem in a level set mesh evolution framework},
+ volume = {76},
+ issn = {2254-3902},
+ doi = {10.1007/s40324-018-00185-4},
+ number = {3},
+ journal = {SeMA},
+ author = {Feppon, F. and Allaire, G. and Bordeu, F. and Cortial, J. and Dapogny, C.},
+ month = sep,
+ year = {2019},
+ note = {Publisher: Springer International Publishing},
+ keywords = {fsi, multiphysics, shape optimisation},
+ pages = {413--458},
+}
+
+@phdthesis{feppon_shape_2019-1,
+ type = {{PhD} {Thesis}},
+ title = {Shape and topology optimization of multiphysics systems},
+ school = {Thèse de doctorat de l'Universit'e Paris-Saclay pr'epar'ee à l”Ecole polytechnique},
+ author = {Feppon, Florian},
+ year = {2019},
+ keywords = {multiphysics, shape optimisation},
+}
+
+@misc{noauthor_iso_2017,
+ title = {{ISO} 10211:2017 - {Thermal} bridges in building construction — {Heat} flows and surface temperatures — {Detailed} calculations},
+ url = {https://www.iso.org/standard/65710.html},
+ year = {2017},
+ keywords = {building construction, detailed calculations, heat flows, surface temperatures, thermal bridges},
+}
+
+@book{advanced_micro_devices_inc_hip_2024,
+ edition = {Release 6.1.40091},
+ title = {{HIP} {Documentation}},
+ url = {https://rocm.docs.amd.com/projects/HIP/en/latest/index.html},
+ author = {{Advanced Micro Devices, Inc.}},
+ month = may,
+ year = {2024},
+ keywords = {AMD, GPU, HIP},
+}
+
+@book{advanced_micro_devices_inc_rocthrust_2024,
+ edition = {Release 3.0.1},
+ title = {{rocThrust} {Documentation}},
+ url = {https://rocm.docs.amd.com/projects/rocThrust/en/latest/index.html},
+ author = {{Advanced Micro Devices, Inc.}},
+ month = aug,
+ year = {2024},
+ keywords = {AMD, gpu, rocm},
+}
+
+@article{van_landeghem_mathematical_2024,
+ title = {Mathematical and computational framework for moving and colliding rigid bodies in a {Newtonian} fluid},
+ volume = {9},
+ issn = {2380288X, 23802898},
+ url = {https://www.intlpress.com/site/pub/pages/journals/items/amsa/content/vols/0009/0001/a002/},
+ doi = {10.4310/AMSA.2024.v9.n1.a2},
+ number = {1},
+ urldate = {2024-05-30},
+ journal = {Annals of Mathematical Sciences and Applications},
+ author = {Van Landeghem, Céline and Berti, Luca and Chabannes, Vincent and Chouippe, Agathe and Giraldi, Laetitia and Hoarau, Yannick and Prud’homme, Christophe},
+ year = {2024},
+ pages = {59--89},
+}
+
+@inproceedings{firmin_comparative_2023,
+ series = {Communications in {Computer} and {Information} {Science}},
+ title = {A {Comparative} {Study} of {Fractal}-{Based} {Decomposition} {Optimization}},
+ volume = {1824},
+ url = {https://doi.org/10.1007/978-3-031-34020-8\_1},
+ doi = {10.1007/978-3-031-34020-8_1},
+ booktitle = {Optimization and {Learning} - 6th {International} {Conference}, {OLA} 2023, {Malaga}, {Spain}, {May} 3-5, 2023, {Proceedings}},
+ publisher = {Springer},
+ author = {Firmin, Thomas and Talbi, El-Ghazali},
+ editor = {Dorronsoro, Bernabé and Chicano, Francisco and Danoy, Grégoire and Talbi, El-Ghazali},
+ year = {2023},
+ pages = {3--20},
+}
+
+@inproceedings{firmin_massively_2023,
+ title = {Massively {Parallel} {Asynchronous} {Fractal} {Optimization}},
+ doi = {10.1109/IPDPSW59300.2023.00151},
+ booktitle = {2023 {IEEE} {International} {Parallel} and {Distributed} {Processing} {Symposium} {Workshops} ({IPDPSW})},
+ author = {Firmin, Thomas and Talbi, El-Ghazali},
+ year = {2023},
+ keywords = {Asynchronous metaheuristic, Conferences, Continuous optimization, Distributed processing, Fractals, Hierarchical decomposition, Linear programming, Search problems, Software, Software algorithms},
+ pages = {930--938},
+}
+
+@article{blanchard_uranie_2019,
+ title = {The {Uranie} platform: an open-source software for optimisation, meta-modelling and uncertainty analysis},
+ volume = {5},
+ copyright = {© J.-B. Blanchard et al., published by EDP Sciences, 2019},
+ issn = {2491-9292},
+ shorttitle = {The {Uranie} platform},
+ url = {https://www.epj-n.org/articles/epjn/abs/2019/01/epjn180009/epjn180009.html},
+ doi = {10.1051/epjn/2018050},
+ abstract = {The high-performance computing resources and the constant improvement of both numerical simulation accuracy and the experimental measurements with which they are confronted bring a new compulsory step to strengthen the credence given to the simulation results: uncertainty quantification. This can have different meanings, according to the requested goals (rank uncertainty sources, reduce them, estimate precisely a critical threshold or an optimal working point), and it could request mathematical methods with greater or lesser complexity. This paper introduces the Uranie platform, an open-source framework developed at the Alternative Energies and Atomic Energy Commission (CEA), in the nuclear energy division, in order to deal with uncertainty propagation, surrogate models, optimisation issues, code calibration, etc. This platform benefits from both its dependencies and from personal developments, to offer an efficient data handling model, a C++ and Python interface, advanced graphi graphical tools, several parallelisation solutions, etc. These methods can then be applied to many kinds of code (considered as black boxes by Uranie) so to many fields of physics as well. In this paper, the example of thermal exchange between a plate-sheet and a fluid is introduced to show how Uranie can be used to perform a large range of analysis.},
+ language = {en},
+ urldate = {2024-09-30},
+ journal = {EPJ Nuclear Sciences \& Technologies},
+ author = {Blanchard, Jean-Baptiste and Damblin, Guillaume and Martinez, Jean-Marc and Arnaud, Gilles and Gaudier, Fabrice},
+ year = {2019},
+ pages = {4},
+}
+
+@misc{wikipedia_contributors_zfs_2024,
+ title = {{ZFS}},
+ url = {https://en.wikipedia.org/wiki/ZFS},
+ abstract = {ZFS is a combined file system and logical volume manager designed by Sun Microsystems. It is known for its data integrity, support for high storage capacities, and protection against data corruption. This Wikipedia entry provides an overview of its history, features, and applications.},
+ author = {{Wikipedia contributors}},
+ year = {2024},
+}
+
+@book{project_zfs_2024,
+ title = {{ZFS} {Administration} {Guide}},
+ url = {https://openzfs.github.io/openzfs-docs/man/master/8/zfs.8.html},
+ abstract = {The ZFS administration guide provides detailed documentation on managing ZFS, a robust file system and volume manager for high-performance computing environments. It covers various commands for managing file systems, snapshots, and data integrity.},
+ author = {Project, OpenZFS},
+ year = {2024},
+}
+
+@inproceedings{liang_daos_2020,
+ address = {Berlin, Heidelberg},
+ title = {{DAOS}: {A} {Scale}-{Out} {High} {Performance} {Storage} {Stack} for {Storage} {Class} {Memory}},
+ isbn = {978-3-030-48841-3},
+ url = {https://doi.org/10.1007/978-3-030-48842-0_3},
+ doi = {10.1007/978-3-030-48842-0_3},
+ abstract = {The Distributed Asynchronous Object Storage (DAOS) is an open source scale-out storage system that is designed from the ground up to support Storage Class Memory (SCM) and NVMe storage in user space. Its advanced storage API enables the native support of structured, semi-structured and unstructured data models, overcoming the limitations of traditional POSIX based parallel filesystem. For HPC workloads, DAOS provides direct MPI-IO and HDF5 support as well as POSIX access for legacy applications. In this paper we present the architecture of the DAOS storage engine and its high-level application interfaces. We also describe initial performance results of DAOS for IO500 benchmarks.},
+ booktitle = {Supercomputing {Frontiers}: 6th {Asian} {Conference}, {SCFA} 2020, {Singapore}, {February} 24–27, 2020, {Proceedings}},
+ publisher = {Springer-Verlag},
+ author = {Liang, Zhen and Lombardi, Johann and Chaarawi, Mohamad and Hennecke, Michael},
+ year = {2020},
+ note = {event-place: Singapore, Singapore},
+ keywords = {DAOS, Distributed storage system, NVMe, Parallel filesystem, Persistent memory, RAFT, SCM, SWIM},
+ pages = {40--54},
+}
+
+@misc{laboratory_llnl_scalable_2024,
+ title = {Scalable {Checkpoint}/{Restart} for {MPI} ({SCR})},
+ url = {https://computing.llnl.gov/projects/scalable-checkpoint-restart-for-mpi},
+ abstract = {The Scalable Checkpoint/Restart (SCR) library enables efficient, scalable checkpointing of MPI applications. This project from LLNL focuses on reducing checkpoint overhead to improve resilience in high-performance computing environments.},
+ author = {Laboratory (LLNL), Lawrence Livermore National},
+ year = {2024},
+}
+
+@misc{fault_tolerance_working_group_mpi_forum_user_2024,
+ title = {User {Level} {Failure} {Mitigation} ({ULFM})},
+ url = {https://fault-tolerance.org/},
+ abstract = {The User Level Failure Mitigation (ULFM) proposal is developed by the MPI Forum’s Fault Tolerance Working Group to support the continued operation of MPI programs after crash (node failures) have impacted the execution. The key principle is that no MPI call (point-to-point, collective, RMA, IO, …) can block indefinitely after a failure, but must either succeed or raise an MPI error. In addition the design is centered around user needs and flexibility, the API should allow varied fault tolerant models to be built as external libraries.},
+ author = {{Fault Tolerance Working Group, MPI Forum}},
+ year = {2024},
+ keywords = {fault tolerance, resilience},
+}
+
+@book{fti_documentation_team_fti_2024,
+ title = {{FTI}: {Fault} {Tolerance} {Interface} - {Examples}},
+ url = {https://fault-tolerance-interface.readthedocs.io/en/latest/examples.html},
+ abstract = {This webpage provides examples of how to use the Fault Tolerance Interface (FTI) for implementing fault tolerance in HPC applications. The examples demonstrate checkpointing, recovery, and handling failures with FTI.},
+ author = {{FTI Documentation Team}},
+ year = {2024},
+ keywords = {fault tolerance, resilience},
+}
+
+@inproceedings{koziol_extreme_2012,
+ title = {Extreme {I}/{O} {Scaling} with {HDF5}},
+ url = {https://cscads.rice.edu/HDF5-CScADS.pdf},
+ booktitle = {{XSEDE} 12 - {Extreme} {Scaling} {Workshop}},
+ publisher = {The HDF Group},
+ author = {Koziol, Quincey},
+ year = {2012},
+}
+
+@inproceedings{bautista-gomez_fti_2011,
+ address = {New York, NY, USA},
+ series = {{SC} '11},
+ title = {{FTI}: high performance fault tolerance interface for hybrid systems},
+ isbn = {978-1-4503-0771-0},
+ url = {https://doi.org/10.1145/2063384.2063427},
+ doi = {10.1145/2063384.2063427},
+ abstract = {Large scientific applications deployed on current petascale systems expend a significant amount of their execution time dumping checkpoint files to remote storage. New fault tolerant techniques will be critical to efficiently exploit post-petascale systems. In this work, we propose a low-overhead high-frequency multi-level checkpoint technique in which we integrate a highly-reliable topology-aware Reed-Solomon encoding in a three-level checkpoint scheme. We efficiently hide the encoding time using one Fault-Tolerance dedicated thread per node. We implement our technique in the Fault Tolerance Interface FTI. We evaluate the correctness of our performance model and conduct a study of the reliability of our library. To demonstrate the performance of FTI, we present a case study of the Mw9.0 Tohoku Japan earthquake simulation with SPECFEM3D on TSUBAME2.0. We demonstrate a checkpoint overhead as low as 8\% on sustained 0.1 petaflops runs (1152 GPUs) while checkpointing at high frequency.},
+ booktitle = {Proceedings of 2011 {International} {Conference} for {High} {Performance} {Computing}, {Networking}, {Storage} and {Analysis}},
+ publisher = {Association for Computing Machinery},
+ author = {Bautista-Gomez, Leonardo and Tsuboi, Seiji and Komatitsch, Dimitri and Cappello, Franck and Maruyama, Naoya and Matsuoka, Satoshi},
+ year = {2011},
+ note = {event-place: Seattle, Washington},
+}
+
+@misc{open_mpi_documentation_team_user_2024,
+ title = {User {Level} {Failure} {Mitigation} ({ULFM}) in {Open} {MPI}},
+ url = {https://docs.open-mpi.org/en/v5.0.x/features/ulfm.html},
+ abstract = {This chapter documents the features and options specific to the User Level Failure Mitigation (ULFM) Open MPI implementation. The ULFM proposal is developed by the MPI Forum’s Fault Tolerance Working Group to support the continued operation of MPI programs after failures, both hard and soft, have impacted execution. No MPI call can block indefinitely after a failure, and errors are not necessarily fatal, as the MPI implementation makes a best effort to maintain the execution environment.},
+ author = {{Open MPI Documentation Team}},
+ year = {2024},
+ keywords = {fault tolerance, resilience},
+}
+
+@inproceedings{ouertatani_accelerated_2024,
+ address = {Lugano, Switzerland},
+ title = {Accelerated {NAS} via pretrained ensembles and multi-fidelity {Bayesian} {Optimization}},
+ url = {https://hal.science/hal-04611343},
+ booktitle = {33rd {International} {Conference} on {Artificial} {Neural} {Networks} ({ICANN})},
+ author = {Ouertatani, Houssem and Maxim, Cristian and Niar, Smail and Talbi, El-Ghazali},
+ month = sep,
+ year = {2024},
+ keywords = {Deep Ensembles, Multi-fidelity BO, Neural Architecture Search},
+}
+
+@unpublished{beuzeville_deterministic_2024,
+ title = {Deterministic and probabilistic backward error analysis of neural networks in floating-point arithmetic},
+ url = {https://hal.science/hal-04663142},
+ author = {Beuzeville, Théo and Buttari, Alfredo and Gratton, Serge and Mary, Theo},
+ month = jul,
+ year = {2024},
+ keywords = {artificial neural networks, backward error, error analysis, floating-point arithmetic, probabilistic error analysis, rounding errors},
+}
+
+@unpublished{buttari_modular_2024,
+ title = {A modular framework for the backward error analysis of {GMRES}},
+ url = {https://hal.science/hal-04525918},
+ author = {Buttari, Alfredo and Higham, Nicholas J and Mary, Théo and Vieublé, Bastien},
+ month = mar,
+ year = {2024},
+ keywords = {Computer arithmetic, GMRES, Iterative solvers, Linear system of equations, Rounding error analysis},
+}
+
+@article{saigre_model_2024,
+ title = {Model order reduction and sensitivity analysis for complex heat transfer simulations inside the human eyeball},
+ url = {https://hal.science/hal-04361954},
+ doi = {10.1002/cnm.3864},
+ journal = {International Journal for Numerical Methods in Biomedical Engineering},
+ author = {Saigre, Thomas and Prud'Homme, Christophe and Szopos, Marcela},
+ month = sep,
+ year = {2024},
+ note = {Publisher: John Wiley and Sons},
+ keywords = {Heat transfer, Mathematical and computational ophthalmology, Sensitivity analysis, Uncertainty qualification, Validation, real-time model order reduction},
+ pages = {e3864},
+}
+
+@unpublished{aghili_accelerating_2024,
+ title = {Accelerating the convergence of {Newton}'s method for nonlinear elliptic {PDEs} using {Fourier} neural operators},
+ url = {https://hal.science/hal-04440076},
+ author = {Aghili, Joubine and Hild, Romain and Michel-Dansac, Victor and Vigon, Vincent and Franck, Emmanuel},
+ month = feb,
+ year = {2024},
+ keywords = {Fourier neural operators, Neural operators, Newton's method, Nonlinear elliptic PDEs},
+}
+
+@article{pham_numerical_2024,
+ title = {Numerical investigation of stabilization in the {Hybridizable} {Discontinuous} {Galerkin} method for linear anisotropic elastic equation},
+ url = {https://hal.science/hal-04503407},
+ doi = {10.1016/j.cma.2024.117080},
+ journal = {Computer Methods in Applied Mechanics and Engineering},
+ author = {Pham, Ha and Faucher, Florian and Barucq, Hélène},
+ month = jun,
+ year = {2024},
+ note = {Publisher: Elsevier},
+ pages = {117080},
+}
+
+@unpublished{mary_error_2024,
+ title = {Error analysis of matrix multiplication with narrow range floating-point arithmetic},
+ url = {https://hal.science/hal-04671474},
+ author = {Mary, Théo and Mikaitis, Mantas},
+ month = aug,
+ year = {2024},
+ keywords = {GPUs, floating-point arithmetic, matrix multiplication, mixed precision, multiword arithmetic, overflow, reduced precision, rounding error analysis, scaling, underflow},
+}
+
+@unpublished{mary_error_2024-1,
+ title = {Error analysis of the {Gram} low-rank approximation (and why it is not as unstable as one may think)},
+ url = {https://hal.science/hal-04554516},
+ author = {Mary, Théo},
+ month = apr,
+ year = {2024},
+ keywords = {Gram matrix, eigenvalue decomposition, finite precision arithmetic, iterative refinement, low-rank approximation, mixed precision, rounding error analysis, singular value decomposition},
+}
+
+@unpublished{prudhomme_ktirio_2024,
+ title = {Ktirio {Urban} {Building}: {A} {Computational} {Framework} for {City} {Energy} {Simulations} {Enhanced} by {CI}/{CD} {Innovations} on {EuroHPC} {Systems}},
+ url = {https://hal.science/hal-04590586},
+ author = {Prud'Homme, Christophe and Chabannes, Vincent and Berti, Luca and Maslek, Maryam and Pincon, Philippe and Cladellas, Javier and Diallo, Abdoulaye},
+ month = may,
+ year = {2024},
+ keywords = {City Energy Simulation, HPC, HPC HPCOps Urban building City Energy Simulation, HPCOps, Urban building},
+}
+
+@misc{noauthor_notitle_nodate,
+}
+
+@misc{belieres--frendo_volume-preserving_2024,
+ title = {Volume-preserving geometric shape optimization of the {Dirichlet} energy using variational neural networks},
+ url = {http://arxiv.org/abs/2407.19064},
+ abstract = {In this work, we explore the numerical solution of geometric shape optimization problems using neural network-based approaches. This involves minimizing a numerical criterion that includes solving a partial differential equation with respect to a domain, often under geometric constraints like constant volume. Our goal is to develop a proof of concept using a flexible and parallelizable methodology to tackle these problems. We focus on a prototypal problem: minimizing the so-called Dirichlet energy with respect to the domain under a volume constraint, involving a Poisson equation in \${\textbackslash}mathbb R{\textasciicircum}2\$. We use physics-informed neural networks (PINN) to approximate the Poisson equation's solution on a given domain and represent the shape through a neural network that approximates a volume-preserving transformation from an initial shape to an optimal one. These processes are combined in a single optimization algorithm that minimizes the Dirichlet energy. One of the significant advantages of this approach is its parallelizable nature, which makes it easy to handle the addition of parameters. Additionally, it does not rely on shape derivative or adjoint calculations. Our approach is tested on Dirichlet and Robin boundary conditions, parametric right-hand sides, and extended to Bernoulli-type free boundary problems. The source code for solving the shape optimization problem is open-source and freely available.},
+ urldate = {2024-09-17},
+ publisher = {arXiv},
+ author = {Bélières--Frendo, Amaury and Franck, Emmanuel and Michel-Dansac, Victor and Privat, Yannick},
+ month = aug,
+ year = {2024},
+ note = {arXiv:2407.19064 [cs, math]},
+ keywords = {Mathematics - Numerical Analysis, Mathematics - Optimization and Control},
+}
+
+@article{hecht_new_2012,
+ title = {New development in {FreeFem}++},
+ volume = {20},
+ issn = {1570-2820},
+ url = {https://freefem.org/},
+ number = {3-4},
+ journal = {Journal of Numerical Mathematics},
+ author = {Hecht, F.},
+ year = {2012},
+ mrnumber = {3043640},
+ pages = {251--265},
+}
+
+@inproceedings{gamblin_spack_2015,
+ address = {Austin Texas},
+ title = {The {Spack} package manager: bringing order to {HPC} software chaos},
+ isbn = {978-1-4503-3723-6},
+ shorttitle = {The {Spack} package manager},
+ url = {https://dl.acm.org/doi/10.1145/2807591.2807623},
+ doi = {10.1145/2807591.2807623},
+ language = {en},
+ urldate = {2024-09-05},
+ booktitle = {Proceedings of the {International} {Conference} for {High} {Performance} {Computing}, {Networking}, {Storage} and {Analysis}},
+ publisher = {ACM},
+ author = {Gamblin, Todd and LeGendre, Matthew and Collette, Michael R. and Lee, Gregory L. and Moody, Adam and De Supinski, Bronis R. and Futral, Scott},
+ month = nov,
+ year = {2015},
+ pages = {1--12},
+}
+
+@article{vallet_toward_2022,
+ title = {Toward practical transparent verifiable and long-term reproducible research using {Guix}},
+ volume = {9},
+ issn = {2052-4463},
+ url = {https://www.nature.com/articles/s41597-022-01720-9},
+ doi = {10.1038/s41597-022-01720-9},
+ abstract = {Abstract
+ Reproducibility crisis urge scientists to promote transparency which allows peers to draw same conclusions after performing identical steps from hypothesis to results. Growing resources are developed to open the access to methods, data and source codes. Still, the computational environment, an interface between data and source code running analyses, is not addressed. Environments are usually described with software and library names associated with version labels or provided as an opaque container image. This is not enough to describe the complexity of the dependencies on which they rely to operate on. We describe this issue and illustrate how open tools like Guix can be used by any scientist to share their environment and allow peers to reproduce it. Some steps of research might not be fully reproducible, but at least, transparency for computation is technically addressable. These tools should be considered by scientists willing to promote transparency and open science.},
+ language = {en},
+ number = {1},
+ urldate = {2024-09-05},
+ journal = {Scientific Data},
+ author = {Vallet, Nicolas and Michonneau, David and Tournier, Simon},
+ month = oct,
+ year = {2022},
+ pages = {597},
+}
+
+@techreport{adams_dakota_2022,
+ title = {Dakota, {A} {Multilevel} {Parallel} {Object}-{Oriented} {Framework} for {Design} {Optimization}, {Parameter} {Estimation}, {Uncertainty} {Quantification}, and {Sensitivity} {Analysis}: {Version} 6.16 {User}’s {Manual}},
+ number = {SAND2022-6171},
+ institution = {Sandia National Laboratories},
+ author = {Adams, B. M. and Bohnhoff, W. J. and Dalbey, K. R. and Ebeida, M. S. and Eddy, J. P. and Eldred, M. S. and Hooper, R. W. and Hough, P. D. and Hu, K. T. and Jakeman, J. D. and Khalil, M. and Maupin, K. A. and Monschke, J. A. and Ridgway, E. M. and Rushdi, A. A. and Seidl, D. T. and Stephens, J. A. and Swiler, L. P. and Winokur, J. G.},
+ month = may,
+ year = {2022},
+}
+
+@article{faucher_hawen_2021,
+ title = {hawen: time-harmonic wave modeling and inversion using hybridizable discontinuous {Galerkin} discretization},
+ volume = {6},
+ copyright = {http://creativecommons.org/licenses/by/4.0/},
+ issn = {2475-9066},
+ shorttitle = {hawen},
+ url = {https://joss.theoj.org/papers/10.21105/joss.02699},
+ doi = {10.21105/joss.02699},
+ number = {57},
+ urldate = {2024-09-05},
+ journal = {Journal of Open Source Software},
+ author = {Faucher, Florian},
+ month = jan,
+ year = {2021},
+ pages = {2699},
+}
+
+@misc{apptainer_contributors_apptainer_2024,
+ title = {Apptainer {User} {Documentation}},
+ url = {https://apptainer.org/docs},
+ author = {{Apptainer Contributors}},
+ year = {2024},
+}
+
+@book{slurm_development_team_slurm_2024,
+ title = {{SLURM} {Workload} {Manager}},
+ url = {https://slurm.schedmd.com/documentation.html},
+ author = {{SLURM Development Team}},
+ year = {2024},
+}
+
+@misc{karakasis_reframe-hpcreframe_2024,
+ title = {reframe-hpc/reframe: {ReFrame} 4.6.0},
+ url = {https://doi.org/10.5281/zenodo.11002528},
+ publisher = {Zenodo},
+ author = {Karakasis, Vasileios and Manitaras, Theofilos and Otero, Javier and Koutsaniti, Eirini and {jgp} and {rsarm} and Bignamini, Christopher and {victorusu} and Jocksch, Andreas and {kraushm} and {lucamar} and Keller, Sebastian and Omlin, Samuel and Kliavinek, Sergei and Mendonça, Henrique and Giordano, Mosè and {MarkLTurner} and {GiuseppeLoRe} and Grassano, Davide and Boissonneault, Maxime and Leak, Steve and Paipuri, Mahendra and {jfavre} and {Vanessasaurus} and Morrison, Jack and Moors, Sam and You, Zhi-Qiang and Sandgren, Ake and {brandon-biggs}},
+ month = apr,
+ year = {2024},
+ doi = {10.5281/zenodo.11002528},
+}
+
+@article{ballout_nonlinear_2024,
+ title = {Nonlinear compressive reduced basis approximation for multi-parameter elliptic problem},
+ copyright = {Creative Commons Attribution 4.0 International},
+ url = {https://zenodo.org/doi/10.5281/zenodo.13336083},
+ doi = {10.5281/ZENODO.13336083},
+ abstract = {What's Changed
+
+
+
+add citation file by @prudhomm in https://github.com/feelpp/article.nl-c-rbm/pull/4
+
+
+Full Changelog: https://github.com/feelpp/article.nl-c-rbm/compare/v1.0.1...v1.1.0},
+ urldate = {2024-09-04},
+ author = {Ballout, Hassan and Maday, Yvon and Prud'homme, Christophe},
+ month = aug,
+ year = {2024},
+ note = {Publisher: Zenodo
+Version Number: v1.1.0},
+}
+
+@inproceedings{balay_efficient_1997,
+ title = {Efficient {Management} of {Parallelism} in {Object} {Oriented} {Numerical} {Software} {Libraries}},
+ booktitle = {Modern {Software} {Tools} in {Scientific} {Computing}},
+ publisher = {Birkhäuser Press},
+ author = {Balay, Satish and Gropp, William D. and McInnes, Lois Curfman and Smith, Barry F.},
+ editor = {Arge, E. and Bruaset, A. M. and Langtangen, H. P.},
+ year = {1997},
+ pages = {163--202},
+}
+
+@article{zhang_petscsf_2022,
+ title = {The {PetscSF} {Scalable} {Communication} {Layer}},
+ volume = {33},
+ number = {4},
+ journal = {IEEE Transactions on Parallel and Distributed Systems},
+ author = {Zhang, Junchao and Brown, Jed and Balay, Satish and Faibussowitsch, Jacob and Knepley, Matthew and Marin, Oana and Mills, Richard Tran and Munson, Todd and Smith, Barry F. and Zampini, Stefano},
+ year = {2022},
+ pages = {842--853},
+}
+
+@article{dalcin_parallel_2011,
+ title = {Parallel distributed computing using {Python}},
+ volume = {34},
+ issn = {0309-1708},
+ doi = {10.1016/j.advwatres.2011.04.013},
+ number = {9},
+ journal = {Advances in Water Resources},
+ author = {Dalcin, Lisandro D. and Paz, Rodrigo R. and Kler, Pablo A. and Cosimo, Alejandro},
+ year = {2011},
+ pages = {1124 -- 1139},
+}
+
+@techreport{balay_petsctao_2024,
+ title = {{PETSc}/{TAO} {Users} {Manual}},
+ number = {ANL-21/39 - Revision 3.21},
+ institution = {Argonne National Laboratory},
+ author = {Balay, Satish and Abhyankar, Shrirang and Adams, Mark F. and Benson, Steven and Brown, Jed and Brune, Peter and Buschelman, Kris and Constantinescu, Emil and Dalcin, Lisandro and Dener, Alp and Eijkhout, Victor and Faibussowitsch, Jacob and Gropp, William D. and Hapla, Václav and Isaac, Tobin and Jolivet, Pierre and Karpeev, Dmitry and Kaushik, Dinesh and Knepley, Matthew G. and Kong, Fande and Kruger, Scott and May, Dave A. and McInnes, Lois Curfman and Mills, Richard Tran and Mitchell, Lawrence and Munson, Todd and Roman, Jose E. and Rupp, Karl and Sanan, Patrick and Sarich, Jason and Smith, Barry F. and Zampini, Stefano and Zhang, Hong and Zhang, Hong and Zhang, Junchao},
+ year = {2024},
+ doi = {10.2172/2205494},
+}
+
+@misc{ootomo_dgemm_2024,
+ title = {{DGEMM} on {Integer} {Matrix} {Multiplication} {Unit}},
+ url = {http://arxiv.org/abs/2306.11975},
+ abstract = {Deep learning hardware achieves high throughput and low power consumption by reducing computing precision and specializing in matrix multiplication. For machine learning inference, fixed-point value computation is commonplace, where the input and output values and the model parameters are quantized. Thus, many processors are now equipped with fast integer matrix multiplication units (IMMU). It is of significant interest to find a way to harness these IMMUs to improve the performance of HPC applications while maintaining accuracy. We focus on the Ozaki scheme, which computes a high-precision matrix multiplication by using lower-precision computing units, and show the advantages and disadvantages of using IMMU. The experiment using integer Tensor Cores shows that we can compute double-precision matrix multiplication faster than cuBLAS and an existing Ozaki scheme implementation on FP16 Tensor Cores on NVIDIA consumer GPUs. Furthermore, we demonstrate accelerating a quantum circuit simulation by up to 4.33 while maintaining the FP64 accuracy.},
+ urldate = {2024-06-28},
+ publisher = {arXiv},
+ author = {Ootomo, Hiroyuki and Ozaki, Katsuhisa and Yokota, Rio},
+ month = mar,
+ year = {2024},
+ note = {arXiv:2306.11975 [cs]},
+ keywords = {Computer Science - Distributed, Parallel, and Cluster Computing},
+}
+
+@inproceedings{haidar_harnessing_2018,
+ address = {Dallas, TX, USA},
+ title = {Harnessing {GPU} {Tensor} {Cores} for {Fast} {FP16} {Arithmetic} to {Speed} up {Mixed}-{Precision} {Iterative} {Refinement} {Solvers}},
+ isbn = {978-1-5386-8384-2},
+ url = {https://ieeexplore.ieee.org/document/8665777/},
+ doi = {10.1109/SC.2018.00050},
+ urldate = {2024-06-28},
+ booktitle = {{SC18}: {International} {Conference} for {High} {Performance} {Computing}, {Networking}, {Storage} and {Analysis}},
+ publisher = {IEEE},
+ author = {Haidar, Azzam and Tomov, Stanimire and Dongarra, Jack and Higham, Nicholas J.},
+ month = nov,
+ year = {2018},
+ pages = {603--613},
+}
diff --git a/sections/abbreviations.tex b/sections/abbreviations.tex
index b74f1c6..2941d35 100644
--- a/sections/abbreviations.tex
+++ b/sections/abbreviations.tex
@@ -10,8 +10,10 @@ \section*{List of Abbreviations}
\begin{acronym}[ABCDEF]
\acro{exama}[\textsc{Exa-MA}]{Methods and Algorithms for Exascale Computing}
\acro{DoA}{Description of Action}
- \acro{EC}{European Commission}
+ \acro{EC}{European Commission}
+ \acro{GENCI}{Grand Equipement National de Calcul Intensif}
\acro{FAIR}{Findable, Accessible, Interoperable, and Reusable}
+ \acro{feelpp}[\Feelpp]{Finite Element Embedded Language in \Cpp{}}
\acro{WP}{Work Package}
%% Bottlenecks for Exa-MA
\acro{B1}{Energy Efficiency \acroextra{: Develop energy-efficient technologies to meet the 20 MW target for exascale systems.}}
@@ -34,6 +36,19 @@ \section*{List of Abbreviations}
\acro{O4}{Objective 4 \acroextra{: Enable AI algorithms to achieve exascale performance by leveraging the methods (O1) and libraries (O2) developed in the project.}}
\acro{O5}{Objective 5 \acroextra{: Provide demonstrators through mini-apps and proxy-apps that will be openly available and benchmarked to showcase exascale readiness.}}
+ \acro{specx}[SPECX]{SPECX runtime environment}
+ \acro{DSEL}{Domain Specific Embedded Language}
+ \acro{DSL}{Domain Specific Language}
+ \acro{RB}{Reduced Basis methods}
+ \acro{NIRB}{Non-Intrusive Reduced Basis methods}
+ \acro{CRB}{Certified Reduced Basis methods}
+ \acro{ROM}{Reduced Order Models}
+ \acro{POD}{Proper Orthogonal Decomposition}
+ \acro{PGD}{Proper Generalized Decomposition}
+ \acro{FEM}{Finite Element Method}
+ \acro{FVM}{Finite Volume Method}
+ \acro{FDM}{Finite Difference Method}
+ \acro{CFD}{Computational Fluid Dynamics}
\end{acronym}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ No newline at end of file
diff --git a/sections/appendix-a.tex b/sections/appendix-a.tex
index d503b18..d88aaee 100644
--- a/sections/appendix-a.tex
+++ b/sections/appendix-a.tex
@@ -4,24 +4,185 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\clearpage
-\section*{Appendix A}
-\addcontentsline{toc}{section}{Appendix A}
-\label{sec:appendix-a}
+\appendix
+\section*{Appendix A. Computing and Data Storage Infrastructures}
+\addcontentsline{toc}{section}{Appendix A. Computing and Data Storage Infrastructures}
+\label{sec:app:architectures}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Section content, please change!
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection*{Appendix A.1}
-\label{sec:appendix-a1}
-\addcontentsline{toc}{subsection}{Appendix A.1}
+\subsection*{EuroHPC JU}
+\label{sec:arch:eurohpc-ju}
+\addcontentsline{toc}{subsection}{EuroHPC JU}
-\lipsum[1]
+EuroHPC JU is a joint undertaking between the European Union, European countries, and private partners to develop and deploy (pre-)exascale supercomputers in Europe.
+The EuroHPC JU is responsible for the procurement and operation of the supercomputers. The EuroHPC JU has selected eight sites in eight countries to host the supercomputers. The supercomputers are expected to be operational by 2022. The supercomputers are expected to have a combined peak performance of over 200 Petaflops. The supercomputers will be used for a wide range of applications, including weather forecasting, climate modeling, drug discovery, and materials science. The supercomputers will also be used to support the European Open Science Cloud, a pan-European data infrastructure for research.
+The next two supercomputers are expected in 2024 (Germany, Jülich) and 2025 (France, TGCC) to host the first exascale systems in Europe.
-\subsection*{Appendix A.2}
-\label{sec:appendix-a2}
-\addcontentsline{toc}{subsection}{Appendix A.2}
+In~\cref{tab:eurohpc_flops_cpu_gpu} and~\cref{tab:eurohpc_storage_interconnect} below the FLOPS are given in Petaflops and the storage in Petabytes.
+
+\begin{table}[!ht]
+ \centering
+ \begin{tabular}{l l l l l}
+ \toprule
+ \textbf{Name} & \textbf{FLOPS} & \textbf{CPU} & \textbf{Cores} & \textbf{GPU} \\
+ & & \textbf{/node} & \textbf{/node} & \\
+ \midrule
+ LUMI & 386 & 2x AMD EPYC 7763 & 128 & AMD Instinct \\
+ LEONARDO & 249 & Intel Ice-Lake (Booster) & - & NVIDIA Ampere \\
+ & & Intel Sapphire Rapids (Data-centric) & & \\
+ MARENOSTRUM5 & 215 & Intel Sapphire Rapids (GPP, ACC) & - & NVIDIA Hopper \\
+ & & Intel Emerald Rapids (NGT ACC) & & Intel Rialto Bridge\\
+ & & NVIDIA Grace (NGT GPP) & & \\
+ MELUXINA & 12.81 & 2x AMD EPYC 7H12 & 128 & NVIDIA Ampere \\
+ KAROLINA & 9.59 & 2x AMD EPYC 7H12 & 128 & NVIDIA A100 \\
+ DISCOVERER & 4.52 & 2x AMD EPYC 7H12 & 128 & None \\
+ VEGA & 6.92 & 2x AMD EPYC 7H12 & 128 & NVIDIA A100 \\
+ DEUCALION & 3.96 & ARM A64FX & 1632 & \\
+ & & AMD EPYC 7763 & & \\
+ & & & & NVIDIA Ampere \\
+ \bottomrule
+ \end{tabular}
+ \caption{EuroHPC Systems, Their Countries, and Features (FLOPS, CPU, Cores/node, GPU)}
+ \label{tab:eurohpc_flops_cpu_gpu}
+\end{table}
+
+\begin{table}[!ht]
+ \centering
+ \begin{tabular}{l l l l}
+ \toprule
+ \textbf{Name} & \textbf{Storage (PB)} & \textbf{Interconnect} \\
+ \midrule
+ LUMI & 117 & Slingshot-11 @ 200 Gb/s \\
+ LEONARDO & 100+ & - \\
+ MARENOSTRUM 5 & 248 & - \\
+ MELUXINA & 20 & InfiniBand @ 200 Gb/s \\
+ KAROLINA & 1 & InfiniBand @ 200 Gb/s \\
+ DISCOVERER & 2 & InfiniBand @ 200 Gb/s \\
+ VEGA & 24 & InfiniBand @ 200 Gb/s \\
+ DEUCALION & 11 & - \\
+ \bottomrule
+ \end{tabular}
+ \caption{EuroHPC Systems Storage and Interconnect}
+ \label{tab:eurohpc_storage_interconnect}
+\end{table}
+
+For more details check out~\cite{eurohpc_supercomputers}.
+
+%%
+%% Genci
+%%
+\subsection*{Genci (France)}
+\label{sec:arch:genci}
+\addcontentsline{toc}{subsection}{Genci}
+
+\ac{GENCI} is the French national high-performance computing agency. It provides access to supercomputers and high-performance computing resources for French researchers and their international collaborators. The agency operates three supercomputers: Adastra, Jean Zay, and Joliot-Curie. The systems are located at the CINES, IDRIS, and TGCC computing centers, respectively. The systems are described in the tables~\ref{tab:genci_flops_cpu_gpu} and~\ref{tab:genci_storage_interconnect}.
+For more details, check out:
+\begin{description}
+ \item[Cines] \fullcite{genci_cines}
+ \item[Idris] \fullcite{genci_idris}
+ \item[TGCC] \fullcite{genci_tgcc}
+\end{description}
+
+
+ \begin{table}[!ht]
+ \centering
+ \begin{tabular}{l l l l}
+ \toprule
+ \textbf{Name} & \textbf{FLOPS} & \textbf{CPU} & \textbf{GPU} \\
+ \midrule
+ CINES (Adastra) & 3.5 & 2x AMD EPYC Genoa (Genoa) & None \\
+ & & & GPU: AMD EPYC Trento (MI250x) \\
+ & & & APU: AMD MI300A (MI300A) \\
+ & & AMD EPYC Genoa (HPDA) & \\
+ IDRIS (Jean Zay) & 28 & CSL: Intel Cascade Lake & \\
+ & & & Nvidia V100 SXM2 \\
+ & & AMD EPYC Milan & Nvidia A100 SXM4\\
+ & & Intel Sapphire Rapids & Nvidia H100 SXM5 \\
+ TGCC (Joliot-Curie) & 22 & Rome: 2x AMD EPYC & None \\
+ & & SKL: Intel Skylake & \\
+ & & Intel Cascade Lake & NVIDIA V100 \\
+ \bottomrule
+ \end{tabular}
+ \caption{GENCI Systems - FLOPS, CPU, and GPU}
+ \label{tab:genci_flops_cpu_gpu}
+ \end{table}
+
+
+ \begin{table}[!ht]
+ \centering
+ \begin{tabular}{l l l}
+ \toprule
+ \textbf{Name} & \textbf{Storage (PB)} & \textbf{Interconnect} \\
+ \midrule
+ CINES (Adastra) & 1.9 & Slingshot \\
+ IDRIS (Jean Zay) & 50+ & Infiniband \\
+ TGCC (Joliot-Curie) & 100+ & Infiniband \\
+ \bottomrule
+ \end{tabular}
+ \caption{GENCI Systems - Storage and Interconnect}
+ \label{tab:genci_storage_interconnect}
+ \end{table}
+
+\subsection*{CNRS-Unistra / IRMA / Gaya (France)}
+\label{sec:arch:gaya}
+\addcontentsline{toc}{subsection}{Gaya}
+
+The Gaya system is located at the University of Strasbourg, France. The system is described in the tables~\ref{tab:gaya_flops_cpu_gpu} and~\ref{tab:gaya_storage_interconnect}. It allows to easily run large-scale simulations and data analysis tasks up to 6 nodes and 768 cores and includes 3 AMD Instinct MI210 GPUs for development and experiments.
+The frontal has 96 cores and 0.5TB of RAM which is used for generating large-scale meshes and associated partitioning.
+
+\begin{table}[!ht]
+ \centering
+ \begin{tabular}{l l l l l l}
+ \toprule
+ \textbf{Name} & \textbf{Nodes} & \textbf{Memory} & \textbf{CPU} & \textbf{Cores} & \textbf{GPU} \\
+ & & \textbf{/ node} & \textbf{/ node} & \textbf{/ node} & \\
+ \midrule
+ Gaya & 6 & 512 & 2x AMD EPYC 7713 & 128 & \\
+ & 1 & 256 & 2x AMD EPYC 7313 & 32 & 3x AMD Instinct MI210 \\
+ \bottomrule
+ \end{tabular}
+ \caption{Gaya System - Nodes, CPU, Cores/node, GPU, and Interconnect}
+ \label{tab:gaya_flops_cpu_gpu}
+\end{table}
+
+ \begin{table}[!ht]
+ \centering
+ \begin{tabular}{l l l}
+ \toprule
+ \textbf{Name} & \textbf{Storage (PB)} & \textbf{Interconnect} \\
+ \midrule
+ Gaya & 0.1+ & Infiniband \\
+ \bottomrule
+ \end{tabular}
+ \caption{Gaya System - Storage and Interconnect}
+ \label{tab:gaya_storage_interconnect}
+ \end{table}
+
+\subsection*{CNRS-Unistra / IRMA / Girder (France)}
+\label{sec:arch:girder:unistra}
+\addcontentsline{toc}{subsection}{Data Storage: girder.math.unistra.fr}
+
+A Girder system is located at the University of Strasbourg, France.
+150TB of storage is available.
+Public and Private data collections are available.
+
+Check out \url{https://girder.math.unistra.fr} for more details.
+
+\subsection*{Zenodo (EU)}
+\label{sec:arch:zenodo}
+\addcontentsline{toc}{subsection}{Data Storage: zenodo.org}
+
+% make a brief description of the Zenodo system
+Zenodo is a general-purpose open-access repository developed under the European OpenAIRE program and operated by CERN.
+It allows researchers to deposit data sets, research software, reports, and any other research-related digital artifacts.
+Zenodo assigns a DOI to each uploaded artifact, making it citable in scientific publications.
+Finally GitHub integration is available to automatically upload releases to Zenodo.
+This feature is used to store the NumPEx software releases.
+
+Check out \url{https://zenodo.org/communities/numpex} for more details.
-\lipsum[1]
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ No newline at end of file
diff --git a/sections/conclusions.tex b/sections/conclusions.tex
index 774ce50..0d3a3d4 100644
--- a/sections/conclusions.tex
+++ b/sections/conclusions.tex
@@ -7,10 +7,56 @@
\section{Conclusions}
\label{sec:conclusions}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%% Section content, please change!
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+In this deliverable, we have presented the initial benchmarking efforts and methodologies developed within the Exa-MA project, a component of the PEPR NumPEx program.
+The Exa-MA project aims to advance high-performance computing towards and beyond the exascale barrier by developing relevant numerical methods and ensuring that software is production-ready for exascale computing by the project's conclusion.
+
+We introduced a benchmarking methodology designed to address the challenges and bottlenecks associated with exascale computing.
+This methodology focuses on several critical aspects:
+
+\begin{itemize}
+ \item \textbf{Development of Numerical Methods and Algorithms}: Advancing state-of-the-art methods optimized for exascale architectures.
+ \item \textbf{Testing and Validation Processes}: Establishing structured non-regression testing, verification, and validation to ensure correctness and performance integrity.
+ \item \textbf{Benchmarking Strategy}: Implementing a phased approach to measure performance, scalability, and energy efficiency on advanced computational architectures.
+ \item \textbf{Data Management and I/O Strategies}: Addressing I/O bottlenecks through efficient data management techniques.
+ \item \textbf{Profiling and Measurement Tools}: Utilizing advanced tools to gather detailed performance insights.
+ \item \textbf{Containerization and Packaging}: Ensuring reproducible and portable execution environments.
+ \item \textbf{Continuous Integration and Deployment (CI/CD)}: Maintaining reproducibility and sustained high performance through continuous benchmarking and regression testing.
+ \item \textbf{Fault Tolerance Strategies}: Enhancing system resilience and data integrity.
+\end{itemize}
+
+The document provided an overview of the software developed within Exa-MA, highlighting their features, parallel capabilities, and the technologies employed. General statistics offered insights into their characteristics and technological choices, emphasizing the diversity and adaptability of the project's software components.
+
+In the work package chapters (WP1 to WP6), we presented software relevant to each focus area, detailing the numerical methods and algorithms they implement, parallel capabilities, benchmarks and metrics developed, challenges identified, and a 12-month roadmap for each.
+This approach ensures that all aspects of exascale computing, from discretization to uncertainty quantification, are addressed cohesively.
+
+The initial benchmarking results present the initial state in developing exascale-ready numerical methods and software tools.
+The challenges identified during this process have informed our future work.
+Depending on the software, the presentation of the results vary and we have in general an unbalanced presentation not only within each workpackage but also across the workpackages.
+We do not feel that this is a problem, as the various software are in different stages of development and benchamarking and the benchmarking results are not yet all readily available.
+However, during the coming months we should strive to setup the methodology for all software and be able any moment to present current results.
+The next release of this documentation should have a more balanced systematic presentation of the results within and across the workpackages.
+
+
+\section*{Future Work}
+
+Looking ahead, the Exa-MA project will continue to refine and expand upon the methodologies and tools presented in this deliverable.
+Our future efforts will focus on:
+
+\begin{itemize}
+ \item \textbf{Incorporating new methods and algorithms}: Developing and integrating novel numerical methods and algorithms optimized for exascale architectures.
+ \item \textbf{Enhancing Scalability and Performance}: Further optimizing numerical methods and software tools to fully leverage emerging exascale platforms, ensuring efficient utilization of computational resources.
+
+ \item \textbf{Extending Benchmarking Metrics}: Incorporating new metrics and evaluation criteria relevant to exascale computing, such as energy consumption, resilience, and data movement efficiency.
+ \item \textbf{Fault Tolerance Mechanisms}: Implementing fault tolerance strategies to maintain resilience in the face of hardware and software failures common in exascale environments.
+ \item \textbf{Strengthening Community Collaboration}: Engaging with the broader HPC community to share insights, tools, and best practices, fostering a collaborative ecosystem that accelerates progress.
+ \item \textbf{Addressing Identified Challenges}: Focusing research and development efforts on overcoming the specific bottlenecks and challenges identified in this initial benchmarking phase.
+\end{itemize}
+
+\section*{Final Remarks}
+
+This deliverable marks a significant step towards advancing exascale computing by providing a structured approach to the development and benchmarking of numerical methods, algorithms, and software. By adhering to the proposed benchmarking methodology and continuously improving our approaches, we aim to make substantial contributions to the field of high-performance computing.
+
+The progress achieved thus far in some of the software demonstrates the feasibility and importance of a methodical approach to overcoming the challenges of exascale computing.
-Each deliverable should end with conclusions and plans for further work.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ No newline at end of file
diff --git a/sections/introduction.tex b/sections/introduction.tex
index 59ad6d3..e9b153e 100644
--- a/sections/introduction.tex
+++ b/sections/introduction.tex
@@ -14,23 +14,45 @@ \section{Introduction}
\subsection{Purpose and Scope of the Document}
\label{sec:purpose}
-The Exa-MA project, part of the PEPR NumPEx program funded under France’s Plan d’Investissement d’Avenir, is dedicated to advancing the frontier of high-performance computing towards and beyond the exascale barrier.
-This deliverable, D7.1, represents the first in a series of annual benchmarking reports designed to evaluate the performance of key software tools and libraries that will underpin the next generation of scientific computing.
-Exascale computing promises significant breakthroughs across a wide range of disciplines, from climate science to materials physics, by enabling simulations and analyses at unprecedented scales and resolutions.
-However, realizing the full potential of exascale computing requires overcoming substantial challenges in software architecture, scalability, and efficiency.
-Benchmarking plays a crucial role in this context by providing a rigorous framework to measure, analyze, and optimize the performance of computational tools on state-of-the-art hardware architectures.
+The Exa-MA project, part of the PEPR NumPEx program funded under France’s Plan d’Investissement d’Avenir, is dedicated to advancing the frontier of high-performance computing towards and beyond the exascale barrier. This deliverable, D7.1, represents the first in a series of annual benchmarking reports designed to evaluate the performance of key software tools and libraries that will underpin the next generation of scientific computing.
-In this deliverable, we will assess several key software components on their ability to leverage advanced computational architectures (CPU, GPU, and Hybrid systems) effectively.
-Our focus is on elucidating each software’s features, its parallel computing capabilities, and initial performance metrics as these tools move towards exascale readiness.
-This evaluation will inform ongoing development efforts and help streamline the transition of these tools to fully exploit the capabilities of emerging exascale systems.
+Exascale computing promises significant breakthroughs across a wide range of disciplines, from climate science to materials physics, by enabling simulations and analyses at unprecedented scales and resolutions. However, realizing the full potential of exascale computing requires overcoming substantial challenges in software architecture, scalability, efficiency, and resilience. Benchmarking plays a crucial role in this context by providing a rigorous framework to measure, analyze, and optimize the performance of computational tools on state-of-the-art hardware architectures.
-Through testing and analysis, we aim to ensure that the Exa-MA suite of tools not only meets the high computational demands of future exascale applications but also adheres to the principles of energy efficiency, scalability, and robustness required in the exascale era.
+In this deliverable, we present a benchmarking methodology designed for developing and evaluating software tailored to exascale computing environments. This methodology addresses key bottlenecks identified in exascale systems, including interconnect technology, memory hierarchy, data management, exascale algorithms, and reproducibility challenges. The methodology integrates several core components:
+
+\begin{itemize}
+ \item \textbf{Testing and Validation Processes:} Establishing structured non-regression testing, verification, and validation to ensure software correctness and performance integrity across updates.
+ \item \textbf{Benchmarking Strategy:} Implementing a phased benchmarking approach to measure performance, scalability, and energy efficiency on advanced computational architectures (CPU, GPU, and hybrid systems).
+ \item \textbf{Data Management and I/O Strategies:} Addressing I/O bottlenecks through efficient data management techniques, including leveraging high-performance parallel file systems and advanced data compression methods.
+ \item \textbf{Profiling and Measurement Tools:} Utilizing advanced profiling tools such as EZTrace, Extrae, Score-P, TAU, Vampir, and Nsight to gather detailed performance insights across different architectures.
+ \item \textbf{Containerization and Packaging:} Employing packaging and containerization technologies like Spack, Guix-HPC, Docker, and Apptainer/Singularity to ensure reproducible and portable execution environments.
+ \item \textbf{Continuous Integration and Deployment (CI/CD):} Integrating continuous benchmarking and regression testing into CI/CD pipelines to maintain reproducibility, portability, and sustained high performance across different HPC systems.
+ \item \textbf{Fault Tolerance Strategies:} Developing fault tolerance mechanisms to enhance system resilience and data integrity, including checkpoint/restart techniques and advanced fault-tolerant I/O frameworks.
+\end{itemize}
+
+Furthermore, we provide a list of software components developed within the Exa-MA project. These software tools are essential for enabling exascale applications and have been chosen for their potential impact and alignment with the project's objectives. We present general statistics about these software packages, including their supported hardware architectures (CPU, GPU, and hybrid systems), programming languages, parallel computing technologies, data formats, and DevOps practices such as continuous integration, testing, and deployment. This overview offers insights into the diversity and technological choices within the project, highlighting the widespread usage of various technologies and the commitment to quality and maintainability.
+
+Through rigorous testing and analysis, and by employing our benchmarking methodology, we aim to ensure that the Exa-MA suite of tools not only meets the high computational demands of future exascale applications but also adheres to the principles of energy efficiency, scalability, robustness, reproducibility required in the exascale era. Our methodology is key to ensuring the long-term maintainability and sustainability of this effort, as it establishes standardized practices and frameworks that facilitate ongoing development and adaptation to evolving technologies. By documenting our methodology and sharing our findings, we contribute to the broader HPC community’s efforts in overcoming the challenges associated with exascale computing while promoting sustainable and maintainable software practices.
\subsection{Structure of the Document}
\label{sec:structure}
-This document is organised as follows: \Cref{sec:introduction} provides information about the report content whereas \Cref{sec:guidelines} introduces guides for writing \projacronym{} deliverables. The deliverable is concluded in \Cref{sec:conclusions}. Finally, additional information is provided in the \nameref{sec:appendix-a} and \nameref{sec:appendix-b}.
-
+This document is organized as follows:
+
+\begin{itemize}
+ \item \Cref{chap:introduction} introduces the purpose and scope of the deliverable, providing an overview of the Exa-MA project.
+ \item \Cref{chap:methodology} presents the benchmarking methodology that will be developed for exascale software, detailing the key objectives, testing processes, data management strategies, fault tolerance mechanisms, and the use of demonstrators.
+ \item \Cref{chap:software} provides a general overview of the software developed within Exa-MA, focusing on their features, mathematical foundations, functionalities, relevant publications, acknowledgments, and contact details. It includes general statistics about the software, offering insights into their characteristics and technological choices, such as supported architectures, programming languages, parallelism technologies, data formats, and DevOps practices.
+ \item \Cref{chap:wp1} (\textbf{WP1 - Discretization}) presents software relevant to the discretization methods developed within this work package. For each software, we detail their features, the methods they implement, parallel capabilities, benchmarks and metrics developed, challenges identified, and provide a 12-month roadmap.
+ \item \Cref{chap:wp2} (\textbf{WP2 - Model Order, Surrogate, Scientific Machine Learning Methods}) covers software related to reduced-order models, surrogate modeling, and scientific machine learning techniques tailored for exascale applications. Each software is presented with its implemented methods, parallel capabilities, associated benchmarks and metrics, challenges identified, and a 12-month roadmap.
+ \item \Cref{chap:wp3} (\textbf{WP3 - Solvers}) focuses on software implementing scalable solver algorithms suitable for exascale architectures. We discuss their features, parallel capabilities, benchmarks and metrics developed, challenges identified, and include a 12-month roadmap for each.
+ \item \Cref{chap:wp4} (\textbf{WP4 - Data Assimilation}) presents software and tools for inverse problems and data assimilation in large-scale simulations. For each software, we explore the methods they implement, parallel capabilities, benchmarks and metrics developed, challenges identified, and provide a 12-month roadmap.
+ \item \Cref{chap:wp5} (\textbf{WP5 - Optimization}) addresses software related to optimization techniques for exascale applications, including shape optimization and auto-ML tuning. The software are presented with their features, implemented methods, parallel capabilities, benchmarks and metrics developed, challenges identified, and a 12-month roadmap.
+ \item \Cref{chap:wp6} (\textbf{WP6 - Uncertainty Quantification}) explores software for quantifying uncertainty in computational models at exascale, discussing stochastic modeling and probabilistic approaches. Each software includes details on the methods implemented, parallel capabilities, benchmarks and metrics developed, challenges identified, and a 12-month roadmap.
+ \item \Cref{chap:conclusions} concludes the deliverable, summarizing the key findings and outlining future directions for the Exa-MA project.
+ \item The \textbf{References} compiles all the bibliographic citations used throughout the document.
+ \item The \textbf{Appendices} provide additional information, including detailed descriptions of hardware architectures (\nameref{sec:app:architectures}) and supplementary materials relevant to the benchmarking activities.
+\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ No newline at end of file
diff --git a/sections/summary.tex b/sections/summary.tex
index 3da354d..4b176c7 100644
--- a/sections/summary.tex
+++ b/sections/summary.tex
@@ -8,30 +8,49 @@ \section*{Executive Summary}
\addcontentsline{toc}{section}{Executive Summary}
\label{sec:summary}
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%% Section content, please change!
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+This deliverable, D7.1 of the Exa-MA project within the PEPR NumPEx program, is the first in a series of annual benchmarking reports aimed at advancing high-performance computing towards and beyond the exascale barrier. Exa-MA (\emph{Methods and Algorithms for Exascale}) focuses on developing relevant numerical methods and ensuring that software is production-ready for exascale computing by the end of the project.
+
+\subsection*{Summary}
+
+Exascale computing promises significant breakthroughs across various disciplines by enabling simulations and analyses at unprecedented scales and resolutions. Realizing this potential requires overcoming substantial challenges in numerical methods, algorithms, software architecture, scalability, efficiency, and resilience.
-From Wikipedia:\footnote{\url{https://en.wikipedia.org/wiki/Executive_summary}}
+This report introduces a benchmarking methodology designed for developing and evaluating numerical methods and software tailored to exascale computing environments. The methodology addresses key bottlenecks identified in exascale systems, including interconnect technology, memory hierarchy, data management, exascale algorithms, and reproducibility challenges.
+It integrates several core components:
+
+\begin{itemize}
+ \item \textbf{Development of Numerical Methods and Algorithms}: Advancing state-of-the-art methods and algorithms optimized for exascale architectures.
+ \item \textbf{Testing and Validation Processes}: Establishing structured non-regression testing, verification, and validation to ensure correctness and performance integrity across updates.
+ \item \textbf{Benchmarking Strategy}: Implementing a phased approach to measure performance, scalability, and energy efficiency on advanced computational architectures (CPU, GPU, and hybrid systems).
+ \item \textbf{Data Management and I/O Strategies}: Addressing I/O bottlenecks through efficient data management techniques.
+ \item \textbf{Profiling and Measurement Tools}: Utilizing advanced tools to gather detailed performance insights across different architectures.
+ \item \textbf{Containerization and Packaging}: Employing technologies to ensure reproducible and portable execution environments.
+ \item \textbf{Continuous Integration and Deployment (CI/CD)}: Integrating continuous benchmarking and regression testing into CI/CD pipelines to maintain reproducibility and sustained high performance.
+ \item \textbf{Fault Tolerance Strategies}: Developing mechanisms to enhance system resilience and data integrity.
+\end{itemize}
-An executive summary, or management summary, is a short document or section of a document, produced for business purposes, that summarises a longer report or proposal or a group of related reports in such a way that readers can rapidly become acquainted with a large body of material without having to read it all. It usually contains a brief statement of the problem or proposal covered in the major document(s), background information, concise analysis and main conclusions.
-It is intended as an aid to decision-making by managers and has been described as the most important part of a business plan.
+The document provides an overview of the software developed within Exa-MA, focusing on their features and parallel capabilities. General statistics offer insights into their characteristics and technological choices, such as supported architectures, programming languages, parallelism technologies, data formats, and DevOps practices.
-An executive summary differs from an abstract in that an abstract will usually be shorter and is intended to provide a neutral overview or orientation rather than being a condensed version of the full document. Abstracts are extensively used in academic research where the concept of the executive summary would be meaningless. ``An abstract is a brief summarising statement~\dots read by parties who are trying to decide whether or not to read the main document'', while ``an executive summary, unlike an abstract, is a document in miniature that may be read in place of the longer document''.
+In the work package chapters (WP1 to WP6), we present software relevant to each area, listing the numerical methods and algorithms they implement, parallel capabilities, benchmarks developed, challenges identified, and a 12-month roadmap for each. The work packages cover:
-Structure:
\begin{itemize}
- \item Be approximately 1-2 pages long
- \item Be written in language appropriate for the target audience
- \item Consist of short, concise paragraphs
- \item Begin with a summary
- \item Be written in the same order as the main report
- \item Only include material present in the main report
- \item Make recommendations
- \item Provide a justification
- \item Have a conclusion
- \item Be readable separately from the main report
- \item Sometimes summarise more than one document
+ \item \textbf{WP1 - Discretization}: Advancements in discretization methods for exascale computing.
+ \item \textbf{WP2 - Model Order, Surrogate, Scientific Machine Learning Methods}: Development of reduced-order models, surrogate modeling, and scientific machine learning techniques.
+ \item \textbf{WP3 - Solvers}: Scalable solver algorithms suitable for exascale architectures.
+ \item \textbf{WP4 - Data Assimilation}: Methods and tools for inverse problems and data assimilation in large-scale simulations.
+ \item \textbf{WP5 - Optimization}: Optimization techniques for exascale applications, including shape optimization and auto-ML tuning.
+ \item \textbf{WP6 - Uncertainty Quantification}: Methods for quantifying uncertainty in computational models at exascale.
\end{itemize}
+\subsection*{Recommendations}
+
+We recommend adopting the proposed benchmarking methodology across the Exa-MA project to ensure that the numerical methods and software tools meet the computational demands of exascale applications while adhering to principles of energy efficiency, scalability, robustness, reproducibility, maintainability, and sustainability. This methodology is key to the project's long-term success, establishing standardized practices that facilitate ongoing development and adaptation to evolving technologies.
+
+\subsection*{Justification}
+
+Implementing this methodology is crucial for overcoming the challenges associated with exascale computing. By focusing on advanced numerical methods and rigorous testing practices, the Exa-MA project can create high-quality, efficient, and sustainable tools. This approach contributes to the broader high-performance computing community by providing insights and frameworks applicable to other exascale initiatives.
+
+\subsection*{Conclusion}
+
+By developing advanced numerical methods and employing our benchmarking methodology, the Exa-MA project aims to meet the high computational demands of future exascale applications while adhering to essential principles like energy efficiency and scalability. Sharing our methodology and findings contributes to the broader HPC community's efforts to overcome exascale computing challenges and promotes sustainable practices needed for future scientific breakthroughs.
+
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ No newline at end of file
diff --git a/software/cgal/WP1/WP1.tex b/software/cgal/WP1/WP1.tex
index 2eae83f..e681a32 100644
--- a/software/cgal/WP1/WP1.tex
+++ b/software/cgal/WP1/WP1.tex
@@ -1,3 +1,4 @@
+%!TEX root = ../../../exa-ma-d7.1.tex
\section{Software: CGAL}
\label{sec:WP1:CGAL:software}
@@ -42,7 +43,30 @@ \section{Software: CGAL}
\subsection{Software Overview}
\label{sec:WP1:CGAL:summary}
-In~\cref{tab:WP1:CGAL:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+%\textbf{WP1 description from white paper:}
+
+The focus of WP1 is on revisiting discretization methodologies, particularly for large-scale models,
+that incorporate multiple phenomena and process couplings. The main tasks include mesh generation, adaptive refinement,
+and the development of finite element frameworks. Additionally, emphasis is placed on creating parallelization
+strategies that can significantly enhance scalability and overall performance of the algorithms.
+
+CGAL will serve as the foundational library providing the geometric computing building blocks to develop some of the
+relevant discretization algorithms. The focus will be on robustness, geometric fidelity and performance.
+CGAL already provides a variety of algorithms for the manipulation of geometric data.
+In particular, we will focus on two components for mesh generation given a wide range of inputs: 3D mesh generation \cite{jamin_cgalmesh_2015,alliez_3d_2024} and 3D alpha wrapping \cite{portaneri_alpha_2022,alliez_3d_2024-1}.
+
+The 3D mesh generation package relies upon filtering and refinement of 3D Delaunay triangulations, ensuring the creation of high-quality isotropic tetrahedral meshes for complex geometric domains.
+The algorithm guarantees that the mesh conforms accurately to the input surface, while respecting user-defined constraints on parameters such as size of elements, boundary approximation, and element quality. In addition to mesh generation, CGAL provides refinement strategies that adaptively improve mesh quality, while ensuring that the resulting structure remains optimized as the domain's complexity grows.
+
+
+
+Nowadays, real-world input data come from a variety of sources and sensors, and are often ill-defined for accurate representation of the domain,
+such as a unorganized 3D point clouds obtained by scanning an entire city. 3D data are commonly acquired through measurements followed by shape reconstruction,
+or generated through imperfect automated processes. Consequently, this data can exhibit a wide range of defects, including gaps,
+missing elements, self-intersections, degeneracies like zero-volume structures, and non-manifold features, making accurate
+and reliable geometric processing a challenge. The 3D Alpha Wrapping component offers a way to deal with this kinds of data by providing an algorithm that creates a valid 3D triangulation that strictly encloses the input. The algorithm offers unconditional robustness to the input, i.e., the output is guaranteed to be a watertight and orientable surface triangle mesh, regardless of the input quality.
+
+Table~\cref{tab:WP1:CGAL:features} records a summary of the software features relevant to the work package which are briefly discussed.
\begin{table}[h!]
\centering
@@ -56,7 +80,7 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} unstructured mesh & provide short description here \\
+\rowcolor{white} unstructured mesh & 3D Delaunay Mesh Generation, 3D Alpha Wrapping \\
\end{tabular}
}
}
@@ -68,60 +92,127 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP1:CGAL:performances}
+%\begin{itemize}
+% \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
+% \item describe the parallel computation environment: type of architecture and super computer used.
+% \item describe the parallel capabilities of the software
+% \item \textbf{Scalability:} Describe the general scalability properties of the software
+% \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+%\end{itemize}
+
+CGAL does not yet offer parallel implementation with distributed memory. Nevertheless, its 3D mesh generation package supports multi-core shared-memory parallelism, via Intel's Thread Building Blocks (TBB) library.
+
+The 3D Alpha Wrapping algorithm is not yet parallelized and runs on a single thread.
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+For future numerical experiments involving parallel algorithms,
+we will use a DELL precision workstation equipped with two 26-core Intel Xeon Gold 6230R CPUs clocked at 2.10 GHz, providing a total count of 104 threads, running on a 64-bit architecture, with 512 GB of RAM.
+
+%\url{https://doc.cgal.org/latest/Mesh_3/index.html#Mesh_3ParallelAlgorithms}
\subsection{Initial Performance Metrics}
\label{sec:WP1:CGAL:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP1. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+
+
+%This section provides a summary of initial performance benchmarks performed in the context of WP1. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+%
+%\begin{itemize}
+% \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+% \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+% \begin{itemize}
+% \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
+% \item Output dataset format and key results.
+% \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
+% \item DOI or permanent link for accessing the dataset.
+% \end{itemize}
+% \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
+% \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+% \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+%\end{itemize}
+
+We now provide initial performance metrics of the two relevant discretization algorithms. The benchmarks have been performed as part of the packages release in CGAL.
+
+%\subsubsection{Benchmark \#1}
+%\begin{itemize}
+% \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+% \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+% \item \textbf{Input/Output Dataset Description:}
+% \begin{itemize}
+% \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+% \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+% \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+% \end{itemize}
+% \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+% \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+%\end{itemize}
+
+
+\subsubsection{Benchmark \#1 - 3D Alpha Wrapping serial}
+
+
+The 3D alpha wrapping does not offer parallelization at its current stage of development. We provide in~\cref{WP1::CGAL::aw3} the complexity analysis from~\cite{portaneri_alpha_2022} on a single 3D model (fan model, available in GrabCad) and using a single thread.
+The plots shows that the computational time correlates well with the output mesh complexity (number of triangles).
+
+
+\begin{figure}[htb]
+ \centering
+ \includegraphics[width=0.6\textwidth]{graphics/cgal/fan.png}
+ \caption{Output complexity and time against alpha/offset parameters.
+\textbf{Top:} Input fan model, and few output meshes shown for increasing alpha parameter. \textbf{Middle top:} Complexity of the output mesh in number of triangle
+facets. Middle bottom: Computational time in ms. \textbf{Bottom:} Computational time against complexity of the output mesh. Each dot corresponds to one above combination of alpha/offset parameters.}
+\label{WP1::CGAL::aw3}
+\end{figure}
+
+
+
+\subsubsection{Benchmark \#2 - 3D Mesh Generation parallel}
+
+
+The speed-up charts in~\cref{WP1::CGAL::mg3parallel} are generated using the parallel version of the 3D meshing algorithm and were obtained from the 3D mesh generation reference manual \cite{alliez_3d_2024}.
+The machine used is a PC running Windows 7 64-bits with two 6-core Intel Xeon CPU X5660 clocked at 2.80 GHz with 32GB of RAM.
+The program has been compiled with Microsoft Visual C++ 2012, in Release mode.
+The models used for this benchmark are publicly available in the CGAL Git repository, specifically in the 'demo' folder.
+
+
+\begin{figure}[htb]
+ \centering
+ \includegraphics[width=0.6\textwidth]{graphics/cgal/refinement_speedup.png}
+ \caption{Facet refinement speed-up (left) and cell refinement speed-up (right), compared to the sequential version of the algorithm.}
+ \label{WP1::CGAL::mg3parallel}
+\end{figure}
+
\subsection{12-Month Roadmap}
\label{sec:WP1:CGAL:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
+%In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+%\begin{itemize}
+% \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
+% \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+% \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+%\end{itemize}
+
+
-In~\cref{tab:WP1:CGAL:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+Our current efforts focus on developing a feature-preserving, reduced-complexity version of the 3D Alpha Wrapping algorithm.
+The goal is two-fold: on the one hand it will ensure that the domain's features (such as corners, cusps or sharp creases) are accurately represented in the output. On the other hand, obtaining a feature preserving mesh also offers a means to obtain an output with a reduced complexity (number of vertices) required to enclose the input domain, thus offering an improvement in performance even for the serial version of the algorithm.
+In practice this means that the feature-preserving 3D Alpha Wrapping will complement a parsimonious variant.
+In our context, parsimony translates into minimizing the number of vertices to accurately represent the input domain by keeping
+the geometric discrepancy (measured as the Hausdorff distance between the output surface and the input) bounded.
+Proper identification of the domain's features during meshing will allow us to directly connect these features over planar and sufficiently
+smooth regions without generating unnecessary vertices, thereby achieving a satisfactory balance between accuracy and simplicity.
+
+
+% cite https://hal.science/hal-03380593/file/2021216131.pdf christos-todo
+Furthermore, more scalable parallelization schemes are currently under development in CGAL by GeometryFactory and IGN (Institut National de l'Information Géographique et Forestière), utilizing the distributed memory paradigm and a generic framework. Following the generic approach of CGAL, we can hopefully use them on existing algorithms, such as the 3D Alpha Wrapping.
+Nevertheless, designing a distributed version of a mesh generation algorithm is a non-trivial task that requires addressing several significant technical challenges.
+During mesh generation, the domain evolves dynamically, making it difficult to maintain a balanced decomposition with an even number of elements across different subdomains.
+As a result, a careful load-balancing strategy is essential.
+Additionally, new vertices generated by separate threads must not coincide or be positioned too closely. Maintaining conflict-free areas between different subdomains may become a significant bottleneck, especially as the number of processors increases.
+
+
+In~\cref{tab:WP1:CGAL:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the present work package.
\begin{table}[h!]
\centering
@@ -139,13 +230,30 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & Well documented examples should be provided along with our developed algorithms, as they already exist for 3D Mesh Generation and 3D Alpha Wrapping packages of CGAL. \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & The codes and data will be publicly available and open-source. \\
+\rowcolor{white} B6 - Data Management & Benchmark scripts will be available, using publicly available large databases of geometric models. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Challenges regarding distributed mesh generation involve dynamic load-balancing and scalability with increasing number of threads. \\
\end{tabular}
}
}
\caption{WP1: CGAL plan with Respect to Relevant Bottlenecks}
\label{tab:WP1:CGAL:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
+
+
+%B10 Scientific Productivity : Provide scientists with tools to use exascale systems
+%productively, including program development, application execution, input
+%preparation, output collection, and result analysis.
+
+%B11 Reproducibility and Replicability of Computation : Ensure that research results are
+%reproducible and that data and codes are provided so others can re-obtain the
+%same results.
+
+%B6 Data Management : Develop software that handles massive amounts of data,
+%addressing both offensive I/O (e.g., data analysis and compression) and defensive
+%I/O (e.g., fault tolerance).
+
+%B7 Exascale Algorithms : Redesign algorithms to improve scalability by reducing
+%communication, avoiding or hiding synchronization, and enhancing computational
+%efficiency on accelerators.
diff --git a/software/cgal/cgal.tex b/software/cgal/cgal.tex
index 19673a0..86a00bb 100644
--- a/software/cgal/cgal.tex
+++ b/software/cgal/cgal.tex
@@ -15,7 +15,7 @@ \section{Software: CGAL}
Inria\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
-Inria CA\\
+Inria center at Universit\'e C\^ote d'Azur\\
\end{tabular} \\
\rowcolor{white}\textbf{Contact Emails} & \begin{tabular}{l}
christos.georgiadis@inria.fr\\
@@ -43,18 +43,30 @@ \section{Software: CGAL}
\subsection{Software summary}
\label{sec:CGAL:summary}
-Detailed overview not available.
+
+CGAL (Computational Geometry Algorithms Library) \cite{the_cgal_project_cgal_2024} is an open-source software project designed to provide
+numerically reliable software components (algorithms and geometric data structures) for use in 2D, 3D, or arbitrary dimensions.
+These components include convex hulls, triangulations, Boolean operations, intersection calculations, mesh generation,
+3D point cloud processing, and more. CGAL’s main design features, along with various components, are utilized in industrial
+robotics and digital engineering simulations. For industrial applications, CGAL offers reliable, interoperable components
+that save development time by eliminating the need to reinvent the wheel, allowing users to focus on the business specializations
+that deliver the most value.
+
\subsection{Purpose}
\label{sec:CGAL:purpose}
-Purpose not available.
+%Purpose not available.
+
+The purpose of CGAL is to offer developers tools for solving complex geometric problems in the form of a C++ templated library. Both low level geometric data structures and algorithms are provided, in 2D, 3D and arbitrary dimensions.
+CGAL is used in fields like CAD, robotics and scientific computing, offering components for tasks like mesh generation, spatial searching and geometry processing. CGAL is available under a dual licensing scheme. For integration into other open-source software,
+it is provided under LGPL or GPL licenses, depending on the components. For proprietary or commercial projects,
+licenses can be purchased from Geometry Factory, with options tailored for academic, research or industrial customers.
\subsection{Programming and Computational Environment}
\label{sec::CGAL:environment_capabilities}
-
The following table summarizes these aspects for CGAL, providing a view of its programming and computational capabilities.
\begin{table}[h!]
@@ -66,17 +78,16 @@ \subsection{Programming and Computational Environment}
{\fontsize{9}{11}\selectfont
\begin{tabular}{lp{.3\textwidth}p{.5\textwidth}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
- \rowcolor{white}Languages & \begin{tabular}{l}
-C++\\
+ \rowcolor{white} Languages & \begin{tabular}{l}
+C++, header-only be default\\
\end{tabular} & Programming languages and language standards supported by the software \\
- \rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
-Multithread\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
- \rowcolor{white}Data Formats & \begin{tabular}{l}
-None\\
-\end{tabular} & Data formats that the software can handle or produce.\\
- \rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
-None\\
+ \rowcolor{numpexlightergray} Parallelism & Multithread
+ & Parallel computing methods and frameworks utilized by the software. CGAL requires a compiler supporting C++17 or later, and the Intel TBB library for multithreading.\\
+ \rowcolor{white}Data Formats &
+ & Data formats that the software can handle or produce. For 3D meshes, CGAL can output Medit, VTK, Avizo and Tetgen. The input can be provided in different forms: implicit or explicit. It can read surface meshes in OBJ or OFF formats.\\
+ \\
+\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
+None at the moment\\
\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
\rowcolor{white}DevOps & \begin{tabular}{l}
Continuous Integration\\
@@ -87,44 +98,71 @@ \subsection{Programming and Computational Environment}
Spack\\
Ubuntu\\
\end{tabular} & Software packaging and distribution.\\
- \rowcolor{white}Testing & \begin{tabular}{l}
-Unit\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
- \rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
-None\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
- \rowcolor{white}Interfaces & \begin{tabular}{l}
-None\\
+ \rowcolor{white}Testing &
+
+ & On most operating systems, package managers offer CGAL and its essential third party dependencies. Testing methodologies employed to ensure software quality and correctness.\\
+ \rowcolor{numpexlightergray}Containerization &
+ & Container technologies used to package and deploy the software. CGAL uses a custom-tailored test suite running on 22 platforms combining Windows, Linux (Debian, Fedora, Ubuntu), macOS and different compilers (MSVC, Darwin, clang, gcc)\\
+ \rowcolor{white} Interfaces & \begin{tabular}{l}
+Apt-get on Linux, \\
+and Homebrew on macOS.\\
\end{tabular} & List of software CGAL has interfaces with.\\
+& & Essential third partie libraries: STL, Boost and MPFR. Optional libraries are listed
+\url{https://doc.cgal.org/latest/Manual/thirdparty.html}{here}.\\
\bottomrule
\end{tabular}
}}
- \caption{CGAL programming and computational environment}
+ \caption{CGAL programming and computational environment.}
\end{table}
\subsection{Mathematics}
\label{sec:CGAL:mathematics}
-Mathematics not available.
-In this section, provide a summary the mathematics used in the software.
+Numerical robustness is a fundamental concern in geometric computing, even more so than in other types of numerical methods due to the dual nature of many algorithmes (combinatorial and continuous).
+In geometric algorithms, slight inaccuracies in numerical computations can lead to significant errors, such as incorrect topological configurations or degeneracies that disrupt the algorithm's logic.
+The CGAL library offers a flexible and powerful solution to this problem by following the exact computation paradigm that leverages interval arithmetic as well as multiple precision arithmetic (\url{https://www.cgal.org/exact.html}). Such a paradigm enables users to avoid rounding errors and ensures robust algorithms.
+
+CGAL leverages generic programming, enabling the use of different components through a versatile C++ templated environment.
+This approach allows algorithms and data structures to be flexible and reusable across various geometric scenarios.
+For instance, a 2D convex hull algorithm in CGAL can be applied to an arbitrary 3D plane by utilizing appropriate templated parameters. Similarly, the mesh simplification component can operate any mesh data structures as long as few interface functions are provided.
+Such a templated design ensures that developers can extend or customize components easily to meet specific requirements.
\subsection{Relevant Publications}
\label{sec:CGAL:publications}
-Here is a list of relevant publications related to the software:
+The following publications are relevant for mesh generation components in the CGAL library:
+
+\begin{itemize}
+\item \textit{CGALmesh} \cite{jamin_cgalmesh_2015} describes the 3D mesh generation package of CGAL, which is based on Delaunay triangulation and refinement.
+\item \textit{Alpha Wrapping with an Offset} \cite{portaneri_alpha_2022} offers a solution for the generation of watertight and orientable meshes which strictly enclose arbitrary and possibly defect-ladden inputs.
+\end{itemize}
\subsection{Acknowledgements}
\label{sec::CGAL:acknowledgements}
-The software has been developed with the support of the following funding agencies and institutions:
+%The software has been developed with the support of the following funding agencies and institutions:
+%
+%
+%Acknowledgements not available.
+
+%\url{https://www.cgal.org/partners.html}
+CGAL has been originally funded by European Union's information technologies programme Esprit, by Project 21957 - CGAL, with the project partners Utrecht University (The Netherlands), ETH Zurich (Switzerland), Freie Universitaet Berlin (Germany), Inria Sophia-Antipolis (France), Martin-Luther-University Halle-Wittenberg (Germany), Max-Planck-Institute Saarbruecken (Germany), RISC Linz (Austria) and Tel-Aviv University (Israel).
+After this project, the CGAL open source project has been supported by several European Research Programs:
+\begin{itemize}
+\item GALIA - Project 28155 - GALIA.
+\item ECG - Project IST-2000-26473 - ECG.
+\item ACS - Project IST-006413 - ACS
+\item Aim@Shape - Project IST NoE-506766
+\item GUDHI - FP7-IDEAS-ERC 339025
+\end{itemize}
-Acknowledgements not available.
+Commercial licenses to CGAL are provided by the GeometryFactory company, a spin-off from Inria.
diff --git a/software/composyx/WP3/WP3.tex b/software/composyx/WP3/WP3.tex
index e1d5e07..ef79291 100644
--- a/software/composyx/WP3/WP3.tex
+++ b/software/composyx/WP3/WP3.tex
@@ -54,9 +54,10 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} "singular value decomposition (SVD) and eigenvalue solver" & provide short description here \\
-\rowcolor{numpexlightergray} direct solver & provide short description here \\
-\rowcolor{white} krylov solver & provide short description here \\
+\rowcolor{white} "singular value decomposition (SVD) and eigenvalue solver" & Provide randomized EVD and SVD partial decomposition \\
+\rowcolor{numpexlightergray} direct solver & provide interface to MUMPS, PaStiX and qr\_mumps \\
+\rowcolor{white} krylov solver & provide interface to Fabulous that implement various subspace methods and their block-counterpart \\
+ \bottomrule
\end{tabular}
}
}
@@ -70,11 +71,11 @@ \subsection{Parallel Capabilities}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+ \item \textbf{Parallel Environment :} MPI+ threads and not fully assess MPI+StarPU for heterogeneous manycores.
+ \item \textbf{Computation environment :} Distributed manycores (GENCI platforms, BSC)
+ %\item describe the parallel capabilities of the software
+ \item \textbf{Scalability:} weak scalability on up-to $\approx$ 20~000 cores for the solution of a $\approx 10^9$ linear system.
+ \item \textbf{Integration with Other Systems:} No integration into other Exa-Ma software yet.
\end{itemize}
@@ -84,31 +85,20 @@ \subsection{Initial Performance Metrics}
This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+ \item \textbf{Overall Performance:} weak scalability on up-to $\approx$ 20~000 cores for the solution of a $\approx 10^9$ linear system.
+ \item \textbf{Input/Output Dataset:} not applicable.
+ \item \textbf{open-data Access:} Benchmark on matrix generator distributed on the gitlab of the package.
+% \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+ \item \textbf{Future Improvements:} perform more exhaustive experiments on heteroneous nodes, that is using the MPI+StarPU option.
\end{itemize}
-\subsubsection{Benchmark \#1}
+\subsubsection{Benchmark \#1: heterogeneous diffusion }
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+ \item \textbf{Description:} Solution of a 3D heterogenous diffusion équations in a cube to enable a parallel generation of the benchmark.
+ \item \textbf{Benchmarking Tools Used:} Metrics are memory consumption and elapsed time to solution.
+ \item \textbf{Input/Output Dataset Description:} internal processing of the output to perform
+ \item \textbf{Results Summary:} Speedups
+ \item \textbf{Challenges Identified:} scalability at extreme scale.
\end{itemize}
\subsection{12-Month Roadmap}
@@ -116,18 +106,15 @@ \subsection{12-Month Roadmap}
In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
+ \item \textbf{Data Improvements:} Use other matrix generator using some of the packages developped within Ex-Ma such as FreeFEM++ and/or Feel++.
\item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+ \item \textbf{Results Retention:} We will consider to publish on the gitlab of the packages the performance results produced by the CI.
\end{itemize}
In~\cref{tab:WP3:Composyx:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
\begin{table}[h!]
\centering
-
-
-
\centering
{
\setlength{\parindent}{0pt}
@@ -139,13 +126,14 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & Guix-HPC \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Guix-HPC \\
+\rowcolor{white} B6 - Data Management & not applicable \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Tune CPU and GPU features - Possibly add numerical resiliency eventhough we still believe that the resilincy should be addressed in an hollistic fashion as advocated in~\cite{agullo_resiliency_2022}. \\
+
\end{tabular}
}
}
\caption{WP3: Composyx plan with Respect to Relevant Bottlenecks}
\label{tab:WP3:Composyx:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
diff --git a/software/composyx/composyx.tex b/software/composyx/composyx.tex
index 40c5b0e..d36728e 100644
--- a/software/composyx/composyx.tex
+++ b/software/composyx/composyx.tex
@@ -1,8 +1,6 @@
\section{Software: Composyx}
\label{sec:Composyx:software}
-
-
\begin{table}[h!]
\centering
{ \setlength{\parindent}{0pt}
@@ -41,13 +39,20 @@ \section{Software: Composyx}
\subsection{Software summary}
\label{sec:Composyx:summary}
-Detailed overview not available.
-
+Composyx (previously Maphys++) is a linear algebra C++20 library focused on composability. Its purpose is to allow the user to express a large panel of algorithms using a high-level interface to range from laptop prototypes to many node supercomputer parallel computations.
+Currently it mostly implements domain decomposition methods as described in~\cite{agullo_robust_2019} using hybrid parallel implementation (MPI+Thread, MPI+StarPU) to address heterogeneous manycores.
+\begin{figure}
+ \centering
+ \includegraphics[width=0.8\textwidth]{graphics/composyx/composyx-solverstack.png}
+ %\includegraphics[width=0.8\textwidth]{graphics/composyx/composyx-solverstack.pdf}
+ \caption{Composyx dependencies}
+ \label{fig:composyx}
+ \end{figure}
\subsection{Purpose}
\label{sec:Composyx:purpose}
-Purpose not available.
+Solution of large sparse linear systems using preconditioned subspace methods. For that purpose it relies on the Fabulous packages that implements various techniques including block variants for multiple right-hand sides~\cite{giraud_block_2022}.
\subsection{Programming and Computational Environment}
\label{sec::Composyx:environment_capabilities}
@@ -66,38 +71,22 @@ \subsection{Programming and Computational Environment}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
\rowcolor{white}Languages & \begin{tabular}{l}
C\\
-C++\\
+C++20\\
Fortran\\
\end{tabular} & Programming languages and language standards supported by the software \\
\rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
GPU\\
MPI\\
Multithread\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
- \rowcolor{white}Data Formats & \begin{tabular}{l}
-None\\
-\end{tabular} & Data formats that the software can handle or produce.\\
- \rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
-None\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
- \rowcolor{white}DevOps & \begin{tabular}{l}
-Continuous Integration\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
- \rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
-GUIX-HPC\\
-\end{tabular} & Software packaging and distribution.\\
- \rowcolor{white}Testing & \begin{tabular}{l}
-Verification\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
- \rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
-Singularity\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
- \rowcolor{white}Interfaces & \begin{tabular}{l}
-MUMPS\\
-PaStiX\\
-Scotch\\
-qr\_mumps\\
-\end{tabular} & List of software Composyx has interfaces with.\\
+\end{tabular} & Multithreading OpenMP and Posix, hybrid CPU-GPU with StarPU.\\
+%\rowcolor{white}Data Formats & \begin{tabular}{l} None\\ \end{tabular} & Data formats that the software can handle or produce.\\
+%\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l} None\\ \end{tabular} & \\
+\rowcolor{white}DevOps & \begin{tabular}{l} Continuous Integration\\ \end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
+%
+\rowcolor{numpexlightergray}Packaging & \begin{tabular}{l} GUIX-HPC\\ \end{tabular} & Software packaging and distribution.\\
+\rowcolor{white}Testing & \begin{tabular}{l} Verification\\ \end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
+\rowcolor{numpexlightergray}Containerization & \begin{tabular}{l} Singularity\\ \end{tabular} & Container technologies used to package and deploy the software.\\
+\rowcolor{white}Interfaces & \begin{tabular}{l} MUMPS\\ PaStiX\\ Scotch\\ qr\_mumps\\ \end{tabular} & List of software Composyx has interfaces with.\\
\bottomrule
\end{tabular}
}}
@@ -106,27 +95,39 @@ \subsection{Programming and Computational Environment}
-\subsection{Mathematics}
-\label{sec:Composyx:mathematics}
-Mathematics not available.
+%\subsection{Mathematics}
+%\label{sec:Composyx:mathematics}
+%Mathematics not available.
-In this section, provide a summary the mathematics used in the software.
+%In this section, provide a summary the mathematics used in the software.
\subsection{Relevant Publications}
\label{sec:Composyx:publications}
Here is a list of relevant publications related to the software:
-
+\begin{description}
+ \item[\fullcite{agullo_robust_2019}]
+ The solution of large sparse linear systems is one of the most time consuming kernels in many numerical simulations. The domain decomposition community has developed many efficient and robust methods in the last decades. While many of these solvers fall into the abstract Schwarz (aS) framework, their robustness was originally demonstrated on a case-by-case basis. In this paper, we propose a bound for the condition number of all deflated aS methods provided that the coarse grid consists of the assembly of local components that contain the kernel of some local operators. We show that classical results from the literature on particular instances of aS methods can be retrieved from this bound. We then show that such a coarse grid correction can be explicitly obtained algebraically via generalized eigenproblems, leading to a condition number independent of the number of domains. This result can be readily applied to retrieve or improve the bounds previously obtained via generalized eigenproblems in the particular cases of Neumann-Neumann (NN), additive Schwarz (AS), and optimized Robin, but it also generalizes them when applied with approximate local solvers. Interestingly, the proposed methodology turns out to be a comparison of the considered particular aS method with generalized versions of both NN and AS for tackling the lower and upper part of the spectrum, respectively. We furthermore show that the application of the considered grid corrections in an additive fashion is robust in the AS case although it is not robust for aS methods in general. In particular, the proposed framework allows for ensuring the robustness of the AS method applied on the Schur complement, either with deflation or additively, and with the freedom of relying on an approximate local Schur complement. Numerical experiments illustrate these statements.
+ \item[\fullcite{agullo_soft_2020}]
+ The conjugate gradient (CG) method is the most widely used iterative scheme for the solution of large sparse systems of linear equations when the matrix is symmetric positive definite. Although more than 60 years old, it is still a serious candidate for extreme-scale computations on large computing platforms. On the technological side, the continuous shrinking of transistor geometry and the increasing complexity of these devices affect dramatically their sensitivity to natural radiation and thus diminish their reliability. One of the most common effects produced by natural radiation is the single event upset which consists in a bit-flip in a memory cell producing unexpected results at the application level. Consequently, future extreme-scale computing facilities will be more prone to errors of any kind, including bit-flips, during their calculations. These numerical and technological observations are the main motivations for this work, where we first investigate through extensive numerical experiments the sensitivity of CG to bit-flips in its main computationally intensive kernels, namely the matrix-vector product and the preconditioner application. We further propose numerical criteria to detect the occurrence of such soft errors and assess their robustness through extensive numerical experiments.
+ \item[\fullcite{giraud_block_2022}]
+ We are concerned with the iterative solution of linear systems with multiple right-hand sides available one group after another with possibly slowly varying left-hand sides. For such sequences of linear systems, we first develop a new block minimum norm residual approach that combines two main ingredients. The first component exploits ideas from GCRO-DR~\cite{parks_recycling_2006}, enabling us to recycle information from one solve to the next. The second component is the numerical mechanism for managing the partial convergence of the right-hand sides, referred to as inexact breakdown detection in IB-BGMRES~\cite{robbe_exact_2006}, that enables the monitoring of the rank deficiency in the residual space basis expanded blockwise. Next, for the class of block minimum norm residual approaches that relies on a block Arnoldi-like equality between the search space and the residual space (e.g., any block GMRES or block GCRO variants), we introduce new search space expansion policies defined on novel criteria to detect the partial convergence. These novel detection criteria are tuned to the selected stopping criterion and targeted convergence threshold to best cope with the selected normwise backward error stopping criterion, enabling us to monitor the computational effort while ensuring the final accuracy of each individual solution. Numerical experiments are reported to illustrate the numerical and computational features of both the new block Krylov solvers and the new search space block expansion polices.
+\end{description}
\subsection{Acknowledgements}
\label{sec::Composyx:acknowledgements}
The software has been developed with the support of the following funding agencies and institutions:
+\begin{itemize}
+ \item Inria,
+ \item H2020 Center of Excellence EoCoE-2 and 3,
+ \item H2020 PRACE-6IP,
+ \item DGA through the Hi-Box project,
+ \item Software development was performed using the PlaFRIM experimental testbed, supported by Inria, CNRS (LABRI and IMB), Université de Bordeaux, Bordeaux INP and Conseil R\'egional d’Aquitaine (see https://www.plafrim.fr) as well as on national GENCI platform.
+\end{itemize}
-Acknowledgements not available.
-
diff --git a/software/feelpp/WP1/WP1-contact.tex b/software/feelpp/WP1/WP1-contact.tex
new file mode 100644
index 0000000..18bc422
--- /dev/null
+++ b/software/feelpp/WP1/WP1-contact.tex
@@ -0,0 +1,207 @@
+
+
+
+\subsubsection{Benchmark \#\counter{feelppWP1benchcounter}: Contact Mechanics}
+
+\paragraph{Description}
+This benchmark simulates the dynamic unilateral contact between an elastic bouncing
+ball and a rigid horizontal wall, presented in \cite{chouly_explicit_2018}.
+The full model, combining ray-tracing, the Signorini contact mechanics, and the dynamics of elastic bodies
+is presented in \cite{van_landeghem_motion_nodate}. \\
+
+
+\paragraph{Benchmarking Tools Used}
+
+The simulations are conducted on the Gaya supercomputer. The execution time of the
+following tasks is monitored:
+
+\begin{inparaenum}[\it (i)]
+ \item Mesh: loading and initialization of the non-partitioned mesh,
+ \item Data Structures: initialization of data structures,
+ \item Ray-tracing: collision detection using ray-tracing,
+ \item Assembly: construction of the dynamic algebraic system,
+ \item Solve: solving the non-linear algebraic system, and
+ \item Post process: exporting the vectorial displacement field, the scalar contact displacement and the scalar contact pressure at each time iteration.
+\end{inparaenum}
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{itemize}
+ \item \textbf{Input Data:} As input, we consider the same mesh for all simulations,
+ using $P_1$ Lagrange elements for the vectorial unknown displacement field. The mesh
+ characteristics, the number of mesh elements and the number of degrees of freedom,
+ are provided in \Cref{tab:feelpp:mesh:contact}. Additionally, the input data includes the configuration files necessary to run the simulations.
+ \item \textbf{Output Data:} The output dataset includes the time evolution of
+ the displacement field of the elastic body, as well as the time evolution of
+ the contact displacement and pressure. In addition, the execution times for
+ the different tasks are stored.
+\end{itemize}
+
+
+
+\begin{table}[!ht]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}
+ \multicolumn{3}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties} & \multicolumn{1}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \hline
+ \rowcolor{numpexgray} {\color{white}\bf $h_\text{min}$} & {\color{white}\bf $h_\text{max}$} & {\color{white}\bf \# elements} & {\color{white}\bf $\vct{u}$} \\
+ \pgfmathprintnumber{0.19269262925729186} & \pgfmathprintnumber{0.46595156749445504} & \pgfmathprintnumber{21675942} & \pgfmathprintnumber{9208203} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{Characteristics of the mesh and the number of degrees of freedom for the vectorial displacement field $\vct{u}$ with $P_1$ discretization.}%
+ \label{tab:feelpp:mesh:contact}
+\end{table}
+
+\paragraph{Results Summary}
+
+The results for the computational time and relative computational time for the
+different tasks and varying numbers of processors are presented in ~\Cref{fig:feelpp:wp1:contact:time},
+and ~\Cref{fig:feelpp:wp1:contact:time-rel}. The bars show the results using $P_1$ Lagrange
+elements for the unknown displacement field.
+
+
+We observe that the resolution of the dynamic system constitutes the majority of
+the computational time, and its relative time increases with the number of cores.
+%The communication between nodes and synchronization points become predominant as the number of cores increases.
+With the increase in the number of cores, the absolute execution time related to
+data structure initialization, ray-tracing, assembly, and post-processing decreases.
+The mesh loading time remains constant, as it is not partitioned at the input.
+
+
+\pgfplotstableread{\currfiledir/data/contact-time.dat}\dataContact
+
+\begin{figure}
+ \centering
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Computational time [s]},
+ xtick={0,1,2,3,4}, xticklabels={32,64,128,256,384},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true, ymin=0,
+ bar width=7pt, ybar stacked,
+ %ymode=log,
+ % title={Computational time for the 3D case},
+ ]
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y] table [x=x, y=mesh] {\dataContact};
+ % \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y] table [x=x, y=data] {\dataContact};
+ % \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=black, draw=black, point meta=y] table [x=x, y expr={300*\thisrow{raytracing}} ] {\dataContact};
+ % \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y] table [x=x, y=assembly ] {\dataContact};
+
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y] table [x=x, y=solve] {\dataContact};
+ % \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y] table [x=x, y=postprocess] {\dataContact};
+ % \addlegendentry{Post process}
+
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Computational time for the contact mechanics testcase.}
+ \label{fig:feelpp:wp1:contact:time}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Relative computational time [\%]},
+ xtick={0,1,2,3,4}, xticklabels={32,64,128,256,384},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt, ybar stacked,
+ ymin=0, ymax=100,
+ % title={Relative computational time for the 3D case},
+ ]
+
+ % Compute the relative time for each component by dividing by the total time
+ % using the correct column names from the initial plot
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{mesh}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{data}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=black, draw=black, point meta=y]
+ table [x=x, y expr={3000*\thisrow{raytracing}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Ray-tracing}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{assembly}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{solve}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{postprocess}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Post process}
+
+ \end{axis}
+ \end{tikzpicture}
+
+ \caption{Relative computational time for the contact mechanics testcase.}
+ \label{fig:feelpp:wp1:contact:time-rel}
+\end{subfigure}
+\caption{Absolute (\Cref{fig:feelpp:wp1:contact:time}) and relative (\Cref{fig:feelpp:wp1:contact:time-rel}) computational time for the various tasks using $P_1$ Lagrange elements for the vectorial displacement field.}
+\label{fig:feelpp:wp1:contactbenchmark:time}
+\end{figure}
+
+
+
+\subsection{12-Month Roadmap}
+\label{sec:WP1:Feelpp:roadmap}
+
+For the next 12 month, we plan to focus on the following aspects of the benchmarking process:
+\begin{description}
+ \item[Data Improvements] Unify the input and output data format and structure to facilitate comparison and analysis. In particular, we wish to design dataset architecture that holds all necessary information for the benchmarking process as well as reference output results if any. We have started an effort to continuously benchmark our software using reframe and collect the information in a database including a visualisation tool through a website.
+ \item[Methodology Application] Several aspects will be developed:
+ \begin{itemize}
+ \item include HdG methods and high order methods in the current benchmarks, we didn't have the time to include them in the current results.
+ \item task based parallelism using the runtime environment \ac{specx}:
+ \begin{itemize}
+ \item add multithreading support in various steps of the computational pipeline.
+ \item enable distribution of some work load on GPU (eg. Ray Tracing, Assemly and Solve steps)
+ \end{itemize}
+ \item investigate the use of Kokkos to define portable and performant kernels.
+ \item Improve Ray tracing parallel performance at large scale
+ \item Improve I/O performance at large scale leveraging the HDF5 data format and the parallel I/O capabilities of the library.
+ \item Improve partitioning and load balancing of the mesh at large scale.
+ \item Improve parallel mesh adaptation at large scale using ParMMG
+ \end{itemize}
+ \item[Results Retention] We use two data management platforms Girder and Zenodo to store the data and results of the benchmarks following the methdology in~\cref{sec:methodology-intro}.
+\end{description}
+
+In~\Cref{tab:WP1:Feelpp:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+
+\begin{table}[!ht]
+ \centering
+
+ {
+ \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {
+ \fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} B10 - Scientific Productivity & Containerization and packaging are enabled as well as \ac{CI}/\ac{CD}. \ac{CB} will be enabled very soon.\\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & building upon data management and scientific productivity improvements to enable reproducibility \\
+\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & improve I/O using HDF5 and MPI I/O and possibly used framework from \ac{PC3}\\
+\rowcolor{numpexlightergray} B6 - Data Management & dataset creation and management are being improved to satisfy the methodology in~\cref{sec:methodology-intro}. Girder~\cref{sec:arch:girder:unistra} and~\cref{sec:arch:zenodo} will be used to store our dataset and enable FAIR principles\\
+\rowcolor{white} B7 - Exascale Algorithms & enable Ray Tracing, cG, HdG and spectral element methods on GPU; enable new partitioning strategies and load balancing; enable advanced profiling using score-P, EzTrace; improve memory management; enable parallel in time strategies \\
+\hline
+\end{tabular}
+ }
+ }
+ \caption{WP1: \Feelpp plan with Respect to Relevant Bottlenecks}
+ \label{tab:WP1:Feelpp:bottlenecks}
+\end{table}
diff --git a/software/feelpp/WP1/WP1-distance.tex b/software/feelpp/WP1/WP1-distance.tex
new file mode 100644
index 0000000..cdb43b3
--- /dev/null
+++ b/software/feelpp/WP1/WP1-distance.tex
@@ -0,0 +1,58 @@
+
+\subsubsection{Benchmark \#1: Compute Distance Function}
+
+\paragraph{Description}
+This benchmark evaluates two methods for computing the distance function inside a three-dimensional box:
+\begin{enumerate}
+ \item The \textbf{Level Set} method using the \textbf{Fast Marching Algorithm (FMA)}.
+ \item The \textbf{Ray Tracing} method.
+\end{enumerate}
+The objective is to compute the distance function at all vertices of a discretized box using both methods and verify whether they produce the same results.
+The problem is discretized using an unstructured grid, and performance is assessed on a multi-core CPU architecture.
+
+The benchmark aims to compare the efficiency, accuracy, and computational cost of both approaches in terms of distance calculation within the 3D domain.
+
+\paragraph{Benchmarking Tools Used}
+The following tools were used for performance profiling and analysis:
+\begin{itemize}
+\item \textbf{\Feelpp}: the performance tools integrated into the \Feelpp framework were used to measure the execution time.
+\end{itemize}
+
+The key metrics measured include execution time, accuracy, memory usage, and floating-point operations (FLOPS) for both methods.
+
+\subsection{Input/Output Dataset Description}
+\begin{itemize}
+ \item \textbf{Input Data:} The input consists of a 3D uniform grid representing the box geometry, with approximately 1 million vertices. The level set function and ray tracing boundaries are initialized for the distance computation. The input data is stored in JSON format, and it can be accessed via DOI: \texttt{[Insert DOI]}.
+
+ \item \textbf{Output Data:} The output includes the computed distance function values at all vertices for both methods, stored in CSV format. Additionally, runtime performance logs and accuracy comparisons between the methods are included.
+
+ \item \textbf{Data Repository:} Input and output datasets, along with performance logs, are stored in a Zenodo repository and can be accessed via DOI: \texttt{[Insert DOI]}.
+\end{itemize}
+
+\paragraph{Results Summary}
+The performance comparison between the two methods is summarized as follows:
+
+RESULTS here.
+
+\paragraph{Challenges Identified}
+The following challenges were encountered during the benchmarking process:
+\begin{itemize}
+ \item \textbf{Ray Tracing Bottlenecks:}
+ \item \textbf{Parallelization Issues:}
+ \item \textbf{Memory Usage:}
+\end{itemize}
+
+Final analysis and persectives here.
+
+\begin{itemize}
+ \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+ \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+ \item \textbf{Input/Output Dataset Description:}
+ \begin{itemize}
+ \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+ \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+ \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+ \end{itemize}
+ \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+ \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+\end{itemize}
diff --git a/software/feelpp/WP1/WP1-heatfluid.tex b/software/feelpp/WP1/WP1-heatfluid.tex
new file mode 100644
index 0000000..d4101a1
--- /dev/null
+++ b/software/feelpp/WP1/WP1-heatfluid.tex
@@ -0,0 +1,300 @@
+
+
+
+
+\subsubsection{Benchmark \#\counter{feelppWP1benchcounter}: HeatFluid Coupling}
+\label{sec:WP1:Feelpp:benchmark4}
+
+\newcommand{\vct}[1]{\vec{#1}}
+\newcommand{\mat}[1]{\underline{\underline{#1}}}
+
+
+% \emph{enlever tous les détails, et laisser les références, expliciter la liste de maillage et la machine où est faite le bench, et la mise en donnée (paramètrisation)}
+
+% \emph{dans wp3: parler du préconditionner}
+
+
+\paragraph{Description}
+This benchmark models the steady aqueous humor (AH) flow in the posterior and anterior chambers of the human eyeball, coupled with the overall heat transfer, adapted from~\cite{ooi_simulation_2008,kilgour_operator_2021}.
+The full model description is available in~\cite{saigre_coupled_2024_abstract}.
+It it run with the toolbox \texttt{heatfluid} of \Feelpp.
+
+
+\paragraph{Benchmarking Tools Used}
+
+The following tools were used for performance profiling and analysis:
+\begin{itemize}
+ \item \textbf{\Feelpp}: the performance tools integrated into the \Feelpp framework were used to measure the execution time.
+ \item \textbf{Gaya}: the benchmark was performed on the Gaya supercomputer (see \Cref{sec:arch:gaya}).
+\end{itemize}
+
+The metrics measured are the execution time to:
+\begin{inparaenum}[\it (i)]
+ \item load and initialize the mesh that is already partitioned on the disk,
+ \item initialize the data structures,
+ \item assembly algebraic objects of the linear system,
+ \item solve the non-linear algebraic system, and
+ \item export the results.
+\end{inparaenum}
+
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{itemize}
+ \item \textbf{Input Data:} The input dataset consists of a family of 3D tetrahedral meshes generated through the process described in~\cite{chabannes_3d_2024}, and denoted \texttt{Mr0} to \texttt{Mr6}, with an increasing number of elements.
+ \Cref{tab:feelpp:wp1:coupled:mesh} presents the characteristics of these meshes.
+ The input data also provides the configuration files necessary to run the simulations.
+ \item \textbf{Output Data:} The output includes the computed temperature, velocity, and pressure fields for each mesh, stored in HDF5 format, as well as the time taken to perform each step of the simulation.
+ \item \textbf{Data Repository:} All input and output datasets are available in a Zenodo repository \cite{saigre_mesh_2024}, accessible through DOI: \href{https://doi.org/10.5281/ZENODO.13886143}{10.5281/ZENODO.13886143}.
+\end{itemize}
+
+
+\begin{table}[!ht]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}
+ \multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties} & \multicolumn{3}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \hline
+ \rowcolor{numpexgray}{\color{white}\bf Tag} & {\color{white}\bf $h_\text{min}$} & {\color{white}\bf $h_\text{max}$} & {\color{white}\bf $h_\text{mean}$} & {\color{white}\bf \# elements} & {\color{white}\bf $T$} & {\color{white}\bf $\vct{u}$} & {\color{white}\bf $p$} \\
+ \texttt{Mr0} & \pgfmathprintnumber{1.247583e-04} & \pgfmathprintnumber{3.997611e-03} & \pgfmathprintnumber{9.227331e-04} & \pgfmathprintnumber{191939} & \pgfmathprintnumber{37470} & \pgfmathprintnumber{84966} & \pgfmathprintnumber{4615} \\
+ \rowcolor{numpexlightergray}
+ \texttt{Mr1} & \pgfmathprintnumber{1.367312e-04} & \pgfmathprintnumber{3.634717e-03} & \pgfmathprintnumber{7.717604e-04} & \pgfmathprintnumber{282030} & \pgfmathprintnumber{51753} & \pgfmathprintnumber{116709} & \pgfmathprintnumber{6155} \\
+ \texttt{Mr2} & \pgfmathprintnumber{6.539683e-05} & \pgfmathprintnumber{1.599067e-03} & \pgfmathprintnumber{4.668270e-04} & \pgfmathprintnumber{746664} & \pgfmathprintnumber{131327} & \pgfmathprintnumber{589992} & \pgfmathprintnumber{28548} \\
+ \rowcolor{numpexlightergray}
+ \texttt{Mr3} & \pgfmathprintnumber{3.294835e-05} & \pgfmathprintnumber{9.592658e-04} & \pgfmathprintnumber{4.166619e-04} & \pgfmathprintnumber{1403433} & \pgfmathprintnumber{241831} & \pgfmathprintnumber{707532} & \pgfmathprintnumber{34304} \\
+ \texttt{Mr4} & \pgfmathprintnumber{2.549458e-05} & \pgfmathprintnumber{5.293352e-04} & \pgfmathprintnumber{2.883913e-04} & \pgfmathprintnumber{6038645} & \pgfmathprintnumber{1027375} & \pgfmathprintnumber{1024008} & \pgfmathprintnumber{48534} \\
+ \rowcolor{numpexlightergray}
+ \texttt{Mr5} & \pgfmathprintnumber{3.120124e-05} & \pgfmathprintnumber{1.501561e-04} & \pgfmathprintnumber{2.772105e-04} & \pgfmathprintnumber{43893359} & \pgfmathprintnumber{7374833} & \pgfmathprintnumber{4616967} & \pgfmathprintnumber{205342} \\
+ \texttt{Mr6} & \pgfmathprintnumber{2.820610e-05} & \pgfmathprintnumber{9.940551e-07} & \pgfmathprintnumber{1.835537e-04} & \pgfmathprintnumber{150630096} & \pgfmathprintnumber{25200452} & \pgfmathprintnumber{14671089} & \pgfmathprintnumber{636943} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{Characteristics of meshes used for the convergence study and number of degrees of freedom for temperature $T$, velocity $\vct{u}$, and pressure fields $p$, with the discretization $P_1\text{--}P_2P_1$.}%
+ \label{tab:feelpp:wp1:coupled:mesh}
+\end{table}
+
+
+\paragraph{Results Summary}
+
+The results of the benchmark are summarized in~\Cref{fig:feelpp:wp1:coupled:time} and~\Cref{fig:feelpp:wp1:coupled:time-rel},
+showing the computational time and relative computational time for each component of the simulation, respectively.
+The results are presented for the three biggest meshed of the family, namely \texttt{Mr4}, \texttt{Mr5}, and \texttt{Mr6}.
+Note that for \texttt{Mr6}, the simulation dit not complete on 1 node (128 cores) due to memory limitations.
+
+We observe that the resolution of the non-linear algebraic system is the most time-consuming part of the simulation, followed by the assembly of the linear system.
+Moreover, even though the relative time is globally similar when the number of cores is increased, we note a decrease in the absolute time for various components of the simulation, except for the Post process part, which involves writing the results to disk.
+The assembly time remains significant compared to other parts of the simulation, with a noticeable increase in the time spent resolving the non-linear system, which forms the largest portion of the computation.
+As the number of cores increases, we also observe a proportional increase in time dedicated to I/O operations, particularly in the Post process phase, due to the larger volumes of data being written to disk.
+
+\iffalse
+\pgfplotstableread{\currfiledir/data/heatfluid-time-data.dat}\data
+\begin{figure}
+ \centering
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Computational time [s]},
+ xtick={0,1,2,3,4,5,6,7,8,9,10}, xticklabels={1,2,4,8,16,32,64,128,256,512,640},
+ legend style={at={(0.5,-0.15)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt, ybar stacked,
+ ymode=log,
+ % title={Computational time for the 3D case},
+ ]
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y] table [x=x, y=initMesh] {\data};
+ \addlegendentry{Mesh}
+
+
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y] table [x=x, y expr=\thisrow{init}-\thisrow{initMesh}] {\data};
+ \addlegendentry{Data Structures}
+
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y] table [x=x, y=algebraic-nlsolve] {\data};
+ \addlegendentry{Solve}
+
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y] table [x=x, y expr=\thisrow{algebraic-newton-initial-guess}+\thisrow{algebraic-jacobian}+\thisrow{algebraic-residual}] {\data};
+ \addlegendentry{Assembly}
+
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y
+ ] table [x=x, y=exportResults] {\data};
+ \addlegendentry{Post process}
+
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Computational time for the coupled heat-fluid testcase, performed on Gaya with the mesh \texttt{Mr4}.}
+\end{figure}
+
+\begin{figure}
+ \centering
+
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Relative computational time [\%]},
+ xtick={0,1,2,3,4,5,6,7,8,9,10}, xticklabels={1,2,4,8,16,32,64,128,256,512,640},
+ legend style={at={(0.5,-0.15)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt, ybar stacked,
+ ymin=0, ymax=100,
+ % title={Relative computational time for the 3D case},
+ ]
+
+ % Compute the relative time for each component by dividing by the total time
+ % using the correct column names from the initial plot
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{initMesh}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\data};
+ \addlegendentry{Mesh}
+
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{init}-\thisrow{initMesh})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\data};
+ \addlegendentry{Data Structures}
+
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{algebraic-nlsolve}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\data};
+ \addlegendentry{Solve}
+
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\data};
+ \addlegendentry{Assembly}
+
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{exportResults}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\data};
+ \addlegendentry{Post process}
+
+ \end{axis}
+ \end{tikzpicture}
+
+ \caption{Relative time spent in each component of the computation for the coupled heat-fluid testcase, performed on Gaya with the mesh \texttt{Mr4}.}
+\end{figure}
+\fi
+
+
+\pgfplotstableread{\currfiledir/data/heatfluid-time-M4.dat}\dataMQuatre
+\pgfplotstableread{\currfiledir/data/heatfluid-time-M5.dat}\dataMCinq
+\pgfplotstableread{\currfiledir/data/heatfluid-time-M6.dat}\dataMSix
+
+\begin{figure}
+ \centering
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Computational time [s]},
+ xtick={0,1,2,3,4,5}, xticklabels={128,256,384,512,640,768},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true, ymin=0,
+ bar width=7pt, ybar stacked,
+ %ymode=log,
+ % title={Computational time for the 3D case},
+ ]
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y] table [x=x, y=initMesh] {\dataMQuatre};
+ % \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y] table [x=x, y expr=\thisrow{init}-\thisrow{initMesh}] {\dataMQuatre};
+ % \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y] table [x=x, y expr=\thisrow{algebraic-newton-initial-guess}+\thisrow{algebraic-jacobian}+\thisrow{algebraic-residual}] {\dataMQuatre};
+ % \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y] table [x=x, y=algebraic-nlsolve] {\dataMQuatre};
+ % \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y] table [x=x, y=exportResults] {\dataMQuatre};
+ % \addlegendentry{Post process}
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y, forget plot] table [x=x, y=initMesh] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{init}-\thisrow{initMesh}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{algebraic-newton-initial-guess}+\thisrow{algebraic-jacobian}+\thisrow{algebraic-residual}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y, forget plot] table [x=x, y=algebraic-nlsolve] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y, forget plot] table [x=x, y=exportResults] {\dataMCinq};
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y, forget plot] table [x=x, y=initMesh] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{init}-\thisrow{initMesh}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{algebraic-newton-initial-guess}+\thisrow{algebraic-jacobian}+\thisrow{algebraic-residual}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y, forget plot] table [x=x, y=algebraic-nlsolve] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y, forget plot] table [x=x, y=exportResults] {\dataMSix};
+
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Computational time for the coupled heat-fluid testcase.}
+ \label{fig:feelpp:wp1:coupled:time}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Relative computational time [\%]},
+ xtick={0,1,2,3,4,5}, xticklabels={128,256,384,512,640,768},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt, ybar stacked,
+ ymin=0, ymax=100,
+ % title={Relative computational time for the 3D case},
+ ]
+
+ % Compute the relative time for each component by dividing by the total time
+ % using the correct column names from the initial plot
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{initMesh}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{init}-\thisrow{initMesh})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{algebraic-nlsolve}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{exportResults}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Post process}
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{initMesh}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{init}-\thisrow{initMesh})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{algebraic-nlsolve}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{exportResults}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{initMesh}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{init}-\thisrow{initMesh})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{algebraic-nlsolve}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{exportResults}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \end{axis}
+ \end{tikzpicture}
+
+ \caption{Relative time spent in each component of the computation for the coupled heat-fluid testcase.}
+ \label{fig:feelpp:wp1:coupled:time-rel}
+\end{subfigure}
+\caption{Absolute (\Cref{fig:feelpp:wp1:coupled:time}) and relative (\Cref{fig:feelpp:wp1:coupled:time-rel}) computational time for the coupled heat-fluid testcase, performed on Gaya with the meshes \texttt{Mr4} (left), \texttt{Mr5} (middle), and \texttt{Mr6} (right).}
+\label{fig:feelpp:wp1:heatfluid:time}
+\end{figure}
+
+
+\paragraph{Challenges Identified}
+Several challenges were encountered during the benchmarking process: \textbf{??}
+\begin{itemize}
+ \item \textbf{Memory Usage:}
+ \item \textbf{Parallelization Inefficiencies:}
+ \item \textbf{Cache and Memory Bottlenecks:}
+\end{itemize}
+
diff --git a/software/feelpp/WP1/WP1-lap-elas.tex b/software/feelpp/WP1/WP1-lap-elas.tex
new file mode 100644
index 0000000..2a1327f
--- /dev/null
+++ b/software/feelpp/WP1/WP1-lap-elas.tex
@@ -0,0 +1,47 @@
+
+\subsubsection{Benchmark \#2: Assemble Stiffness and Linear Elasticity Matrix}
+
+\paragraph{Description}
+This benchmark evaluates the assembly of stiffness and linear elasticity finite element matrices in three dimensions using both continuous Galerkin (cG) and hybrid discontinuous Galerkin (hdG) methods using the \Feelpp toolboxes.
+The problem size consists of a mesh with tetrahedral elements, executed entirely on multi-core CPU architectures.
+The benchmark is designed to measure scalability, execution time, and computational efficiency across different material models, including isotropic and anisotropic materials.
+
+The objective of the benchmark is to assess performance in terms of assembly time, memory usage, and parallel efficiency for cG and hdG methods from low to high orders using CPU resources only.
+
+\paragraph{Benchmarking Tools Used}
+The following performance analysis tools were used:
+\begin{itemize}
+ \item \textbf{\Feelpp}: the performance tools integrated into the \Feelpp framework were used to measure the execution time and memory usage during the matrix assembly.
+\end{itemize}
+
+Metrics such as execution time, memory usage, and FLOPS were measured to compare the performance of the cG and hdG methods on CPU.
+
+\paragraph{Input/Output Dataset Description}
+\begin{itemize}
+ \item \textbf{Input Data:} The input dataset consists of a 3D tetrahedral mesh generated using the Gmsh format, with approximately 1 million elements. Material properties are defined in JSON format, covering both isotropic and anisotropic materials.
+
+ \item \textbf{Output Data:} The output includes performance logs, execution times, and memory usage reports. Output results are replicable by using the same mesh and material properties.
+
+ \item \textbf{Data Repository:} All input and output datasets are available in a Zenodo repository, accessible through DOI: \texttt{[Insert DOI]}.
+\end{itemize}
+
+
+\paragraph{Results Summary}
+The benchmark results are summarized as follows:
+
+RESULTS here
+
+The results highlight that ... (ADD ANALYSIS)
+
+
+
+
+\paragraph{Challenges Identified}
+Several challenges were encountered during the benchmarking process:
+\begin{itemize}
+ \item \textbf{Memory Usage:}
+ \item \textbf{Parallelization Inefficiencies:}
+ \item \textbf{Cache and Memory Bottlenecks:}
+\end{itemize}
+
+add extra analysis and conclusion here.
diff --git a/software/feelpp/WP1/WP1-thermalbridge.tex b/software/feelpp/WP1/WP1-thermalbridge.tex
new file mode 100644
index 0000000..f6835ba
--- /dev/null
+++ b/software/feelpp/WP1/WP1-thermalbridge.tex
@@ -0,0 +1,474 @@
+\subsubsection{Benchmark \#\counter{feelppWP1benchcounter}: Elliptic linear PDE : Thermal bridges}
+\label{sec:WP1:Feelpp:benchmark:thermal_bridges}
+\paragraph{Description:} %Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+
+The benchmark known as "thermal bridges" is an example of an application that
+enables us to validate numerical simulation tools using \Feelpp. We have
+developed tests based on the ISO 10211:2017 standard
+\fullcite{noauthor_iso_2017}, which provides methodologies for evaluating
+thermal bridges in building construction.
+
+Thermal bridges are areas within a building envelope where heat
+flow is different compared to adjacent areas, often resulting in increased heat
+loss or unwanted condensation.
+The standard is intended to ensure that thermal
+bridges simulation are accurately computed. It provides reference values (and tolerance) on heat
+temperature and heat flux at several location of the geometry.
+
+At the mathematical level, this application requires finding the numerical
+solution of an elliptic linear PDE (i.e. the heat equation). We employ a
+finite element method based on continuous Lagrange Finite Element of order 1,2
+and 3 (denoted by $P_1$,$P_2$,$P_3$). And we analyzed the execution time of the
+main components of the simulation.
+
+The \Cref{fig:wp1:feelpp:thermal_bridges:visualization} illustrate the geometry
+used, the 3D temperature field solution, and an example of mesh partitioning.
+
+
+\begin{figure}[h]
+ \centering
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-thermalbridges-solution.png}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-thermalbridges-pid.png}
+ \end{subfigure}
+ \caption{Thermal bridges benchmarks - temperature solution (left) and
+ partitioning example (right)}
+ \label{fig:wp1:feelpp:thermal_bridges:visualization}
+\end{figure}
+
+
+
+\paragraph{Benchmarking Tools Used}
+
+The benchmark was performed on the \textbf{Discoverer} supercomputer (see
+\Cref{sec:arch:eurohpc-ju}).
+The performance tools integrated into the \Feelpp-toolboxes framework were used to measure
+the execution time.
+Moreover, we need to note that we have used here Apptainer with \Feelpp SIF image based on Ubuntu noble OS.
+
+The metrics measured are the execution time of the main components of the simulation. We enumerate these parts in the following:
+\begin{itemize}
+\item \textbf{Init}: load mesh from filesystem and initialize heat toolbox (finite element context and algebraic data structure)
+\item \textbf{Assembly}: calculate and assemble the matrix and rhs values obtained using the finite element method
+\item \textbf{Solve}: the linear system by using a preconditioned gmres
+\item \textbf{PostProcess}: compute validation measures (temperature at points and
+ heat flux) and export on the filesystem a visualization format (EnsighGold) of
+ the solution.
+\end{itemize}
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{itemize}
+\item \textbf{Input Data:}
+ \begin{itemize}
+ \item Meshes: We have generated three levels of mesh called M1, M2
+ and M3. These meshes are stored in GMSH format. The statistics can be found in
+ \Cref{tab:wp1:feelpp:thermal_bridges:discr_stat}. We have also prepared for
+ each mesh level a collection of partitioned mesh.
+ The format used is an in-house mesh format of \Feelpp based on
+ JSON+HDF5 file type.
+ The Gmsh meshes and the partitioned meshes can be found on our Girder
+ database management, in the \Feelpp collections.
+ \item Setup: Use standard setup of \Feelpp toolboxes. It corresponds to a cfg
+ file and JSON file. These config files are present in the Github of feelpp.
+ \item Sif image: feelpp:v0.111.0-preview.10-noble-sif (stored in the Github registry of \Feelpp)
+ \end{itemize}
+\item \textbf{Output Data:} The output includes the computed values of
+ validation measure in CSV files format, export visualization files (mesh, partitioning, temperature), and the time taken to perform each simulation step.
+\end{itemize}
+
+
+
+\begin{table}[h!]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}\multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties}
+ & \multicolumn{3}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \rowcolor{numpexgray} {\color{white}\bf Tag} & {\color{white}\bf \# points} & {\color{white}\bf \# edges} & {\color{white}\bf \# faces} & {\color{white}\bf \# elements} & {\color{white}\bf $P_1$} & {\color{white}\bf $P_2$} & {\color{white}\bf $P_3$} \\
+ \texttt{M1} & \pgfmathprintnumber{193654} & \pgfmathprintnumber{1299920} & \pgfmathprintnumber{2164759} & \pgfmathprintnumber{1058492} & \pgfmathprintnumber{193654} & \pgfmathprintnumber{1493574} & \pgfmathprintnumber{4958253} \\
+ \texttt{M2} & \pgfmathprintnumber{1401135} & \pgfmathprintnumber{9778744} & \pgfmathprintnumber{16566803} & \pgfmathprintnumber{8189193} & \pgfmathprintnumber{1401135} & \pgfmathprintnumber{11179879} & \pgfmathprintnumber{37525426} \\
+ \texttt{M3} & \pgfmathprintnumber{10572256} & \pgfmathprintnumber{75307308} & \pgfmathprintnumber{128722252} & \pgfmathprintnumber{63987199} & \pgfmathprintnumber{10572256} & \pgfmathprintnumber{85879564} & \pgfmathprintnumber{289909124} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{Thermal bridges benchmarks - Statistics on meshes and number of degrees of freedom with respect
+ to finite element approximation}
+ \label{tab:wp1:feelpp:thermal_bridges:discr_stat}
+\end{table}
+
+
+\paragraph{Results Summary}
+The benchmark results are summarized in
+\Cref{fig:feelpp:wp1:thermal_bridges:performance_times_M1},
+\Cref{fig:feelpp:wp1:thermal_bridges:performance_times_M2},
+\Cref{fig:feelpp:wp1:thermal_bridges:performance_times_M3} which correspond respectively to
+choice of the mesh M1, M2 and M3. Moreover, for each mesh, we have experimented with several
+finite element discretizations called $P_1$, $P_2$ and $P_3$.
+For each order of finite element approximation, we have selected a set of number
+of CPU cores.
+
+Firstly, we can see clearly that the Solve part is the most time-consuming. See
+comments in \Cref{sec:WP3:Feelpp:benchmark:thermal_bridges}.
+Concerning the mesh M1, considered a coarse mesh, we note that the
+scalability scaling is not good, especially for low order. This is simply because the problem is too small for so many HPC resources. MPI
+communications an IO effects are non-negligible.
+For the mesh M2, results are better (but not ideal) up to around one thousand processing cores.
+Finally, the fined mesh M3, illustrates the best scalability on this range of
+number of tasks. Except for the solve part, we can see an efficiency of increased
+HPC resource at this level.
+
+Each run illustrated here is also validated thanks to the value of the ISO 10211:2017 standard.
+With these experiments, we have also seen that we have some variability in
+performance measures. Some aspects like the filesystem and network load, are not
+under our control, it can explain a part of this. Also, the memory and
+consequently the choice of the number of tasks per node can be important and
+can change significantly the performance. This latter will be taken into account
+more accurately in a next campaign of these benchmarking test.
+
+\newcommand{\barChart}[2][ybar]{
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=0.6172\textwidth,
+ xlabel={Number of CPU core}, ylabel={Execution time [s]},
+ %xticklabels from table={#2}{nProc},
+ xtick=data,
+ xtick align=outside,
+ ymin=0,
+ %legend style={at={(1,1)}, anchor=north east},
+ %legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt,
+ #1
+ ]
+ \foreach [expand list=true] \thetuple in {#2} {
+ \pgfkeys{/mysettings/.cd,
+ table/.store in=\mytable,
+ column/.store in=\mycolumn,
+ shift/.store in=\myshift, shift/.default=0, shift,
+ legend/.store in=\mylegend,
+ color/.store in=\mycolor
+ }
+ \edef\temp{
+ \noexpand\pgfkeys{/mysettings/.cd, \expandafter\@firstofone\thetuple}
+ } \temp
+ %\def\toto{\expandafter\mytable}
+ \edef\temp{
+ \noexpand\addplot[ybar, bar width=0.2, fill=\mycolor, draw=black, point meta=y]
+ table [x expr=\noexpand\coordindex+\myshift, y=\mycolumn ] {\expandafter\noexpand\csname \mytable\endcsname};
+ %table [x=nProc, y=\mycolumn ] {\expandafter\noexpand\csname \mytable\endcsname};
+ } \temp
+ %table [x expr=\noexpand\coordindex, y=\mycolumn ] {#2};
+ \edef\temp{
+ \noexpand\addlegendentry{\mylegend}
+ } \temp
+ }
+ \end{axis}
+\end{tikzpicture}
+}
+
+
+\foreach [expand list=true] \meshId in {1,2,3} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/thermalbridges_M\meshId_P1_discoverer.csv}\dataPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/thermalbridges_M\meshId_P2_discoverer.csv}\dataPb
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/thermalbridges_M\meshId_P3_discoverer.csv}\dataPc
+
+ \begin{figure}
+ \centering
+ \def\plotSetup##1{
+ {table=##1,column=init,legend=Init,color=customdarkblue},
+ {table=##1,legend=Assembly,column=algebraic-assembly,color=customcyan},
+ {table=##1,legend=Solve,column=algebraic-solve,color=customorange},
+ {table=##1,legend=PostProcess,column=exportResults,color=custompurple}
+ }
+ \def\chartBarPlot##1##2{
+ \barChart[ybar,
+ xticklabels from table={##2}{nProc},
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+ \def\chartBarStackedPlot##1##2{
+ \barChart[ybar stacked,
+ xticklabels from table={##2}{nProc},
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPa}{\dataPa}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPa}{\dataPa}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPb}{\dataPb}
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPb}{\dataPb}
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPc}{\dataPc}
+ \caption{\texttt{M\meshId} - \texttt{$P_3$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPc}{\dataPc}
+ \caption{\texttt{M\meshId} - \texttt{$P_3$}}
+ \end{subfigure}
+ \caption{Thermal bridges benchmarks - Execution time of main simulation components
+ - Mesh \texttt{M\meshId} - Discoverer supercomputer}
+ \label{fig:feelpp:wp1:thermal_bridges:performance_times_M\meshId}
+ \end{figure}
+
+}
+
+
+
+
+% columns/FunctionSpace/.style={string type},
+\begin{figure}
+ \centering
+ \captionsetup[subfigure]{justification=centering}
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/measures_all.csv}\dataTableMeasures
+ \def\myLineWidth{2pt}
+ \def\myLineStyleA{loosely dashdotdotted} %dashdotdotted
+ \def\myLineStyleB{dashed}
+ \def\myLineStyleC{solid}
+
+ \def\myAddPlot#1#2#3#4{
+ \addplot[#3,every mark/.append style={solid},
+ % x filter/.expression={(\thisrow{PolyOrder} == 1 ? \pgfmathparse{\thisrow{Mesh}}\pgfmathresult :NaN)},
+ % x filter/.expression={ \thisrow{FunctionSpace} == \thisrow{FunctionSpace} ? \pgfmathresult
+ % x filter/.expression={(\thisrow{FunctionSpace} == P1 ? \pgfmathresult :NaN )},
+ x filter/.code={
+ \pgfmathparse{\thisrow{PolyOrder}==#2}
+ \ifnum0=\pgfmathresult
+ \pgfmathsetmacro{\newx}{nan}
+ \else
+ \pgfmathsetmacro{\newx}{\thisrow{Mesh}}
+ \fi
+ \pgfmathparse{\newx}
+ },
+ y filter/.expression={ #4*\pgfmathresult }
+ ] table [x=Mesh, y=#1] {\dataTableMeasures};
+ }
+
+ \def\myPlotOutputMeasures#1#2#3#4{
+ \resizebox{\textwidth}{0.6172\textwidth}{
+ \begin{tikzpicture}
+ \begin{axis}[
+ %width=\textwidth, height=1.2\textheight,
+ xtick=data,
+ xticklabel={M$\pgfmathprintnumber{\tick}$},
+ xmajorgrids=true,% xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ %xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Mesh levels}, ylabel={#4},
+ % legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ %legend style={at={(0.,1)}, anchor=north west,font=\small,legend
+ %columns=3},
+ legend style={at={(0.,1)}, anchor=south west,font=\small,legend columns=4},
+ %#2
+ ]
+
+ \myAddPlot{#1}{1}{color=customdarkblue,\myLineStyleC,mark=o,line width=\myLineWidth}{#2}
+ \addlegendentry{P1}
+ \myAddPlot{#1}{2}{color=customcyan,\myLineStyleC,mark=triangle,line width=\myLineWidth}{#2}
+ \addlegendentry{P2}
+ \myAddPlot{#1}{3}{color=customorange,\myLineStyleC,mark=square,line width=\myLineWidth}{#2}
+ \addlegendentry{P3}
+ \addplot+[color=red,\myLineStyleB,mark=none,line width=\myLineWidth,every mark/.append style={solid}] coordinates {
+ (1,#3) (3,#3)
+ };
+ \addlegendentry{Ref}
+
+ \end{axis}
+ \end{tikzpicture}
+ }
+
+}
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \myPlotOutputMeasures{Normal_Heat_Flux_alpha}{1}{46.09}{Heat flow [W]}
+ \caption{Heat flow measured in $\alpha$ environment}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \myPlotOutputMeasures{Normal_Heat_Flux_beta}{1}{13.89}{Heat flow [W]}
+ \caption{Heat flow measured in $\beta$ environment}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \vspace*{0.03\textheight}
+ \myPlotOutputMeasures{Normal_Heat_Flux_gamma}{-1}{59.98}{Heat flow [W]} % warning inverse
+ \caption{Heat flow measured in $\gamma$ environment}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \vspace*{0.03\textheight}
+ \myPlotOutputMeasures{Statistics_temperature_alpha_min}{1}{11.32}{Temperature
+ [°C]}
+ \caption{Surface temperature min in $\alpha$ environment}
+ \end{subfigure}
+
+ \caption{Thermal bridges benchmarks - Convergence of validation measures compared to references values}
+ \label{fig:feelpp:wp1:thermal_bridges:measures_convergences}
+\end{figure}
+
+
+
+\paragraph{Challenges Identified}
+Several challenges were encountered during the benchmarking process:
+\begin{itemize}
+ \item \textbf{Memory Usage:} We need to check and detect the memory
+ consumed during the simulation to avoid bad behavior like swapping.
+ \item \textbf{Parallelization Inefficiencies:} We need to understand and
+ improve performance when MPI communication and filesystem IO will be dominant.
+ %\item \textbf{Cache and Memory Bottlenecks:}
+\end{itemize}
+
+To conclude, we have realized HPC performance tests of benchmark called thermal
+bridges. We have realized with success the execution of several simulations on
+significant resources and demonstrated the validation of \Feelpp framework in the
+elliptic PDE context. We have also validated the deployment of \Feelpp with
+container support. Now, we need to provide more refined measures to detect and
+analyze reasons for performance degradation. And also compare to other software
+installations, like Spack.
+
+
+
+\subsubsection{Benchmark \#2: Linear elasticity : NAFEMS LE10}
+
+\paragraph{Description}
+
+\begin{figure}[h]
+ \centering
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-nafems-le10-solution-disp.png}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-nafems-le10-solution-vonmises.png}
+ \end{subfigure}
+ \caption{Thermal bridges benchmarks - displace solution (left) and
+ von Mises yield criterion (right)}
+ \label{fig:wp1:feelpp:nafems-le10:visualization}
+\end{figure}
+
+\paragraph{Benchmarking Tools Used}
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{table}[h!]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}\multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties}
+ & \multicolumn{2}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \rowcolor{numpexgray} {\color{white}\bf Tag} & {\color{white}\bf \# points} & {\color{white}\bf \# edges} & {\color{white}\bf \# faces} & {\color{white}\bf \# elements} & {\color{white}\bf $P_1$} & {\color{white}\bf $P_2$} \\
+ \texttt{M2} & \pgfmathprintnumber{324257} & \pgfmathprintnumber{2247489} & \pgfmathprintnumber{3796235} & \pgfmathprintnumber{1873002} & \pgfmathprintnumber{972771} & \pgfmathprintnumber{7715238} \\
+ \texttt{M3} & \pgfmathprintnumber{2426377} & \pgfmathprintnumber{17230019} & \pgfmathprintnumber{29409215} & \pgfmathprintnumber{14605572} & \pgfmathprintnumber{7279131} & \pgfmathprintnumber{58969188} \\
+ \texttt{M4} & \pgfmathprintnumber{18801264} & \pgfmathprintnumber{135213828} & \pgfmathprintnumber{232036941} & \pgfmathprintnumber{115624376} & \pgfmathprintnumber{56403792} & \pgfmathprintnumber{462045276} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{NAFEMS le10 benchmark - Statistics on meshes and number of degrees of freedom with respect
+ to finite element approximation}
+ \label{tab:wp1:feelpp:nafems-le10:discr_stat}
+\end{table}
+
+\paragraph{Results Summary}
+
+\foreach [expand list=true] \meshId in {2,3,4} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/nafems_le10_M\meshId_P1_discoverer.csv}\dataPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/nafems_le10_M\meshId_P2_discoverer.csv}\dataPb
+
+ \begin{figure}
+ \centering
+ \def\plotSetup##1{
+ {table=##1,column=init,legend=Init,color=customdarkblue},
+ {table=##1,legend=Assembly,column=algebraic-assembly,color=customcyan},
+ %{table=##1,legend=Solve,column=algebraic-solve,color=customorange},
+ {table=##1,legend=PostProcess,column=exportResults,color=custompurple}
+ }
+ \def\chartBarPlot##1##2##3{
+ \barChart[ybar,xticklabels from table={##2}{nProc},legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1},##3]{\plotSetup{##1}}
+ }
+ \def\chartBarStackedPlot##1##2{
+ \barChart[ybar stacked,xticklabels from table={##2}{nProc},
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPa}{\dataPa}{}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPa}{\dataPa}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \ifnum4=\meshId
+ \chartBarPlot{dataPb}{\dataPb}{enlarge x limits=0.3}
+ \else
+ \chartBarPlot{dataPb}{\dataPb}{}
+ \fi
+
+
+
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPb}{\dataPb}
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \caption{NAFEMS le10 benchmarks - Execution time of main simulation components
+ - Mesh \texttt{M\meshId} - Discoverer supercomputer}
+ \label{fig:feelpp:wp1:nafems-le10:performance_times_M\meshId}
+ \end{figure}
+
+}
diff --git a/software/feelpp/WP1/WP1-thermoelectric.tex b/software/feelpp/WP1/WP1-thermoelectric.tex
new file mode 100644
index 0000000..d671899
--- /dev/null
+++ b/software/feelpp/WP1/WP1-thermoelectric.tex
@@ -0,0 +1,7 @@
+\subsubsection{Benchmark \#3: Thermo-Electric Coupling}
+
+Thermo Electric coupling in a complex geometry.
+
+
+
+
diff --git a/software/feelpp/WP1/WP1.tex b/software/feelpp/WP1/WP1.tex
index c73e08d..dd11e80 100644
--- a/software/feelpp/WP1/WP1.tex
+++ b/software/feelpp/WP1/WP1.tex
@@ -1,7 +1,8 @@
-\section{Software: Feel++}
-\label{sec:WP1:Feel++:software}
+%!TEX root = ../../../exa-ma-d7.1.tex
+\section{Software: \texorpdfstring{\Feelpp}{Feel++}}
+\label{sec:WP1:Feelpp:software}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -10,7 +11,7 @@ \section{Software: Feel++}
\begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
\rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-Feel++ Consortium\\
+\Feelpp{} Consortium\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
CNRS\\
@@ -37,131 +38,1695 @@ \section{Software: Feel++}
B6 - Data Management\\
B7 - Exascale Algorithms\\
\end{tabular} \\
- \bottomrule
+\rowcolor{numpexlightergray}\textbf{Contributors} & \begin{tabular}{l}
+ Christophe Prud'homme (UNISTRA)\\
+ Vincent Chabannes (UNISTRA)\\
+ Thomas Saigre (UNISTRA)\\
+ Céline Van Landeghem (UNISTRA)\\
+ Christophe Trophime (CNRS)\\
+\end{tabular}\\
+ \hline
\end{tabular}
}}
- \caption{WP1: Feel++ Information}
+ \caption{WP1: \Feelpp{} Information}
\end{table}
\subsection{Software Overview}
-\label{sec:WP1:Feel++:summary}
+\label{sec:WP1:Feelpp:summary}
-In~\cref{tab:WP1:Feel++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+\Feelpp is an open-source \Cpp{} library designed for solving partial differential equations (PDEs), it supports seamless parallel computing based on MPI, and it is designed to be highly modular and extensible.
+It implements a \ac{DSEL} for variational formulations, which allows users to define complex PDEs in a concise and readable manner directly in \Cpp{} leveraging the power of modern \Cpp{} features.
-\begin{table}[h!]
+In~\Cref{tab:WP1:Feelpp:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+
+\begin{table}[!ht]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} cG & provide short description here \\
-\rowcolor{numpexlightergray} dG/hdG & provide short description here \\
-\rowcolor{white} finite element & provide short description here \\
-\rowcolor{numpexlightergray} inhouse & provide short description here \\
-\rowcolor{white} interface & provide short description here \\
-\rowcolor{numpexlightergray} mesh adaptation & provide short description here \\
-\rowcolor{white} multiphysics coupling & provide short description here \\
-\rowcolor{numpexlightergray} multiscale coupling & provide short description here \\
-\rowcolor{white} parallel in time & provide short description here \\
-\rowcolor{numpexlightergray} spectral element & provide short description here \\
-\rowcolor{white} unstructured mesh & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} cG& continuous Galerkin of arbitrary order; conforming and non conforming interpolation operator; \ac{DSEL} for cG methods and variational formulations \\
+\rowcolor{numpexlightergray} dG/hdG & support dG and HdG methods in 1D, 2D and 3D of arbitrary order; support postprocessing for increased accuracy for HdG; Static condensation multithreaded; \ac{DSEL} for dG/HdG methods and variational formulations\\
+\rowcolor{white} finite element & $H^1$, $L^2$, $H^\mathrm{div}$ and $H^\mathrm{curl}$ finite elements of arbitrary order in 1D, 2D, 3D; \\
+\rowcolor{numpexlightergray} inhouse & efficient data structures for localisation: BVH and KD-trees \\
+\rowcolor{white} interface & interfaces with MMG/ParMMG; Gmsh; Eigen3 \\
+\rowcolor{numpexlightergray} mesh adaptation & use MMG and ParMMG for mesh adaptation; mesh quality indicators to trigger adaptation\\
+\rowcolor{white} multiphysics coupling & Support for function space cartesian products; \ac{DSEL} for variational formulations \\
+\rowcolor{numpexlightergray} multiscale coupling & multi-dimension coupling, \textit{e.g.} 0D-3D, 1D-3D or 2D-3D; support for solving coupled systems of PDEs and ODEs\\
+\rowcolor{white} parallel in time & space-time parallel implementation of original parareal algorithm \\
+\rowcolor{numpexlightergray} spectral element & high order methods on simplices and hypercubes; Gauss-Legendre, Gauss-Lobatto, Gauss-Radau, electrostatic and Fekete points; high order geometric transformation (up to order 4 using Gmsh); construction of $P_N \mathrm{iso} P_1$; $L^2$ orthonormal basis functions used as primal basis functions on hypercubes (Gauss-Legendre) and simplices (Dubiner) \\
+\rowcolor{white} unstructured mesh & Hypercubes and Simplices meshes in 1D, 2D and 3D as well as 1D meshes in 2D and 3D and 2D meshes in 3D; efficient localisation using kd-tree; \ac{DSEL} to manipulate geometric entity collections (points,edges,faces, facets and volumes)\\
+\hline
\end{tabular}
}
}
- \caption{WP1: Feel++ Features}
- \label{tab:WP1:Feel++:features}
+ \caption{WP1: \Feelpp Features}
+ \label{tab:WP1:Feelpp:features}
\end{table}
\subsection{Parallel Capabilities}
-\label{sec:WP1:Feel++:performances}
+\label{sec:WP1:Feelpp:performances}
+
+
+\begin{description}
+ \item[Parallel Programming Environment:] MPI, Multithreading in HdG methods when enable static condensation and Tasks based parallelism using Specx~\cite{cardosi_specx_2023}.
+ \item[Supercomputers:] Gaya~\Cref{sec:arch:gaya} and Discoverer~\cref{sec:arch:eurohpc-ju}.
+ \item[Parallel Capabilities:] Every data structures are parallelized using MPI. We are currently working on adding the support of GPU for some of the data structures. HDF5 and MPI-IO are used for I/O.
+ \item[Integration with Other Systems:] \Feelpp is interfaced with specx, MMG/ParMMG and Gmsh regarding WP1.
+\end{description}
+
+% \begin{itemize}
+% \item describe the parallel programming environment : MPI and
+% \item describe the parallel computation environment: type of architecture and super computer used.
+% \item describe the parallel capabilities of the software
+% \item \textbf{Scalability:} Describe the general scalability properties of the software
+% \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+% \end{itemize}
+
+
+\subsection{Initial Performance Metrics}
+\label{sec:WP1:Feelpp:metrics}
+This section provides a summary of initial performance benchmarks performed in the context of WP1.
+It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results.
+The input data is publicly available and some have already Zenodo DOI~\cref{sec:arch:zenodo}, other are stored on Girder~\cref{sec:arch:girder:unistra} or on the software repository.
+% \begin{itemize}
+% \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+% \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+% \begin{itemize}
+% \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
+% \item Output dataset format and key results.
+% \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
+% \item DOI or permanent link for accessing the dataset.
+% \end{itemize}
+% \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
+% \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+% \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+% \end{itemize}
+%
+% % create latex counter
+% \newcounter{feelppWP1benchcounter}
+% % set the counter to 1
+% \setcounter{feelppWP1benchcounter}{1}
+% \subimport{../software/feelpp/WP1}{WP1-thermalbridge.tex}
+% % increment the counter
+% \stepcounter{feelppWP1benchcounter}
+% % get value of counter
+%
+%
+%
+% \subimport{../software/feelpp/WP1}{WP1-heatfluid.tex}
+% % increment the counter
+% \stepcounter{feelppWP1benchcounter}
+% \subimport{../software/feelpp/WP1}{WP1-contact.tex}
+% %\subimport{./}{WP1-lap-elas}
+
+
+\subsubsection{Benchmark \#1: Compute Distance Function}
+
+\paragraph{Description}
+This benchmark evaluates two methods for computing the distance function inside a three-dimensional box:
+\begin{enumerate}
+ \item The \textbf{Level Set} method using the \textbf{Fast Marching Algorithm (FMA)}.
+ \item The \textbf{Ray Tracing} method.
+\end{enumerate}
+The objective is to compute the distance function at all vertices of a discretized box using both methods and verify whether they produce the same results.
+The problem is discretized using an unstructured grid, and performance is assessed on a multi-core CPU architecture.
+
+The benchmark aims to compare the efficiency, accuracy, and computational cost of both approaches in terms of distance calculation within the 3D domain.
+
+\paragraph{Benchmarking Tools Used}
+The following tools were used for performance profiling and analysis:
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+\item \textbf{\Feelpp}: the performance tools integrated into the \Feelpp framework were used to measure the execution time.
\end{itemize}
+The key metrics measured include execution time, accuracy, memory usage, and floating-point operations (FLOPS) for both methods.
-\subsection{Initial Performance Metrics}
-\label{sec:WP1:Feel++:metrics}
+\paragraph{Input/Output Dataset Description}
+\begin{itemize}
+ \item \textbf{Input Data:} The input consists of a 3D uniform grid representing the box geometry, with approximately 1 million vertices. The level set function and ray tracing boundaries are initialized for the distance computation. The input data is stored in JSON format, and it can be accessed via DOI: \texttt{[Insert DOI]}.
+
+ \item \textbf{Output Data:} The output includes the computed distance function values at all vertices for both methods, stored in HDF5 format and statistics (performance and errors) in CSV format. Additionally, runtime performance logs and accuracy comparisons between the methods are included.
+
+ \item \textbf{Data Repository:} Input and output datasets, along with performance logs, will be stored in a Zenodo repository~\cref{sec:arch:zenodo}
+\end{itemize}
+
+This benchmark will CPU and GPU Benchmarking results. They are not yet available but will be soon once the GPU implementation is finalized.
-This section provides a summary of initial performance benchmarks performed in the context of WP1. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+% \paragraph{Results Summary}
+% The performance comparison between the two methods is summarized as follows:
+%
+% RESULTS here.
+%
+% \paragraph{Challenges Identified}
+% The following challenges were encountered during the benchmarking process:
+% \begin{itemize}
+% \item \textbf{Ray Tracing Bottlenecks:}
+% \item \textbf{Parallelization Issues:}
+% \item \textbf{Memory Usage:}
+% \end{itemize}
+%
+% Final analysis and persectives here.
+%
+% \begin{itemize}
+% \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+% \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+% \item \textbf{Input/Output Dataset Description:}
+% \begin{itemize}
+% \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+% \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+% \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+% \end{itemize}
+% \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+% \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+% \end{itemize}
+\subsubsection{Benchmark \#2: Elliptic linear PDE : Thermal bridges}
+\label{sec:WP1:Feelpp:benchmark:thermal_bridges}
+\paragraph{Description:} %Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+
+The benchmark known as "thermal bridges" is an example of an application that
+enables us to validate numerical simulation tools using \Feelpp. We have
+developed tests based on the ISO 10211:2017 standard
+\fullcite{noauthor_iso_2017}, which provides methodologies for evaluating
+thermal bridges in building construction.
+
+Thermal bridges are areas within a building envelope where heat
+flow is different compared to adjacent areas, often resulting in increased heat
+loss or unwanted condensation.
+The standard is intended to ensure that thermal
+bridges simulation are accurately computed. It provides reference values (and tolerance) on heat
+temperature and heat flux at several location of the geometry.
+
+At the mathematical level, this application requires finding the numerical
+solution of an elliptic linear PDE (i.e. the heat equation). We employ a
+finite element method based on continuous Lagrange Finite Element of order 1,2
+and 3 (denoted by $P_1$,$P_2$,$P_3$). And we analyzed the execution time of the
+main components of the simulation.
+
+The \Cref{fig:wp1:feelpp:thermal_bridges:geometry} represents the geometry
+of this benchmark and the domain decomposition by material.% the 3D temperature field solution, and an example of mesh partitioning.
+
+\begin{figure}[!ht]
+ \centering
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-thermalbridges-geom.png}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-thermalbridges-geom2.png}
+ \end{subfigure}
+ \caption{Thermal bridges benchmarks - geometry and materials}
+ \label{fig:wp1:feelpp:thermal_bridges:geometry}
+\end{figure}
+
+
+\paragraph{Benchmarking Tools Used}
+
+The benchmark was performed on the \textbf{Discoverer} supercomputer (see
+\Cref{sec:arch:eurohpc-ju}).
+The performance tools integrated into the \Feelpp-toolboxes framework were used to measure
+the execution time.
+Moreover, we need to note that we have used here Apptainer with \Feelpp SIF image based on Ubuntu noble OS.
+
+The metrics measured are the execution time of the main components of the simulation. We enumerate these parts in the following:
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+\item \textbf{Init}: load mesh from filesystem and initialize heat toolbox (finite element context and algebraic data structure)
+\item \textbf{Assembly}: calculate and assemble the matrix and rhs values obtained using the finite element method
+\item \textbf{Solve}: the linear system by using a preconditioned GMRES. Results
+ are presented in \Cref{sec:WP3:Feelpp:benchmark:thermal_bridges}.
+\item \textbf{PostProcess}: compute validation measures (temperature at points and
+ heat flux) and export on the filesystem a visualization format (EnsighGold) of
+ the solution.
\end{itemize}
-\subsubsection{Benchmark \#1}
+\paragraph{Input/Output Dataset Description}
+
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+\item \textbf{Input Data:}
+ \begin{itemize}
+ \item Meshes: We have generated three levels of mesh called M1, M2
+ and M3. These meshes are stored in GMSH format. The statistics can be found in
+ \Cref{tab:wp1:feelpp:thermal_bridges:discr_stat}. We have also prepared for
+ each mesh level a collection of partitioned mesh.
+ The format used is an in-house mesh format of \Feelpp based on
+ JSON+HDF5 file type.
+ The Gmsh meshes and the partitioned meshes can be found on our Girder
+ database management, in the \Feelpp collections.
+ \item Setup: Use standard setup of \Feelpp toolboxes. It corresponds to a cfg
+ file and JSON file. These config files are present in the Github of feelpp.
+ \item Sif image: feelpp:v0.111.0-preview.10-noble-sif (stored in the Github registry of \Feelpp)
+ \end{itemize}
+\item \textbf{Output Data:} The output includes the computed values of
+ validation measure in CSV files format, export visualization files (mesh, partitioning, temperature), and the time taken to perform each simulation step.
\end{itemize}
-\subsection{12-Month Roadmap}
-\label{sec:WP1:Feel++:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+
+\begin{table}[!ht]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}\multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties}
+ & \multicolumn{3}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \rowcolor{numpexgray} {\color{white}\bf Tag} & {\color{white}\bf \# points} & {\color{white}\bf \# edges} & {\color{white}\bf \# faces} & {\color{white}\bf \# elements} & {\color{white}\bf $P_1$} & {\color{white}\bf $P_2$} & {\color{white}\bf $P_3$} \\
+ \texttt{M1} & \pgfmathprintnumber{193654} & \pgfmathprintnumber{1299920} & \pgfmathprintnumber{2164759} & \pgfmathprintnumber{1058492} & \pgfmathprintnumber{193654} & \pgfmathprintnumber{1493574} & \pgfmathprintnumber{4958253} \\
+ \texttt{M2} & \pgfmathprintnumber{1401135} & \pgfmathprintnumber{9778744} & \pgfmathprintnumber{16566803} & \pgfmathprintnumber{8189193} & \pgfmathprintnumber{1401135} & \pgfmathprintnumber{11179879} & \pgfmathprintnumber{37525426} \\
+ \texttt{M3} & \pgfmathprintnumber{10572256} & \pgfmathprintnumber{75307308} & \pgfmathprintnumber{128722252} & \pgfmathprintnumber{63987199} & \pgfmathprintnumber{10572256} & \pgfmathprintnumber{85879564} & \pgfmathprintnumber{289909124} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{Thermal bridges benchmarks - Statistics on meshes and number of degrees of freedom with respect
+ to finite element approximation}
+ \label{tab:wp1:feelpp:thermal_bridges:discr_stat}
+\end{table}
+
+
+\paragraph{Results Summary}
+
+We start by showing in \cref{fig:wp1:feelpp:thermal_bridges:visualization} an example of numeric solution and mesh partitioning that we
+have obtained in the simulation pipeline. The partitioning process is considered
+an offline process here but requires some time and memory consumption. This
+should be explicitly described in a future work. With
+\cref{fig:feelpp:wp1:thermal_bridges:measures_convergences}, we have validated
+the simulation run by checking measures compared to reference values.
+
+\begin{figure}[!ht]
+ \centering
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-thermalbridges-solution.png}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-thermalbridges-pid.png}
+ \end{subfigure}
+ \caption{Thermal bridges benchmarks - temperature solution (left) and
+ partitioning example (right)}
+ \label{fig:wp1:feelpp:thermal_bridges:visualization}
+\end{figure}
+
+
+The benchmark performance results are summarized in
+\Cref{fig:feelpp:wp1:thermal_bridges:performance_times_M1},
+\Cref{fig:feelpp:wp1:thermal_bridges:performance_times_M2},
+\Cref{fig:feelpp:wp1:thermal_bridges:performance_times_M3} which correspond respectively to
+choice of the mesh M1, M2 and M3. Moreover, for each mesh, we have experimented with several
+finite element discretizations called $P_1$, $P_2$ and $P_3$.
+For each order of finite element approximation, we have selected a set of number
+of CPU cores.
+%Firstly, we can see clearly that the Solve part is the most time-consuming. See comments in \Cref{sec:WP3:Feelpp:benchmark:thermal_bridges}.
+Concerning the mesh M1, considered a coarse mesh, we note that the
+scalability scaling is not good, especially for low order. This is simply because the problem is too small for so many HPC resources. MPI
+communications and IO effects are non-negligible.
+For the mesh M2 and M3, results are better (but not ideal), and we can rapidly see the limit reached by the scalability test.
+Finally, the fined mesh M3, illustrates the best scalability on this
+benchmarking experiment. We see a reduction in computational cost by increasing the
+computation resources. However, due to the fast execution, the time goes fast to
+the limit.
+
+%Each run illustrated here is also validated thanks to the value of the ISO
+%10211:2017 standard.
+With these benchmarking experiences, we have also seen that we have some
+variability in performance measures. Some aspects such as the filesystem and
+network load, are not under our control, it can explain a part of this (when
+computational time belongs small locally).
+%Also, the memory and
+%consequently the choice of the number of tasks per node can be important and
+%can change significantly the performance. This latter will be taken into account
+%more accurately in a next campaign of these benchmarking test.
+
+
+
+% Color cycle: customdarkblue,customcyan,customorange,custompurple
+\newcommand{\barChart}[2][ybar]{
+ \begin{tikzpicture}
+ \begin{axis}[
+ %width=\textwidth, height=0.6172\textwidth,
+ xlabel={Number of CPU core}, ylabel={Execution time [s]},
+ %xticklabels from table={#2}{nProc},
+ xtick=data,
+ xtick align=outside,
+ ymin=0,
+ %legend style={at={(1,1)}, anchor=north east},
+ %legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt,
+ #1
+ ]
+ \foreach [expand list=true] \thetuple in {#2} {
+ \pgfkeys{/mysettings/.cd,
+ table/.store in=\mytable,
+ column/.store in=\mycolumn,
+ shift/.store in=\myshift, shift/.default=0, shift,
+ legend/.store in=\mylegend,
+ color/.store in=\mycolor
+ }
+ \edef\temp{
+ \noexpand\pgfkeys{/mysettings/.cd, \expandafter\@firstofone\thetuple}
+ } \temp
+ %\def\toto{\expandafter\mytable}
+ \edef\temp{
+ \noexpand\addplot[ybar, bar width=0.2, fill=\mycolor, draw=black, point meta=y]
+ table [x expr=\noexpand\coordindex+\myshift, y=\mycolumn ] {\expandafter\noexpand\csname \mytable\endcsname};
+ %table [x=nProc, y=\mycolumn ] {\expandafter\noexpand\csname \mytable\endcsname};
+ } \temp
+ %table [x expr=\noexpand\coordindex, y=\mycolumn ] {#2};
+ \edef\temp{
+ \noexpand\addlegendentry{\mylegend}
+ } \temp
+ }
+ \end{axis}
+\end{tikzpicture}
+}
+
+
+% columns/FunctionSpace/.style={string type},
+\begin{figure}[h]
+ \centering
+ \captionsetup[subfigure]{justification=centering}
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/thermalbridges_measures.csv}\dataTableMeasures
+ \def\myLineWidth{2pt}
+ \def\myLineStyleA{loosely dashdotdotted} %dashdotdotted
+ \def\myLineStyleB{dashed}
+ \def\myLineStyleC{solid}
+
+ \def\myAddPlot#1#2#3#4{
+ \addplot[#3,every mark/.append style={solid},
+ % x filter/.expression={(\thisrow{PolyOrder} == 1 ? \pgfmathparse{\thisrow{Mesh}}\pgfmathresult :NaN)},
+ % x filter/.expression={ \thisrow{FunctionSpace} == \thisrow{FunctionSpace} ? \pgfmathresult
+ % x filter/.expression={(\thisrow{FunctionSpace} == P1 ? \pgfmathresult :NaN )},
+ x filter/.code={
+ \pgfmathparse{\thisrow{PolyOrder}==#2}
+ \ifnum0=\pgfmathresult
+ \pgfmathsetmacro{\newx}{nan}
+ \else
+ \pgfmathsetmacro{\newx}{\thisrow{Mesh}}
+ \fi
+ \pgfmathparse{\newx}
+ },
+ y filter/.expression={ #4*\pgfmathresult }
+ ] table [x=Mesh, y=#1] {\dataTableMeasures};
+}
+
+\def\myPlotOutputMeasures#1#2#3#4{
+ \resizebox{\textwidth}{0.6172\textwidth}{
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=0.6172\textwidth,
+ % width=\textwidth, height=1.2\textheight,
+ xtick=data,
+ xticklabel={M$\pgfmathprintnumber{\tick}$},
+ xmajorgrids=true,% xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ % xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Mesh levels}, ylabel={#4},
+ % legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ % legend style={at={(0.,1)}, anchor=north west,font=\small,legend
+ % columns=3},
+ legend style={at={(0.,1)}, anchor=south west,font=\small,legend columns=4},
+ % #2
+ ]
+
+ \myAddPlot{#1}{1}{color=customdarkblue,\myLineStyleC,mark=o,line width=\myLineWidth}{#2}
+ \addlegendentry{P1}
+ \myAddPlot{#1}{2}{color=customcyan,\myLineStyleC,mark=triangle,line width=\myLineWidth}{#2}
+ \addlegendentry{P2}
+ \myAddPlot{#1}{3}{color=customorange,\myLineStyleC,mark=square,line width=\myLineWidth}{#2}
+ \addlegendentry{P3}
+ \addplot+[color=red,\myLineStyleB,mark=none,line width=\myLineWidth,every mark/.append style={solid}] coordinates {
+ (1,#3) (3,#3)
+ };
+ \addlegendentry{Ref}
+
+ \end{axis}
+ \end{tikzpicture}
+ }
+
+}
+
+\begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \myPlotOutputMeasures{Normal_Heat_Flux_alpha}{1}{46.09}{Heat flow [W]}
+ \caption{Heat flow measured in $\alpha$ environment}
+\end{subfigure}
+\hfill
+\begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \myPlotOutputMeasures{Normal_Heat_Flux_beta}{1}{13.89}{Heat flow [W]}
+ \caption{Heat flow measured in $\beta$ environment}
+\end{subfigure}
+\hfill
+\begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \vspace*{0.03\textheight}
+ \myPlotOutputMeasures{Normal_Heat_Flux_gamma}{-1}{59.98}{Heat flow [W]} % warning inverse
+ \caption{Heat flow measured in $\gamma$ environment}
+\end{subfigure}
+\hfill
+\begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \vspace*{0.03\textheight}
+ \myPlotOutputMeasures{Statistics_temperature_alpha_min}{1}{11.32}{Temperature
+ [°C]}
+ \caption{Surface temperature min in $\alpha$ environment}
+\end{subfigure}
+
+\caption{Thermal bridges benchmarks - Convergence of validation measures compared to references values}
+\label{fig:feelpp:wp1:thermal_bridges:measures_convergences}
+\end{figure}
+
+
+\foreach [expand list=true] \meshId in {1,2,3} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/thermalbridges_M\meshId_P1_discoverer.csv}\dataPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/thermalbridges_M\meshId_P2_discoverer.csv}\dataPb
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/thermalbridges_M\meshId_P3_discoverer.csv}\dataPc
+
+ \begin{figure}
+ \centering
+ \def\plotSetup##1{
+ {table=##1,column=init,legend=Init,color=customdarkblue},
+ {table=##1,legend=Assembly,column=algebraic-assembly,color=customcyan},
+ %{table=##1,legend=Solve,column=algebraic-solve,color=customorange},
+ {table=##1,legend=PostProcess,column=exportResults,color=customorange}
+ }
+ \def\chartBarPlot##1##2{
+ \barChart[ybar,
+ width=\textwidth, height=0.6172\textwidth,
+ xticklabels from table={##2}{nProc},
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+ \def\chartBarStackedPlot##1##2{
+ \barChart[ybar stacked,
+ width=\textwidth, height=0.6172\textwidth,
+ xticklabels from table={##2}{nProc},
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPa}{\dataPa}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPa}{\dataPa}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \vspace*{0.04\textwidth}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPb}{\dataPb}
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPb}{\dataPb}
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \hfill
+ \vspace*{0.04\textwidth}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPc}{\dataPc}
+ \caption{\texttt{M\meshId} - \texttt{$P_3$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPc}{\dataPc}
+ \caption{\texttt{M\meshId} - \texttt{$P_3$}}
+ \end{subfigure}
+ \caption{Thermal bridges benchmarks - Execution time of main simulation components
+ - Mesh \texttt{M\meshId} - Discoverer supercomputer}
+ \label{fig:feelpp:wp1:thermal_bridges:performance_times_M\meshId}
+ \end{figure}
+
+}
+
+
+
+
+
+
+
+\paragraph{Challenges Identified}
+Several challenges were encountered during the benchmarking process:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+ \item \textbf{Memory Usage:} Reduce the memory
+ footprint.
+ \item \textbf{Parallelization Inefficiencies:} Understand and
+ improve performance when MPI communication and filesystem IO will be dominant.
+ %\item \textbf{Cache and Memory Bottlenecks:}
\end{itemize}
-In~\cref{tab:WP1:Feel++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+To conclude, we have realized HPC performance tests of benchmark called thermal
+bridges. We have realized with success the execution of several simulations on
+significant resources and demonstrated the validation of \Feelpp framework in the
+elliptic PDE context. We have also validated the deployment of \Feelpp with
+container support. Now, we need to provide more refined measures to detect and
+analyze reasons for performance degradation. And also compare to other software
+installations, like Spack.
+
+
+
+
+\subsubsection{Benchmark \#3: Linear elasticity : NAFEMS LE10}
+\label{sec:WP1:Feelpp:benchmark:nafems-le10}
+
+\paragraph{Description}
+The NAFEMS LE10 benchmark is a reference test designed to assess the
+capabilities of finite element analysis (FEM) software for modeling bending
+structures. This benchmark is part of the NAFEMS benchmarks, commonly used to
+verify the accuracy and reliability of numerical simulation codes in mechanical
+and civil engineering.
+
+The LE10 benchmark aims to test the ability of software to solve a beam bending
+problem in the context of structural analysis. It aims to compare the results
+obtained by simulation software with analytical or theoretical results, as well
+as with other calculation methods
+
+The benchmark consists of analyzing a thin plate in flexion subjected to given
+loads and fixed to some edges. The underlying physics model used is a linear
+elasticity model. At the numeric level, we use the continuous Galerkin method
+with Lagrange finite element of order 1 and 2. The validation of the NAFEMS LE10
+benchmark asks to compute the normal stress in the y direction at a point of the geometry.
+
+
+\paragraph{Benchmarking Tools Used}
+The benchmark was performed on the \textbf{Discoverer} supercomputer (see
+\Cref{sec:arch:eurohpc-ju}).
+The performance tools integrated into the \Feelpp-toolboxes framework were used to measure
+the execution time.
+Moreover, we need to note that we have used here Apptainer with \Feelpp SIF image based on Ubuntu noble OS.
+
+The metrics measured are the execution time of the main components of the simulation. We enumerate these parts in the following:
+\begin{itemize}
+\item \textbf{Init}: load mesh from filesystem and initialize solid toolbox (finite element context and algebraic data structure)
+\item \textbf{Assembly}: calculate and assemble the matrix and rhs values obtained using the finite element method
+\item \textbf{Solve}: the linear system by using a preconditioned GMRES. Results
+ are presented in \Cref{sec:WP3:Feelpp:benchmark:nafems-le10}.
+\item \textbf{PostProcess}: compute validation measures (normal stress at a point)
+ and export on the filesystem a visualization format (EnsighGold) of the
+ solution and other fields of interest such as von Mises yield, Tresca criterion and
+ principal stresses.
+\end{itemize}
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{itemize}
+\item \textbf{Input Data:}
+ \begin{itemize}
+ \item Meshes: We have generated three levels of mesh called M2, M3
+ and M4. These meshes are stored in GMSH format. The statistics can be found in
+ \Cref{tab:wp1:feelpp:nafems-le10:discr_stat}. We have also prepared for
+ each mesh level a collection of partitioned mesh.
+ The format used is an in-house mesh format of \Feelpp based on
+ JSON+HDF5 file type.
+ The Gmsh meshes and the partitioned meshes can be found on our Girder
+ database management, in the \Feelpp collections.
+ \item Setup: Use standard setup of \Feelpp toolboxes. It corresponds to a cfg
+ file and JSON file. These config files are present in the Github of feelpp.
+ \item Sif image: feelpp:v0.111.0-preview.10-noble-sif (stored in the Github registry of \Feelpp)
+ \end{itemize}
+\item \textbf{Output Data:} The output includes the computed values of
+ validation measure in CSV files format, export visualization files (mesh,
+ partitioning, displacement, ...), and the time taken to perform each simulation step.
+\end{itemize}
+
+\begin{table}[!ht]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}\multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties}
+ & \multicolumn{2}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \rowcolor{numpexgray} {\color{white}\bf Tag} & {\color{white}\bf \# points} & {\color{white}\bf \# edges} & {\color{white}\bf \# faces} & {\color{white}\bf \# elements} & {\color{white}\bf $P_1$} & {\color{white}\bf $P_2$} \\
+ \texttt{M2} & \pgfmathprintnumber{324257} & \pgfmathprintnumber{2247489} & \pgfmathprintnumber{3796235} & \pgfmathprintnumber{1873002} & \pgfmathprintnumber{972771} & \pgfmathprintnumber{7715238} \\
+ \texttt{M3} & \pgfmathprintnumber{2426377} & \pgfmathprintnumber{17230019} & \pgfmathprintnumber{29409215} & \pgfmathprintnumber{14605572} & \pgfmathprintnumber{7279131} & \pgfmathprintnumber{58969188} \\
+ \texttt{M4} & \pgfmathprintnumber{18801264} & \pgfmathprintnumber{135213828} & \pgfmathprintnumber{232036941} & \pgfmathprintnumber{115624376} & \pgfmathprintnumber{56403792} & \pgfmathprintnumber{462045276} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{NAFEMS LE10 benchmark - Statistics on meshes and number of degrees of freedom with respect
+ to finite element approximation}
+ \label{tab:wp1:feelpp:nafems-le10:discr_stat}
+\end{table}
+
+\paragraph{Results Summary}
+We start by showing \Cref{fig:wp1:feelpp:nafems-le10:results:visualization} that
+illustrate the numerical results quality of this in silico experiments. We can
+see the displacement filed and some quantities of interest in
+\cref{fig:feelpp:wp1:nafems-le10:results:visualization:displacement,fig:feelpp:wp1:nafems-le10:results:visualization:vonmises,fig:feelpp:wp1:nafems-le10:results:visualization:wrap-tresca}.
+Moreover, we have \cref{fig:feelpp:wp1:nafems-le10:results:measures_convergences} which show the
+validity and quality of the solution for each discretization.
+
+In terms of performance, we have plotted the execution times related to the WP1 context. These results, illustrating the \textbf{Init}, \textbf{Assembly} and \textbf{PostProcess} components, can be found in
+\cref{fig:feelpp:wp1:nafems-le10:results:performance_times_M2,fig:feelpp:wp1:nafems-le10:results:performance_times_M3,fig:feelpp:wp1:nafems-le10:results:performance_times_M4}.
+The behavior is very good, we can see the a good scalability, up to a limit as
+usual when the problem size become too small. We can notice some jump with the
+\textbf{Init} part in cases of large number tasks. We think that is due to
+filesystem load, particularly because we load the mesh file from non-fast
+storage. Outputs are written on fast storage but not input datasets because of
+reduced quota limits on this fast disk.
+
+\begin{figure}[!ht]
+ \centering
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-nafems-le10-solution-disp.png}
+ \caption{Displacement}
+ \label{fig:feelpp:wp1:nafems-le10:results:visualization:displacement}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-nafems-le10-solution-vonmises.png}
+ \caption{von Mises yield criterion}
+ \label{fig:feelpp:wp1:nafems-le10:results:visualization:vonmises}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-nafems-le10-solution-tresca-wrap.png}
+ \caption{Tresca criterion on the deformed domain (magnified by $1000$)}
+ \label{fig:feelpp:wp1:nafems-le10:results:visualization:wrap-tresca}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+
+ \centering
+ \captionsetup[subfigure]{justification=centering}
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/nafems_le10_measures.csv}\dataTableMeasures
+ \def\myLineWidth{2pt}
+ \def\myLineStyleA{loosely dashdotdotted} %dashdotdotted
+ \def\myLineStyleB{dashed}
+ \def\myLineStyleC{solid}
+
+ \def\myAddPlot#1#2#3#4{
+ \addplot[#3,every mark/.append style={solid},
+ % x filter/.expression={(\thisrow{PolyOrder} == 1 ? \pgfmathparse{\thisrow{Mesh}}\pgfmathresult :NaN)},
+ % x filter/.expression={ \thisrow{FunctionSpace} == \thisrow{FunctionSpace} ? \pgfmathresult
+ % x filter/.expression={(\thisrow{FunctionSpace} == P1 ? \pgfmathresult :NaN )},
+ x filter/.code={
+ \pgfmathparse{\thisrow{PolyOrder}==#2}
+ \ifnum0=\pgfmathresult
+ \pgfmathsetmacro{\newx}{nan}
+ \else
+ \pgfmathsetmacro{\newx}{\thisrow{Mesh}}
+ \fi
+ \pgfmathparse{\newx}
+ },
+ y filter/.expression={ #4*\pgfmathresult }
+ ] table [x=Mesh, y=#1] {\dataTableMeasures};
+ }
+
+ \def\myPlotOutputMeasures#1#2#3#4{
+ % \resizebox{\textwidth}{0.6172\textwidth}{
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=0.6172\textwidth,
+ xtick=data,
+ xticklabel={M$\pgfmathprintnumber{\tick}$},
+ xmajorgrids=true,% xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ % xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Mesh levels}, ylabel={#4},
+ % legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ % legend style={at={(0.,1)}, anchor=north west,font=\small,legend
+ % columns=3},
+ legend style={at={(1.,1)}, anchor=south east,font=\small,legend columns=4},
+ % #2
+ ]
+
+ \myAddPlot{#1}{1}{color=customdarkblue,\myLineStyleC,mark=o,line width=\myLineWidth}{#2}
+ \addlegendentry{P1}
+ \myAddPlot{#1}{2}{color=customcyan,\myLineStyleC,mark=triangle,line width=\myLineWidth}{#2}
+ \addlegendentry{P2}
+ % \myAddPlot{#1}{3}{color=customorange,\myLineStyleC,mark=square,line width=\myLineWidth}{#2}
+ % \addlegendentry{P3}
+ \addplot+[color=red,\myLineStyleB,mark=none,line width=\myLineWidth,every mark/.append style={solid}] coordinates {
+ (2,#3) (4,#3)
+ };
+ \addlegendentry{Ref}
+
+ \end{axis}
+ \end{tikzpicture}
+ % }
+
+ }
+
+ \myPlotOutputMeasures{Points_pointD_expr_sigma_yy}{1}{5.38e6}{Normal stress ($\sigma_{yy}$) [Pa]}
+
+ \caption{NAFEMS LE10 benchmarks - Output measures}
+ \label{fig:feelpp:wp1:nafems-le10:results:measures_convergences}
+\end{subfigure}
+
+\caption{NAFEMS LE10 benchmark - numerical solutions and convergence of validation measures compared to reference value}
+\label{fig:wp1:feelpp:nafems-le10:results:visualization}
+\end{figure}
+
+
+
+
+
+
+
+
+\foreach [expand list=true] \meshId in {2,3,4} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/nafems_le10_M\meshId_P1_discoverer.csv}\dataPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/nafems_le10_M\meshId_P2_discoverer.csv}\dataPb
+
+ \begin{figure}
+ \centering
+ \def\plotSetup##1{
+ %Color cycle: customdarkblue,customcyan,customorange,custompurple
+ {table=##1,column=init,legend=Init,color=customdarkblue},
+ {table=##1,legend=Assembly,column=algebraic-assembly,color=customcyan},
+ %{table=##1,legend=Solve,column=algebraic-solve,color=customorange},
+ {table=##1,legend=PostProcess,column=exportResults,color=customorange}
+ }
+ \def\chartBarPlot##1##2##3{
+ \barChart[ybar,
+ width=0.95\textwidth, height=0.6172\textwidth,
+ xticklabels from table={##2}{nProc},
+ x tick label style={ rotate=-45 },
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1},
+ ##3]{\plotSetup{##1}}
+ }
+ \def\chartBarStackedPlot##1##2{
+ \barChart[ybar stacked,
+ width=0.95\textwidth, height=0.6172\textwidth,
+ xticklabels from table={##2}{nProc},
+ x tick label style={ rotate=-45 },
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarPlot{dataPa}{\dataPa}{}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPa}{\dataPa}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \vspace*{0.04\textwidth}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \ifnum4=\meshId
+ \chartBarPlot{dataPb}{\dataPb}{enlarge x limits=0.3}
+ \else
+ \chartBarPlot{dataPb}{\dataPb}{}
+ \fi
+
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \chartBarStackedPlot{dataPb}{\dataPb}
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \caption{NAFEMS LE10 benchmarks - Execution time of main simulation components
+ - Mesh \texttt{M\meshId} - Discoverer supercomputer}
+ \label{fig:feelpp:wp1:nafems-le10:results:performance_times_M\meshId}
+ \end{figure}
+
+}
+
+\paragraph{Challenges Identified}
+
+\begin{itemize}
+\item HPC experiments with other kinds of discretizations:
+ \begin{itemize}
+ \item Displacement/pressure formulation
+ \item HDG method
+ \item Nonlinear models
+ \end{itemize}
+\item Go to extreme HPC scale : Partitioning and IO issues
+\end{itemize}
+
+\iffalse
+\subsubsection{Benchmark \#2: Assemble Stiffness and Linear Elasticity Matrix}
+
+\paragraph{Description}
+This benchmark evaluates the assembly of stiffness and linear elasticity finite element matrices in three dimensions using both continuous Galerkin (cG) and hybrid discontinuous Galerkin (hdG) methods using the \Feelpp toolboxes.
+The problem size consists of a mesh with tetrahedral elements, executed entirely on multi-core CPU architectures.
+The benchmark is designed to measure scalability, execution time, and computational efficiency across different material models, including isotropic and anisotropic materials.
+
+The objective of the benchmark is to assess performance in terms of assembly time, memory usage, and parallel efficiency for cG and hdG methods from low to high orders using CPU resources only.
+
+\paragraph{Benchmarking Tools Used}
+The following performance analysis tools were used:
+\begin{itemize}
+ \item \textbf{\Feelpp}: the performance tools integrated into the \Feelpp framework were used to measure the execution time and memory usage during the matrix assembly.
+\end{itemize}
+
+Metrics such as execution time, memory usage, and FLOPS were measured to compare the performance of the cG and hdG methods on CPU.
+
+\paragraph{Input/Output Dataset Description}
+\begin{itemize}
+ \item \textbf{Input Data:} The input dataset consists of a 3D tetrahedral mesh generated using the Gmsh format, with approximately 1 million elements. Material properties are defined in JSON format, covering both isotropic and anisotropic materials.
+
+ \item \textbf{Output Data:} The output includes performance logs, execution times, and memory usage reports. Output results are replicable by using the same mesh and material properties.
+
+ \item \textbf{Data Repository:} All input and output datasets are available in a Zenodo repository, accessible through DOI: \texttt{[Insert DOI]}.
+\end{itemize}
+
+
+\paragraph{Results Summary}
+The benchmark results are summarized as follows:
+
+RESULTS here
+
+The results highlight that ... (ADD ANALYSIS)
+
+
+
+
+\paragraph{Challenges Identified}
+Several challenges were encountered during the benchmarking process:
+\begin{itemize}
+ \item \textbf{Memory Usage:}
+ \item \textbf{Parallelization Inefficiencies:}
+ \item \textbf{Cache and Memory Bottlenecks:}
+\end{itemize}
+
+add extra analysis and conclusion here.
+\fi
+
+\subsubsection{Benchmark \#4: Thermo-Electric Coupling}
+\label{sec:WP1:Feelpp:benchmark:hl-31}
+
+%Thermo Electric coupling in a complex geometry.
+
+\paragraph{Description}
+
+This benchmark models the temperature field and electric current distribution in an high field resistive magnet of the Laboratoire National des Champs Magnétiques Intenses. The magnet consist in a set of 14 copper alloys cylindrical tubes connected 2 by 2 into series by rings. In each tube, the current path is defined by 2 helical cuts of 0.2 mm width. The rings are machined to let water flow in between each tube
+in a channel of 0.8 mm. The magnet is operated at 12 MW with an imosed total current of 31 kA. The water flow in the magnet is about 140 l/s. The water cooling of the magnet is modelling by using Robin boundary conditions with parameters derived from classical correlation in thermo-hydraulics.
+A more detailled version of the full model is available in \cite{daver2016,Hild2020}. The
+model is run with \texttt{thermoelectric} \Feelpp toolbox.
+
+The geometry used in this benchmark performance is illustrated in
+\Cref{fig:wp1:feelpp:hl-31:visualization-geometry}. This is a complex domain
+composed of a large number of components, with some very thin parts.
+
+\paragraph{Benchmarking Tools Used}
+The benchmark was performed on \textbf{Gaya} supercomputer (see \Cref{sec:arch:gaya}) and \textbf{Discoverer} supercomputer (see
+\Cref{sec:arch:eurohpc-ju}).
+The performance tools integrated into the \Feelpp-toolboxes framework were used to measure
+the execution time.
+Moreover, we need to say that we have used several \Feelpp installations
+\begin{itemize}
+\item \textbf{Gaya} : native application from Ubuntu packages of Jammy OS.
+\item \textbf{Discoverer} : Apptainer with \Feelpp SIF image based on Ubuntu
+ Noble OS.
+\end{itemize}
+Note: the \Feelpp version is identical but the dependencies (like Petsc)
+which are of course more recent with Noble.
+
+The metrics measured are the execution time of the main components of the simulation. We enumerate these parts in the following:
+\begin{itemize}
+\item \textbf{Init}: load mesh from filesystem and initialize solid toolbox (finite element context and algebraic data structure)
+\item \textbf{Assembly}: calculate and assemble the matrix and rhs values obtained using the finite element method
+\item \textbf{Solve}: the linear system by using a preconditioned GMRES. Results
+ are presented in \Cref{sec:WP3:Feelpp:benchmark:hl-31}.
+\item \textbf{PostProcess}: Export on the filesystem a visualization format (EnsighGold) of the
+ solution and other fields of interest such as current density and electric field.
+\end{itemize}
+
+\begin{figure}[!ht]
+ \centering
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-HL-31-geo.png}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-HL-31-geo-zoom.png}
+ \end{subfigure}
+ \caption{HL-31 benchmarks - geometry}
+ \label{fig:wp1:feelpp:hl-31:visualization-geometry}
+\end{figure}
+
+
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{itemize}
+\item \textbf{Input Data:}
+ \begin{itemize}
+ \item Meshes: We have generated three levels of mesh called M1, M2
+ and M3. These meshes are stored in GMSH format. The statistics can be found in
+ \Cref{tab:wp1:feelpp:thermal_bridges:discr_stat}. We have also prepared for
+ each mesh level a collection of partitioned mesh.
+ The format used is an in-house mesh format of \Feelpp based on
+ JSON+HDF5 file type.
+ The Gmsh meshes and the partitioned meshes can be found on our Girder
+ database management, in the \Feelpp collections.
+ \item Setup: Use standard setup of \Feelpp toolboxes. It corresponds to a cfg
+ file and JSON file. These config files are present in the Github of feelpp.
+ \item[]{\Feelpp distributions}
+ \begin{itemize}
+ \item SIF image (Apptainer): feelpp:v0.111.0-preview.10-noble-sif (stored in the Github registry of \Feelpp)
+ \item Ubuntu package Jammy (Native) feelpp:v0.111.0-preview.10
+ \end{itemize}
+ \end{itemize}
+\item \textbf{Output Data:} The output includes export visualization files (mesh,
+ partitioning, temperature, elecric potential, current density, electric
+ field, ...), the time taken to perform each simulation step and
+ some integral quantities such a the total power dissipated by the magnet.
+\item \textbf{Data Repository:} All inputdatasets are available in a unistra girder repository (collection HiFiMagnet, HL-31).
+\end{itemize}
+
+
\begin{table}[h!]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}\multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties}
+ & \multicolumn{2}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of dof (heat)}
+ & \multicolumn{2}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of dof (electric)} \\
+ \rowcolor{numpexgray} {\color{white}\bf Tag} & {\color{white}\bf \# points} & {\color{white}\bf \# edges} & {\color{white}\bf \# faces} & {\color{white}\bf \# elements} & {\color{white}\bf $P_1$} & {\color{white}\bf $P_2$} & {\color{white}\bf $P_1$} & {\color{white}\bf $P_2$} \\
+ \texttt{M1} & \pgfmathprintnumber{4306880} & \pgfmathprintnumber{27913534} & \pgfmathprintnumber{46008143} & \pgfmathprintnumber{22401676} & \pgfmathprintnumber{4306880} & \pgfmathprintnumber{32220414} & \pgfmathprintnumber{4302090} & \pgfmathprintnumber{29456526} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{HL-31 benchmark - Statistics on meshes and number of degrees of freedom with respect
+ to finite element approximation}
+ \label{tab:wp1:feelpp:hl-31:discr_stat}
+\end{table}
+
+\paragraph{Results Summary}
+
+We can find in \cref{fig:wp1:feelpp:hl-31:results:visualization-solution} some
+examples of 3D visualization that represent fields obtained by solving the
+thermoelectric problem. We have the temperature solution and other quantities
+evaluated at \textbf{PostProcess} phase such as current density, electric
+potential, and electric fields.
+
+The performance analysis has been done in two supercomputers, Gaya and
+Discoverer. For each of them, we have run the benchmark with discretizations P1
+and P2. About the \Feelpp distribution, we use native Ubuntu Jammy packages for
+Gaya, and Apptainer (with Ubuntu) for Discoverer. The results on these machines
+are illustrated in
+\cref{fig:feelpp:wp1:hl-31:results:performance_times_M1_gaya,fig:feelpp:wp1:hl-31:results:performance_times_M1_discoverer}
+respectively. The measures presented are the time-consuming of simulation
+components related to WP1, namely \textbf{Init}, \textbf{Assembly}, and
+\textbf{PostProcess}. For the \textbf{Solve} component, we refer to
+\cref{sec:WP3:Feelpp:benchmark:hl-31}.
+
+In each case analyzed, we have very good scalability, in the sense that we
+reduced significantly the execution time by increasing the number of tasks. The
+\textbf{Init} phase is the most time-consuming, with a limit quite high compared
+to others. This part contains the mesh reading from the disk, which can explain
+some bumps sometimes when require many HPC resources.
+
+To conclude, we can say that we successfully ran this benchmark case, and we are
+ready to go up to larger problems and more complex modeling.
+
+
+
+
+
+
+
+\begin{figure}[!ht]
+ \centering
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-HL-31-temperature.png}
+ \caption{Temperature}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-HL-31-current_density.png}
+ \caption{Current density}
+ \end{subfigure}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-HL-31-potential_density_streamines.png}
+ \caption{Electric potential and current density on streamlines of electric field}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-HL-31-temperature-streamlines.png}
+ \caption{Temperature on streamlines of electric field}
+ \end{subfigure}
+ \caption{HL-31 benchmarks - solutions}
+ \label{fig:wp1:feelpp:hl-31:results:visualization-solution}
+\end{figure}
+
+
+\foreach [expand list=true] \supercomputerFile/\supercomputerName in {gaya/Gaya,discoverer/Discoverer} {
+
+ \def\meshId{1}
+%\foreach [expand list=true] \meshId in {1} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/HL-31_M\meshId_P1_\supercomputerFile.csv}\dataPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/data/HL-31_M\meshId_P2_\supercomputerFile.csv}\dataPb
+
+ \begin{figure}
+ \centering
+ \def\plotSetup##1{
+ {table=##1,column=init,legend=Init,color=customdarkblue},
+ {table=##1,legend=Assembly,column=algebraic-assembly,color=customcyan},
+ % {table=##1,legend=Solve,column=algebraic-solve,color=customorange},
+ {table=##1,legend=PostProcess,column=exportResults,color=customorange}
+ }
+ \def\chartBarPlot##1##2{
+ \barChart[ybar, width=0.97\textwidth, height=0.6172\textwidth,
+ xticklabels from table={##2}{nProc},
+ x tick label style={ rotate=-45 },
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+ \def\chartBarStackedPlot##1##2{
+ \barChart[ybar stacked, width=0.97\textwidth, height=0.6172\textwidth,
+ xticklabels from table={##2}{nProc},
+ x tick label style={ rotate=-45 },
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup{##1}}
+ }
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ %\resizebox{\textwidth}{0.6172\textwidth}{
+ \chartBarPlot{dataPa}{\dataPa}
+ %}
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ %\resizebox{\textwidth}{0.6172\textwidth}{
+ \chartBarStackedPlot{dataPa}{\dataPa}
+ % }
+ \caption{\texttt{M\meshId} - \texttt{$P_1$}}
+ \end{subfigure}
+ \hfill
+ \vspace*{0.04\textwidth}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ %\resizebox{\textwidth}{0.6172\textwidth}{
+ \chartBarPlot{dataPb}{\dataPb}
+ % }
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ %\resizebox{\textwidth}{0.6172\textwidth}{
+ \chartBarStackedPlot{dataPb}{\dataPb}
+ % }
+ \caption{\texttt{M\meshId} - \texttt{$P_2$}}
+ \end{subfigure}
+ \caption{HL-31 benchmarks - Execution time of main simulation components
+ - \supercomputerName \ supercomputer}
+ \label{fig:feelpp:wp1:hl-31:results:performance_times_M\meshId_\supercomputerFile}
+ \end{figure}
+
+%}
+}
+
+
+\paragraph{Challenges Identified}
+
+\begin{itemize}
+\item HPC experiments with other kinds of discretizations:
+ \begin{itemize}
+ \item Modelling: nonlinear, Maxwell, ...
+ \item HDG method
+ \item Geometry complexity
+ \end{itemize}
+\item Go to extreme HPC scale: Partitioning and IO issues
+\end{itemize}
+
+
+
+
+\subsubsection{Benchmark \#5: HeatFluid Coupling}
+\label{sec:WP1:Feelpp:benchmark4}
+
+\newcommand{\vct}[1]{\vec{#1}}
+\newcommand{\mat}[1]{\underline{\underline{#1}}}
+
+
+% \emph{enlever tous les détails, et laisser les références, expliciter la liste de maillage et la machine où est faite le bench, et la mise en donnée (paramètrisation)}
+
+% \emph{dans wp3: parler du préconditionner}
+
+
+\paragraph{Description}
+This benchmark models the steady aqueous humor (AH) flow in the posterior and anterior chambers of the human eyeball, coupled with the overall heat transfer, adapted from~\cite{ooi_simulation_2008,kilgour_operator_2021}.
+The full model description is available in~\cite{saigre_coupled_2024_abstract}.
+It it run with the toolbox \texttt{heatfluid} of \Feelpp.
+
+
+\paragraph{Benchmarking Tools Used}
+
+The following tools were used for performance profiling and analysis:
+\begin{itemize}
+ \item \textbf{\Feelpp}: the performance tools integrated into the \Feelpp framework were used to measure the execution time.
+ \item \textbf{Gaya}: the benchmark was performed on the Gaya supercomputer (see \Cref{sec:arch:gaya}).
+\end{itemize}
+
+The metrics measured are the execution time to:
+\begin{inparaenum}[\it (i)]
+ \item load and initialize the mesh that is already partitioned on the disk,
+ \item initialize the data structures,
+ \item assembly algebraic objects of the linear system,
+ \item solve the non-linear algebraic system, and
+ \item export the results.
+\end{inparaenum}
+
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{itemize}
+ \item \textbf{Input Data:} The input dataset consists of a family of 3D tetrahedral meshes generated through the process described in~\cite{chabannes_3d_2024}, and denoted \texttt{Mr0} to \texttt{Mr6}, with an increasing number of elements.
+ \Cref{tab:feelpp:wp1:coupled:mesh} presents the characteristics of these meshes.
+ The mesh \texttt{Mr4} is presented in \Cref{fig:feelpp:wp1:hfheat}, amond results.
+ The input data also provides the configuration files necessary to run the simulations.
+ \item \textbf{Output Data:} The output includes the computed temperature, velocity, and pressure fields for each mesh, stored in HDF5 format, as well as the time taken to perform each step of the simulation.
+ \item \textbf{Data Repository:} All input and output datasets are available in a Zenodo repository \cite{saigre_mesh_2024}, accessible through DOI: \href{https://doi.org/10.5281/ZENODO.13886143}{10.5281/ZENODO.13886143}.
+\end{itemize}
+
+
+\begin{table}[!ht]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}
+ \multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties} & \multicolumn{3}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \hline
+ \rowcolor{numpexgray}{\color{white}\bf Tag} & {\color{white}\bf $h_\text{min}$} & {\color{white}\bf $h_\text{max}$} & {\color{white}\bf $h_\text{mean}$} & {\color{white}\bf \# elements} & {\color{white}\bf $T$} & {\color{white}\bf $\vct{u}$} & {\color{white}\bf $p$} \\
+ \texttt{Mr0} & \pgfmathprintnumber{1.247583e-04} & \pgfmathprintnumber{3.997611e-03} & \pgfmathprintnumber{9.227331e-04} & \pgfmathprintnumber{191939} & \pgfmathprintnumber{37470} & \pgfmathprintnumber{84966} & \pgfmathprintnumber{4615} \\
+ \rowcolor{numpexlightergray}
+ \texttt{Mr1} & \pgfmathprintnumber{1.367312e-04} & \pgfmathprintnumber{3.634717e-03} & \pgfmathprintnumber{7.717604e-04} & \pgfmathprintnumber{282030} & \pgfmathprintnumber{51753} & \pgfmathprintnumber{116709} & \pgfmathprintnumber{6155} \\
+ \texttt{Mr2} & \pgfmathprintnumber{6.539683e-05} & \pgfmathprintnumber{1.599067e-03} & \pgfmathprintnumber{4.668270e-04} & \pgfmathprintnumber{746664} & \pgfmathprintnumber{131327} & \pgfmathprintnumber{589992} & \pgfmathprintnumber{28548} \\
+ \rowcolor{numpexlightergray}
+ \texttt{Mr3} & \pgfmathprintnumber{3.294835e-05} & \pgfmathprintnumber{9.592658e-04} & \pgfmathprintnumber{4.166619e-04} & \pgfmathprintnumber{1403433} & \pgfmathprintnumber{241831} & \pgfmathprintnumber{707532} & \pgfmathprintnumber{34304} \\
+ \texttt{Mr4} & \pgfmathprintnumber{2.549458e-05} & \pgfmathprintnumber{5.293352e-04} & \pgfmathprintnumber{2.883913e-04} & \pgfmathprintnumber{6038645} & \pgfmathprintnumber{1027375} & \pgfmathprintnumber{1024008} & \pgfmathprintnumber{48534} \\
+ \rowcolor{numpexlightergray}
+ \texttt{Mr5} & \pgfmathprintnumber{3.120124e-05} & \pgfmathprintnumber{1.501561e-04} & \pgfmathprintnumber{2.772105e-04} & \pgfmathprintnumber{43893359} & \pgfmathprintnumber{7374833} & \pgfmathprintnumber{4616967} & \pgfmathprintnumber{205342} \\
+ \texttt{Mr6} & \pgfmathprintnumber{2.820610e-05} & \pgfmathprintnumber{9.940551e-07} & \pgfmathprintnumber{1.835537e-04} & \pgfmathprintnumber{150630096} & \pgfmathprintnumber{25200452} & \pgfmathprintnumber{14671089} & \pgfmathprintnumber{636943} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{Characteristics of meshes used for the convergence study and number of degrees of freedom for temperature $T$, velocity $\vct{u}$, and pressure fields $p$, with the discretization $P_1\text{--}P_2P_1$.}%
+ \label{tab:feelpp:wp1:coupled:mesh}
+\end{table}
+
+
+\paragraph{Results Summary}
+
+We show in \Cref{fig:feelpp:wp1:hfheat} the distribution of the computed temperature over the eyeball in the standing position on a vertical cut.
+The temperature is higher at the back part of the eye, as this part is inside the human body, while the front part, where heat exchanges with the ambient air are present, is colder.
+The temperature distribution is consistent with previous findings from literature \cite{ooi_simulation_2008,wang_fluid_2016}.
+%
+Moreover, we present in \Cref{fig:feelpp:wp1:hf} the results of the simulation for various postural orientations of the eye, namely standing, prone, and supine positions.
+The main striking result is the difference in the flow patterns: it follows the gravitational force and is more important in the standing position (\Cref{fig:feelpp:wp1:hf:standing}).
+Moreover, we notice the formation of Krukenberg's spindle and phenomena of recirculation in the anterior chamber, as observed in the literature \cite{kilgour_operator_2021,wang_fluid_2016}.
+%
+These results have been obtained with the mesh \texttt{Mr4} and the $P_1\text{--}P_2P_1$ discretization.
+
+
+
+
+
+\makeatletter
+\newcommand\addplotgraphicsnatural[2][]{%
+ \begingroup
+ % set options in this local group (will be lost afterwards):
+ \pgfqkeys{/pgfplots/plot graphics}{#1}%
+ % measure the natural size of the graphics:
+ \setbox0=\hbox{\includegraphics{#2}}%
+ %
+ % compute the required unit vector ratio:
+ \pgfmathparse{\wd0/(\pgfkeysvalueof{/pgfplots/plot graphics/xmax} - \pgfkeysvalueof{/pgfplots/plot graphics/xmin})}%
+ \let\xunit=\pgfmathresult
+ \pgfmathparse{\ht0/(\pgfkeysvalueof{/pgfplots/plot graphics/ymax} - \pgfkeysvalueof{/pgfplots/plot graphics/ymin})}%
+ \let\yunit=\pgfmathresult
+ %
+ % configure pgfplots to use it.
+ % The \xdef expands all macros except those prefixed by '\noexpand'
+ % and assigns the result to a global macro named '\marshal'.
+ \xdef\marshal{%
+ \noexpand\pgfplotsset{unit vector ratio={\xunit\space \yunit}}%
+ }%
+ \endgroup
+ %
+ % use our macro here:
+ \marshal
+ %
+ \addplot graphics[#1] {#2};
+}
+\makeatother
+
+\begin{figure}[!ht]
+ \centering
+ \input{graphics/feelpp/feelpp-benchmark-heatfluid-resheat.tex}
+ \caption{Distribution of the computed temperature over the eyeball in the standing position, on a vertical cut. Mesh discretization is also presented.}
+ \label{fig:feelpp:wp1:hfheat}
+
+\end{figure}
+
+
+\begin{figure}[!ht]
+
+ \def\subfigwidth{1.4\columnwidth}
+ \begin{subfigure}[c]{0.32\textwidth}
\centering
-
-
+ \input{graphics/feelpp/feelpp-benchmark-heatfluid-standing.tex}
+ \vspace{-1\baselineskip}
+ \caption{Standing position.}
+ \label{fig:feelpp:wp1:hf:standing}
+ \end{subfigure}
+ %
+ \begin{subfigure}[c]{0.32\textwidth}
+ \input{graphics/feelpp/feelpp-benchmark-heatfluid-prone.tex}
+ \vspace{-1\baselineskip}
+ \caption{Prone position.}
+ \label{fig:feelpp:wp1:hf:prone}
+ \end{subfigure}
+ %
+ \begin{subfigure}[c]{0.32\textwidth}
+ \input{graphics/feelpp/feelpp-benchmark-heatfluid-supine.tex}
+ \vspace{-1\baselineskip}
+ \caption{Supine position.}
+ \label{fig:feelpp:wp1:hf:supine}
+ \end{subfigure}
+
+ \caption{Results of simulation for various postural orientations of the eye. Streamlines are colored according to the pressure, and the arrows show the fluid velocity magnitude.}
+ \label{fig:feelpp:wp1:hf}
+\end{figure}
+
+
+The results of the benchmark are summarized in~\Cref{fig:feelpp:wp1:coupled:time} and~\Cref{fig:feelpp:wp1:coupled:time-rel},
+showing the computational time and relative computational time for each component of the simulation, respectively.
+The results are presented for the three biggest meshed of the family, namely \texttt{Mr4}, \texttt{Mr5}, and \texttt{Mr6}.
+Note that for \texttt{Mr6}, the simulation dit not complete on 1 node (128 cores) due to memory limitations.
+
+We observe that the resolution of the non-linear algebraic system is the most time-consuming part of the simulation, followed by the assembly of the linear system.
+Moreover, even though the relative time is globally similar when the number of cores is increased, we note a decrease in the absolute time for various components of the simulation, except for the Post process part, which involves writing the results to disk.
+The assembly time remains significant compared to other parts of the simulation, with a noticeable increase in the time spent resolving the non-linear system, which forms the largest portion of the computation.
+As the number of cores increases, we also observe a proportional increase in time dedicated to I/O operations, particularly in the Post process phase, due to the larger volumes of data being written to disk.
+
+
+
+\pgfplotstableread{\currfiledir/data/heatfluid-time-M4.dat}\dataMQuatre
+\pgfplotstableread{\currfiledir/data/heatfluid-time-M5.dat}\dataMCinq
+\pgfplotstableread{\currfiledir/data/heatfluid-time-M6.dat}\dataMSix
+
+\begin{figure}
\centering
- {
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Computational time [s]},
+ xtick={0,1,2,3,4,5}, xticklabels={128,256,384,512,640,768},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true, ymin=0,
+ bar width=7pt, ybar stacked,
+ %ymode=log,
+ % title={Computational time for the 3D case},
+ ]
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y] table [x=x, y=initMesh] {\dataMQuatre};
+ % \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y] table [x=x, y expr=\thisrow{init}-\thisrow{initMesh}] {\dataMQuatre};
+ % \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y] table [x=x, y expr=\thisrow{algebraic-newton-initial-guess}+\thisrow{algebraic-jacobian}+\thisrow{algebraic-residual}] {\dataMQuatre};
+ % \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y] table [x=x, y=algebraic-nlsolve] {\dataMQuatre};
+ % \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y] table [x=x, y=exportResults] {\dataMQuatre};
+ % \addlegendentry{Post process}
+
+ % customdarkblue customcyan customorange custompurple customgreen
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y, forget plot] table [x=x, y=initMesh] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{init}-\thisrow{initMesh}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{algebraic-newton-initial-guess}+\thisrow{algebraic-jacobian}+\thisrow{algebraic-residual}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y, forget plot] table [x=x, y=algebraic-nlsolve] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y, forget plot] table [x=x, y=exportResults] {\dataMCinq};
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y, forget plot] table [x=x, y=initMesh] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{init}-\thisrow{initMesh}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y, forget plot] table [x=x, y expr=\thisrow{algebraic-newton-initial-guess}+\thisrow{algebraic-jacobian}+\thisrow{algebraic-residual}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y, forget plot] table [x=x, y=algebraic-nlsolve] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y, forget plot] table [x=x, y=exportResults] {\dataMSix};
+
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Computational time for the coupled heat-fluid testcase.}
+ \label{fig:feelpp:wp1:coupled:time}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Relative computational time [\%]},
+ xtick={0,1,2,3,4,5}, xticklabels={128,256,384,512,640,768},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt, ybar stacked,
+ ymin=0, ymax=100,
+ % title={Relative computational time for the 3D case},
+ ]
+
+ % Compute the relative time for each component by dividing by the total time
+ % using the correct column names from the initial plot
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{initMesh}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{init}-\thisrow{initMesh})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{algebraic-nlsolve}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{exportResults}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMQuatre};
+ \addlegendentry{Post process}
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{initMesh}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{init}-\thisrow{initMesh})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{algebraic-nlsolve}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{exportResults}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMCinq};
+
+ \resetstackedplots
+
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{initMesh}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{init}-\thisrow{initMesh})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*(\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual})/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{algebraic-nlsolve}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{exportResults}/(\thisrow{initMesh} + (\thisrow{init}-\thisrow{initMesh}) + \thisrow{algebraic-nlsolve} + (\thisrow{algebraic-newton-initial-guess} + \thisrow{algebraic-jacobian} + \thisrow{algebraic-residual}) + \thisrow{exportResults})}] {\dataMSix};
+ \end{axis}
+ \end{tikzpicture}
+
+ \caption{Relative time spent in each component of the computation for the coupled heat-fluid testcase.}
+ \label{fig:feelpp:wp1:coupled:time-rel}
+\end{subfigure}
+\caption{Absolute (\Cref{fig:feelpp:wp1:coupled:time}) and relative (\Cref{fig:feelpp:wp1:coupled:time-rel}) computational time for the coupled heat-fluid testcase, performed on Gaya with the meshes \texttt{Mr4} (left), \texttt{Mr5} (middle), and \texttt{Mr6} (right).}
+\label{fig:feelpp:wp1:heatfluid:time}
+\end{figure}
+
+
+\paragraph{Challenges Identified}
+Several challenges were encountered during the benchmarking process:
+\begin{itemize}
+ \item \textbf{Memory Usage:} Memory usage should be better monitored espcially when creating the solver and preconditioner objects.
+ \item \textbf{Parallelization Inefficiencies:} We need to test on large configuration in terms of mesh sizes and number of cores as well as increase the polynomial order to identify potential parallelization and I/O bottlenecks.
+\end{itemize}
+
+
+
+\subsubsection{Benchmark \#6: Contact Mechanics}
+
+\paragraph{Description}
+This benchmark simulates the dynamic unilateral contact between an elastic bouncing
+ball and a rigid horizontal wall, presented in \cite{chouly_explicit_2018}. The full model,
+combining ray-tracing, the Signorini contact mechanics, and the dynamics of elastic bodies
+is presented in \cite{van_landeghem_motion_nodate}. \\
+
+
+\paragraph{Benchmarking Tools Used}
+
+The simulations are conducted on the Gaya supercomputer. The execution time of the
+following tasks is monitored:
+
+\begin{inparaenum}[\it (i)]
+ \item Mesh: loading and initialization of the non-partitioned mesh,
+ \item Data Structures: initialization of data structures,
+ \item Ray-tracing: collision detection using ray-tracing,
+ \item Assembly: construction of the dynamic algebraic system,
+ \item Solve: solving the non-linear algebraic system, and
+ \item Post process: exporting the vectorial displacement field, the scalar contact displacement and the scalar contact pressure at each time iteration.
+\end{inparaenum}
+
+\paragraph{Input/Output Dataset Description}
+
+\begin{itemize}
+ \item \textbf{Input Data:} As input, we consider the same mesh for all simulations,
+ using $P_1$ Lagrange elements for the vectorial unknown displacement field. The mesh
+ characteristics, the number of mesh elements and the number of degrees of freedom,
+ are provided in \Cref{tab:feelpp:mesh:contact}. Additionally, the input data includes the configuration files necessary to run the simulations.
+ \item \textbf{Output Data:} The output dataset includes the time evolution of
+ the displacement field of the elastic body, as well as the time evolution of
+ the contact displacement and pressure. In addition, the execution times for
+ the different tasks are stored.
+\end{itemize}
+
+
+
+\begin{table}[!ht]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}
+ \multicolumn{3}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Mesh properties} & \multicolumn{1}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Number of degrees of freedom} \\
+ \hline
+ \rowcolor{numpexgray} {\color{white}\bf $h_\text{min}$} & {\color{white}\bf $h_\text{max}$} & {\color{white}\bf \# elements} & {\color{white}\bf $\vct{u}$} \\
+ \pgfmathprintnumber{0.19269262925729186} & \pgfmathprintnumber{0.46595156749445504} & \pgfmathprintnumber{21675942} & \pgfmathprintnumber{9208203} \\
+ \hline
+ \end{tabular}
+ }}
+ \caption{Characteristics of the mesh and the number of degrees of freedom for the vectorial displacement field $\vct{u}$ with $P_1$ discretization.}%
+ \label{tab:feelpp:mesh:contact}
+\end{table}
+
+\paragraph{Results Summary}
+
+\begin{figure}[!ht]
+ \centering
+
+ \includegraphics[width=\textwidth]{graphics/feelpp/feelpp-benchmark-contact-temperature.png}
+ \caption{The contact between a sphere and a rigid wall. The colors represent the distance from the highest point of the sphere to the wall.}
+ \label{fig:wp1:feelpp:contact:visualization-solution}
+\end{figure}
+
+
+The contact displacement is shown in ~\Cref{fig:wp1:feelpp:contact:visualization-solution}.
+The figure on the left shows the reference domain, while the figure on the right depicts
+the contact between the sphere and the rigid wall. The distance from the highest point of the sphere
+in the outward normal direction to the wall is represented by the color bar.
+The results of this simulation correspond to those found in the literature \cite{chouly_explicit_2018}.
+
+The results for the computational time and relative computational time for the
+different tasks and varying numbers of processors are presented in ~\Cref{fig:feelpp:wp1:contact:time},
+and ~\Cref{fig:feelpp:wp1:contact:time-rel}. The bars show the results using $P_1$ Lagrange
+elements for the unknown displacement field.
+
+
+We observe that the resolution of the dynamic system constitutes the majority of
+the computational time, and its relative time increases with the number of cores.
+The communication between nodes and synchronization points become predominant as the number of cores increases.
+With the increase in the number of cores, the absolute execution time related to
+data structure initialization, ray-tracing, assembly, and post-processing decreases.
+The mesh loading time remains constant, as it is not partitioned at the input.
+
+
+\pgfplotstableread{\currfiledir/data/contact-time.dat}\dataContact
+
+\begin{figure}
+ \centering
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Computational time [s]},
+ xtick={0,1,2,3,4}, xticklabels={32,64,128,256,384},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true, ymin=0,
+ bar width=7pt, ybar stacked,
+ %ymode=log,
+ % title={Computational time for the 3D case},
+ ]
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y] table [x=x, y=mesh] {\dataContact};
+ % \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y] table [x=x, y=data] {\dataContact};
+ % \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=customyellow, draw=black, point meta=y] table [x=x, y expr={300*\thisrow{raytracing}} ] {\dataContact};
+ % \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y] table [x=x, y=assembly ] {\dataContact};
+
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y] table [x=x, y=solve] {\dataContact};
+ % \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y] table [x=x, y=postprocess] {\dataContact};
+ % \addlegendentry{Post process}
+
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Computational time for the contact mechanics testcase.}
+ \label{fig:feelpp:wp1:contact:time}
+ \end{subfigure}
+ \hfill
+ \begin{subfigure}[b]{\textwidth}
+ \begin{tikzpicture}
+ \begin{axis}[
+ width=\textwidth, height=8cm,
+ xlabel={Nproc}, ylabel={Relative computational time [\%]},
+ xtick={0,1,2,3,4}, xticklabels={32,64,128,256,384},
+ legend style={at={(0.5,-0.18)}, anchor=north, legend columns=-1},
+ ymajorgrids=true, yminorgrids=true,
+ bar width=7pt, ybar stacked,
+ ymin=0, ymax=100,
+ % title={Relative computational time for the 3D case},
+ ]
+
+ % Compute the relative time for each component by dividing by the total time
+ % using the correct column names from the initial plot
+ \addplot+[ybar, bar width=0.2, fill=customdarkblue, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{mesh}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Mesh}
+ \addplot+[ybar, bar width=0.2, fill=customgreen, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{data}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Data Structures}
+ \addplot+[ybar, bar width=0.2, fill=customyellow, draw=black, point meta=y]
+ table [x=x, y expr={3000*\thisrow{raytracing}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Ray-tracing}
+ \addplot+[ybar, bar width=0.2, fill=customcyan, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{assembly}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Assembly}
+ \addplot+[ybar, bar width=0.2, fill=custompurple, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{solve}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Solve}
+ \addplot+[ybar, bar width=0.2, fill=customorange, draw=black, point meta=y]
+ table [x=x, y expr={100*\thisrow{postprocess}/(\thisrow{mesh} + \thisrow{data} + 300*\thisrow{raytracing} + \thisrow{assembly} + \thisrow{solve} + \thisrow{postprocess} )}] {\dataContact};
+ \addlegendentry{Post process}
+
+ \end{axis}
+ \end{tikzpicture}
+
+ \caption{Relative computational time for the contact mechanics testcase.}
+ \label{fig:feelpp:wp1:contact:time-rel}
+\end{subfigure}
+\caption{Absolute (\Cref{fig:feelpp:wp1:contact:time}) and relative (\Cref{fig:feelpp:wp1:contact:time-rel}) computational time for the various tasks using $P_1$ Lagrange elements for the vectorial displacement field.}
+\label{fig:feelpp:wp1:contactbenchmark:time}
+\end{figure}
+
+
+
+\subsection{12-Month Roadmap}
+\label{sec:WP1:Feelpp:roadmap}
+
+For the next 12 month, we plan to focus on the following aspects of the benchmarking process:
+\begin{description}
+ \item[Data Improvements] Unify the input and output data format and structure to facilitate comparison and analysis. In particular, we wish to design dataset architecture that holds all necessary information for the benchmarking process as well as reference output results if any. We have started an effort to continuously benchmark our software using reframe and collect the information in a database including a visualisation tool through a website.
+ \item[Methodology Application] Several aspects will be developed:
+ \begin{itemize}
+ \item include HdG methods and high order methods in the current benchmarks, we didn't have the time to include them in the current results.
+ \item task based parallelism using the runtime environment \ac{specx}:
+ \begin{itemize}
+ \item add multithreading support in various steps of the computational pipeline.
+ \item enable distribution of some work load on GPU (eg. Ray Tracing, Assemly and Solve steps)
+ \end{itemize}
+ \item investigate the use of Kokkos to define portable and performant kernels.
+ \item Improve Ray tracing parallel performance at large scale
+ \item Improve I/O performance at large scale leveraging the HDF5 data format and the parallel I/O capabilities of the library.
+ \item Improve partitioning and load balancing of the mesh at large scale.
+ \item Improve parallel mesh adaptation at large scale using ParMMG
+ \end{itemize}
+ \item[Results Retention] We use two data management platforms Girder and Zenodo to store the data and results of the benchmarks following the methdology in~\cref{sec:methodology-intro}.
+\end{description}
+
+In~\Cref{tab:WP1:Feelpp:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+
+\begin{table}[!ht]
+ \centering
+
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} B10 - Scientific Productivity & Containerization and packaging are enabled as well as \ac{CI}/\ac{CD}. \ac{CB} will be enabled very soon.\\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & building upon data management and scientific productivity improvements to enable reproducibility \\
+\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & improve I/O using HDF5 and MPI I/O and possibly used framework from \ac{PC3}\\
\rowcolor{numpexlightergray} B2 - Interconnect Technology & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B6 - Data Management & dataset creation and management are being improved to satisfy the methodology in~\cref{sec:methodology-intro}. Girder~\cref{sec:arch:girder:unistra} and~\cref{sec:arch:zenodo} will be used to store our dataset and enable FAIR principles\\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & enable Ray Tracing, cG, HdG and spectral element methods on GPU, enable new partitioning strategies and load balancing, \\
+\hline
\end{tabular}
}
}
- \caption{WP1: Feel++ plan with Respect to Relevant Bottlenecks}
- \label{tab:WP1:Feel++:bottlenecks}
-\end{table}
\ No newline at end of file
+ \caption{WP1: \Feelpp plan with Respect to Relevant Bottlenecks}
+ \label{tab:WP1:Feelpp:bottlenecks}
+\end{table}
diff --git a/software/feelpp/WP1/data/HL-31_M1_P1_discoverer.csv b/software/feelpp/WP1/data/HL-31_M1_P1_discoverer.csv
new file mode 100644
index 0000000..b9a75fb
--- /dev/null
+++ b/software/feelpp/WP1/data/HL-31_M1_P1_discoverer.csv
@@ -0,0 +1,12 @@
+nProc,createMesh,createExporters,init,solve,exportResults,algebraic-assembly,algebraic-solve,heat.ksp-niter,electric.ksp-niter
+16,2.3728302000e+01,1.7034010000e-02,6.2636398700e+01,2.1547218200e+01,2.9203061500e+01,1.1367113680e+01,1.0073588880e+01,40,20
+32,1.3293367300e+01,1.1205420000e-03,3.1834099100e+01,9.7530698100e+00,1.6075089900e+01,6.6176559900e+00,3.0888409900e+00,39,22
+64,7.3622678400e+00,1.3734230000e-03,1.7945598800e+01,5.7233874300e+00,8.6970483400e+00,3.6214691510e+00,2.0727737500e+00,41,20
+128,5.7261796900e+00,1.6420810000e-03,1.3183568600e+01,4.3502196900e+00,5.4145803100e+00,2.7453495020e+00,1.5871246380e+00,40,20
+256,3.0576672600e+00,4.1978340000e-03,8.8526158600e+00,4.0333140300e+00,4.1646587600e+00,1.6646723300e+00,2.3548291500e+00,38,21
+384,2.5559617100e+00,1.6911228000e-02,7.6901085000e+00,5.1166267400e+00,3.5430812400e+00,1.3701739410e+00,3.7364459500e+00,35,19
+512,2.1886948800e+00,1.6403512000e-02,7.3290956500e+00,7.1364235100e+00,3.2503395400e+00,1.2450340920e+00,5.8721426500e+00,43,19
+640,2.0769965100e+00,1.6688991000e-02,7.5082110900e+00,1.4670810100e+01,3.5011321800e+00,1.7185719970e+00,1.2556511200e+01,39,19
+768,2.0926878400e+00,2.4820687000e-02,9.8314281800e+00,6.7370503000e+01,3.6395013600e+00,1.9818025860e+00,6.5124912200e+01,37,19
+896,2.7648481100e+00,1.4618539000e-02,1.1048141000e+01,9.0812721400e+01,3.9984162000e+00,2.5888323910e+00,8.7745634500e+01,41,19
+1024,2.0710182400e+00,5.5474440000e-03,4.1303655000e+01,1.9131923500e+01,4.0783385600e+00,2.0956100850e+00,1.6915041630e+01,40,19
diff --git a/software/feelpp/WP1/data/HL-31_M1_P1_gaya.csv b/software/feelpp/WP1/data/HL-31_M1_P1_gaya.csv
new file mode 100644
index 0000000..086a6d4
--- /dev/null
+++ b/software/feelpp/WP1/data/HL-31_M1_P1_gaya.csv
@@ -0,0 +1,8 @@
+nProc,createMesh,createExporters,init,solve,exportResults,algebraic-assembly,algebraic-solve,heat.ksp-niter,electric.ksp-niter
+4,7.8874580400e+01,1.9476150000e-03,1.9824665800e+02,7.3784552600e+01,1.0522864800e+02,3.8918477760e+01,3.4622390700e+01,27,19
+8,4.3281780500e+01,2.7652300000e-03,1.0510149100e+02,4.6480319500e+01,5.4425087200e+01,2.2248771380e+01,2.4102312810e+01,24,16
+16,2.5983501800e+01,1.2950870000e-03,6.1675336900e+01,2.5623008900e+01,3.2234827900e+01,1.1701536510e+01,1.3843015900e+01,25,19
+32,1.5387380200e+01,1.8431960000e-03,3.5630527700e+01,1.6334269800e+01,1.9533166100e+01,6.5640037700e+00,9.6667968800e+00,27,19
+64,1.0981516300e+01,1.2421180000e-03,2.3090160300e+01,1.1137873300e+01,1.5782601100e+01,4.7082960800e+00,6.3957056700e+00,23,17
+128,8.4185287800e+00,2.2854730000e-03,4.9458590800e+01,5.1966084600e+00,1.5423843700e+01,2.4543095620e+00,2.7250683500e+00,27,18
+256,3.4244702100e+00,6.8803710000e-03,8.5744426700e+00,2.7639874000e+00,1.1136088100e+01,1.4758021030e+00,1.2773671600e+00,23,19
diff --git a/software/feelpp/WP1/data/HL-31_M1_P2_discoverer.csv b/software/feelpp/WP1/data/HL-31_M1_P2_discoverer.csv
new file mode 100644
index 0000000..0c54714
--- /dev/null
+++ b/software/feelpp/WP1/data/HL-31_M1_P2_discoverer.csv
@@ -0,0 +1,12 @@
+nProc,createMesh,createExporters,init,solve,exportResults,algebraic-assembly,algebraic-solve,heat.ksp-niter,electric.ksp-niter
+32,1.2455821000e+01,2.7753236000e-01,7.9082478700e+01,1.4059603600e+02,3.3334927100e+01,4.2091728830e+01,9.7414624900e+01,65,46
+64,7.5020320900e+00,2.5459540000e-03,3.9820673800e+01,5.1679813500e+01,1.6653618800e+01,2.5223069730e+01,2.6291791300e+01,58,39
+128,5.6276066500e+00,4.8854100000e-03,2.4808922200e+01,3.0263716000e+01,9.4215373000e+00,1.1524433640e+01,1.8655076840e+01,62,47
+256,3.5804048800e+00,8.6840250000e-03,1.7088210600e+01,2.3290676200e+01,5.8797107700e+00,6.8549821700e+00,1.6389009620e+01,66,60
+512,2.3087583900e+00,1.2793275000e-02,4.3913934800e+01,4.8564209100e+01,4.8038315000e+00,4.0947834800e+00,4.4434286700e+01,71,104
+640,2.3271627900e+00,1.7020085000e-02,1.1818792200e+01,3.7032727400e+01,5.1447940000e+00,4.2494515600e+00,3.2623971500e+01,68,62
+768,2.3024858500e+00,2.2473998000e-02,1.2023239400e+01,8.3730319900e+01,4.4990733800e+00,3.8853559900e+00,7.9606708900e+01,68,54
+896,2.3069415500e+00,1.5629392000e-02,1.4328854600e+01,1.0584844300e+02,5.0414187000e+00,4.0841940900e+00,1.0103615240e+02,73,45
+1024,2.0894351200e+00,1.7431044000e-02,1.0847259600e+01,3.0417341900e+01,5.2733886000e+00,3.7679367000e+00,2.6431497100e+01,72,71
+1152,2.9698662800e+00,1.5972996000e-02,1.0700660400e+01,3.8691532100e+01,4.8811864000e+00,2.4723275060e+00,3.6199670200e+01,69,83
+1280,2.8596463600e+00,2.2429604000e-02,1.0543590900e+01,3.5659172500e+01,5.2365601500e+00,2.3609524630e+00,3.3264351200e+01,71,69
diff --git a/software/feelpp/WP1/data/HL-31_M1_P2_gaya.csv b/software/feelpp/WP1/data/HL-31_M1_P2_gaya.csv
new file mode 100644
index 0000000..861f965
--- /dev/null
+++ b/software/feelpp/WP1/data/HL-31_M1_P2_gaya.csv
@@ -0,0 +1,8 @@
+nProc,createMesh,createExporters,init,solve,exportResults,algebraic-assembly,algebraic-solve,heat.ksp-niter,electric.ksp-niter
+4,7.8591992200e+01,2.2857320000e-03,4.5822833600e+02,9.4456183800e+02,2.2483643600e+02,2.3456447970e+02,7.0823706500e+02,46,50
+8,4.0865207300e+01,2.6806740000e-03,2.2773034000e+02,4.7871985700e+02,1.0745015900e+02,1.1799206270e+02,3.5979184000e+02,42,35
+16,2.3984927900e+01,2.7669960000e-03,1.3368841900e+02,3.6315871000e+02,5.5282467400e+01,5.9106416000e+01,3.0349014400e+02,55,43
+32,1.4052001300e+01,4.3585650000e-03,7.6762278200e+01,2.0670682000e+02,3.3535965300e+01,3.5466906200e+01,1.7087956530e+02,51,50
+64,7.9775138000e+00,2.8970700000e-03,4.2285763500e+01,1.0908232800e+02,1.9546610700e+01,1.8141998560e+01,9.0442308300e+01,46,38
+128,6.0672697500e+00,1.2050651000e-02,2.7728789800e+01,9.6987331100e+01,1.4764709900e+01,1.2391013870e+01,8.4156269600e+01,47,44
+256,3.4963983100e+00,4.6900840000e-03,1.4998552500e+01,4.7387031700e+01,1.3436832600e+01,6.2649840700e+00,4.1064858200e+01,46,57
diff --git a/software/feelpp/WP1/data/SAVEthermalbrides_M3_P3_discoverer.csv b/software/feelpp/WP1/data/SAVEthermalbrides_M3_P3_discoverer.csv
new file mode 100644
index 0000000..2af682e
--- /dev/null
+++ b/software/feelpp/WP1/data/SAVEthermalbrides_M3_P3_discoverer.csv
@@ -0,0 +1,7 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init
+640,4.35685000e-04,5.58447588e+00,4.76816963e+00,2.53127272e-01,6.41800390e+00,9.21354859e+00,2.84210000e-05,3.03802441e+01
+768,4.81104000e-04,5.52890593e+00,4.92008748e+00,2.69175045e-01,5.11537824e+00,1.02396251e+01,4.47710000e-05,3.08069648e+01
+896,4.89096000e-04,5.30403448e+00,4.13190226e+00,4.52869998e-01,4.74457136e+00,1.05872785e+01,4.47310000e-05,3.02593697e+01
+1024,5.47685000e-04,4.71450200e+00,3.69833453e+00,2.32863468e-01,4.17638190e+00,1.01722287e+01,4.95500000e-05,2.69447249e+01
+1152,6.47416000e-04,4.88651734e+00,3.31738166e+00,2.07439174e-01,4.33771967e+00,8.82286228e+00,2.98700000e-05,2.50427414e+01
+1280,6.14950000e-04,4.70194963e+00,3.30265719e+00,4.04115688e-01,3.97464095e+00,9.04132852e+00,5.17300000e-05,2.43631665e+01
diff --git a/software/feelpp/WP1/data/contact-time.dat b/software/feelpp/WP1/data/contact-time.dat
new file mode 100644
index 0000000..7ba589b
--- /dev/null
+++ b/software/feelpp/WP1/data/contact-time.dat
@@ -0,0 +1,6 @@
+nProc mesh data raytracing solve assembly postprocess x
+32 1220 3515.78 7.99 21700. 5790.6 5750. 0
+64 1350 1843.63 5.55 14800 3480.2 3320 1
+128 1370. 871.68 3.15 7680. 1702.8 1910. 2
+256 1370. 412.777 3.98 9320. 908.73 1270. 3
+384 1410. 275.982 3.91 22500. 674.2 1300. 4
\ No newline at end of file
diff --git a/software/feelpp/WP1/data/heatfluid-time-M4.dat b/software/feelpp/WP1/data/heatfluid-time-M4.dat
new file mode 100644
index 0000000..deccb3d
--- /dev/null
+++ b/software/feelpp/WP1/data/heatfluid-time-M4.dat
@@ -0,0 +1,7 @@
+nProc initMesh createExporters graph matrixVector algebraicOthers init snes-niter algebraic-newton-initial-guess algebraic-jacobian algebraic-residual algebraic-nlsolve solve exportResults x
+128 2.8832947000e+00 1.0644970000e-01 9.9484240600e-01 1.6842853100e-01 1.2037600000e-04 1.7471086000e+01 5 4.3297252000e-02 9.1714625100e+00 4.8804535300e+00 6.3989943000e+01 6.3993202300e+01 1.2241222200e+01 -0.25
+384 9.5509137600e+00 5.9602071000e-02 5.1536020800e-01 4.3629427400e-01 2.6969800000e-04 1.1108840000e+01 5 3.2903182000e-02 1.1810600100e+01 2.4709951700e+00 7.6292128300e+01 7.6294369000e+01 1.0763801300e+01 0.75
+256 4.0199597000e+00 5.6903196000e-02 6.1569145800e-01 2.4874063300e-01 5.0875000000e-05 5.4578067000e+00 5 1.9060319000e-02 6.8808480000e+00 3.0906525800e+00 6.4367011500e+01 6.4369924900e+01 1.0828510400e+01 1.75
+512 7.4688850500e+00 7.2650993000e-02 6.9411369800e-01 3.5568968400e-01 1.1956500000e-04 8.9794582800e+00 5 4.3061246000e-02 9.5882259200e+00 1.9191909300e+00 5.1791150300e+01 5.1792671000e+01 1.0877965600e+01 2.75
+640 7.0479418600e+00 5.9969436000e-02 5.6765102600e-01 3.1525860200e-01 1.5104400000e-04 8.3393920300e+00 5 3.0913026000e-02 8.6573001800e+00 2.0891155700e+00 8.8454711600e+01 8.8457407900e+01 1.2455707800e+01 3.75
+768 8.0307731900e+00 7.6782754000e-02 5.9018891500e-01 3.8684149500e-01 1.4354100000e-04 9.8496158500e+00 5 3.9585666000e-02 8.3931794300e+00 1.9199747200e+00 4.9572183900e+01 4.9573873800e+01 1.1068545900e+01 4.75
diff --git a/software/feelpp/WP1/data/heatfluid-time-M5.dat b/software/feelpp/WP1/data/heatfluid-time-M5.dat
new file mode 100644
index 0000000..cec1ad5
--- /dev/null
+++ b/software/feelpp/WP1/data/heatfluid-time-M5.dat
@@ -0,0 +1,7 @@
+nProc initMesh createExporters graph matrixVector algebraicOthers init snes-niter algebraic-newton-initial-guess algebraic-jacobian algebraic-residual algebraic-nlsolve solve exportResults x
+128 1.6901888800e+01 4.8112315600e-01 6.6536187000e+00 9.1356602200e-01 1.8077000000e-04 3.8295172700e+01 5 2.0242436100e-01 4.1269404600e+01 2.6058905500e+01 3.6447994500e+02 3.6449279600e+02 2.1924630200e+01 0
+256 1.4436836500e+01 2.6806578400e-01 3.8955033600e+00 6.0126180000e-01 1.2471500000e-04 2.1330322400e+01 5 1.1096730900e-01 2.4549018200e+01 1.3800923100e+01 2.1158960500e+02 2.1159825300e+02 2.5315904700e+01 1
+384 2.1326006600e+01 2.0718716900e-01 4.4426718500e+00 2.5515440100e-01 1.2409300000e-04 2.8518670000e+01 5 3.0346600300e-01 3.2310070900e+01 1.3049917200e+01 2.1383577600e+02 2.1385366000e+02 2.4672972100e+01 2
+512 1.7362467600e+01 1.7490193400e-01 3.2365561300e+00 2.7541745900e+00 9.4237000000e-05 2.5315741600e+01 5 2.2441782200e-01 2.1963850900e+01 8.7331257700e+00 1.5701628600e+02 1.5702909600e+02 1.9934319800e+01 3
+640 1.9842207100e+01 1.5117178500e-01 3.2774784300e+00 4.2753928200e-01 1.0536900000e-04 2.5004794000e+01 5 1.2840409300e-01 1.9123906500e+01 6.9897268400e+00 1.4568245000e+02 1.4568809700e+02 2.2308801000e+01 4
+768 2.0365526900e+01 1.3675956000e-01 1.9612419900e+00 3.7731505700e+00 1.0084000000e-04 2.8312983700e+01 5 1.0795119500e-01 1.7320950700e+01 5.6777846000e+00 2.1185942000e+02 2.1187875500e+02 2.3274643200e+01 5
diff --git a/software/feelpp/WP1/data/heatfluid-time-M6.dat b/software/feelpp/WP1/data/heatfluid-time-M6.dat
new file mode 100644
index 0000000..bd97e20
--- /dev/null
+++ b/software/feelpp/WP1/data/heatfluid-time-M6.dat
@@ -0,0 +1,6 @@
+nProc initMesh createExporters graph matrixVector algebraicOthers init snes-niter algebraic-newton-initial-guess algebraic-jacobian algebraic-residual algebraic-nlsolve solve exportResults x
+256 3.3756093300e+01 7.8704357600e-01 1.2590613800e+01 9.5433492300e+00 1.8840500000e-04 6.5152153500e+01 5 3.8085563300e-01 8.0535251500e+01 4.6039300100e+01 8.3368675700e+02 8.3372569000e+02 4.1074661000e+01 1.25
+384 3.6073115400e+01 5.6072191600e-01 1.1190462700e+01 3.7238490500e+00 1.1392400000e-04 5.8099560600e+01 5 7.9997573400e-01 7.4873448800e+01 4.4395690200e+01 7.2040107500e+02 7.2064617500e+02 3.0834449600e+01 2.25
+512 4.2544497200e+01 4.3171370900e-01 8.9824307100e+00 3.7693287000e+00 5.1868000000e-05 6.1054836500e+01 5 6.1626864900e-01 5.1449094000e+01 2.7586192100e+01 4.9607653000e+02 4.9636843200e+02 2.7637326100e+01 3.25
+640 3.0702302400e+01 3.7315054400e-01 1.5698607300e+01 1.4859880300e+00 1.3502400000e-04 5.2930040500e+01 5 9.7664598300e-01 5.1473427400e+01 2.3280806400e+01 5.3034898700e+02 5.3057908900e+02 2.7740535300e+01 4.25
+768 4.8526928700e+01 4.0554984200e-01 9.6392203400e+00 8.2554347800e+00 9.8385000000e-05 7.1442201000e+01 5 8.3075885400e-01 4.5313700300e+01 1.9944762900e+01 3.9629933600e+02 3.9640462900e+02 4.3398295700e+01 5.25
diff --git a/software/feelpp/WP1/data/heatfluid-time-data.dat b/software/feelpp/WP1/data/heatfluid-time-data.dat
new file mode 100644
index 0000000..ccf774f
--- /dev/null
+++ b/software/feelpp/WP1/data/heatfluid-time-data.dat
@@ -0,0 +1,12 @@
+x nProc snes-niter algebraic-newton-initial-guess algebraic-jacobian algebraic-residual algebraic-nlsolve solve initMesh createExporters graph matrixVector algebraicOthers init exportResults
+0 1 5 8.5572370300e-01 2.8415422600e+03 1.2688440400e+03 1.2932284300e+04 1.2932695500e+04 4.8181386400e+01 4.4126717600e+00 2.2754197200e+02 2.1171819200e+01 3.6198000000e-05 3.6993006800e+02 1.0432355300e+02
+1 2 5 1.1789050900e+00 1.6289574600e+03 6.3663896800e+02 7.2005328500e+03 7.2008993700e+03 3.6329771000e+01 2.4044482600e+00 4.1184161000e+02 1.3790979500e+01 4.6767999900e-05 5.1411211300e+02 9.5245434700e+01
+2 4 5 6.5231492100e-01 8.0029810100e+02 3.1614092700e+02 4.4523783900e+03 4.4526153500e+03 2.3077745500e+01 1.2222734900e+00 1.8157338400e+02 7.2932276300e+00 3.2842000000e-05 2.3763589000e+02 4.7234174000e+01
+3 8 5 3.5690192400e-01 3.9603225500e+02 1.6635196200e+02 3.5116281400e+03 3.5117330300e+03 1.2802605900e+01 6.2928266300e-01 9.3319453100e+01 4.3768954500e+00 3.7641000000e-05 1.2364950800e+02 1.9705699100e+01
+4 16 5 2.1656393800e-01 2.3159759400e+02 9.7366045800e+01 1.8922871100e+03 1.8923486300e+03 7.7180356300e+00 3.6955078700e-01 5.2242086600e+01 2.5493710500e+00 4.0867000000e-05 6.9905182300e+01 1.1726561100e+01
+5 32 5 1.3602161100e-01 1.3968685400e+02 5.3033316500e+01 1.1018268700e+03 1.1018676100e+03 5.1272856900e+00 2.0731300300e-01 2.4397212400e+01 1.3997793800e+00 4.7650000000e-05 3.4910967600e+01 6.8253637600e+00
+6 64 5 3.8599662500e-01 9.3636244900e+01 3.3196533200e+01 8.6537564000e+02 8.6556683900e+02 5.6824842000e+00 1.2720604500e-01 1.3362342500e+01 1.3812117800e+00 1.0072000000e-04 2.3811849800e+01 7.1705104300e+00
+7 128 5 1.3308907600e-01 5.4782262400e+01 2.1508311000e+01 4.0141182800e+02 4.0148978200e+02 8.0578149100e+00 7.8796834000e-02 5.7696591000e+00 6.2952572200e-01 3.7090000000e-05 1.6555693600e+01 5.0762797000e+00
+8 256 5 3.4228249000e-02 3.3897457300e+01 9.4967658500e+00 2.8289624400e+02 2.8290105000e+02 4.6922366900e+00 6.8024454000e-02 3.5754825700e+00 7.5132016800e-01 1.4197700000e-04 9.7799898400e+00 7.2673011400e+00
+9 512 5 2.5240257500e-01 2.1580793900e+01 5.9640738400e+00 1.6194049800e+02 1.6194828600e+02 7.0178650400e+00 5.9698545000e-02 2.7453203900e+00 6.8205018200e-01 9.3886000000e-05 1.1263374300e+01 9.7943683600e+00
+10 640 5 2.9664943500e-01 2.0589903900e+01 5.1002338700e+00 1.2667439800e+02 1.2667974000e+02 7.6339394200e+00 5.6030996000e-02 2.8737287900e+00 1.8356951000e-01 1.5342900000e-04 1.1318939900e+01 9.0236068400e+00
diff --git a/software/feelpp/WP1/data/nafems_le10_M2_P1_discoverer.csv b/software/feelpp/WP1/data/nafems_le10_M2_P1_discoverer.csv
new file mode 100644
index 0000000..54a5297
--- /dev/null
+++ b/software/feelpp/WP1/data/nafems_le10_M2_P1_discoverer.csv
@@ -0,0 +1,9 @@
+nProc,initMaterialProperties,createMesh,createSpaces,createExporters,graph,matrixVector,algebraicOthers,init,ksp-niter,createMeshTmp,algebraic-assembly,algebraic-solve,solve,exportResults
+32,1.8556195000e+00,2.1691816900e+00,8.5843856700e-01,6.0890490000e-03,1.4208177000e+00,3.0663807800e-01,4.2270000000e-05,2.3668106700e+00,21,3.3128381000e-02,2.1773797600e+00,1.6992519800e+00,3.8806420200e+00,2.3838979000e+00
+64,1.2346050600e-01,1.7146231400e+00,5.4994092400e-01,6.0723610000e-03,7.2015888500e-01,4.6902053800e-01,4.8000000000e-05,1.6353411300e+00,31,2.5759751000e-02,1.1782501500e+00,1.6547593300e+00,2.8354415500e+00,1.8646870100e+00
+128,1.2800464200e-01,2.4388941800e+00,2.1662567200e-01,2.3180342000e-02,3.5912660800e-01,1.7458234700e-01,6.0431000000e-05,8.4548941400e-01,29,5.2050269000e-02,7.6910187800e-01,8.5149549000e-01,1.6223382000e+00,1.1978515900e+00
+256,1.3629191700e-01,8.0985942200e-01,1.1610069000e-01,1.1489339000e-02,1.2763518900e-01,2.8710282400e-01,6.6351000000e-05,6.8485181700e-01,51,5.6251013000e-02,4.8296558100e-01,4.5526199900e+00,5.0366287300e+00,1.0990297300e+00
+384,1.3702706600e-01,1.4111965100e+00,9.2622580000e-02,1.5782682000e-02,9.7430185000e-02,2.9773488100e-01,3.8770000000e-05,7.0678710600e-01,46,7.4294029000e-02,8.1514142700e-01,8.9481060500e+00,9.7649459700e+00,1.5933848100e+00
+512,1.5056493200e-01,1.0905762800e+00,7.9329924000e-02,2.5169130000e-02,5.1734442000e-02,3.6555002600e-01,2.4840000000e-05,8.3097828200e-01,26,4.9725410000e-02,6.6695741000e-01,1.0538847400e+01,1.1210192800e+01,1.1223221500e+00
+640,1.5191931500e-01,1.4573100700e+00,7.8014727000e-02,1.7475387000e-02,5.2399938000e-02,6.0096290900e-01,2.8260000000e-05,3.8170437100e+00,23,6.6848734000e-02,1.5388090700e+00,2.7310616700e+01,2.8852909800e+01,1.1649082400e+00
+768,1.4175688600e-01,2.1709473300e+00,2.9787320100e-01,7.1946650000e-03,4.2693876000e-02,1.5367586400e+00,3.7040000000e-05,4.9182127700e+00,30,6.8759402000e-02,2.0640759900e+00,4.6258648800e+01,4.8328484800e+01,1.4531332400e+00
diff --git a/software/feelpp/WP1/data/nafems_le10_M2_P2_discoverer.csv b/software/feelpp/WP1/data/nafems_le10_M2_P2_discoverer.csv
new file mode 100644
index 0000000..3df3cfa
--- /dev/null
+++ b/software/feelpp/WP1/data/nafems_le10_M2_P2_discoverer.csv
@@ -0,0 +1,11 @@
+nProc,initMaterialProperties,createMesh,createSpaces,createExporters,graph,matrixVector,algebraicOthers,init,ksp-niter,createMeshTmp,algebraic-assembly,algebraic-solve,solve,exportResults
+32,5.8937407000e-02,1.9795969100e+00,2.6038611200e+00,7.9177910000e-03,6.0694592300e+00,4.9735968300e-01,2.8350000000e-05,8.3117501600e+00,34,8.0689829000e-02,4.8810899900e+01,2.4190379300e+01,7.3031109500e+01,4.3218249800e+00
+64,8.1828908000e-02,1.4375282300e+00,1.3478358800e+00,7.2066950000e-03,3.7827086600e+00,8.0363445100e-01,2.5120000000e-05,5.5164392000e+00,71,5.0042030000e-03,2.4414097800e+01,2.1641430800e+01,4.6072906900e+01,2.3770363900e+00
+128,1.9207525400e+00,1.1851704300e+00,7.6257814200e-01,8.7581380000e-03,3.4713744800e+00,7.1839568900e-01,4.1250000000e-05,4.7782212600e+00,84,3.0447950000e-03,1.2636476200e+01,2.1137914900e+01,3.3786309000e+01,1.6046937200e+00
+256,1.7041918800e-01,1.0359222300e+00,3.5591844200e-01,6.9170580000e-03,1.9046328300e+00,5.4191692600e-01,3.0320000000e-05,3.4735082000e+00,80,2.7489830000e-03,8.0936293000e+00,2.4904583800e+01,3.3006526900e+01,1.5209435800e+00
+384,1.6971800500e-01,1.1213571500e+00,2.1161278600e-01,1.0233208000e-02,1.2038030500e+00,6.8500948400e-01,3.9030000000e-05,2.2430616500e+00,45,7.3709364000e-02,5.2685995600e+00,6.2869227200e+01,6.8153608500e+01,1.4581177800e+00
+512,1.6410750800e-01,8.7829955000e-01,1.9967970400e-01,7.2548760000e-03,8.1537149300e-01,5.9241425200e-01,4.0981000000e-05,2.4513330600e+00,75,2.6816880000e-03,4.3374806100e+00,6.9232955600e+01,7.3583431200e+01,1.5486165900e+00
+640,1.3410789900e-01,1.4191775600e+00,3.1457810900e-01,7.1127260000e-03,5.8712386600e-01,6.1403180000e-01,4.0851000000e-05,1.6179725100e+00,78,7.0312875000e-02,4.1830810700e+00,1.1441350000e+02,1.1861861500e+02,1.6623654400e+00
+768,1.5560008500e-01,2.1580834800e+00,2.9517841800e-01,1.1660863000e-02,4.8203675500e-01,5.4692605200e-01,2.4360000000e-05,2.3519118000e+00,41,7.1669711000e-02,3.7130164900e+00,5.2941921300e+01,5.6666433700e+01,1.6372959400e+00
+896,1.6502368300e-01,1.6571246200e+00,1.0965512800e-01,2.0532524900e-01,6.1028952700e-01,1.6258780600e+00,3.6460000000e-05,6.7096871100e+00,92,2.3407840000e-03,4.7990373200e+00,1.0617016700e+02,1.1098720400e+02,1.8913002600e+00
+1024,1.9346360200e-01,3.2645134400e+00,2.9375887700e-01,6.0876100000e-03,6.0336565600e-01,4.9100426300e-01,3.0560000000e-05,1.7999212200e+00,276,2.4510660000e-03,3.7197465600e+00,2.2461480600e+02,2.2857638200e+02,2.0995010800e+00
diff --git a/software/feelpp/WP1/data/nafems_le10_M3_P1_discoverer.csv b/software/feelpp/WP1/data/nafems_le10_M3_P1_discoverer.csv
new file mode 100644
index 0000000..4f094a2
--- /dev/null
+++ b/software/feelpp/WP1/data/nafems_le10_M3_P1_discoverer.csv
@@ -0,0 +1,9 @@
+nProc,initMaterialProperties,createMesh,createSpaces,createExporters,graph,matrixVector,algebraicOthers,init,ksp-niter,createMeshTmp,algebraic-assembly,algebraic-solve,solve,exportResults
+32,1.8745728800e+00,1.3200178700e+01,7.4470275500e+00,2.4657180000e-03,8.1241101900e+00,7.7837317800e-01,3.6081000000e-05,1.3980270400e+01,28,6.6344822000e-02,1.8048767200e+01,1.1913598200e+01,2.9989745500e+01,2.1926488800e+01
+64,1.1951601300e-01,8.5632852600e+00,4.5662954400e+00,1.2653979000e-02,4.6075941800e+00,1.0634174300e+00,2.4810000000e-05,8.6374826200e+00,45,3.6010500000e-02,8.0459886800e+00,1.0450533500e+01,1.8511850400e+01,1.2825416600e+01
+128,1.2815985200e-01,4.5493131400e+00,2.3786513700e+00,1.4378498000e-02,2.6344373100e+00,1.2338423100e+00,3.0080000000e-05,5.2827016200e+00,51,6.4875723000e-02,4.8233937500e+00,9.4301481200e+00,1.4261485700e+01,7.1491306900e+00
+256,1.3333034000e-01,3.0304684700e+00,1.0854564000e+00,7.6540020000e-03,1.3628997600e+00,1.1163836300e+00,3.7991000000e-05,3.2701459100e+00,138,6.3393418000e-02,2.5550122900e+00,1.2457415500e+01,1.5016890900e+01,3.5485211000e+00
+384,1.3328535900e-01,3.0364074300e+00,8.9322719200e-01,1.4258701000e-02,1.8119843300e+00,2.5218360700e-01,4.0330000000e-05,2.7427978500e+00,144,5.9958457000e-02,1.8955342800e+00,1.2355469200e+02,1.2546504800e+02,2.9372366300e+00
+512,1.4410826900e-01,2.4638455400e+00,5.1135506900e-01,2.2654824000e-02,6.9607630900e-01,1.2407755500e+00,3.6990000000e-05,2.5933578400e+00,402,5.8307090000e-02,2.2333666700e+00,1.0313971800e+02,1.0537559300e+02,2.5545929200e+00
+640,1.5164555700e-01,3.1485982400e+00,4.0441396900e-01,1.4165557000e-02,5.4218599000e-01,3.1356539300e+00,4.5760000000e-05,8.2791370700e+00,35,5.3576484000e-02,3.0087665600e+00,6.4772798300e+01,6.7792699700e+01,2.6438750100e+00
+768,1.3515641800e-01,3.2077867900e+00,5.4019082800e-01,2.2015071000e-02,4.4918455200e-01,2.1341318500e+00,3.8760000000e-05,6.2210101600e+00,55,7.5360254000e-02,3.5456821500e+00,1.0098862300e+02,1.0455095500e+02,3.0369320800e+00
diff --git a/software/feelpp/WP1/data/nafems_le10_M3_P2_discoverer.csv b/software/feelpp/WP1/data/nafems_le10_M3_P2_discoverer.csv
new file mode 100644
index 0000000..4b0f3e1
--- /dev/null
+++ b/software/feelpp/WP1/data/nafems_le10_M3_P2_discoverer.csv
@@ -0,0 +1,9 @@
+nProc,initMaterialProperties,createMesh,createSpaces,createExporters,graph,matrixVector,algebraicOthers,init,ksp-niter,createMeshTmp,algebraic-assembly,algebraic-solve,solve,exportResults
+128,8.6816647000e-02,4.6157450200e+00,4.7646293800e+00,3.7731383000e-02,1.2966953500e+01,4.6212927900e+00,2.4800000000e-05,2.1662169400e+01,161,7.3643580000e-03,1.0474051400e+02,1.7424551000e+02,2.7905393300e+02,9.5486099400e+00
+256,1.2926185200e-01,2.7640957000e+00,3.6654818800e+00,1.5001359100e-01,6.4406031600e+00,3.9486830200e+00,2.7490000000e-05,1.3852408100e+01,87,5.6963640000e-03,4.9379161100e+01,6.3272895000e+01,1.1270249100e+02,5.3771690000e+00
+384,1.2049718900e-01,2.2113202300e+00,2.4295438800e+00,2.9784988000e-02,6.9852365700e+00,1.8480250500e+00,3.9081000000e-05,1.3754348300e+01,93,5.6532140000e-03,3.7866528900e+01,1.0453142200e+02,1.4245142400e+02,4.1095803700e+00
+512,1.6631967000e-01,2.1691510800e+00,1.8957464600e+00,6.6833950000e-03,4.3367075500e+00,3.5254252600e+00,3.3521000000e-05,1.2517375200e+01,92,3.6212340000e-03,2.6562169400e+01,9.9642700700e+01,1.2643257900e+02,3.7604334700e+00
+640,1.7732212800e-01,3.1222252900e+00,1.7230171900e+00,1.3206499000e-02,4.2911841400e+00,3.8906723100e+00,3.7830000000e-05,1.2272320600e+01,81,5.6289340000e-03,2.3236055300e+01,1.1102504200e+02,1.3453076300e+02,3.5720874600e+00
+768,2.5847833600e-01,2.4924648800e+00,1.5269299000e+00,1.3258001000e-02,4.2076031200e+00,3.7963667800e+00,2.1550100000e-04,1.3164302100e+01,86,7.5941335000e-02,2.0348047700e+01,1.5143364200e+02,1.7202368700e+02,3.3717554800e+00
+896,2.5567414800e-01,3.9748980100e+00,1.3589351600e+00,1.0189598000e-02,4.4726149600e+00,4.2201519600e+00,4.8790000000e-05,1.4896059600e+01,112,2.9624020000e-03,1.8918223700e+01,2.1034286800e+02,2.2933571700e+02,3.7797210800e+00
+1024,2.0311140700e+00,3.1551660500e+00,8.4240517000e-01,1.0863576000e-02,2.3410697300e+00,3.5687818900e+00,4.8240000000e-05,1.0069711900e+01,86,4.1347220000e-03,1.6688890600e+01,1.3171129900e+02,1.4864833800e+02,3.4201569600e+00
diff --git a/software/feelpp/WP1/data/nafems_le10_M4_P1_discoverer.csv b/software/feelpp/WP1/data/nafems_le10_M4_P1_discoverer.csv
new file mode 100644
index 0000000..7fe2dd6
--- /dev/null
+++ b/software/feelpp/WP1/data/nafems_le10_M4_P1_discoverer.csv
@@ -0,0 +1,9 @@
+nProc,initMaterialProperties,createMesh,createSpaces,createExporters,graph,matrixVector,algebraicOthers,init,ksp-niter,createMeshTmp,algebraic-assembly,algebraic-solve,solve,exportResults
+128,9.6279478000e-02,2.8820064800e+01,1.8193596800e+01,3.2421518000e-02,1.6965697900e+01,8.6485837700e+00,2.6640000000e-05,3.8513056700e+01,135,9.3231266000e-02,3.7015316000e+01,9.0484162400e+01,1.2757307300e+02,5.0188873800e+01
+256,1.2395340300e-01,1.6228678900e+01,9.6308657700e+00,5.2513230000e-03,1.0692933800e+01,7.6312700900e+00,2.6561000000e-05,2.7143719000e+01,57,4.5293871000e-02,1.8924205900e+01,4.1222451200e+01,6.0379793500e+01,3.0504437700e+01
+384,1.2827552300e-01,1.3683930000e+01,6.4444964100e+00,4.7191710000e-03,8.7226668300e+00,6.1202867500e+00,3.7360000000e-05,2.1363098300e+01,53,3.3107548000e-02,1.3695119600e+01,5.1431535300e+01,6.5166269600e+01,1.8463204400e+01
+512,1.6469245200e-01,1.1024196800e+01,4.8989297000e+00,3.4955550000e-03,5.3259504500e+00,8.0882610100e+00,6.0750000000e-05,1.8387373900e+01,117,3.9696712000e-02,1.0009862000e+01,7.8041252400e+01,8.8071527700e+01,1.4840403500e+01
+640,1.8826824800e-01,1.0689215200e+01,4.0630901000e+00,1.1276203000e-02,4.6882797500e+00,8.3899037700e+00,5.2530000000e-05,1.8613012400e+01,94,2.5880149000e-02,8.9001318600e+00,8.7996767600e+01,9.6930622200e+01,1.2814077300e+01
+768,2.1724550700e-01,1.0894441400e+01,3.5965513200e+00,3.9153860000e-03,4.5305656200e+00,8.5577106200e+00,5.0161000000e-05,1.9916289600e+01,58,7.6979798000e-02,9.3416014900e+00,8.9501233900e+01,9.8862401900e+01,1.1852656100e+01
+896,2.5839061600e-01,1.0908971100e+01,3.3479258500e+00,3.5126262000e-02,3.6668367100e+00,1.1360694500e+01,3.7540000000e-05,2.1443345400e+01,67,6.6813972000e-02,8.9445576700e+00,1.3444810100e+02,1.4386693700e+02,1.0720939200e+01
+1024,2.0741639300e+00,7.0634826600e+00,2.7372365300e+00,5.5299430000e-03,6.5641335500e+00,5.9978043100e+00,7.6510000000e-05,1.6733212100e+01,43,6.5048639000e-02,6.5950253800e+00,7.0149409500e+01,7.6982710300e+01,9.0816159000e+00
diff --git a/software/feelpp/WP1/data/nafems_le10_M4_P2_discoverer.csv b/software/feelpp/WP1/data/nafems_le10_M4_P2_discoverer.csv
new file mode 100644
index 0000000..fdb4956
--- /dev/null
+++ b/software/feelpp/WP1/data/nafems_le10_M4_P2_discoverer.csv
@@ -0,0 +1,4 @@
+nProc,initMaterialProperties,createMesh,createSpaces,createExporters,graph,matrixVector,algebraicOthers,init,ksp-niter,createMeshTmp,algebraic-assembly,algebraic-solve,solve,exportResults
+640,1.2311826900e-01,8.4239921200e+00,7.2887457800e+00,1.4224630000e-01,2.2840039900e+01,2.0748258300e+01,5.0440000000e-05,5.3412150700e+01,224,7.3471080000e-03,1.6713317400e+02,4.7572166500e+02,6.4314809200e+02,1.7187455200e+01
+1280,1.8708817600e+00,7.6973103100e+00,5.4351054800e+00,3.0086889000e-02,1.7447132000e+01,1.6386006400e+01,3.9680000000e-05,3.9991299000e+01,221,7.6882770000e-03,8.1355820100e+01,4.0338340100e+02,4.8503596000e+02,1.0852226900e+01
+2560,2.0409103700e-01,7.6495102200e+00,3.6205971600e+00,3.1664237000e-02,6.3043839600e+00,2.4136187900e+01,4.2050000000e-05,3.7481427100e+01,110,7.4581640000e-03,4.7617726200e+01,2.7724742500e+02,3.2535120200e+02,9.8493631300e+00
diff --git a/software/feelpp/WP1/data/nafems_le10_measures.csv b/software/feelpp/WP1/data/nafems_le10_measures.csv
new file mode 100644
index 0000000..47e6607
--- /dev/null
+++ b/software/feelpp/WP1/data/nafems_le10_measures.csv
@@ -0,0 +1,7 @@
+Mesh,FunctionSpace,PolyOrder,Points_pointD_expr_sigma_yy
+2,1,1,5.4113157547869030e+06
+2,2,2,5.3895200534619587e+06
+3,1,1,5.4339263575702887e+06
+3,2,2,5.3340391751274057e+06
+4,1,1,5.3772021437434787e+06
+4,2,2,5.3288496626412356e+06
diff --git a/software/feelpp/WP1/data/thermalbridges_M1_P1_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M1_P1_discoverer.csv
new file mode 100644
index 0000000..a701a87
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M1_P1_discoverer.csv
@@ -0,0 +1,7 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+128,1.2221492700e-01,6.4238565900e-01,3.6209233900e-01,1.4458699000e-02,2.8710427000e-02,1.6634969500e-01,3.8690000000e-05,1.7349843500e+00,26,1.8110100400e-01,1.6613907700e-01,3.4788501700e-01,4.3714443000e-01
+256,1.2188734100e-01,6.0933554400e-01,3.5646670400e-01,2.8591517000e-02,1.3059219000e-02,1.9658095800e-01,2.6620000000e-05,1.7128140300e+00,27,1.4065624800e-01,1.0622557300e+00,1.2033874000e+00,5.3369217400e-01
+384,1.2437320600e-01,1.1830958100e+00,3.6205980200e-01,2.8347095000e-02,6.0365400000e-03,2.3218801700e-01,2.9640000000e-05,2.3419577800e+00,26,1.4211633500e-01,2.2203140900e+00,2.3628567800e+00,6.6711084200e-01
+512,1.2767371800e-01,7.8678474100e-01,3.5101679900e-01,3.9970829000e-02,7.2165000000e-03,2.5982411000e-01,2.5400000000e-05,2.1693036200e+00,27,1.5817408500e-01,2.9846153300e+00,3.1432860800e+00,9.1241879800e-01
+640,1.2444258600e-01,1.1941529300e+00,3.6527525700e-01,2.5397164200e-01,1.5870359000e-02,4.1670653600e-01,2.9171000000e-05,3.2150332700e+00,27,1.6181011200e-01,5.9518735500e+00,6.1184470600e+00,1.4622789000e+00
+768,1.2181068100e-01,1.0041638400e+00,5.8097881600e-01,6.3973814000e-02,5.4210090000e-03,1.3031381700e+00,3.8400000000e-05,4.2390728400e+00,27,3.4788091100e-01,3.2163671600e+01,3.2518294900e+01,1.5505736000e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M1_P2_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M1_P2_discoverer.csv
new file mode 100644
index 0000000..242b79a
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M1_P2_discoverer.csv
@@ -0,0 +1,6 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+768,2.0043945700e-01,1.1550174700e+00,5.0253279600e-01,6.2502142000e-02,2.1611302000e-02,1.4874120600e+00,3.4240000000e-05,1.1076939400e+01,27,5.4320486400e-01,2.4804235300e+01,2.5355764200e+01,1.7623572400e+00
+896,2.0443952700e-01,2.1696371700e+00,5.0451176400e-01,7.6830519000e-02,2.4296902800e-01,2.9201949700e+00,2.5970000000e-05,7.1910087400e+00,27,5.1984655500e-01,4.0220041100e+01,4.0945942000e+01,1.7758572400e+00
+1024,2.1300895700e-01,2.2501775000e+00,5.0940512800e-01,3.2543927400e-01,2.8915860000e-02,5.3961488200e-01,2.9170000000e-05,4.5474326000e+00,27,5.1770846700e-01,1.2049986500e+01,1.2777072900e+01,2.2439947000e+00
+1152,1.9801998700e-01,3.1553701500e+00,3.0040711100e-01,5.2593783700e-01,1.0873629000e-02,3.8018388500e-01,2.5940000000e-05,5.2566829000e+00,28,2.9549906300e-01,2.0276598900e+01,2.0572906000e+01,2.6340414500e+00
+1280,2.0617758400e-01,2.6577875100e+00,5.1298211700e-01,1.3591860900e+00,4.9517851000e-02,3.5371247800e-01,3.0560000000e-05,6.8186289000e+00,28,2.8321354300e-01,2.0927082900e+01,2.1212091800e+01,2.6108234200e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M1_P3_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M1_P3_discoverer.csv
new file mode 100644
index 0000000..4be521b
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M1_P3_discoverer.csv
@@ -0,0 +1,7 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+640,4.3467500000e-04,9.1846142800e-01,5.1316202400e-01,4.1455845000e-02,1.8578729100e-01,2.9361011000e-01,2.8140000000e-05,2.7074524900e+00,43,7.6340030500e-01,3.6384693700e+01,3.7174548100e+01,1.2062629200e+00
+768,5.7615500000e-04,9.6099116900e-01,4.9969733500e-01,2.6611345700e-01,8.0908235000e-02,2.1621960600e+00,3.3200000000e-05,4.9757196300e+00,42,6.7516046300e-01,2.3635402500e+01,2.4317840600e+01,1.2899842900e+00
+896,4.5941400000e-04,1.2435942700e+00,4.9607431000e-01,2.6093499000e-01,8.0056954000e-02,2.1032333500e+00,4.4090000000e-05,4.8733336800e+00,40,8.3975956800e-01,5.2366497600e+01,5.3262096200e+01,2.2019590300e+00
+1024,5.3071400000e-04,1.2889648100e+00,4.8964810900e-01,7.4947960000e-02,8.3408202000e-02,3.7149283100e-01,4.5340000000e-05,3.3181940600e+00,42,8.2659177400e-01,2.5856958300e+01,2.6766810000e+01,1.7153465100e+00
+1152,3.6828300000e-04,1.3693329400e+00,2.9069442500e-01,8.1627258000e-02,5.6752445000e-02,3.5405923600e-01,3.0561000000e-05,4.0900474400e+00,40,6.0157326200e-01,2.6602871700e+01,2.7271097100e+01,2.2375110000e+00
+1280,4.9841300000e-04,2.8397818000e+00,7.4133889200e-01,1.6133377500e+00,1.1517500500e-01,5.2547799300e-01,2.9170000000e-05,7.6578209200e+00,42,5.4890687100e-01,2.9098227500e+01,2.9648814000e+01,3.1170897300e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M2_P1_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M2_P1_discoverer.csv
new file mode 100644
index 0000000..b4b4834
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M2_P1_discoverer.csv
@@ -0,0 +1,7 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+128,1.2159143700e-01,2.2760594300e+00,6.6093750300e-01,9.0708642000e-02,3.4608272000e-01,1.8654925700e-01,6.4321000000e-05,4.5097836700e+00,19,3.4542091800e-01,3.1062773200e-01,6.5809906600e-01,9.1677828400e-01
+256,1.2362867300e-01,1.5033889700e+00,4.7840572400e-01,6.1903482000e-02,1.1813589600e-01,3.6211038400e-01,3.0160000000e-05,3.1819345100e+00,19,2.3496012900e-01,2.4063613500e+00,2.6423985300e+00,7.9927894000e-01
+384,1.2211505500e-01,1.6291026900e+00,4.4793072600e-01,4.8317818000e-02,5.6980198000e-02,4.2714177400e-01,3.7960000000e-05,3.2220218200e+00,22,2.0508143200e-01,3.6282911400e+00,3.8342432800e+00,1.0686290300e+00
+512,1.2187394200e-01,2.2635685700e+00,4.1478562600e-01,5.3356314000e-02,2.5043320000e-01,2.6027529400e-01,3.1690000000e-05,3.8966749900e+00,21,3.5986217300e-01,5.5182120600e+00,5.8837084000e+00,1.3494170700e+00
+640,1.2642750300e-01,1.9455363300e+00,4.1126546000e-01,7.9367204000e-02,3.1596224700e-01,4.3888142100e-01,3.2750000000e-05,3.8324187700e+00,20,1.9581928000e-01,1.2200409100e+01,1.2402038700e+01,2.6269755700e+00
+768,1.4389478900e-01,1.0956197500e+00,6.1133474500e-01,6.9087759000e-02,1.6696681000e-01,1.2751007000e+00,2.7630000000e-05,4.3774152800e+00,21,3.9826991600e-01,3.3974133100e+01,3.4382698200e+01,2.0416071600e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M2_P2_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M2_P2_discoverer.csv
new file mode 100644
index 0000000..4a93f63
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M2_P2_discoverer.csv
@@ -0,0 +1,6 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+768,2.0940107700e-01,1.7468972300e+00,6.3741777200e-01,7.0368758000e-02,4.9982277100e-01,2.3364909700e+00,3.0320000000e-05,1.3356731400e+01,26,7.8452580500e-01,4.2701669000e+01,4.3502398700e+01,3.0766955900e+00
+896,2.0327590100e-01,2.2733472200e+00,6.0093497200e-01,8.3659303000e-02,3.4125211300e-01,3.1683314300e+00,3.5990000000e-05,8.1242850600e+00,27,6.5273073600e-01,6.1099692800e+01,6.1763975700e+01,2.8927137700e+00
+1024,2.0639198500e-01,1.5559529000e+00,5.8796726100e-01,1.2753675900e-01,6.2144358100e-01,5.1158001100e-01,3.1430000000e-05,4.3586877300e+00,27,6.0579255200e-01,3.1060104500e+01,3.1686632000e+01,2.6679515700e+00
+1152,2.0125288600e-01,1.6127071000e+00,5.8981169000e-01,3.6499914300e-01,1.0163956400e-01,9.1146615400e-01,3.5731000000e-05,4.7368318100e+00,26,3.9584179100e-01,2.8388142900e+01,2.8817305400e+01,3.3033876900e+00
+1280,2.1090202000e-01,2.5263821100e+00,5.5441298900e-01,3.1961478700e-01,5.3980467700e-01,4.7696878600e-01,2.9421000000e-05,5.4792223500e+00,27,3.7619856600e-01,3.1222273500e+01,3.1601417600e+01,3.0258266900e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M2_P3_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M2_P3_discoverer.csv
new file mode 100644
index 0000000..2800291
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M2_P3_discoverer.csv
@@ -0,0 +1,7 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+640,4.4491400000e-04,2.1732103400e+00,9.5973800400e-01,5.0494403000e-02,1.3898614700e+00,1.6806650500e+00,3.8390000000e-05,7.0443629000e+00,44,1.7689990200e+00,4.6284469300e+01,4.8075974900e+01,3.0016306400e+00
+768,5.8934500000e-04,2.3982832000e+00,8.4805869500e-01,6.2068321000e-02,1.4264228900e+00,1.5009238400e+00,3.7330000000e-05,7.3144058000e+00,43,1.6017413700e+00,6.5836430200e+01,6.7476190000e+01,2.4725984200e+00
+896,5.4982500000e-04,2.3019797500e+00,7.8467612700e-01,1.0806108400e-01,1.3219004700e+00,2.8279014200e+00,3.4260000000e-05,8.8945250700e+00,43,1.5163562100e+00,6.5948378600e+01,6.7488718800e+01,3.0133387900e+00
+1024,4.2770400000e-04,1.3678673100e+00,7.8509388100e-01,7.0090768000e-02,1.5034407900e+00,8.6866450500e-01,4.2680000000e-05,5.7423838800e+00,43,1.2486861400e+00,4.4307065700e+01,4.5585569000e+01,3.0055180400e+00
+1152,6.2707500000e-04,2.3136228700e+00,4.9890980500e-01,1.1373103300e-01,6.6522563600e-01,1.9390517100e+00,4.7851000000e-05,6.8503261400e+00,42,1.1097118700e+00,4.1907769300e+01,4.3222282300e+01,2.9576937600e+00
+1280,4.6471500000e-04,1.6861913100e+00,7.2101455400e-01,3.2862174100e-01,1.3394583200e+00,9.0421259600e-01,4.4621000000e-05,6.2617915200e+00,43,9.7346972000e-01,4.7209356900e+01,4.8194864000e+01,3.5538268600e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M3_P1_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M3_P1_discoverer.csv
new file mode 100644
index 0000000..7f8a377
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M3_P1_discoverer.csv
@@ -0,0 +1,7 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+128,5.8801534000e-02,1.5388099300e+01,5.7377812100e+00,1.4296828500e+00,1.8650969500e+00,1.7588750400e+00,3.1750000000e-05,3.1983697400e+01,19,1.9035248800e+00,2.6079812500e+00,4.5220725600e+00,4.7289494200e+00
+256,1.2193992900e-01,8.8420221800e+00,2.7906854300e+00,6.5528861400e-01,1.1538221300e+00,1.9316459800e+00,3.2131000000e-05,1.8510334600e+01,19,1.1398652900e+00,3.9852895500e+00,5.1761948500e+00,3.3930818600e+00
+384,1.2431615200e-01,7.2772168200e+00,1.8251861100e+00,3.8501902800e-01,6.6694985700e-01,2.0310720500e+00,2.9461000000e-05,1.4385343400e+01,20,8.6147464300e-01,8.4534463700e+00,9.3189661600e+00,2.3292885400e+00
+512,1.2102471700e-01,6.0220637600e+00,1.3149637600e+00,2.7702819600e-01,5.0972619800e-01,2.0172791000e+00,4.0870000000e-05,1.1988063100e+01,19,7.6746064800e-01,7.5281271700e+00,8.2988521000e+00,2.2445924800e+00
+640,1.2441643500e-01,4.9730741600e+00,1.0620161100e+00,2.2246039800e-01,5.7559515700e-01,1.8785659100e+00,3.7950000000e-05,1.0304962100e+01,19,6.5551864800e-01,1.1008448600e+01,1.1670536100e+01,2.5452425200e+00
+768,1.2252347100e-01,4.7038850200e+00,1.1016825900e+00,1.9703714600e-01,3.1311057000e-01,3.0561556300e+00,3.9380000000e-05,1.1092531600e+01,19,5.9093735700e-01,2.2219551500e+01,2.2812773000e+01,2.5381560900e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M3_P2_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M3_P2_discoverer.csv
new file mode 100644
index 0000000..92e738d
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M3_P2_discoverer.csv
@@ -0,0 +1,6 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+768,1.9892015100e-01,4.6479819500e+00,2.0025882300e+00,1.2048745100e+00,1.9820755100e+00,5.7985732600e+00,4.2221000000e-05,2.4477765200e+01,28,2.0059030000e+00,5.1952086100e+01,5.3979470600e+01,4.4939586700e+00
+896,1.9479082700e-01,4.5595567400e+00,1.9092774600e+00,1.7055611800e-01,1.7471430900e+00,6.5590641700e+00,3.1780000000e-05,1.7699028500e+01,30,1.8941306200e+00,7.0019888300e+01,7.1941408800e+01,4.6561315600e+00
+1024,2.2258726900e-01,4.4410605500e+00,1.6938962200e+00,4.4033279500e-01,1.5125901400e+00,6.2620516600e+00,3.8051000000e-05,1.6418235900e+01,30,1.5359937900e+00,4.2653670200e+01,4.4434091300e+01,4.5638793200e+00
+1152,2.0465769600e-01,4.3289426200e+00,1.5569707600e+00,6.0937193400e-01,1.4556434300e+00,5.4774946700e+00,3.5320000000e-05,1.5484748300e+01,33,1.4987614900e+00,4.6546006700e+01,4.8289549300e+01,4.7113590800e+00
+1280,2.0423572500e-01,4.1475541400e+00,1.4661314900e+00,1.4183412700e+00,1.2767295000e+00,5.5131345700e+00,3.4420000000e-05,1.6088636000e+01,29,1.3687894200e+00,4.6890151300e+01,4.8473130300e+01,4.7990259900e+00
diff --git a/software/feelpp/WP1/data/thermalbridges_M3_P3_discoverer.csv b/software/feelpp/WP1/data/thermalbridges_M3_P3_discoverer.csv
new file mode 100644
index 0000000..c5ddb94
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_M3_P3_discoverer.csv
@@ -0,0 +1,7 @@
+nProc,initMaterialProperties,initMesh,initFunctionSpaces,initPostProcess,graph,matrixVector,algebraicOthers,init,ksp-niter,algebraic-assembly,algebraic-solve,solve,exportResults
+640,5.6811600000e-04,5.8829813400e+00,4.8046282100e+00,1.9629473400e-01,6.3687823500e+00,9.7428641300e+00,5.1471000000e-05,3.0892399500e+01,52,7.5115468900e+00,5.8914157300e+01,6.6507393600e+01,5.6644518600e+00
+768,4.3859300000e-04,5.4567335000e+00,5.0132249100e+00,1.9594279700e-01,5.0668731000e+00,1.0737904100e+01,4.7240000000e-05,3.1185553800e+01,52,6.5558912300e+00,8.7634080800e+01,9.4245232100e+01,1.1437644300e+01
+896,3.9494300000e-04,5.5881810400e+00,4.3274205200e+00,2.2052755600e-01,4.6531367300e+00,1.0712614100e+01,4.8940000000e-05,3.0379003100e+01,55,5.4856500900e+00,1.0935850000e+02,1.1511664200e+02,6.0332953100e+00
+1024,5.4768500000e-04,4.7145020000e+00,3.6983345300e+00,2.3286346800e-01,4.1763819000e+00,1.0172228700e+01,4.9550000000e-05,2.6944724900e+01,53,4.6801205900e+00,6.6260307400e+01,7.1393493100e+01,0.0000000000e+00
+1152,6.4518900000e-04,4.9777687400e+00,3.4706974500e+00,4.3221852600e-01,4.1446092800e+00,9.3423767500e+00,5.0030000000e-05,2.5769581900e+01,56,4.5478110200e+00,6.8228675100e+01,7.3056887300e+01,5.7206711400e+00
+1280,6.1495000000e-04,4.7019496300e+00,3.3026571900e+00,4.0411568800e-01,3.9746409500e+00,9.0413285200e+00,5.1730000000e-05,2.4363166500e+01,53,4.1604424600e+00,6.9384043000e+01,7.3820710100e+01,2.9530584300e+01
diff --git a/software/feelpp/WP1/data/thermalbridges_measures.csv b/software/feelpp/WP1/data/thermalbridges_measures.csv
new file mode 100644
index 0000000..b0c71a1
--- /dev/null
+++ b/software/feelpp/WP1/data/thermalbridges_measures.csv
@@ -0,0 +1,10 @@
+Mesh,FunctionSpace,PolyOrder,Normal_Heat_Flux_alpha,Normal_Heat_Flux_beta,Normal_Heat_Flux_gamma,Points_alpha_max_field_temperature,Points_alpha_min_field_temperature,Points_beta_max_field_temperature,Points_beta_min_field_temperature,Statistics_temperature_alpha_max,Statistics_temperature_alpha_min,Statistics_temperature_beta_max,Statistics_temperature_beta_min
+1,1,1,4.3664487331567940e+01,1.3234932512920267e+01,-5.7485182388700139e+01,1.7902222158287383e+01,1.1304867204491277e+01,1.6842788433387280e+01,1.1082223535205555e+01,1.7902169501965489e+01,1.1371956387817605e+01,1.6842741345767166e+01,1.1112161991304177e+01
+1,2,2,4.5920349299502369e+01,1.3860839175855748e+01,-5.9806133448075506e+01,1.7901891073029908e+01,1.1321657526014533e+01,1.6842480672480740e+01,1.1112744631863260e+01,1.7901884718433479e+01,1.1396092677993355e+01,1.6842475256850463e+01,1.1143124253065611e+01
+1,3,3,4.6057445284140442e+01,1.3894513993257846e+01,-5.9932960345290191e+01,1.7901922351104183e+01,1.1321569384690871e+01,1.6842508560988644e+01,1.1114398838330514e+01,1.7901915877032060e+01,1.1396995559840580e+01,1.6842502832766989e+01,1.1143633112408436e+01
+2,1,1,4.4799717162042043e+01,1.3536694138922140e+01,-5.8623721773288175e+01,1.7901960933620234e+01,1.1316057276332904e+01,1.6842559410775841e+01,1.1102053394597020e+01,1.7901947772540517e+01,1.1352070586155065e+01,1.6842547950221018e+01,1.1120965153494268e+01
+2,2,2,4.6023155452834246e+01,1.3870834660787196e+01,-5.9901921733220760e+01,1.7901920397938451e+01,1.1321516669751611e+01,1.6842506098598843e+01,1.1112662135288588e+01,1.7901918798251351e+01,1.1359716866255161e+01,1.6842504741471132e+01,1.1130881826387943e+01
+2,3,3,4.6082178422527484e+01,1.3890641287101072e+01,-5.9966446141339631e+01,1.7901933390180247e+01,1.1321800155485697e+01,1.6842517834979930e+01,1.1114019895592506e+01,1.7901931776794832e+01,1.1359728626976997e+01,1.6842516438673741e+01,1.1130496826523419e+01
+3,1,1,4.5404732072008386e+01,1.3697310338161632e+01,-5.9247837752897944e+01,1.7901926646209294e+01,1.1315266440423787e+01,1.6842514869191447e+01,1.1110162730392528e+01,1.7901922868030713e+01,1.1334933610620965e+01,1.6842511882689585e+01,1.1119212975395500e+01
+3,2,2,4.6064679339355280e+01,1.3880897977609173e+01,-5.9949624528822426e+01,1.7901933036973887e+01,1.1320371543754488e+01,1.6842517415665746e+01,1.1114439036646937e+01,1.7901932634720737e+01,1.1340271948667160e+01,1.6842517067408572e+01,1.1122516688150915e+01
+3,3,3,4.6090709601683052e+01,1.3893796991275430e+01,-5.9981056917149459e+01,1.7901937306746408e+01,1.1321319392070992e+01,1.6842521026539728e+01,1.1114939976301162e+01,1.7901936894298874e+01,1.1340481302633263e+01,1.6842520632463181e+01,1.1121713423681232e+01
diff --git a/software/feelpp/WP2/WP2.tex b/software/feelpp/WP2/WP2.tex
index 3903137..d11cd2f 100644
--- a/software/feelpp/WP2/WP2.tex
+++ b/software/feelpp/WP2/WP2.tex
@@ -1,7 +1,8 @@
-\section{Software: Feel++}
-\label{sec:WP2:Feel++:software}
+%!TEX root = ../../../exa-ma-d7.1.tex
+\section{Software: \texorpdfstring{\Feelpp}{Feel++}}
+\label{sec:WP2:Feelpp:software}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -10,7 +11,7 @@ \section{Software: Feel++}
\begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
\rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-Feel++ Consortium\\
+\Feelpp Consortium\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
CNRS\\
@@ -37,126 +38,216 @@ \section{Software: Feel++}
B6 - Data Management\\
B7 - Exascale Algorithms\\
\end{tabular} \\
- \bottomrule
+\rowcolor{numpexlightergray}\textbf{Contributors} & \begin{tabular}{l}
+ Christophe Prud'homme (UNISTRA)\\
+ Vincent Chabannes (UNISTRA)\\
+ Thomas Saigre (UNISTRA)\\
+\end{tabular}\\
+ \hline
\end{tabular}
}}
- \caption{WP2: Feel++ Information}
+ \caption{WP2: \Feelpp Information}
\end{table}
\subsection{Software Overview}
-\label{sec:WP2:Feel++:summary}
+\label{sec:WP2:Feelpp:summary}
-In~\cref{tab:WP2:Feel++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+In~\cref{tab:WP2:Feelpp:features}, we provide a summary of the software features relevant to the work package which are briefly discussed.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} "ROM-DA: GEIM PBDW & provide short description here \\
-\rowcolor{numpexlightergray} "ROM: RB & provide short description here \\
-\rowcolor{white} ..." & provide short description here \\
-\rowcolor{numpexlightergray} ..." & provide short description here \\
-\rowcolor{white} POD & provide short description here \\
-\rowcolor{numpexlightergray} ROM: NIRB & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} ROM-DA: GEIM PBDW & \Feelpp support data driven reduced basis method PBDW and GEIM which can work often in combination\\
+\rowcolor{numpexlightergray} ROM: RB, CRB, RB-GREEDY, POD, EIM, SCM, $\min-\theta$ & \Feelpp supports efficient reduced basis approximation thanks to affine decomposition that can be computed by hand or via EIM. Several constructions are possible: Reduced Basis from sampling, Greedy and POD. Certified Reduced Basis can be built. The online code is generated as well as a database and associated metadata. \\
+\rowcolor{white} ROM: NIRB & Nonintrusive reduced basis~\cite{CRMATH_2009__347_7-8_435_0} are available as a python layer \\
+\hline
\end{tabular}
}
}
- \caption{WP2: Feel++ Features}
- \label{tab:WP2:Feel++:features}
+ \caption{WP2: \Feelpp Features}
+ \label{tab:WP2:Feelpp:features}
\end{table}
\subsection{Parallel Capabilities}
-\label{sec:WP2:Feel++:performances}
-
+\label{sec:WP2:Feelpp:performances}
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+\begin{description}
+ \item[Parallel Programming Environment:] The model order reduction methods profit from \Feelpp parallel capabilities. MPI is used to distribute the load for offline calculation. The online stage can use multithreading.
+ \item[Super computer:] The reduced basis has been run on Gaya~\Cref{sec:arch:gaya}.
+ \item[Parallel Capabilities:] the offline stage can profit from the parallel capabilities of \Feelpp.
+ \item[Scalability:] The reduced basis methods share similar performances as in WP1~\Cref{sec:WP1:Feelpp:software}.
+ \item[Integration with Other Systems:] The reduced basis methods are often used in combination with OpenTURNS and OpenMODELICA/Dymola.
+\end{description}
\subsection{Initial Performance Metrics}
-\label{sec:WP2:Feel++:metrics}
+\label{sec:WP2:Feelpp:metrics}
This section provides a summary of initial performance benchmarks performed in the context of WP2. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+\begin{description}
+ \item[Overall Performance] \Feelpp reduced order method profit from \Feelpp parallel capabilities.
+ \item[Input/Output Dataset] The dataset for the \Feelpp model order reduction methods are composed of the finite element configuration for the offline stage and a JSON description of the parameter domain, the parameter variability and the outputs of interests.
+ \item[open-data Access] \Feelpp uses json as metadata format and in-house open format for the online database. The current format require documentation.
+ \item[Challenges] Some of the challenges are that the reduced basis method imply a complex workflow with many steps that need to be carefully configured for efficient computation. In particular the reduced basis functions require robust finite element solves as well as a posteriori error bounds or estimations which means developing robust algrebraic solution strategies.
+ \item[Future Improvements] performance monitoring of all the steps of the reduced order methods, documentation of the online stage data formats and large scale benchmarking.
+\end{description}
-\subsection{12-Month Roadmap}
-\label{sec:WP2:Feel++:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
-In~\cref{tab:WP2:Feel++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+\subsubsection{Benchmark \#1: Heat transfer in the eye using \ac{RB} and \ac{NIRB}}
+
+\paragraph{Description:} \cite{saigre_model_2024} presents a 3D mathematical model to simulate heat transfer in the human eye, addressing the challenges of experimental analysis. A sensitivity analysis explores the effects of various parameters on heat distribution in ocular tissues. To improve computational efficiency, we applied a fast model reduction technique with certified error bounds using the \ac{CRB}~\cite{prudhomme_reliable_2002}. Our results align with experimental and numerical findings, highlighting the importance of blood flow and environmental factors.
+
+In~\cite{saigre_mathematical_2024}, we extend the reduced order model to include non-intrusive reduced basis methods~\cite{CRMATH_2009__347_7-8_435_0}.
+This second methods consists in solving the high fidelity model on a coarse mesh and use interpolation and a reduced basis to obtain an approximation of the solution on a fine mesh.
+A rectification post-process is applied to the reduced solution to improve the accuracy of the reduced solution.
+
+The benchmark assesses the software's performance in simulating heat transfer in the eye using these two methods.
+
+\paragraph{Benchmarking Tools Used:} We used \Feelpp to monitor the time spent in the various section of the code, including the reduced basis generation and the non-intrusive reduced basis method. To assess the \ac{RB} method performance, we measured the following quantities:
+\begin{inparaenum}[\it (i)]
+ \item the time to compute the high fidelity solution $T^\text{fem}(\mu)$, and
+ \item the time to perform the online stage of the RBM, \emph{i.e.} to compute the reduced solution $T^{\text{rbm}, N}(\mu)$ and associated the error bound $\Delta_N(\mu)$.
+\end{inparaenum}
+
+Then, for the \ac{NIRB} method, we measured the time to measured the following quantities:
+\begin{inparaenum}[\it (i)]
+ \item The fine snapshots $T_h^\mathcal{N}$, \emph{i.e.} the high-fidelity solution,
+ \item the coarse snapshots $T_H^\mathcal{N}$, \emph{i.e.} a high-fidelity solution on the coarse mesh,
+ \item the NIRB approximation without the rectification post-process $T_{Hh}^{N,\text{NoRect}}$, and
+ \item the NIRB approximation with the rectification post-process $T_{Hh}^{N,\text{Rect}}$. This last quantity is the final reduced solution.
+\end{inparaenum}
+
+\paragraph{Input/Output Dataset Description}
-\begin{table}[h!]
+\begin{description}
+ \item[Input Data:] The dataset for the \Feelpp model order reduction methods are composed of the finite element configuration for the offline stage and a JSON description of the parameter domain, the parameter variability and the outputs of interests.
+ \item[Output Data:] The output data is composed of a code, performing the online stage of the methods, as well as database associated to the reduced basis.
+ \item[Data Repository:] The source code to generated the data for the two methods are available in the \Feelpp repository~\cite{prudhomme_feelppfeelpp_2024}.
+\end{description}
+
+\paragraph{Results Summary:}
+
+\Cref{tab:feelpp:wp2:eye:rbm} displays a comparative analysis of execution times for solving the heat transfer problem.
+We first discuss the execution times for the high-fidelity solution, encompassing both $P_1$ and $P_2$ finite-element discretizations.
+The measured time, denoted as $t_\text{exec}$, includes assembling and solving the problem.
+In addition, we also evaluate the execution time of the online phase of our certified reliable reduced basis model.
+It corresponds to the time spent by the application to compute the reduced solution $T^{\text{rbm},N}$, as well as the error bound $\Delta_N$, for a reduced basis of size $N=10$.
+This comparison highlights a significant reduction in the time required to assemble and solve the problem using our advanced reduced basis approach.
+Importantly, this efficiency does not compromise accuracy; the results from the reduced basis model are effectively sharp with respect to the high fidelity model.
+More detailed insights abouth these results are available in \cite{saigre_model_2024}.
+
+We present in \Cref{tab:feelpp:wp2:eye:nirb} the time of execution and associated speedup to run the NIRB method the bioheat transfer problem in the eye using $P_2$ discretization.
+%
+The gain of this second method is less significant than the RBM method, but it is still a significant improvement in terms of computational time.
+We remark that the problem scales well with the number of processors, as evidenced by the speedup obtained for the quantites computed.
+
+
+
+\begin{table}
\centering
-
-
+ \begin{subtable}[b]{\textwidth}
\centering
- {
+ {\setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}
+ & \multicolumn{3}{c}{\color{white}\bf Finite element resolution} & {\color{white}\bf Reduced model} \\
+ \rowcolor{numpexgray}
+ & \multicolumn{3}{c}{\color{white}\bf $T^\text{fem}(\mu)$} & {\color{white}\bf $T^{\text{rbm}, N}(\mu), \Delta_N(\mu)$} \\
+ \rowcolor{numpexgray}
+ & {\color{white}\bf $P_1$} & {\color{white}\bf $P_2$ (\texttt{np=1})} & {\color{white}\bf $P_2$ (\texttt{np=12})} & \\
+ \rowcolor{numpexlightergray}
+ Problem size & $\mathcal{N} = 207~845$ & \multicolumn{2}{c|}{$\mathcal{N} = 1~580~932$} & $N = 10$ \\
+ $t_\text{exec}$ & \qty{5.534}{\second} & \qty{62.432}{\second} & \qty{10.76}{\second} & \qty{2.88e-04}{\second}\\
+ \rowcolor{numpexlightergray}
+ speed-up & 11.69 & -- & 5.80 & \qty{2.17e5}{}\\
+ \hline
+ \end{tabular}
+ }
+ \caption{Times of execution of the finite element model for both $P_1$ and $P_2$ discretization against the computation time of the reduced solution and error bound.}
+ \label{tab:feelpp:wp2:eye:rbm}
+ \end{subtable}
+
+
+ \begin{subtable}[b]{\textwidth}
+ \centering
+ \resizebox{\textwidth}{!}{\setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ \begin{tabular}{!{\color{numpexgray}\vrule}rl!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}
+ \multicolumn{2}{c}{\color{white}\bf Element computed} & \multicolumn{3}{c}{\color{white}\bf Computational time} & \multicolumn{3}{c}{\color{white}\bf Speedup}\\
+ \rowcolor{numpexgray}
+ \multicolumn{2}{c}{} & \color{white}\bf \texttt{np = 12} & \color{white}\bf \texttt{np = 64} & \color{white}\bf \texttt{np = 128} & \color{white}\bf \texttt{np = 12} & \color{white}\bf \texttt{np = 64} & \color{white}\bf \texttt{np = 128} \\
+ \rowcolor{numpexlightergray}
+ Fine snapshots & $T_h^\mathcal{N}$ & \qty{13.6}{\second} & \qty{4.79}{\second} & \qty{2.38e+00}{\second} & -- & 2.84 & 5.71 \\
+ Coarse snapshots & $T_H^\mathcal{N}$ & \qty{2.77e+00}{\second} & \qty{1.10}{\second} & \qty{0.859}{\second} & 4.90 & 12.36 & 15.83 \\
+ \rowcolor{numpexlightergray}
+ NIRB w/o rectification & $T_{Hh}^{N,\text{NoRect}}$ & \qty{3.190}{\second} & \qty{1.309}{\second} & \qty{1.0687}{\second} & 4.26 & 10.39 & 12.73 \\
+ NIRB w/ rectification & $T_{Hh}^{N,\text{Rect}}$ & \qty{3.203}{\second} & \qty{1.311}{\second} & \qty{1.0691}{\second} & 4.25 & 10.37 & 12.72 \\
+ \hline
+ \end{tabular}}
+ \caption{Time of execution and associated speedup to run the NIRB method for the 3D eye-model with $P_2$ discretization, mean over 64 parameters.}
+ \label{tab:feelpp:wp2:eye:nirb}
+ \end{subtable}
+ \caption{Execution times and speedup for the heat transfer problem in the eye using \ac{RB} and \ac{NIRB}.}
+ \label{tab:feelpp:wp2:eye}
+\end{table}
+
+
+
+\paragraph{Challenges Identified:}
+
+\begin{description}
+ \item[Memory Usage] memory monitoring is necessary to better map the testcase on the target architectures.
+ \item[Parallelization] The different offline stages need to be better profiled with respect to performance and I/O. A lot of scalar products are computed in the offline stage, we need to check very large configurations and possibly consider communication avoidance strategies.
+\end{description}
+
+
+\subsection{12-Month Roadmap}
+\label{sec:WP2:Feelpp:roadmap}
+
+In this section, we describe the roadmap for improving benchmarks and addressing the challenges identified.
+It follows mainly the same roadmap as in WP1~\Cref{sec:WP1:Feelpp:roadmap}.
+
+In~\cref{tab:WP2:Feelpp:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+
+\begin{table}[!ht]
+ \centering
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & provide short description here \\
-\rowcolor{numpexlightergray} B2 - Interconnect Technology & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} B10 - Scientific Productivity & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B2 - Interconnect Technology & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{white} B6 - Data Management & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & large scale simulation for \ac{RB} and \ac{NIRB} and monitor reduce and broadcast operations; Implement Non-Linear Compressive RB \\
+\hline
\end{tabular}
}
}
- \caption{WP2: Feel++ plan with Respect to Relevant Bottlenecks}
- \label{tab:WP2:Feel++:bottlenecks}
+ \caption{WP2: \Feelpp plan with Respect to Relevant Bottlenecks}
+ \label{tab:WP2:Feelpp:bottlenecks}
\end{table}
\ No newline at end of file
diff --git a/software/feelpp/WP3/WP3.tex b/software/feelpp/WP3/WP3.tex
index cdbd211..cbb5ea0 100644
--- a/software/feelpp/WP3/WP3.tex
+++ b/software/feelpp/WP3/WP3.tex
@@ -1,7 +1,8 @@
-\section{Software: Feel++}
-\label{sec:WP3:Feel++:software}
+%!TEX root = ../../../exa-ma-d7.1.tex
+\section{Software: \texorpdfstring{\Feelpp}{Feel++}}
+\label{sec:WP3:Feelpp:software}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -10,7 +11,7 @@ \section{Software: Feel++}
\begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
\rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-Feel++ Consortium\\
+\Feelpp Consortium\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
CNRS\\
@@ -37,57 +38,62 @@ \section{Software: Feel++}
B6 - Data Management\\
B7 - Exascale Algorithms\\
\end{tabular} \\
- \bottomrule
+\rowcolor{numpexlightergray}\textbf{Contributors} & \begin{tabular}{l}
+ Christophe Prud'homme (UNISTRA)\\
+ Vincent Chabannes (UNISTRA)\\
+ Thomas Saigre (UNISTRA)\\
+\end{tabular}\\
+ \hline
\end{tabular}
}}
- \caption{WP3: Feel++ Information}
+ \caption{WP3: \Feelpp Information}
\end{table}
\subsection{Software Overview}
-\label{sec:WP3:Feel++:summary}
+\label{sec:WP3:Feelpp:summary}
-In~\cref{tab:WP3:Feel++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+In~\cref{tab:WP3:Feelpp:features} we provide the summary of the \Feelpp features relevant to the work package which are briefly discussed.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} algebraic multiphysics coupling & provide short description here \\
-\rowcolor{numpexlightergray} domain decomposition methods & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} algebraic multiphysics coupling & In \Feelpp, algebraic multiphysics coupling is achieved through the fieldsplit PETSc preconditioner. This allows the construction of block preconditioners, leveraging both PETSc’s built-in preconditioners and in-house preconditioners designed for specific applications, such as computational fluid dynamics (CFD) and magnetostatic problems. These preconditioners enhance the solver's performance for coupled systems by efficiently handling the interaction between different physical fields. \\
+\rowcolor{numpexlightergray} domain decomposition methods & \Feelpp provides support for domain decomposition methods, such as hp-mortar methods. These methods utilize specialized preconditioners that exploit the structure of mortar elements to improve solver efficiency. The hp-mortar approach allows for flexible discretization and is particularly useful for non-conforming meshes and multi-domain problems, enhancing both scalability and accuracy in large-scale simulations. \\
+\hline
\end{tabular}
}
}
- \caption{WP3: Feel++ Features}
- \label{tab:WP3:Feel++:features}
+ \caption{WP3: \Feelpp Features}
+ \label{tab:WP3:Feelpp:features}
\end{table}
\subsection{Parallel Capabilities}
-\label{sec:WP3:Feel++:performances}
-
+\label{sec:WP3:Feelpp:performances}
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+\begin{description}
+ \item[Parallel Environment:] MPI.
+ \item[Architectures] CPU Only: Gaya~\cref{sec:arch:gaya} and almost all JU systems~\cref{sec:arch:eurohpc-ju} except Marenostrum 5 and Deucalion.
+ \item[Scalability] Only Speedups are measured
+\end{description}
\subsection{Initial Performance Metrics}
-\label{sec:WP3:Feel++:metrics}
+\label{sec:WP3:Feelpp:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results.
+All output data are available in this repository, the input data is available publicly in the \Feelpp repository, in Girder~\cref{sec:arch:girder:unistra} and for a few in Zenodo~\cref{sec:arch:zenodo}.
+\iffalse
\begin{itemize}
\item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
\item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
@@ -102,57 +108,666 @@ \subsection{Initial Performance Metrics}
\item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
\end{itemize}
-\subsubsection{Benchmark \#1}
+\subsubsection{Benchmark \#1: Laplacian and Linear Elasticity}
+\paragraph{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+
+\paragraph{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+
+\paragraph{Input/Output Dataset Description:}
+\paragraph{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+
+\paragraph{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+
+\paragraph{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+
+\paragraph{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+
+\paragraph{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+\fi
+
+\subsubsection{Benchmark \#1: Elliptic linear PDE : Thermal bridges}
+\label{sec:WP3:Feelpp:benchmark:thermal_bridges}
+\paragraph{Description:} % Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+The benchmark description can be found in
+\Cref{sec:WP1:Feelpp:benchmark:thermal_bridges}.
+
+The goal of this benchmark in WP3 is to analyze the scalability performance of a linear solver
+available in \Feelpp, which is a wrapper of Petsc solver.
+In this benchmark, we use a GMRES solver preconditioned by an algebraic
+multigrid called GAMG. This kind of preconditioner is generally well adapted to
+elliptic PDE.
+
+
+\paragraph{Benchmarking Tools Used:} %List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+See \Cref{sec:WP1:Feelpp:benchmark:thermal_bridges}
+
+\paragraph{Input/Output Dataset Description:}
+See \Cref{sec:WP1:Feelpp:benchmark:thermal_bridges}
+
+\paragraph{Results Summary:}% Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+
+The metrics computed in this experiment are the time execution of the linear solver and the number of iterations of the GMRES algorithm.
+The \Cref{fig:feelpp:wp3:thermal_bridges:performance_measure_splitted}
+illustrate the results obtained. In each subfigure, we can see the impact of the
+mesh level for a finite element approximation. We have also \Cref{fig:feelpp:wp3:thermal_bridges:performance_measure_all}
+which present all results in the same charts. It allows us to see the impact of
+mesh and finite element choice.
+
+The speed-up is not very good (or ideal), particularly for the coarse mesh. The
+reasons are not yet well understood because we can see that the number of
+iterations is quite constant with respect to the number of CPU cores. We need to
+investigate more in detail what happened. At the number of iteration levels, the
+results are good, with excellent constant behavior with respect to the number of
+CPU cores. We can see also the impact of the mesh level and finite element
+approximation order. As expected, the number of iterations increases with the
+number of degrees of freedom but not too much. However, we can notice in the
+case $P_1$ that the mesh M1 takes more iteration than the other. Maybe some
+phenomena are not well captured with this approximation which is the coarsen.
+
+
+
+\begin{figure}
+ \centering
+
+ \foreach [expand list=true] \polyId in {1,2,3} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M1_P\polyId_discoverer.csv}\dataMa
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M2_P\polyId_discoverer.csv}\dataMb
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M3_P\polyId_discoverer.csv}\dataMc
+
+ \def\plotSetup{
+ {table=dataMa,column=algebraic-solve,legend=M1,color=customdarkblue},
+ {table=dataMb,column=algebraic-solve,legend=M2,color=customcyan},
+ {table=dataMc,column=algebraic-solve,legend=M3,color=customorange}
+ }
+
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \barChart[ybar,xticklabels from table={\dataMa}{nProc},
+ width=\textwidth, height=0.6172\textwidth,
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup}
+ \caption{Execution time - $P_{\polyId}$}
+ \end{subfigure}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \begin{tikzpicture}
+ \def\myLineWidth{2pt}
+ \begin{axis}[
+ width=\textwidth, height=0.6172\textwidth,
+ % xtick=data,
+ xmajorgrids=true, xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, %yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Number of CPU cores}, ylabel={Number of iterations},
+ legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ ]
+ \addplot[color=customdarkblue,mark=o,line width=\myLineWidth] table [x=nProc, y=ksp-niter] {\dataMa};
+ \addlegendentry{M1}
+ \addplot[color=customcyan,mark=triangle,line width=\myLineWidth] table [x=nProc, y=ksp-niter] {\dataMb};
+ \addlegendentry{M2}
+ \addplot[color=customorange,mark=square,line width=\myLineWidth] table [x=nProc, y=ksp-niter] {\dataMc};
+ \addlegendentry{M3}
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Number of iterations of GMRES - $P_{\polyId}$}
+ \end{subfigure}
+
+ \vspace*{0.04\textwidth}
+ }
+ \caption{Thermal bridges benchmarks - Performance measures of algebraic solver
+ (Mesh comparison) - GMRES preconditionned by GAMG - Discoverer supercomputer}
+ \label{fig:feelpp:wp3:thermal_bridges:performance_measure_splitted}
+\end{figure}
+
+
+
+\begin{figure}
+ \centering
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M1_P1_discoverer.csv}\dataMaPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M1_P2_discoverer.csv}\dataMaPb
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M1_P3_discoverer.csv}\dataMaPc
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M2_P1_discoverer.csv}\dataMbPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M2_P2_discoverer.csv}\dataMbPb
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M2_P3_discoverer.csv}\dataMbPc
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M3_P1_discoverer.csv}\dataMcPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M3_P2_discoverer.csv}\dataMcPb
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/thermalbridges_M3_P3_discoverer.csv}\dataMcPc
+
+ \def\chartLinePlot#1#2{
+ \begin{tikzpicture}
+ \def\myLineWidth{2pt}
+ \def\myLineStyleA{loosely dashdotdotted} %dashdotdotted
+ \def\myLineStyleB{dashed}
+ \def\myLineStyleC{solid}
+ %\def\myMarkStyle{every mark/.append style={solid}}
+ %\edef\myMarkStyle{\noexpand{every mark/.append style={solid}}}
+
+ \begin{axis}[
+ width=\textwidth, height=0.6172\textwidth,
+ % xtick=data,
+ xmajorgrids=true, xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, %yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Number of CPU cores},% ylabel={Number of iterations},
+ % legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ legend style={at={(0.,1)}, anchor=north west,font=\small,legend columns=3},
+ #2
+ ]
+ \addplot[color=customdarkblue,\myLineStyleA,mark=o,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMaPa};
+ \addlegendentry{M1-P1}
+ \addplot[color=customdarkblue,\myLineStyleB,mark=o,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMaPb};
+ \addlegendentry{M1-P2}
+ \addplot[color=customdarkblue,\myLineStyleC,mark=o,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMaPc};
+ \addlegendentry{M1-P3}
+ \addplot[color=customcyan,\myLineStyleA,mark=triangle,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMbPa};
+ \addlegendentry{M2-P1}
+ \addplot[color=customcyan,\myLineStyleB,mark=triangle,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMbPb};
+ \addlegendentry{M2-P2}
+ \addplot[color=customcyan,\myLineStyleC,mark=triangle,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMbPc};
+ \addlegendentry{M2-P3}
+ \addplot[color=customorange,\myLineStyleA,mark=square,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMcPa};
+ \addlegendentry{M3-P1}
+ \addplot[color=customorange,\myLineStyleB,mark=square,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMcPb};
+ \addlegendentry{M3-P2}
+ \addplot[color=customorange,\myLineStyleC,mark=square,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMcPc};
+ \addlegendentry{M3-P3}
+ \end{axis}
+ \end{tikzpicture}
+ }
+
+ \begin{subfigure}[c]{\textwidth}
+ \centering
+ \chartLinePlot{ksp-niter}{ylabel={Number of iterations}}
+ \caption{Number of iterations of GMRES}
+ \end{subfigure}
+ \begin{subfigure}[c]{\textwidth}
+ \centering
+ \chartLinePlot{algebraic-solve}{ylabel={Execution time [s]}}
+ \caption{Execution time}
+ \end{subfigure}
+ \caption{Thermal bridges benchmarks - Performance measures of algebraic solver
+ (Mesh and polynomial order comparison) - GMRES preconditionned by GAMG - Discoverer supercomputer}
+ \label{fig:feelpp:wp3:thermal_bridges:performance_measure_all}
+\end{figure}
+
+
+\paragraph{Challenges Identified:} %Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+\item Improve the HPC scalability at large/extreme scale.
+\item Reduce the memory
\end{itemize}
-\subsection{12-Month Roadmap}
-\label{sec:WP3:Feel++:roadmap}
+\subsubsection{Benchmark \#2: Linear elasticity : NAFEMS LE10}
+\label{sec:WP3:Feelpp:benchmark:nafems-le10}
+
+\paragraph{Description:} % Briefly describe the benchmark case, including the
+ % problem size, target architecture (e.g., CPU, GPU),
+ % and the input data. Mention the specific goals of the
+ % benchmark (e.g., testing scalability, energy
+ % efficiency).
+The benchmark description can be found in
+\cref{sec:WP1:Feelpp:benchmark:nafems-le10}.
+
+In this benchmark, we are interested in analysing the performance of algebraic
+solvers. For this linear elasticity problem, we used a multigrid algebraic
+preconditioner called GAMG (Petsc). This preconditioner is then used in a GMRES
+iterative method. In addition, we emphasize that we have used the preconditioned
+Chebyshev iterative method (with Jacobi) as smoother of the multigrid.
+
+Finally, using near-null-space vectors is generally crucial for the good
+performance of the multigrid solver, particularly when the kernel of the
+operator is not only the constants. In this elasticity problem, these near-null
+space vectors can be derived as rigid body modes from the coordinates of the
+discretization grid nodes or degrees of freedom with high-order approximation.
+
+
+\paragraph{Benchmarking Tools Used:} % List the tools used for performance
+ % analysis, such as Extrae, Score-P, TAU,
+ % Vampir, or Nsight, and specify what metrics
+ % were measured (e.g., execution time, FLOPS,
+ % energy consumption).
+See \cref{sec:WP1:Feelpp:benchmark:nafems-le10}.
+
+\paragraph{Input/Output Dataset Description:}
+See \cref{sec:WP1:Feelpp:benchmark:nafems-le10}.
+
+\paragraph{Results Summary:} % Include a summary of key metrics (execution time,
+ % memory usage, FLOPS) and their comparison across
+ % architectures (e.g., CPU, GPU).
+
+We have plotted the performances results and number of iterations in
+\cref{fig:feelpp:wp3:nafems-le10:performance_measures_M2_M3} for the mesh M2 and
+M3, and in
+\cref{fig:feelpp:wp3:nafems-le10:performance_measures_M4} for the mesh M4. For
+each, we have tested with lagrange finite element $P_1$ and $P_2$.
+
+The conclusion we can reach is that the solver does not behave very well on a
+large HPC scale. In particular, the quality of the solver deteriorates with a
+large number of iterations. However, we can see that increasing the number of
+degrees of freedom improves strong scalability, and that the extreme case M4-P2
+behaves well. The variability of the number of iterations and its high level
+should be studied more to better understand these results.
+
+Finally, with
+\cref{fig:feelpp:wp3:nafems-le10:performance_measures_all}, we have merged all
+results in order to see the impact of mesh and finite element choice and the
+HPC resources.
+
+\begin{figure}
+ \centering
+
+ \foreach [expand list=true] \polyId/\shiftData in {1/0,2/2} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M2_P\polyId_discoverer.csv}\dataMb
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M3_P\polyId_discoverer.csv}\dataMc
+ %\pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M4_P\polyId_discoverer.csv}\dataMd
+
+ \def\plotSetup{
+ {table=dataMb,column=algebraic-solve,legend=M2,color=customdarkblue},
+ {table=dataMc,column=algebraic-solve,legend=M3,color=customcyan,shift=\shiftData}
+ }
+
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \barChart[
+ ybar, width=\textwidth, height=0.6172\textwidth,
+ xticklabels from table={\dataMb}{nProc},
+ x tick label style={ rotate=-45 },
+ legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup}
+ \caption{Execution time - $P_{\polyId}$}
+ \end{subfigure}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \begin{tikzpicture}
+ \def\myLineWidth{2pt}
+ \begin{axis}[
+ width=0.8\textwidth, height=0.6172\textwidth,
+ % xtick=data,
+ xmajorgrids=true, xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, %yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Number of CPU cores}, ylabel={Number of iterations},
+ legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ ]
+ \addplot[color=customdarkblue,mark=o,line width=\myLineWidth] table [x=nProc, y=ksp-niter] {\dataMb};
+ \addlegendentry{M2}
+ \addplot[color=customcyan,mark=triangle,line width=\myLineWidth] table [x=nProc, y=ksp-niter] {\dataMc};
+ \addlegendentry{M3}
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Number of iterations of GMRES - $P_{\polyId}$}
+ \end{subfigure}
+ \vspace*{0.04\textwidth}
+ }
+
+ \caption{NAFEMS LE10 benchmarks - Performance measures of algebraic solver
+ (Mesh comparison M2 and M3) - GMRES preconditionned by GAMG - Discoverer supercomputer}
+ \label{fig:feelpp:wp3:nafems-le10:performance_measures_M2_M3}
+\end{figure}
+
+
+
+
+\begin{figure}
+ \centering
+ \foreach [expand list=true] \polyId/\shiftData in {1,2} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M4_P\polyId_discoverer.csv}\dataMd
+ \def\plotSetup{
+ {table=dataMd,column=algebraic-solve,legend=M4,color=customorange}
+ }
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \resizebox{\textwidth}{0.6172\textwidth}{
+ \barChart[ybar,
+ xticklabels from table={\dataMd}{nProc},
+ x tick label style={ rotate=-45 }
+ ]{\plotSetup}
+ }
+ \caption{Execution time - $P_{\polyId}$}
+ \end{subfigure}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \resizebox{\textwidth}{0.6172\textwidth}{
+ \begin{tikzpicture}
+ \def\myLineWidth{2pt}
+ \begin{axis}[
+ width=\textwidth, height=0.6172\textwidth,
+ % xtick=data,
+ xmajorgrids=true, xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, %yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Number of CPU cores}, ylabel={Number of iterations},
+ %legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ ]
+ \addplot[color=customorange,mark=o,line width=\myLineWidth] table [x=nProc, y=ksp-niter] {\dataMd};
+ %\addlegendentry{M4}
+ \end{axis}
+ \end{tikzpicture}
+ }
+ \caption{Number of iterations of GMRES - $P_{\polyId}$}
+ \end{subfigure}
+ }
+
+ \caption{NAFEMS LE10 - Performance measures of algebraic solver
+ (Mesh M4) - GMRES preconditionned by GAMG - Discoverer supercomputer}
+ \label{fig:feelpp:wp3:nafems-le10:performance_measures_M4}
+\end{figure}
+
+
+
+\begin{figure}
+ \centering
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M2_P1_discoverer.csv}\dataMbPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M2_P2_discoverer.csv}\dataMbPb
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M3_P1_discoverer.csv}\dataMcPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M3_P2_discoverer.csv}\dataMcPb
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M4_P1_discoverer.csv}\dataMdPa
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/nafems_le10_M4_P2_discoverer.csv}\dataMdPb
+
+ \def\chartLinePlot#1#2{
+ \begin{tikzpicture}
+ \def\myLineWidth{2pt}
+ \def\myLineStyleA{loosely dashdotdotted} %dashdotdotted
+ \def\myLineStyleB{dashed}
+ \def\myLineStyleC{solid}
+ %\def\myMarkStyle{every mark/.append style={solid}}
+ %\edef\myMarkStyle{\noexpand{every mark/.append style={solid}}}
+
+ %\begin{axis}
+ \begin{semilogxaxis}
+ [
+ width=\textwidth, height=0.6172\textwidth,
+ % xtick=data,
+ xmajorgrids=true, xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, %yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ %xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands
+ %separator={},precision=0]{\pgfmathresult}},
+ xticklabel={\pgfmathparse{exp(\tick)}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Number of CPU cores},% ylabel={Number of iterations},
+ %xmode=log,
+ %log ticks with fixed point,
+ %x filter/.code=\pgfmathparse{##1 + 6.90775527898214},
+ % legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ %legend style={at={(1.,1)}, anchor=north east,font=\small,legend columns=2},
+ #2
+ ]
+ \addplot[color=customdarkblue,\myLineStyleA,mark=o,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMbPa};
+ \addlegendentry{M2-P1}
+ \addplot[color=customdarkblue,\myLineStyleC,mark=o,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMbPb};
+ \addlegendentry{M2-P2}
+ %\addplot[color=customdarkblue,\myLineStyleC,mark=o,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMaPc};
+ %\addlegendentry{M1-P3}
+ \addplot[color=customcyan,\myLineStyleA,mark=triangle,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMcPa};
+ \addlegendentry{M3-P1}
+ \addplot[color=customcyan,\myLineStyleC,mark=triangle,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMcPb};
+ \addlegendentry{M3-P2}
+ % \addplot[color=customcyan,\myLineStyleC,mark=triangle,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMbPc};
+ % \addlegendentry{M2-P3}
+ \addplot[color=customorange,\myLineStyleA,mark=square,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMdPa};
+ \addlegendentry{M4-P1}
+ \addplot[color=customorange,\myLineStyleC,mark=square,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMdPb};
+ \addlegendentry{M4-P2}
+ % \addplot[color=customorange,\myLineStyleC,mark=square,line width=\myLineWidth,every mark/.append style={solid}] table [x=nProc, y=#1] {\dataMcPc};
+ % \addlegendentry{M3-P3}
+ %\end{axis}
+ \end{semilogxaxis}
+ \end{tikzpicture}
+ }
+
+ \begin{subfigure}[c]{\textwidth}
+ \centering
+ \chartLinePlot{ksp-niter}{
+ ylabel={Number of iterations},
+ legend style={at={(1.,1)}, anchor=north east,font=\small,legend columns=2}
+ }
+ \caption{Number of iterations of GMRES}
+ \end{subfigure}
+ \begin{subfigure}[c]{\textwidth}
+ \centering
+ \chartLinePlot{algebraic-solve}{
+ ylabel={Execution time [s]},
+ legend style={at={(0.,1)}, anchor=north west,font=\small,legend columns=2}
+ }
+ \caption{Execution time}
+ \end{subfigure}
+ \caption{NAFEMS LE10 - Performance measures of algebraic solver
+ (Mesh and polynomial order comparison) - GMRES preconditionned by GAMG - Discoverer supercomputer}
+ \label{fig:feelpp:wp3:nafems-le10:performance_measures_all}
+\end{figure}
+
+
+\paragraph{Challenges Identified:} %Describe any bottlenecks encountered (e.g.,
+ %memory usage, parallelization inefficiencies)
+ %and how they impacted the benchmark.
+
+\begin{itemize}
+\item Tuning GAMG
+\item Improve the HPC scalability at large/extreme scale.
+\item Reduce the memory
+\end{itemize}
+
+
+\subsubsection{Benchmark \#3: Thermo-Electric Coupling}
+\label{sec:WP3:Feelpp:benchmark:hl-31}
+
+\paragraph{Description:} % Briefly describe the benchmark case, including the
+ % problem size, target architecture (e.g., CPU, GPU),
+ % and the input data. Mention the specific goals of the
+ % benchmark (e.g., testing scalability, energy
+ % efficiency).
+The benchmark description can be found in
+\cref{sec:WP1:Feelpp:benchmark:hl-31}.
+
+The thermoelectric model is initially assumed to be linear. We will analyze the non-linear case (full coupling) in a future study.
+For this purpose, the PDE of the electrical model is considered to be
+non-temperature dependent. Therefore, the solution strategy consists of solving
+the electrical problem first and then the thermal problem (including the
+contribution given from the previous solution). This decoupled approach allows
+us to concentrate on the algebraic solvers of each physics.
+
+To solve the linear systems, we use a preconditioned GMRES algorithm for each
+one. The preconditioner used is an algebraic multigrid called GAMG (Petsc). The
+setup is identical for all physicis expeted for the smoother :
+\begin{itemize}
+\item \textbf{Heat} : KSPCHEBYSHEV + JACOBI
+\item \textbf{Electric} : KSPRICHARDSON + SOR
+\end{itemize}
+
+Note: The choice of smoother in electric seems important to ensure convergence
+(or even only stability of the iterative method). This will be studied more in
+detail in future work.
+
+\paragraph{Benchmarking Tools Used:} % List the tools used for performance
+ % analysis, such as Extrae, Score-P, TAU,
+ % Vampir, or Nsight, and specify what metrics
+ % were measured (e.g., execution time, FLOPS,
+ % energy consumption).
+See \cref{sec:WP1:Feelpp:benchmark:hl-31}.
+
+\paragraph{Input/Output Dataset Description:}
+See \cref{sec:WP1:Feelpp:benchmark:hl-31}.
+
+\paragraph{Results Summary:} % Include a summary of key metrics (execution time,
+ % memory usage, FLOPS) and their comparison across
+ % architectures (e.g., CPU, GPU).
+
+The results are presented in \cref{fig:feelpp:wp3:hl-3:performance_measure_gaya}
+and \cref{fig:feelpp:wp3:hl-3:performance_measure_discoverer}. They correspond
+respectivelly to the execution time of the algebraic solve using the Gaya and Discoverer supercomputers.
+In both machines, we have a good strong scalability up to 256 CPU cores. The
+results
+the results are not as good for a larger distribution. Indeed, the execution
+time is no longer reduced, with some bump that we don't understant for now.
+However, for a large number of tests, the calculation time remains reasonable, although there is a limit.
+
+In terms of the number of iterations, the results are very good and encouraging.
+We can see that there is little impact with the number of CPU cores (except for
+a few runs of the electric problem).
+
+
+
+\foreach [expand list=true] \supercomputerFile/\supercomputerName in {gaya/Gaya,discoverer/Discoverer}
+{
+\begin{figure}
+ \centering
+
+ \foreach [expand list=true] \polyId in {1,2} {
+
+ \pgfplotstableread[col sep=comma]{\currfiledir/../WP1/data/HL-31_M1_P\polyId_\supercomputerFile.csv}\dataMa
+
+ \def\plotSetup{
+ {table=dataMa,column=algebraic-solve,legend=M1,color=customdarkblue}
+ }
+
+
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \barChart[ybar, width=\textwidth, height=0.6172\textwidth,
+ xticklabels from table={\dataMa}{nProc},
+ x tick label style={ rotate=-45 },
+ every axis legend/.code={\let\addlegendentry\relax}, %ignore legend locally
+ %legend style={at={(0.5,1)}, anchor=south,font=\tiny,legend columns=-1}
+ ]{\plotSetup}
+ \caption{Execution time - $P_{\polyId}$}
+ \end{subfigure}
+ \begin{subfigure}[c]{0.49\textwidth}
+ \centering
+ \begin{tikzpicture}
+ \def\myLineWidth{2pt}
+ \begin{axis}[
+ width=\textwidth, height=0.6172\textwidth,
+ % xtick=data,
+ xmajorgrids=true, xminorgrids=false, minor x tick num=3,
+ ymajorgrids=true, %yminorgrids=true,
+ minor y tick num=2,
+ % xticklabel={\pgfmathparse{100*\tick}\pgfmathprintnumber[precision=0]{\pgfmathresult}\%},
+ xticklabel={\pgfmathparse{\tick}\pgfmathprintnumber[fixed,set thousands separator={},precision=0]{\pgfmathresult}},
+ xlabel={Number of CPU cores}, ylabel={Number of iterations},
+ legend style={at={(0.5,1)}, anchor=south,font=\small,legend columns=3}
+ ]
+ \addplot[color=customdarkblue,mark=o,line width=\myLineWidth] table [x=nProc, y=heat.ksp-niter] {\dataMa};
+ \addlegendentry{heat}
+ \addplot[color=customcyan,mark=triangle,line width=\myLineWidth] table [x=nProc, y=electric.ksp-niter] {\dataMa};
+ \addlegendentry{electric}
+ \end{axis}
+ \end{tikzpicture}
+ \caption{Number of iterations of GMRES - $P_{\polyId}$}
+ \end{subfigure}
+ \vspace*{0.04\textwidth}
+ }
+
+ \caption{HL-31 benchmarks - Performance measures of algebraic solver
+ (Mesh comparison) - GMRES preconditionned by GAMG - \supercomputerName \ supercomputer}
+ \label{fig:feelpp:wp3:hl-3:performance_measure_\supercomputerFile}
+\end{figure}
+}
+
+\paragraph{Challenges Identified:} % Describe any bottlenecks encountered (e.g.,
+ % memory usage, parallelization inefficiencies)
+ % and how they impacted the benchmark.
+
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+\item Tunning GAMG
+\item Improve the HPC scalability at large/extreme scale.
+\item Reduce the memory
\end{itemize}
-In~\cref{tab:WP3:Feel++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+\iffalse
+\subsubsection{Benchmark \#4: CFD FDA Benchmark}
+
+\fullcite{chabannes_high_2017}
+\paragraph{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+
+\paragraph{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+
+\paragraph{Input/Output Dataset Description:}
+\paragraph{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+
+\paragraph{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+
+\paragraph{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+
+\paragraph{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
-\begin{table}[h!]
+\paragraph{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+
+\fi
+
+\subsubsection{Benchmark \#4: Conjuguate heat transfer}
+\label{sec:WP3:Feelpp:benchmark4}
+
+
+The benchmark has been described in the prequel of this document, in \Cref{sec:WP1:Feelpp:benchmark4}, and simulate the aqueous humor flow in the human eye, coupled with the heat transfer.
+The strategy employed to simulate the non-linear model implemented in the \texttt{heatfluid} toolbox of \Feelpp consists of a fixed-point iteration.
+Without efficient preconditioning, iterative solvers would not actually converge for solving the system.
+Direct methods are only usable for small problems, otherwise, memory and computational costs can exceed available resources and render them unsuitable for larger problems.
+
+Precisely, we employ a \emph{fieldsplit}\index{fieldsplit}, or PDE-based, strategy, allowing us to define what we could call ``sub-preconditioners'' for each system block.
+We split the system into two blocks according to the fluid and heat unknowns.
+The fluid block is solved using a Schur complement approach~\cite{elman_finite_2014}, while the heat block is solved using a few iteration of GAMG.
+%
+A more in-depth description of the preconditioner can be found in~\cite{saigre_coupled_2024_paper}.
+
+
+\subsection{12-Month Roadmap}
+\label{sec:WP3:Feelpp:roadmap}
+
+In this section, we describe the roadmap for improving benchmarks and addressing the challenges identified.
+It follows mainly the same roadmap as in WP1~\Cref{sec:WP1:Feelpp:roadmap}.
+%% \begin{itemize}
+%% \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
+%% \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+%% \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+%% \end{itemize}
+
+In~\cref{tab:WP3:Feelpp:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+
+\begin{table}[!ht]
\centering
-
-
+
+
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & provide short description here \\
-\rowcolor{numpexlightergray} B2 - Interconnect Technology & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} B10 - Scientific Productivity &see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & see~\cref{tab:WP1:Feelpp:bottlenecks}\\
+\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B2 - Interconnect Technology & see~\cref{tab:WP1:Feelpp:bottlenecks}\\
+\rowcolor{white} B6 - Data Management & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & enable hpddm and better preconditionner configurations at large scale (eg. algebraic multigrid), enable algebraic saddle point preconditioners from WP3, enable hp-mortar and compression strategies; update with latest PETSc improvement and start using PETSc GPU API\\
+\hline
\end{tabular}
}
}
- \caption{WP3: Feel++ plan with Respect to Relevant Bottlenecks}
- \label{tab:WP3:Feel++:bottlenecks}
-\end{table}
\ No newline at end of file
+ \caption{WP3: \Feelpp plan with Respect to Relevant Bottlenecks}
+ \label{tab:WP3:Feelpp:bottlenecks}
+\end{table}
diff --git a/software/feelpp/WP4/WP4.tex b/software/feelpp/WP4/WP4.tex
index ee5e70f..d55fc93 100644
--- a/software/feelpp/WP4/WP4.tex
+++ b/software/feelpp/WP4/WP4.tex
@@ -1,7 +1,8 @@
-\section{Software: Feel++}
+%!TEX root = ../../../exa-ma-d7.1.tex
+\section{Software: \texorpdfstring{\Feelpp}{Feel++}}
\label{sec:WP4:Feel++:software}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -37,6 +38,10 @@ \section{Software: Feel++}
B6 - Data Management\\
B7 - Exascale Algorithms\\
\end{tabular} \\
+\rowcolor{numpexlightergray}\textbf{Contributors} & \begin{tabular}{l}
+ Christophe Prud'homme (UNISTRA)\\
+ Vincent Chabannes (UNISTRA)
+ \end{tabular}\\
\bottomrule
\end{tabular}
}}
@@ -48,19 +53,19 @@ \subsection{Software Overview}
In~\cref{tab:WP4:Feel++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} stochastic data assimilation: ensemble & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} stochastic and deterministic data assimilation: ensemble kalman filter & \Feelpp support ensemble runs and EnKF~\cite{asch_data_2016}\\
\end{tabular}
}
}
@@ -73,82 +78,74 @@ \subsection{Parallel Capabilities}
\label{sec:WP4:Feel++:performances}
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
-
+\begin{description}
+ \item[programming environment] MPI
+ \item[computation environment] CPU Only
+ \item[parallel capabilities] The data assimmilation algorithms build on top of the \Feelpp library are parallelized using MPI.
+ \item[Scalability] Ensemble runs are parallelized using MPI and the scalability of the ensemble Kalman filter is expected on a large number of cores.
+ \item[Integration with Other Systems] OpenTURNS is often coupled for uncertainty quantification.
+\end{description}
\subsection{Initial Performance Metrics}
\label{sec:WP4:Feel++:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP4. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+% This section provides a summary of initial performance benchmarks performed in the context of WP4. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+
+%\begin{description}
+% \item[Overall Performance] Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+% \item[Input/Output Dataset] Provide a detailed description of the dataset used for the benchmark, including:
+% \begin{itemize}
+% \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
+% \item Output dataset format and key results.
+% \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
+% \item DOI or permanent link for accessing the dataset.
+% \end{itemize}
+% \item[open-data Access] Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
+% \item[Challenges] Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+% \item[Future Improvements] Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+%\end{description}
+
+There are no benchmark yet defined for data assimilation and inverse problems that can be shared in this deliverable.
+The work is ongoing and will be reported in the next deliverable.
\subsection{12-Month Roadmap}
\label{sec:WP4:Feel++:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
+In this section, we describe the roadmap for improving benchmarks and addressing the challenges identified.
+It follows mainly the same roadmap as in WP1~\Cref{sec:WP1:Feelpp:roadmap}.
+
+
+%In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+%\begin{description}
+% \item[Data Improvements] Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
+% \item[Methodology Application] Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+% \item[Results Retention] Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+%\end{description}
In~\cref{tab:WP4:Feel++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
-
-
+
+
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & provide short description here \\
-\rowcolor{numpexlightergray} B2 - Interconnect Technology & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} B10 - Scientific Productivity & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & see~\cref{tab:WP1:Feelpp:bottlenecks}\\
+\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B2 - Interconnect Technology & see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{white} B6 - Data Management &see~\cref{tab:WP1:Feelpp:bottlenecks} \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Enable verified benchmarking of deterministic/Stochastic EnKF and more generally ensemble runs\\
\end{tabular}
}
}
diff --git a/software/feelpp/WP5/WP5.tex b/software/feelpp/WP5/WP5.tex
index 49b406d..6c2f67f 100644
--- a/software/feelpp/WP5/WP5.tex
+++ b/software/feelpp/WP5/WP5.tex
@@ -1,7 +1,8 @@
-\section{Software: Feel++}
-\label{sec:WP5:Feel++:software}
+%!TEX root = ../../../exa-ma-d7.1.tex
+\section{Software: \texorpdfstring{\Feelpp}{Feel++}}
+\label{sec:WP5:Feelpp:software}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -10,7 +11,7 @@ \section{Software: Feel++}
\begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
\rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-Feel++ Consortium\\
+\Feelpp Consortium\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
CNRS\\
@@ -37,121 +38,121 @@ \section{Software: Feel++}
B6 - Data Management\\
B7 - Exascale Algorithms\\
\end{tabular} \\
+\rowcolor{numpexlightergray}\textbf{Contributors} & \begin{tabular}{l}
+Christophe Prud'homme (UNISTRA)\\
+Vincent Chabannes (UNISTRA)\\
+Lucas Palazzolo (INRIA)\\
+Yannick Privat (UL)
+\end{tabular}\\
\bottomrule
\end{tabular}
}}
- \caption{WP5: Feel++ Information}
+ \caption{WP5: \Feelpp Information}
\end{table}
\subsection{Software Overview}
-\label{sec:WP5:Feel++:summary}
+\label{sec:WP5:Feelpp:summary}
-In~\cref{tab:WP5:Feel++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+In~\cref{tab:WP5:Feelpp:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} Iterative methods & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} Iterative methods: Shape Optimization & Iterative methods for shape optimization based on the work \cite{feppon_f_null_2020} have been implemented in \url{https://github.com/feelpp/feelpp-shapo} using \Feelpp toolboxes.\\
\end{tabular}
}
}
- \caption{WP5: Feel++ Features}
- \label{tab:WP5:Feel++:features}
+ \caption{WP5: \Feelpp Features}
+ \label{tab:WP5:Feelpp:features}
\end{table}
\subsection{Parallel Capabilities}
-\label{sec:WP5:Feel++:performances}
+\label{sec:WP5:Feelpp:performances}
+
+The parallel properties are detailed in~\cref{sec:WP1:Feelpp:performances} since almost all the procedure depend on \Feelpp and its parallel capabilities.
+However the mesh adaptation depend on MMG/ParMMG.
+The advance combination of toolboxes and communication between them is a key feature for shape optimization.
+\subsection{Initial Performance Metrics}
+\label{sec:WP5:Feelpp:metrics}
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+We have developed the benchmark~\cref{sec:WP5:Feelpp:benchmark1}, initial results are available in the repository~\cite{Palazollo_Feel_Shape_Optimization} and the report~\cite{palazzolo2023shape} however we are still working on the dataset management, performance monitoring and verification and validation benchmarks.
+We will provide a more detailed report in the next version of this deliverable.
+\iffalse
-\subsection{Initial Performance Metrics}
-\label{sec:WP5:Feel++:metrics}
This section provides a summary of initial performance benchmarks performed in the context of WP5. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
\begin{itemize}
\item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+ \item \textbf{Input/Output Dataset:} a json file containing the description of the shape optimization problem and a json and partition mesh file for each toolbox used in the optimization problem.
+ \item \textbf{open-data Access:} the dataset is available at \url{https://github.com/feelpp/feelpp-shapo}
+ \item \textbf{Challenges:} Robust scalable solution strategies for the different steps of the shape optimization problem. I/O and memory management are also critical.
+ \item \textbf{Future Improvements:} Implement more efficient solution strategies for the shape optimization problem. Optimize I/O and memory management for large-scale simulations.
\end{itemize}
+\fi
+
+\subsubsection{Benchmark \#1 - Shape Optimization under Stokes Flow}
+\label{sec:WP5:Feelpp:benchmark1}
+
+\paragraph{Description:} In \cite{palazzolo2023shape}, geometric shape optimization has been performed by using the Cea's method to compute the shape gradient \cite{cea_conception_1986}. The study employs iterative methods such as gradient descent and null-space methods \cite{feppon_f_null_2020}. The benchmark focuses on optimizing the shape of 2D and 3D objects under Stokes flow using these methods. Notably, in the 2D case, the optimized shape resembles a rugby ball \cite{pironneau_optimum_1974}. We use this testcase as we have a criteria for the reference solution for the shape optimization problem. We will investigate other problems in the future.
+
+\paragraph{Benchmarking Tools Used:} The optimization problems are tackled using various \Feelpp toolboxes designed for solving the associated PDEs of the primal, adjoint, and extension problems.
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
\subsection{12-Month Roadmap}
-\label{sec:WP5:Feel++:roadmap}
+\label{sec:WP5:Feelpp:roadmap}
+
+In this section, we describe the roadmap for improving benchmarks and addressing the challenges identified.
+It follows mainly the same roadmap as in WP1~\Cref{sec:WP1:Feelpp:roadmap}.
+\iffalse
In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
+\begin{description}
+ \item[Data Improvements] Proceed the same way as \cref{sec:WP1:Feelpp:metrics} to improve the dataset management.
+ \item[Methodology Application] Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+ \item[Results Retention] Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+\end{description}
+\fi
-In~\cref{tab:WP5:Feel++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+In~\cref{tab:WP5:Feelpp:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
-
-
+
+
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & provide short description here \\
-\rowcolor{numpexlightergray} B2 - Interconnect Technology & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} B10 - Scientific Productivity & Same as~\cref{sec:WP1:Feelpp:roadmap}\\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Same as~\cref{sec:WP1:Feelpp:roadmap} \\
+\rowcolor{white} B12 - Pre/Post Processing and In-Situ Processing & Same as~\cref{sec:WP1:Feelpp:roadmap}; reduce I/O load and enable checkpoint restart strategies \\
+\rowcolor{numpexlightergray} B2 - Interconnect Technology & Same as~\cref{sec:WP1:Feelpp:roadmap} \\
+\rowcolor{white} B6 - Data Management & Same as~\cref{sec:WP1:Feelpp:roadmap} \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Enable efficient Shape Optimization Benchmarking; define new benchmark with reference solutions; work on multiphysic shape optimisation; \\
\end{tabular}
}
}
- \caption{WP5: Feel++ plan with Respect to Relevant Bottlenecks}
- \label{tab:WP5:Feel++:bottlenecks}
+ \caption{WP5: \Feelpp plan with Respect to Relevant Bottlenecks}
+ \label{tab:WP5:Feelpp:bottlenecks}
\end{table}
\ No newline at end of file
diff --git a/software/feelpp/WP7/WP7.tex b/software/feelpp/WP7/WP7.tex
index 0d85d86..3052edf 100644
--- a/software/feelpp/WP7/WP7.tex
+++ b/software/feelpp/WP7/WP7.tex
@@ -1,7 +1,8 @@
+%!TEX root = ../../../exa-ma-d7.1.tex
\section{Software: Feel++}
\label{sec:WP7:Feel++:software}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -40,18 +41,18 @@ \subsection{Software Overview}
Provide a brief overview of the software with respect to WP7.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
\rowcolor{white} Health & provide short description here \\
\rowcolor{numpexlightergray} Environment & provide short description here \\
\rowcolor{white} Energy & provide short description here \\
diff --git a/software/feelpp/feelpp.tex b/software/feelpp/feelpp.tex
index b7e4f67..82170c9 100644
--- a/software/feelpp/feelpp.tex
+++ b/software/feelpp/feelpp.tex
@@ -1,9 +1,8 @@
-\section{Software: Feel++}
-\label{sec:Feel++:software}
+%!TEX root = ../../exa-ma-d7.1.tex
+\section{Software: \texorpdfstring{\Feelpp}{Feel++}}
+\label{sec:Feelpp:software}
-
-
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -12,7 +11,7 @@ \section{Software: Feel++}
\begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
\rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-Feel++ Consortium\\
+\Feelpp Consortium\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
CNRS\\
@@ -39,119 +38,315 @@ \section{Software: Feel++}
B6 - Data Management\\
B7 - Exascale Algorithms\\
\end{tabular} \\
- \bottomrule
+ \hline
\end{tabular}
}}
- \caption{Feel++ Information}
+ \caption{\Feelpp Information}
\end{table}
\subsection{Software summary}
-\label{sec:Feel++:summary}
-Detailed overview not available.
+\label{sec:Feelpp:summary}
+
+\Feelpp is a powerful framework designed to solve problems based on ordinary differential equations (ODEs) and partial differential equations (PDEs).
+It leverages modern \Cpp{} standards, including \Cpp{17} and \Cpp{20}, and also provides a Python layer through Pybind11.
+The suite is particularly effective for parallel computing, with seamless integration of parallelism through default communicators, including support for ensemble runs.
+
+
+
+\Feelpp provides several specialized toolboxes to address various fields of numerical simulation
+\begin{inparaenum}[(i)]
+ \item fluid mechanics,
+ \item solid mechanics,
+ \item heat transfer and conjuguate heat transfer,
+ \item fluid structure interaction,
+ \item electro and magnetostatic,
+ \item thermoelectric,
+ \item levelset and multifluid.
+\end{inparaenum}
+Each toolbox is tailored to specific types of problems, ensuring high performance and accuracy.
+
+\Feelpp is used in a variety of fields, particularly in health, industry, and physics. Below are some examples of applications:
+\begin{inparaenum}[(i)]
+ \item \textbf{Health (Brain)}: Applications related to brain modeling.
+ \item \textbf{Health (Tumor Cells)}: Simulation of tumor cell behavior.
+ \item \textbf{Industry (ROM, UQ)}: Reduced-order modeling and uncertainty quantification in industrial settings.
+ \item \textbf{Automotive (CFD, ROM)}: Computational fluid dynamics and reduced-order models in automotive engineering.
+ \item \textbf{Physics (High Field Magnets)}: Simulations of high field magnets.
+ \item \textbf{Health (Rheology)}: Modeling blood rheology.
+ \item \textbf{Health (Eye/Brain)}: Coupling between the eye and the brain in medical simulations.
+\end{inparaenum}
+
+The \Feelpp core library provides a wide range of numerical methods for solving PDEs, with support for finite elements in various Sobolev spaces with scalar, vector and matrix values. For example:
+\begin{inparaenum}[(i)]
+ \item \( L^2 \), \( \mathbf{L}^2 \), \( \mathbb{L}^2 \)
+ \item \( H^1 \), \( \mathbf{H}^1 \), \( \mathbb{H}^1 \)
+ \item \( \mathbf{H}(\text{div}) \), \( \mathbf{H}(\text{curl}) \)
+\end{inparaenum}
+
+\Feelpp supports a wide variety of element types and value types, including single, double, and quad precision, as well as complex numbers. The supported problem domains include 0D to 3D, with or without time dependence.
+
+Here is a typical example of solving a Laplace equation in \Feelpp:
+\begin{listing}[ht]
+ \caption{Sample \Feelpp code for solving a Laplace equation on an arbitrary domain.}
+\begin{minted}[
+ linenos, % Line numbers
+ fontsize=\scriptsize, % Reduce font size
+ bgcolor=bgcolor, % Slightly gray background
+ frame=lines, % Delimiters around the code
+ framesep=2mm, % Space between code and frame
+ rulecolor=\color{gray}, % Color of the frame
+ breaklines % Allow line breaks in long lines
+ ]{cpp}
+auto Vh = Pch<4>(mesh, markedelements(mesh, expr("<...>")));
+auto u = Vh->element(), v = Vh->element(g, "g");
+auto l = form1(_test = Vh);
+l = integrate(_range = elements(support(Vh)), _expr = f * id(v));
+l += integrate(_range = markedfaces(support(Vh), "Robin"), _expr = -r_2 * id(v));
+l += integrate(_range = markedfaces(support(Vh), "Neumann"), _expr = -un * id(v));
+
+auto a = form2(_trial = Vh, _test = Vh);
+a = integrate(_range = elements(support(Vh)), _expr = inner(k * gradt(u), grad(v)));
+a += integrate(_range = markedfaces(support(Vh), "Robin"), _expr = r_1 * idt(u) * id(v));
+a += on(_range = markedfaces(support(Vh), "Dirichlet"), _rhs = l, _element = u, _expr = g);
+a.solve(_rhs = l, _solution = u);
+\end{minted}
+\end{listing}
+
+The suite supports multiple linear algebra libraries, including PETSc, SLEPc, Eigen, and Boost::ublas, and is designed to run efficiently in both sequential and parallel environments.
+
+
+
+\Feelpp is a versatile and powerful tool for solving PDEs and ODEs in a variety of scientific and industrial applications.
+With support for high-performance computing and a broad range of numerical methods which can be used to build taylored applications for research, development, and teaching.
\subsection{Purpose}
-\label{sec:Feel++:purpose}
-Purpose not available.
+\label{sec:Feelpp:purpose}
+\Feelpp aims to provide a flexible and efficient environment for conducting finite element analysis in multiple scientific domains.
+It allows for the rapid prototyping of numerical models and is built with parallel computing in mind to scale on modern HPC architectures.
\subsection{Programming and Computational Environment}
-\label{sec::Feel++:environment_capabilities}
+\label{sec::Feelpp:environment_capabilities}
+The~\Cref{tab:Feelpp:environment_capabilities} summarizes these aspects for \Feelpp, providing a view of its programming and computational capabilities.
-The following table summarizes these aspects for Feel++, providing a view of its programming and computational capabilities.
+%% \begin{table}[!ht]
+%% %% \centering
+%% {
+%% \setlength{\parindent}{0pt}
+%% \def\arraystretch{1.25}
+%% \arrayrulecolor{numpexgray}
+%% {\fontsize{9}{11}\selectfont
+%% \begin{longtable}{lp{.3\textwidth}p{.5\textwidth}}
+%% \rowcolor{gray}\textbf{\color{white}Category} & \textbf{\color{white}Details} & \textbf{\color{white}Description} \\
+%% \hline
+%% \endfirsthead % End of the first head (the header for the first page)
+%%
+%% \hline
+%% \rowcolor{gray}\textbf{Category} & \textbf{Details} & \textbf{Description} \\
+%% \hline
+%% \endhead % End of the head (for all other pages)
+%%
+%% \hline
+%% \endfoot % Footer for all but the last page
+%%
+%% \hline
+%% \endlastfoot % Footer for the last page
+%%
+%% \rowcolor{white}Languages & \begin{tabular}{l}
+%% C++\\
+%% C++17\\
+%% C++20\\
+%% Python\\
+%% \end{tabular} & \Feelpp is primarily developed in C++ and supports modern C++ standards, including C++17 and C++20, which provide enhanced performance, safety features, and modern programming paradigms for high-performance computing applications. This allows \Feelpp to leverage advanced language features such as constexpr, parallel algorithms, and enhanced lambda expressions, improving both code maintainability and computational efficiency. Additionally, \Feelpp integrates with Python, enabling scripting capabilities and facilitating ease of use for rapid prototyping, automation, and integration with scientific workflows. This dual-language support provides flexibility for both performance-critical tasks and user-friendly interfaces. \\
+%% \rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
+%% MPI\\
+%% Parallelism - C++\\
+%% Task based\\
+%% \end{tabular} & \Feelpp offers robust support for parallel computing, making it suitable for high-performance simulations. It utilizes MPI (Message Passing Interface) for distributed memory parallelism, enabling efficient communication between processes across different nodes in HPC environments. Additionally, \Feelpp implements parallelism directly in C++, allowing fine-grained control over threading and parallel execution within shared memory systems. The framework also supports task-based parallelism thanks to specx, enabling the efficient execution of independent computational tasks, improving load balancing, and optimizing resource usage across heterogeneous computing architectures. This parallelism support ensures scalability and high performance in complex multiphysics simulations.\\
+%% \rowcolor{white}Data Formats & \begin{tabular}{l}
+%% Data-management system\\
+%% Ensight\\
+%% Gmsh and associated formats\\
+%% HDF5\\
+%% JSON\\
+%% VTK\\
+%% YAML\\
+%% in-house format\\
+%% \end{tabular} & \Feelpp supports data management, including remote data management systems such as Girder and GitHub. It handles a variety of input and output formats, including Ensight, Gmsh, HDF5, JSON, VTK, YAML, and an in-house format. This flexibility enables \Feelpp to align with the \textbf{FAIR} principles (Findable, Accessible, Interoperable, and Reusable), ensuring efficient data sharing and reuse across different platforms and tools.\\
+%% \rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
+%% Checkpoint restart\\
+%% \end{tabular} & \Feelpp provides resilience support through checkpoint restart functionality. This allows simulations to save their state at specific points, enabling recovery from failures without restarting from the beginning. Binary in-house formats are used for fast restarts, ensuring minimal downtime and efficient continuation of long-running computations, especially on large-scale HPC systems.\\
+%% \rowcolor{white}DevOps & \begin{tabular}{l}
+%% Continuous Benchmarking\\
+%% Continuous Delivery\\
+%% Continuous Integration\\
+%% \end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
+%% \rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
+%% Debian\\
+%% Fedora\\
+%% Spack\\
+%% Ubuntu\\
+%% \end{tabular} & \Feelpp is available in various packaging formats to ensure broad compatibility across different platforms. It is packaged for popular Linux distributions and HPC environments, with specific mirrors and repositories:\\
+%% \rowcolor{white}Testing & \begin{tabular}{l}
+%% Unit\\
+%% Validation\\
+%% Verification\\
+%% \end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
+%% \rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
+%% Docker\\
+%% Singularity\\
+%% \end{tabular} & Container technologies used to package and deploy the software.\\
+%% \rowcolor{white}Interfaces & \begin{tabular}{l}
+%% Dymola/OpenModelica/FMU\\
+%% HPdomain decomposition methods\\
+%% MMG/ParMMG\\
+%% OpenTurns\\
+%% PETSc\\
+%% Salome\\
+%% \end{tabular} & List of software \Feelpp has interfaces with.\\
+%% \bottomrule
+%% \end{longtable}
+%% }}
+%% \caption{\Feelpp programming and computational environment}
+%% \end{table}
+
+{\fontsize{9}{11}\selectfont
+\begin{longtable}{lp{0.3\textwidth}p{0.5\textwidth}}
+ \caption{\Feelpp programming and computational environment}\label{tab:Feelpp:environment_capabilities} \\
+ \rowcolor{gray}\textbf{\color{white}Category} & \textbf{\color{white}Details} & \textbf{\color{white}Description} \\
+ \hline
+ \endfirsthead % End of the first head (the header for the first page)
+
+ \hline
+ \rowcolor{gray}\textbf{\color{white}Category} & \textbf{\color{white}Details} & \textbf{\color{white}Description} \\
+ \hline
+ \endhead % End of the head (for all other pages)
+
+ \hline
+ \endfoot % Footer for all but the last page
+
+ \hline
+ \endlastfoot % Footer for the last page
-\begin{table}[h!]
- \centering
- {
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {\fontsize{9}{11}\selectfont
- \begin{tabular}{lp{.3\textwidth}p{.5\textwidth}}
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
\rowcolor{white}Languages & \begin{tabular}{l}
-C++\\
-C++17\\
-C++20\\
-Python\\
-\end{tabular} & Programming languages and language standards supported by the software \\
+ \Cpp{}\\
+ \Cpp{17}\\
+ \Cpp{20}\\
+ Python\\
+ \end{tabular} & \Feelpp is primarily developed in \Cpp{} and supports modern \Cpp{} standards, including \Cpp{17} and \Cpp{20}, which provide enhanced performance and safety features for high-performance computing applications. It also integrates with Python for scripting capabilities. \\
+
\rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
-MPI\\
-Parallelism - C++\\
-Task based\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
+ MPI\\
+ Parallelism - \Cpp{}\\
+ Task based\\
+ \end{tabular} & \Feelpp offers robust support for parallel computing, utilizing MPI for distributed memory parallelism and implementing task-based parallelism to improve load balancing and resource usage. \\
+
\rowcolor{white}Data Formats & \begin{tabular}{l}
-Data-management system\\
-Ensight\\
-Gmsh and associated formats\\
-HDF5\\
-JSON\\
-VTK\\
-VTK\\
-YAML\\
-in-house format\\
-\end{tabular} & Data formats that the software can handle or produce.\\
- \rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
-Checkpoint restart\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
- \rowcolor{white}DevOps & \begin{tabular}{l}
-Continuous Benchmarking\\
-Continuous Delivery\\
-Continuous Integration\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
- \rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
-Debian\\
-Fedora\\
-Spack\\
-Ubuntu\\
-\end{tabular} & Software packaging and distribution.\\
+ Data-management system\\
+ Ensight\\
+ Gmsh and associated formats\\
+ HDF5\\
+ JSON\\
+ VTK\\
+ YAML\\
+ in-house format\\
+ \end{tabular} & \Feelpp supports various data formats for efficient data sharing and reuse, aligning with the FAIR principles. \\
+
+ \rowcolor{numpexlightergray}Resilience & Checkpoint restart & \Feelpp provides resilience support through checkpoint restart functionality. This allows simulations to save their state at specific points, enabling recovery from failures without restarting from the beginning. Binary in-house formats are used for fast restarts, ensuring minimal downtime and efficient continuation of long-running computations, especially on large-scale HPC systems.\\
+
+ \rowcolor{white}DevOps & \begin{tabular}{l} Continuous Benchmarking\\
+ Continuous Delivery\\
+ Continuous Integration\\
+ \end{tabular} & \Feelpp is developped on GitHub and uses GitHub Actions workflows to ensure the quality of the developments. The main branch is protected and only accepted reviewed PR can be merged. \Feelpp is not only rebuilt by the CI but also tests are run. Upon merge, a packaging workflow is trigerred to generate Debian-based packages as well as Docker and Apptainer images. Once Apptainer images are available, benchmarks are trigerred on super computer to check the performances, see~\cref{fig:feelpp-ci} and~\cref{fig:feelpp-cb} for a graphical representation.\\
+
+ \rowcolor{numpexlightergray}Packaging & Debian, Fedora, Spack, Ubuntu & Debian-based systems: Available via the APT repository at \href{https://apt.feelpp.org}{apt.feelpp.org}. Detailed installation instructions can be found in the documentation: \href{https://docs.feelpp.org/user/latest/install/index.html}{\Feelpp Installation Guide}.
+ Fedora: Distributed via Docker images, simplifying deployment in containerized environments.
+ Spack: Supported by Spack for high-performance computing environments, with a mirror available at \href{https://ghcr.io/feelpp/spack}{ghcr.io/feelpp/spack}.
+ Ubuntu: Also available through the same APT repository for Debian-based systems.\\
+
\rowcolor{white}Testing & \begin{tabular}{l}
-Unit\\
-Validation\\
-Verification\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
- \rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
-Docker\\
-Singularity\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
- \rowcolor{white}Interfaces & \begin{tabular}{l}
-Dymola/OpenModelica/FMU\\
-HPdomain decomposition methods\\
-MMG/ParMMG\\
-OpenTurns\\
-PETSc\\
-Salome\\
-\end{tabular} & List of software Feel++ has interfaces with.\\
- \bottomrule
- \end{tabular}
- }}
- \caption{Feel++ programming and computational environment}
-\end{table}
+ Unit\\
+ Validation\\
+ Verification\\
+ \end{tabular} & \Feelpp includes a suite of nearly 1,000 tests executed via ctest. These tests correspond to applications run both sequentially and in parallel. In most cases, applications are tested in both modes; only on very rare occasions are they run exclusively in either sequential or parallel mode. Each application contains numerous tests that perform unit testing or verify expected mathematical properties. To detect excessive execution time regressions, timeouts are configured for the tests. Finally, some tests are mini-applications or benchmarks that perform verification or validation against reference results.\\
+ \rowcolor{numpexlightergray}Containerization & Docker, Singularity & \Feelpp supports Docker and Apptainer/Singularity containers built either from source or from Debian-based packages. Both container type images are stored on \url{https://ghcr.io/feelpp/feelpp}. Images with tags containing \texttt{-sif} appended are apptainer images, the other ones are docker images. \\
+ \rowcolor{white}Interfaces &\begin{tabular}{l}
+ Dymola/OpenModelica/FMU\\
+ hpddm\\
+ MMG/ParMMG\\
+ OpenTurns\\
+ PETSc\\
+ Salome\\
+ CGAL\\
+ Scimba
+ \end{tabular} & \Feelpp can easily be interfaced with various software thanks to the use of standard open formats and a flexible \Cpp{} design.\\
-\subsection{Mathematics}
-\label{sec:Feel++:mathematics}
-Mathematics not available.
-In this section, provide a summary the mathematics used in the software.
+ \end{longtable}
+ }
-\subsection{Relevant Publications}
-\label{sec:Feel++:publications}
-Here is a list of relevant publications related to the software:
+ The~\Cref{fig:feelpp-ci} shows the \Feelpp Continuous Integration Workflow using GitHub Actions.
+ \begin{figure}
+ \centering
+ \includegraphics[width=0.8\textwidth]{graphics/feelpp/feelpp-ci-workflow.png}
+ \caption{\Feelpp Continuous Integration Workflow using GitHub Actions}
+ \label{fig:feelpp-ci}
+ \end{figure}
+ The~\Cref{fig:feelpp-cb} shows the \Feelpp Continuous Benchmarking Workflow using GitHub Actions and Reframe which is quite close to the methodology described in~\Cref{sec:methodology-regression-reframe}.
+ \begin{figure}
+ \centering
+ \includegraphics[width=0.8\textwidth]{graphics/feelpp/feelpp-cb-workflow.png}
+ \caption{\Feelpp Continuous Benchmarking Workflow using GitHub Actions and Reframe}
+ \label{fig:feelpp-cb}
+ \end{figure}
-\subsection{Acknowledgements}
-\label{sec::Feel++:acknowledgements}
+\subsection{Mathematics}
+\label{sec:Feelpp:mathematics}
+\Feelpp is based on the \ac{FEM}, which is used for solving partial differential equations (PDEs) in complex geometries.
+It leverages advanced numerical techniques to ensure accuracy and scalability, including adaptive mesh refinement, domain decomposition, and error estimation.
-The software has been developed with the support of the following funding agencies and institutions:
+The~\Cref{fig:Feelpp:components} shows the main components of \Feelpp, including the finite element method, mesh generation, solvers but also the components that will be tested in the benchmarks of this deliverable.
+\begin{figure}
+ \centering
+ \input{graphics/feelpp/feelpp-components.tex}
+ \caption{Main dependencies of \Feelpp(bottom rows), its main Components as well as the flagship applications (top row).}
+ \label{fig:Feelpp:components}
+\end{figure}
-Acknowledgements not available.
+\subsection{Relevant Publications}
+\label{sec:Feelpp:publications}
+Relevant publications include:
+\begin{description}
+ \item[\fullcite{prudhomme_feel_2012}] presents a detailed exploration of \Feelpp's design, focusing on its embedded variational language as a \ac{DSEL} and abstraction mechanisms that simplify the implementation of complex numerical methods. The framework is demonstrated with examples involving non-overlapping domain decomposition techniques and fictitious domain methods.
+ \item[\fullcite{prudhomme_feelppfeelpp_2024}] is the last preview of the \Feelpp software. The documentation is available online at \url{https://docs.feelpp.org};
+ \item[\fullcite{van_landeghem_mathematical_2024}] The paper \emph{Mathematical and computational framework for moving and colliding rigid bodies in a Newtonian fluid} (2024) presents a mathematical model and computational strategy to simulate the dynamics of moving and colliding rigid bodies within a Newtonian fluid. This work focuses on accurately modeling fluid-structure interactions as a step towards simulating micro-swimmers.
+ \item[\fullcite{saigre_model_2024}] The paper \emph{Model Order Reduction and Sensitivity Analysis for Complex Heat Transfer Simulations Inside the Human Eyeball}, presents advanced computational methods to simulate heat transfer processes within the human eye. The authors use reduced-order modeling to improve the efficiency and accuracy of these simulations, contributing to better insights in biomedical engineering through advanced analysis and in particular sensitivity analysis.;
+ \item[\fullcite{prudhomme_ktirio_2024}] The paper \emph{Ktirio Urban Building: A Computational Framework for City Energy Simulations Enhanced by CI/CD Innovations on EuroHPC Systems} (2024) presents a high-performance computational framework designed to support the European Union's Horizon 2050 initiative by improving energy consumption in buildings at city scale and beyond. The framework, known as Ktirio Urban Building (KUB), leverages Continuous Integration/Continuous Deployment (CI/CD) innovations to streamline large-scale city energy simulations on EuroHPC JU supercomputers.
+ \item[\fullcite{saigre_coupled_2024_abstract}] The paper \emph{A coupled fluid-dynamics-heat transfer model for 3D simulations of the aqueous humor flow in the human eye} presents a coupled fluid-dynamics-heat transfer model to simulate the flow of aqueous humor in the human eye, providing insights into the impact of the postural orientation of the patient on the fluid dynamics and heat transfer processes within the eye. An full-length paper of this extended abstract is in preparation \cite{saigre_coupled_2024_paper}.
+ \item[\fullcite{palazzolo2023shape}] The master thesis \emph{Shape Optimization for Rigid Objects in a Stokes Flow} (2023) presents a comprehensive study of shape optimization for rigid objects within a Stokes flow. The research focuses on determining the optimal shapes of 2D and 3D objects using iterative methods, including gradient descent and null-space techniques. The implementation of this work is publicly available at \url{https://feelpp.github.io/feelpp-shapo/shapo/index.html}.
+\end{description}
+\subsection{Acknowledgements}
+\label{sec::Feelpp:acknowledgements}
+The software has been developed with the support of the following funding agencies and institutions through various research projects:
+\begin{itemize}
+ \item Université de Strasbourg
+ \item Cemosis
+ \item CNRS
+ \item ANR
+ \item Région Grand Est
+ \item AMIES
+ \item European Commission
+ \item EuroHPC JU
+\end{itemize}
diff --git a/software/freefempp/WP1/WP1.tex b/software/freefempp/WP1/WP1.tex
index f71a88c..7d28d5f 100644
--- a/software/freefempp/WP1/WP1.tex
+++ b/software/freefempp/WP1/WP1.tex
@@ -43,6 +43,11 @@ \section{Software: Freefem++}
\subsection{Software Overview}
\label{sec:WP1:Freefem++:summary}
+FreeFEM can solve partial differential equations (PDEs) with the Finite Element Method (FEM) in 1D, 2D and 3D, as well as with the Boundary Element Method (BEM) in 2D and 3D. It handles 1D (curve) meshes as well as 2D and 3D (surface and volume) simplicial meshes.\\
+It embarks an internal 2D unstructured mesh generator called \textit{bamg} and uses in-house tools for 2D anisotropic mesh adaptation. For 3D anisotropic mesh adaptation FreeFEM is interfaced with the tetgen, MMG and ParMMG libraries and uses Mshmet for metric computation. MMG can handle 2D meshes as well as 3D surface and volume meshes and also has level-set discretization (curve and surface) capabilities.\\
+
+FreeFEM implements a lot of standard finite elements for discrete versions of functions in $H_1, H_{\text{div}}, H_{\text{curl}}, L^2$ as well as more exotic finite elements such as non-conform Lagrange, Hsieh-Clough-Tocher (HCT), Brezzi–Douglas–Marini (BDM) elements. In addition, it is possible for the user to add their own finite element in the language by defining a C++ plugin.\\
+
In~\cref{tab:WP1:Freefem++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
\begin{table}[h!]
@@ -57,11 +62,11 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} cG & provide short description here \\
-\rowcolor{numpexlightergray} dG/hdG & provide short description here \\
-\rowcolor{white} mesh adaptation & provide short description here \\
-\rowcolor{numpexlightergray} multiphysics coupling & provide short description here \\
-\rowcolor{white} unstructured mesh & provide short description here \\
+\rowcolor{white} cG & Continuous Lagrange finite element spaces up to degree 4. Raviart-Thomas or Nedelec edge elements up to third order. \\
+\rowcolor{numpexlightergray} dG/hdG & up to degree 4 Discontinuous Galerkin (NIPG, NCG, DG). \\
+\rowcolor{white} mesh adaptation & FreeFEM uses in-house tools for 2D mesh generation and mesh adaptation. It is interfaced with tetgen, MMG and ParMMG for 3D mesh adaptation. \\
+\rowcolor{numpexlightergray} multiphysics coupling & The DSL supports coupled variational formulations with cartesian product spaces involving different meshes or mesh types (surface / volume coupling). \\
+\rowcolor{white} unstructured mesh & FreeFEM works with simplicial unstructured meshes (segments in 1D, triangles in 2D and 3D surface, tetrahedra in 3D). \\
\end{tabular}
}
}
@@ -73,58 +78,103 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP1:Freefem++:performances}
+\textbf{Mesh generation} Currently, the initial mesh generation step is not parallel; the \textit{bamg} 2D mesh generator, the 3D primitives (ellipses, cuboids) and mesh extrusion from a 2D mesh are all implemented sequentially. For 3D anisotropic mesh adaptation, FreeFEM is interfaced with tetgen, MMG and ParMMG ; only the ParMMG library is parallel.
+
+\textbf{Mesh partitioning} The next step of a FreeFEM simulation is to distribute the mesh among the MPI processes. This is done using either a user-defined partitioning or an automatic graph partitioner such as SCOTCH, METIS or ParMETIS. Only ParMETIS has parallel capabilities. For large scale problems, avoiding a large global mesh can be achieved by starting from a coarser description of the geometry using a coarser initial mesh, partitioning and distributing the coarse mesh, and finally splitting (element-wise) the local subdomain meshes in parallel to reach the desired level of refinement. Of course this is not always applicable depending on the complexity of the geometry or the nature of the input.
+
+\textbf{Assembly} After the simulation mesh has been partitioned, it is distributed among the MPI processes as a (either non-overlapping or overlapping) decomposition into subdomains, with each MPI process in charge of a local subdomain mesh. The neighbor-to-neighbor mapping is computed locally in each MPI process using redundant information from the partitioned global mesh. Then, the linear system corresponding to the finite element discretization of the problem can be assembled in a distributed way, each MPI process assembling the local matrix corresponding to its subdomain independently. The distributed linear system can then be passed to the solver backend such as HPDDM, PETSc or the in-house ffddm (FreeFEM DD Methods) library, keeping track of the neighbor-to-neighbor mapping of the degrees of freedom.
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+ \item \textbf{Parallel Environment:} MPI
+ \item \textbf{Architectures:} CPU only ; local computing clusters, french supercomputers (Irene, Jean Zay, ...)
+
+ \item \textbf{Scalability:} The initial mesh generation step is sequential. Mesh partitioning is done using the parallel graph partitioner ParMETIS. Parallel adaptive mesh refinement relies on the scalability of the ParMMG library. The distributed assembly of the linear systems is done locally.
+\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP1:Freefem++:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP1. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+In this section we present the current state of the software concerning mesh generation and mesh adaptation as well as discretization of the variational problem to assemble the linear system. In particular, we assess the performance and scalability in terms of computing time of the initial mesh construction and mesh partitioning steps as well as the assembly process to construct the linear system.\\
+
+As already mentioned, the initial setup is relying on the construction of a global mesh which is then partitioned into subdomains and distributed among MPI processes, which becomes impractical for large problems. A workaround consists in starting from a coarser description of the geometry using a coarser initial mesh, partitioning and distributing the coarse mesh, and finally splitting (element-wise) the local subdomain meshes in parallel to reach the desired level of refinement. The drawback is that it is not always applicable depending on the complexity of the geometry or the nature of the input.\\
+
+In the general case, the partitioning of the global mesh is done using the sequential SCOTCH or METIS libraries, or the parallel ParMETIS library, aiming at keeping a good load balancing while minimizing interfaces. We observe that ParMETIS usually does not scale well past a few dozens of cores.\\
+
+After the mesh has been partitioned and distributed among the MPI processes, the linear system can be assembled in a distributed way, each process assembling the local matrix corresponding to its subdomain independently ; thus, the assembly step naturally exhibits good scalability properties. The distributed linear system can then be passed through the solver backend.\\
+
+The benchmark presented in this section illustrates the performance and scalability properties of these steps (mesh generation, mesh partitioning, assembly, solution of the linear system) for a 3D linear isotropic heterogeneous elasticity problem discretized with $P_1$, $P_2$ and $P_3$ elements for a sequence of increasingly finer meshes.\\
+The benchmark consists in a FreeFEM script publicly available in the distribution. The output of the script includes the computing time of the different steps of the simulation for a given polynomial order and mesh size parameter that the user can specify via command-line parameters.
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+ \item \textbf{Overall Performance:} The initial mesh generation and mesh partitioning steps of a simulation are not scalable. For non-trivial geometry, this can become a bottleneck even on a few thousand cores depending on the type of discretization and the performance of the solver ; after the mesh has been distributed among the MPI processes, the assembly step is trivially parallel and exhibits good scalability depending on the load-balancing properties of the initial partitioning.
+ \item \textbf{Input/Output Datasets:} Currently, the dataset for each benchmark consists in the FreeFEM script used to define and solve the corresponding problem. The scripts are open access and available in the FreeFEM distribution.
+ \item \textbf{Challenges:}
+ \begin{itemize}
+ \item definition of the geometry of the problem: relying on the construction of a global mesh which is then partitioned into subdomains and distributed among MPI processes is impractical for large problems.
+ \item currently, the benchmark datasets consist only in the FreeFEM script generating and solving the problem. When applicable, the benchmarks should also provide the input mesh generated by FreeFEM to allow for easier comparisons and improve reproducibility.
+ \item benchmarks currently do not provide a way to validate the results of the simulation.
+ \end{itemize}
+ \item \textbf{Future Improvements:}
\begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
+ \item construct the initial mesh in a distributed way; parallelize the mesh building primitives as well as mesh extrusion using \textit{buildlayers}.
+ \item provide the generated mesh in a suitable format for each benchmark.
+ \item provide validation quantities as output for each benchmark, such as relevant measured quantities of the simulation, verification of convergence order, ...
\end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
\end{itemize}
-\subsubsection{Benchmark \#1}
+\subsubsection{Benchmark \#1: 3D linear elasticity}
+
+This benchmark consists in solving the 3D linear isotropic heterogeneous static elasticity equations discretized with vector $P_1$, $P_2$ and $P_3$ elements for a sequence of increasingly finer meshes. The computational domain is a hollow cylinder of inner radius 0.8, outer radius 1 and height 3. Gravity is $g = 9.81$, density is $\rho = 7700$. The cylinder is composed of 10 alternating vertical layers of two materials with different mechanical properties $(E_1, nu_1) = (10^7, 0.45)$ and $(E_2, nu_2) = (10^9, 0.35)$. An essential boundary condition of null displacement is imposed on the bottom end of the cylinder, while a downwards vertical force $f = 5 \times 10^5$ is applied on the top boundary with a natural boundary condition.\\
+The problem is discretized with either $P_1, P_2$ or $P_3$ vector elements. The mesh size is proportional to an input parameter ${n}_h$.\\
+
+The benchmark consists in a FreeFEM script publicly available in the distribution and can be found at~\url{https://github.com/FreeFem/FreeFem-sources/blob/develop/examples/hpddm/elasticity-3d-cylinder-PETSc.edp}. The script outputs the computing time of the different steps of the simulation (mesh generation, mesh partitioning, assembly, solution of the linear system) for a given mesh size parameter ${n}_h$ and polynomial order that the user can specify as command-line parameters.\\
+
+The mesh is built by first building the 2D base of the cylinder using the \textit{bamg} internal mesh generator and then extruding it using the \textit{buildlayers} command. The mesh is then partitioned using the automatic graph partitioner ParMETIS. For the solver, we use GMRES with a relative tolerance of $10^{-5}$ preconditioned by a two-level GenEO preconditioner built through the PETSc interface of HPDDM. The GenEO coarse space is built by taking the first 20 GenEO eigenvectors in each subdomain.
+
+\begin{table}[h!]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{7}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}{\vrule width 2pt}}c!{\color{numpexgray}{\vrule width 2pt}}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}{\vrule width 2pt}}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}{\vrule width 2pt}}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}{\vrule width 2pt}}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}{\color{white}\bf ${n}_h$} & \color{numpexgray}{\color{white}\bf \# cores} & \multicolumn{3}{c!{\color{numpexgray}{\vrule width 2pt}}}{\color{white}\bf Mesh} & \multicolumn{3}{c!{\color{numpexgray}{\vrule width 2pt}}}{\color{white}\bf $P_1$} & \multicolumn{3}{c!{\color{numpexgray}{\vrule width 2pt}}}{\color{white}\bf $P_2$} & \multicolumn{3}{c!{\color{numpexgray}\vrule}}{\color{white}\bf $P_3$}\\
+ \rowcolor{numpexgray} & & {\color{white}\bf \# elts} & {\color{white}\bf build} & {\color{white}\bf part.} & {\color{white}\bf \#dof} & {\color{white}\bf assmb.} & {\color{white}\bf solve} & {\color{white}\bf \#dof} & {\color{white}\bf assmb.} & {\color{white}\bf solve} & {\color{white}\bf \#dof} & {\color{white}\bf assmb.} & {\color{white}\bf solve}\\
+ \texttt{2} & \texttt{24} & \pgfmathprintnumber{55920} & \texttt{0.04} s & \pgfmathprintnumber{0.3313851398} s & \pgfmathprintnumber{38499} & \pgfmathprintnumber{0.1087517946} s & \pgfmathprintnumber{0.8205061646} s & \pgfmathprintnumber{265356} & \pgfmathprintnumber{2.286347792} s & \pgfmathprintnumber{11.71424775} s & \pgfmathprintnumber{848331} & \pgfmathprintnumber{28.18209619} s & \pgfmathprintnumber{88.76374739} s\\
+ \rowcolor{numpexlightergray}
+ \texttt{4} & \texttt{192} & \pgfmathprintnumber{459840} & \pgfmathprintnumber{0.661449048} s & \pgfmathprintnumber{0.7736278304} s & \pgfmathprintnumber{271674} & \pgfmathprintnumber{0.1316522944} s & \pgfmathprintnumber{1.774703749} s & \pgfmathprintnumber{2005416} & \pgfmathprintnumber{2.746920468} s & \pgfmathprintnumber{23.16441227} s & \pgfmathprintnumber{6580746} & \pgfmathprintnumber{33.79095437} s & \pgfmathprintnumber{205.7214154} s\\
+ \texttt{6} & \texttt{648} & \pgfmathprintnumber{1568880} & \pgfmathprintnumber{2.543351217} s & \pgfmathprintnumber{2.415714336} s & \pgfmathprintnumber{878097} & \pgfmathprintnumber{0.1541521545} s & \pgfmathprintnumber{1.627837831} s & \pgfmathprintnumber{6648708} & \pgfmathprintnumber{3.229656544} s & \pgfmathprintnumber{33.43725633} s & \pgfmathprintnumber{22018473} & \pgfmathprintnumber{39.81431927} s & \pgfmathprintnumber{293.7350389} s\\
+ \rowcolor{numpexlightergray}
+ \texttt{8} & \texttt{1536} & \pgfmathprintnumber{3611520} & \pgfmathprintnumber{5.995033506} s & \pgfmathprintnumber{5.663275026} s & \pgfmathprintnumber{1971606} & \pgfmathprintnumber{0.161011985} s & \pgfmathprintnumber{3.006030941} s & \pgfmathprintnumber{15107544} & \pgfmathprintnumber{3.35254444} s & \pgfmathprintnumber{36.51872591} s & \pgfmathprintnumber{50242374} & \pgfmathprintnumber{41.56813132} s & \pgfmathprintnumber{325.3649314} s\\
+ \texttt{10} & \texttt{3000} & \pgfmathprintnumber{7587600} & \pgfmathprintnumber{13.06281585} s & \pgfmathprintnumber{10.68638843} s & \pgfmathprintnumber{4053969} & \pgfmathprintnumber{0.1880340362} s & \pgfmathprintnumber{4.405469723} s & \pgfmathprintnumber{31388676} & \pgfmathprintnumber{3.87673609} s & \pgfmathprintnumber{43.42991529} s & \pgfmathprintnumber{104766921} & \pgfmathprintnumber{47.57492979} s & \pgfmathprintnumber{403.4183248} s\\
+ \end{tabular}
+ }}
+ \caption{Weak scaling experiment for 3D heterogeneous linear elasticity with decreasing mesh size. For each mesh size we report the number of cores, the number of mesh elements, the computing time for mesh construction and mesh partitioning, as well as the number of degrees of freedom and the computing time of the assembly and solution steps for $P_1, P_2$ and $P_3$ discretizations. All computing times are reported in seconds.}
+ \label{tab:elasticity}
+\end{table}
+
+In~\cref{tab:elasticity} we report a weak scaling experiment, where the number of CPU cores scales with ${n}_h^3$ so as to keep the local problem sizes constant. We report the computing time of the different steps of the simulation: mesh construction, mesh partitioning, assembly and solution of the linear system for $P_1, P_2$ and $P_3$ discretizations. The number of mesh elements varies from $\pgfmathprintnumber{55920}$ to $7.6$ million, with a corresponding increase of the number of CPU cores from 24 to 3000. For the finest mesh, the number of unknowns reaches 4 million for $P_1$, 31 million for $P_2$ and 105 million for $P_3$.
+
+\textbf{Hardware and software setting:} The experiment is done on the Skylake partition of the Irene supercomputer. FreeFEM is compiled using the Intel 20.0.4 compiler suite.
+
+The results can be summarized as follows:
+
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+\item As we can see, the mesh generation and partitioning steps do not scale: going from a mesh of $\pgfmathprintnumber{55920}$ elements to $7.6$ million elements, the combined time increases from $0.37$s to $23.75$s. Note that here we purposefully do not rely on the construction of a coarse mesh which would be refined (by splitting all edges) in parallel after partitioning to reach the desired level of refinement, so as to illustrate the performance of these initial steps in the most general case.
+\item In terms of computing time, the solution step dominates the assembly step for all discretization orders, with a factor of between 3 for $n_h = 2$ using $P_3$, to 23 for $n_h = 10$ using $P_1$. Computing time for both assembly and solution steps increase by a factor of roughly 10 going from $P_1$ to $P_2$ as well as from $P_2$ to $P_3$.
+\item Going from $n_h = 2$ to $n_h = 10$, the assembly step increases by around 70\% for all three discretization orders, reaching $0.19$s for $P_1$, $3.88$s for $P_2$ and $47.57$s for $P_3$. On the other hand, as expected the solution step exhibits poorer scalability ; it increases by a factor of around 4, reaching $4.41$s for $P_1$, $43.43$s for $P_2$ and $403.42$s for $P_3$.
+\item In terms of relative bottlenecks, we can see that the poor scalability of the mesh generation and partitioning steps leads to it quickly dominating the total simulation time for $P_1$ discretization, accounting for $84\%$ of the total time for $n_h = 10$. On the other hand, it accounts for much less when considering higher order discretization, reaching only $5\%$ of the total time for $n_h = 10$ using $P_3$ elements.
\end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP1:Freefem++:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
+As we have seen, relying on the construction of a global mesh which is then partitioned into subdomains and distributed among MPI processes using ParMetis is not scalable and can become impractical for large problems. We will work on how to construct the initial mesh in a distributed way, first parallelizing the mesh building primitives as well as mesh extrusion using \textit{buildlayers}.
+
+Currently, FreeFEM relies on a set of high-level macros written in the FreeFEM language that the user has to call for the initial mesh partitioning and decomposition. In combination with the point above concerning distributed mesh generation, we will define parallel data structures for handling distributed meshes and finite element spaces. The goal is to allow the user to go from a sequential script to a parallel script in an almost completely transparent way and with minimal changes to the script, e.g. by only changing the type of the mesh variable from 'mesh' to 'Dmesh'.\\
+
+For now, FreeFEM benchmarks consist in FreeFEM scripts available in the distribution. In an effort to ensure reproducibility, we will unify all FreeFEM benchmarks in a separate repository with appropriate documentation and instructions to ensure long-term usability. For WP1 benchmarks, we will provide the generated mesh in a suitable format for each benchmark. In addition, we will provide validation quantities as output for each benchmark, such as relevant measured quantities of the simulation or verification of convergence order.\\
In~\cref{tab:WP1:Freefem++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
@@ -144,10 +194,10 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & Improve initial parallel mesh generation and partitioning for standard primitives ; define internal distributed data structures for meshes and finite element spaces to further simplify the conversion from a sequential script to a parallel one. \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Unify and document FreeFEM benchmarks in a separate repository. Provide validation quantities for each benchmark. \\
+\rowcolor{white} B6 - Data Management & Provide the generated mesh in a suitable format for relevant benchmarks. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Work towards eliminating the mesh generation and partitioning steps as bottlenecks for exascale, at least for simple cases. \\
\end{tabular}
}
}
diff --git a/software/freefempp/WP3/WP3.tex b/software/freefempp/WP3/WP3.tex
index 77ad1ab..8a5e93a 100644
--- a/software/freefempp/WP3/WP3.tex
+++ b/software/freefempp/WP3/WP3.tex
@@ -43,7 +43,7 @@ \section{Software: Freefem++}
\subsection{Software Overview}
\label{sec:WP3:Freefem++:summary}
-In~\cref{tab:WP3:Freefem++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+In~\cref{tab:WP3:Freefem++:features} we provide a summary of the FreeFEM features relevant to the work package which are briefly discussed.
\begin{table}[h!]
\centering
@@ -57,9 +57,9 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} algebraic multiphysics coupling & provide short description here \\
-\rowcolor{numpexlightergray} domain decomposition methods & provide short description here \\
-\rowcolor{white} reuse of Krylov subspaces for multiple right-hand sides & provide short description here \\
+\rowcolor{white} algebraic multiphysics coupling & PETSc can be used within FreeFEM for the solution of algebraic multiphysics coupling problems. Moreover, the user has easy access to additional relevant information about the problem (mesh, PDE, elementary matrices) which can allow to design and test more efficient coupling algorithms informed by the physics of the problem.\\
+\rowcolor{numpexlightergray} domain decomposition methods & extensive interface to the HPDDM library, accessible both via its own FreeFEM interface and through the PETSc interface of FreeFEM. The majority of the parallel examples of the FreeFEM distribution uses HPDDM through PETSc for the solution of the linear systems. FreeFEM also provides a set of high-level FreeFEM scripts implementing domain decomposition methods called ffddm for easy prototyping.\\
+\rowcolor{white} reuse of Krylov subspaces for multiple right-hand sides & thanks to its interface to HPDDM, FreeFEM can take advantage of advanced Krylov subspace methods implementing blocking and recycling strategies for the efficient solution of linear systems with multiple right-hand sides.\\
\end{tabular}
}
}
@@ -71,58 +71,106 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP3:Freefem++:performances}
+FreeFEM uses either direct or iterative solvers for the solution of the linear systems. In both cases, it is interfaced with high-performance CPU-based linear solver libraries using MPI for parallelism. For direct methods, it is interfaced with the multifrontal solver MUMPS. For large scale computations, FreeFEM relies on its extensive interface to the PETSc library. In particular, it can use the high-performance domain decomposition library HPDDM as the linear solver backend, which is available both through PETSc and through its own FreeFEM interface. After the input simulation mesh has been partitioned and distributed among the MPI processes, the linear system is naturally assembled and passed through the solver backend in a distributed way.\\
+More details about scalability are given in the next section and in~\cref{sec:WP3:HPDDM:software}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+ \item \textbf{Parallel Environment:} MPI
+ \item \textbf{Architectures:} CPU only ; local computing clusters, french supercomputers (Irene, Jean Zay, ...)
+
+ \item \textbf{Scalability:} relies on high-performance solver librairies such as HPDDM and PETSc. Scalability is measured in terms of number of iterations of the iterative solver and computing time of both setup and solution steps.
+\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP3:Freefem++:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+Here we present the current state of the software concering the performance of the solution step for problems involving various physics. For large scale problems, the main concern is the efficiency of the preconditioner, both in terms of number of iterations and parallel efficiency. For problems with provably robust preconditioners such as SPD problems preconditioned by a multi-level GenEO domain-decomposition method, the scalability is very good up to several thousands CPU cores, where the increase in coarse space size starts to hinder parallel efficiency (see~\cref{sec:WP3:HPDDM:software}). On the other hand, there are physics for which designing provably robust and efficient solvers is still a current open challenge.\\
+
+The first benchmark presented in this section illustrates this difficulty in the context of high-frequency electromagnetic wave propagation problems. We consider a very simple setting of scattering of a point source in a homogeneous domain. The benchmark solves the time-harmonic second-order Maxwell's equations in a cube, where the source is a gaussian point source in the center of the cube, with lowest-order absorbing boundary conditions on all 6 faces.\\
+The benchmark consists in a FreeFEM script publicly available in the distribution. The script outputs the number of iterations and computing time for a specific frequency that the user can specify as a command-line parameter. The performance can be evaluated through the PETSc output logs.\\
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+ \item \textbf{Overall Performance:} FreeFEM relies on high-performance solver libraries such as HPDDM and PETSc for the efficient solution of large-scale linear systems. State-of-the-art multilevel domain-decomposition solvers have been shown to be robust and scalable up to several thousand CPU cores (see~\cref{sec:WP3:HPDDM:software}). However, designing robust and efficient solvers for difficult physics, such as high-frequency wave propagation problems, remains an open challenge. FreeFEM is the ideal framework to try and tackle these open problems: it allows for a quick and easy description and set up of test cases involving various physics, and it is developed hand-in-hand with our research in scientific computing and parallel solvers. The FreeFEM/ffddm/HPDDM/PETSc ecosystem greatly facilitates the development of new methods, from research and prototyping all the way to HPC implementation.
+ \item \textbf{Input/Output Datasets:} Currently, the dataset for each benchmark consists in the FreeFEM script used to define and solve the corresponding problem. The scripts are open access and available in the FreeFEM distribution.
+ \item \textbf{Challenges:}
+ \begin{itemize}
+ \item definition of the geometry of the problem: relying on the construction of a global mesh which is then partitioned into subdomains and distributed among MPI processes is impratical for large problems. This is discussed in more details in~\cref{sec:WP1:Freefem++:software}.
+ \item for difficult physics such as high-frequency wave propagation, designing robust and efficient preconditioners is still an open challenge. In particular, in the context of multi-level domain decomposition methods, the construction of efficient coarse spaces of reasonable size can be challenging.
+ \item currently, the benchmark datasets consist only in the FreeFEM script generating and solving the problem. A discussion should be made to provide the (possibly very large) linear system to solve as a benchmark input, allowing to compare results with other solver libraries and software. The price is then the loss of information about the physics of the problem, effectively discarding non-algebraic solvers. Existing formats such as PETSc matrix export formats should be investigated.
+ \end{itemize}
+ \item \textbf{Future Improvements:}
\begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
+ \item construct the initial mesh in a distributed way; parallelize the mesh building primitives as well as mesh extrusion using \textit{buildlayers}.
+ \item continuing our research in domain decomposition methods, design efficient and robust coarse spaces for difficult problems such as high-frequency wave propagation.
+ \item come up with a standardized and efficient way of providing the linear system as input data for benchmarking.
\end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
\end{itemize}
-\subsubsection{Benchmark \#1}
+\subsubsection{Benchmark \#1: Time-harmonic Maxwell equations at high frequency}
+
+This benchmark consists in solving the time-harmonic second-order Maxwell's equations in the unit cube, where the source is a gaussian point source in the center of the cube, with lowest-order absorbing boundary conditions on all 6 faces. The complex amplitude of the electric field $\mathbf{E(x)}$ is solution of
+
+\begin{equation}
+ \begin{cases}
+\nabla\times(\nabla\times \mathbf{E})- k^2 \mathbf{E} = \mathbf{f}, & \text{in } \Omega,\\
+\nabla\times \mathbf{E}\times \textbf{n}+ \mathrm{i} k \textbf{n} \times (\mathbf{E}\times \textbf{n}) = 0 & \text{on } \partial\Omega,\\
+ \end{cases}
+\end{equation}
+
+where $k = 2\pi f$, $f \in \mathbb{R}^*_+$ is the frequency, $\Omega$ is the unit cube, $\partial\Omega$ are the boundary faces of $\Omega$, $\textbf{n(x)}$ is the outward unit normal vector to $\partial\Omega$, and the source term is $\textbf{f(x)} = [0,0,\exp^{-50 k [(x-0.5)^2 + (y-0.5)^2 + (z-0.5)^2]}]$.\\
+The problem is discretized with lowest-order Nedelec edge finite elements. The unit cube is discretized with a regular tetrahedral mesh (each elementary cube is cut into 6 tetrahedra, with 1 diagonal privileged) with a constant mesh size corresponding to 10 points per wavelength $\lambda = \frac{1}{f}$.
+
+
+The benchmark consists in a FreeFEM script publicly available in the distribution and can be found at~\url{https://github.com/FreeFem/FreeFem-sources/blob/master/examples/hpddm/maxwell-mg-3d-PETSc-complex.edp}. The script outputs the number of iterations and computing time for a specific frequency $f$ that the user can specify as a command-line parameter. The performance can be evaluated through the PETSc output logs. The number of iterations of the Krylov solver, the FLOPS and the execution time are reported for increasing frequencies.\\
+
+The preconditioner used in this benchmark is a nested two-grid optimized overlapping Schwarz domain-decomposition preconditioner. The construction of the preconditioner is done using PETSc. A thorough description of the method can be found in~\cite{bonazzoli_domain_2019}. The mesh is partitioned using the automatic graph partitioner \textit{ParMETIS}. The GMRES relative tolerance is set to $10^{-5}$. Each subdomain is assigned to one MPI process. In the preconditioner, the inner relative tolerance of the nested coarse solve is set to $10^{-1}$.
+
+\begin{table}[h!]
+ \centering
+ { \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}c!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}{\color{white}\bf f} & {\color{white}\bf \# dofs} & {\color{white}\bf \# cores} & {\color{white}\bf \# iter.} & \multicolumn{5}{c!{\color{numpexgray}\vrule}}{\color{white}\bf Computing time (s)} \\
+ \rowcolor{numpexgray} & & & & {\color{white}\bf mesh part.} & {\color{white}\bf assembly} & {\color{white}\bf prec. setup} & {\color{white}\bf solve} & {\color{white}\bf VTK export} \\
+ \texttt{8} & \pgfmathprintnumber{3641840} & \pgfmathprintnumber{48} & \texttt{7} & \pgfmathprintnumber{5.974709965} & \pgfmathprintnumber{4.332314352} & \pgfmathprintnumber{1.2046e+01} & \pgfmathprintnumber{2.4177e+01} & \pgfmathprintnumber{1.945695103} \\
+ \rowcolor{numpexlightergray}
+ \texttt{12} & \pgfmathprintnumber{12225960} & \pgfmathprintnumber{162} & \texttt{9} & \pgfmathprintnumber{8.98338665} & \pgfmathprintnumber{4.556623172} & \pgfmathprintnumber{1.2622e+01} & \pgfmathprintnumber{3.1155e+01} & \pgfmathprintnumber{2.038234012}\\
+ \texttt{16} & \pgfmathprintnumber{28902880} & \pgfmathprintnumber{384} & \texttt{12} & \pgfmathprintnumber{12.99739596} & \pgfmathprintnumber{4.857938825} & \pgfmathprintnumber{1.3344e+01} & \pgfmathprintnumber{3.9628e+01} & \pgfmathprintnumber{2.140030511} \\
+ \rowcolor{numpexlightergray}
+ \texttt{20} & \pgfmathprintnumber{56360600} & \pgfmathprintnumber{750} & \texttt{18} & \pgfmathprintnumber{17.22267813} & \pgfmathprintnumber{5.45332545} & \pgfmathprintnumber{1.3444e+01} & \pgfmathprintnumber{5.5973e+01} & \pgfmathprintnumber{2.164726065}\\
+ \texttt{24} & \pgfmathprintnumber{97287120} & \pgfmathprintnumber{1296} & \texttt{25}& \pgfmathprintnumber{20.58727531} & \pgfmathprintnumber{5.664089589} & \pgfmathprintnumber{1.3940e+01} & \pgfmathprintnumber{7.6634e+01} & \pgfmathprintnumber{2.235102781}\\
+ \rowcolor{numpexlightergray}
+ \texttt{28} & \pgfmathprintnumber{154370440} & \pgfmathprintnumber{2058} & \texttt{35} & \pgfmathprintnumber{29.6948684} & \pgfmathprintnumber{5.643965845} & \pgfmathprintnumber{1.3856e+01} & \pgfmathprintnumber{1.0889e+02} & \pgfmathprintnumber{2.260791771}\\
+ \texttt{32} & \pgfmathprintnumber{230298560} & \pgfmathprintnumber{3072} & \texttt{45} & \pgfmathprintnumber{42.58705387} & \pgfmathprintnumber{5.691343932} & \pgfmathprintnumber{1.4165e+01} & \pgfmathprintnumber{1.4772e+02} & \pgfmathprintnumber{2.523501058}\\
+ \end{tabular}
+ }}
+ \caption{Weak scaling experiment increasing frequency from $f=8$ to $f=32$. For each frequency we report the number of degrees of freedom, the number of cores, the number of GMRES iterations, and the computing time (in seconds) of the different steps of the simulation: mesh partitioning, assembly of the linear system, construction of the preconditioner, solution of the linear system, VTK export of the solution.}
+ \label{tab:maxwell}
+\end{table}
+
+In~\cref{tab:maxwell} we report a weak scaling experiment, where the number of CPU cores scales with $f^3$ so as to keep the local problem sizes constant. We report the computing time of the different steps of the simulation: mesh partitioning, assembly of the linear system, construction of the preconditioner, solution of the linear system, VTK export of the solution. The frequency varies from 8 to 32, with a corresponding increase of the number of unknowns from 3.6 million to 230 million. The number of cores increases from 48 to 3072.
+
+\textbf{Hardware and software setting:} The experiment is done on the Skylake partition of the Irene supercomputer. FreeFEM is compiled using the Intel 20.0.4 compiler suite.
+
+The results can be summarized as follows:
+
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+\item The parallel assembly of the linear system, the setup of the preconditioner and the export to VTK are all local computations and operations done independently on each subdomain. The local matrices are assembled on the local meshes of the subdomains, and the construction of the preconditioner mainly consists in performing the LU factorization of the two local matrices (corresponding to the fine and coarse levels) using the direct solver \textit{MUMPS}. It is then natural that the computing time corresponding to these steps stays approximately constant throughout the weak scaling experiment.
+\item As discussed in~\cref{{sec:WP1:Freefem++:software}}, the cost of the initial partitioning of the global mesh using \textit{ParMETIS} increases. This can become a bottleneck for large scale simulations. Moreover, without relying on an initial coarsening of the global mesh, the memory cost of holding the mesh becomes intractable. This is evidenced here, where going higher than $f=32$ produces an out-of-memory error.
+\item Even though the setup cost of the preconditioner is cheap and scalable, the number of iterations increases with frequency, from 7 iterations for $f=8$ to 45 iterations for $f=32$. Correspondingly, the computing time of the solution step increases from 24 to 148 seconds. The increase is a little more than linear with respect to frequency.
\end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP3:Freefem++:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
+See~\cref{sec:WP1:Freefem++:roadmap} for the plans regarding the definition of the initial geometry and mesh in a distributed setting for large scale problems.\\
+
+Currently, FreeFEM relies on a set of high-level macros written in the FreeFEM language that the user has to call for the initial mesh partitioning and decomposition. In combination with the point above concerning distributed mesh generation, we will define parallel data structures for handling distributed meshes and finite element spaces. The goal is to allow the user to go from a sequential script to a parallel script in an almost completely transparent way and with minimal changes to the script, e.g. by only changing the type of the mesh variable from 'mesh' to 'Dmesh'.\\
+
+For now, FreeFEM benchmarks consist in FreeFEM scripts available in the distribution. In an effort to ensure reproducibility, we will unify all FreeFEM benchmarks in a separate repository with appropriate documentation and instructions to ensure long-term usability.\\
In~\cref{tab:WP3:Freefem++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
@@ -142,10 +190,10 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & Improve initial parallel mesh generation and partitioning for standard primitives ; define internal distributed data structures for meshes and finite element spaces to further simplify the conversion from a sequential script to a parallel one.\\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Unify and document FreeFEM benchmarks in a separate repository. \\
+\rowcolor{white} B6 - Data Management & Define/choose a standard matrix input format for solver benchmarks. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Design fast and robust domain-decomposition preconditioners based on efficient and reasonably-sized coarse spaces for difficult physics such as high-frequency wave propagation \\
\end{tabular}
}
}
diff --git a/software/freefempp/WP5/WP5.tex b/software/freefempp/WP5/WP5.tex
index a7b5fa7..70c6624 100644
--- a/software/freefempp/WP5/WP5.tex
+++ b/software/freefempp/WP5/WP5.tex
@@ -43,6 +43,8 @@ \section{Software: Freefem++}
\subsection{Software Overview}
\label{sec:WP5:Freefem++:summary}
+FreeFEM interfaces with various optimization libraries for large scale problems. IPOPT is a software library for non-linear, constrained optimization and implements a primal-dual interior point method along with filter method based line searches. The CMA-ES library implements stochastic strategies for optimization of nonlinear non-convex 'blackbox' functions. FreeFEM is also interfaced with the Toolkit for Advanced Optimization (TAO) from the PETSc library.\\
+
In~\cref{tab:WP5:Freefem++:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
\begin{table}[h!]
@@ -57,7 +59,7 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} Iterative methods & provide short description here \\
+\rowcolor{white} Iterative methods & FreeFEM is interfaced with several optimization libraries such as IPOPT, CM-AES and TAO from PETSc.\\
\end{tabular}
}
}
@@ -69,84 +71,57 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP5:Freefem++:performances}
+IPOPT is a sequential library. FreeFEM provides a MPI-parallel interface to CM-AES, and TAO relies on the parallel capabilities of the PETSc library.
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+ \item \textbf{Parallel Environment:} MPI
+
+ \item \textbf{Architectures:} CPU only ; local computing clusters, french supercomputers (Irene, Jean Zay, ...)
+
+ \item \textbf{Scalability:} relies on optimization librairies such as IPOPT, CM-AES and TAO from PETSc. Scalability will be measured in terms of number of iterations of the optimization algorithm and computing time.
\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP5:Freefem++:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP5. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+In the near future, the plan is to provide results comparing the performance of the different methods available in FreeFEM for a constrained minimal volume optimization problem described below.
+
+\subsubsection{Benchmark \#1: Minimal volume}
+
+The Plateau problem is named after the Belgian natural scientist J. Plateau, who made numerous experiments with soap films, realizing a large variety of minimal surfaces.\\
+In this well known problem, the aim is to find a minimal surface or volume whose boundary is fixed. We restrict ourselves to closed and $C^2$ surfaces, parametrized by means of the graph of a given function $u:\bar{\Omega} \rightarrow \mathbb{R}$ (with $\Omega\subset\mathbb{R}^d$, where $d=2$ or $3$) belonging to the admissible set
+\[
+ S(\Omega) = \left\{ u:\bar{\Omega} \rightarrow \mathbb{R},\quad u_{\vert\Omega} \in C^d(\Omega,\mathbb{R}),\quad u_{\vert\partial\Omega} \in C^1(\partial\Omega,\mathbb{R}) \right\},
+\]
+so that an admissible surface/volume is given by the parametric surface
+$$
+X(\boldsymbol{x}) = (\boldsymbol{x},u(\boldsymbol{x})), \quad \boldsymbol{x} \in \bar{\Omega} .
+$$
+The cost functional is
+\[
+ J(u) = \int_{\Omega} (1+\vert\nabla u\vert^2)^\frac{1}{2}\,d\boldsymbol{x} .
+\]
+Given a function $\gamma\in C^1(\partial\Omega,\mathbb{R})$ along the boundary of $\Omega$, we seek an optimal solution of the Plateau optimization problem
+\begin{align}
+ \nonumber
+ \min_{u \in S(\Omega)} J(u) \mbox{ such that } u_{\vert\partial\Omega} = \gamma .
+\end{align}
+This problem can be solved by finding the critical points of the functional $J$ thanks to an iterative procedure, e.g. combining a fixed point method and a Newton method.\\
+
+Such an iterative procedure requires the computation of the gradient, and if possible, the Hessian. Here, the first derivative of the cost functional $J$ is given by
+\begin{align}
+ \nonumber
+ DJ(u)v = \int_{\Omega} \frac{\nabla u\cdot\nabla v}{(1+\vert\nabla u\vert^2)^{\frac{1}{2}}} \,d\boldsymbol{x} \quad \forall v\in S(\Omega)
+\end{align}
+and its second order derivative is given by
+\begin{align}
+ \nonumber
+ D^2J(u)(v,w) = \int_{\Omega} \frac{\nabla v\cdot\nabla w}{(1+\vert\nabla u\vert^2)^{\frac{1}{2}}} -\frac{(\nabla u \cdot \nabla v)(\nabla u \cdot \nabla w)}{(1+\vert\nabla u\vert^2)^{\frac{3}{2}}} \,d\boldsymbol{x} \quad \forall v,w \in S(\Omega).
+\end{align}
+As an example, we can choose $\Omega$ as the unit square and $\gamma(x,y) = \cos(\pi x)\cos(\pi y)$. This problem is solved using TAO from PETSC in a FreeFEM script available in the distribution at~\url{https://github.com/FreeFem/FreeFem-sources/blob/master/examples/hpddm/minimal-surface-Tao-2d-PETSc.edp}.
\subsection{12-Month Roadmap}
\label{sec:WP5:Freefem++:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
-
-In~\cref{tab:WP5:Freefem++:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
-
-\begin{table}[h!]
- \centering
-
-
-
- \centering
- {
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {
- \fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
-\end{tabular}
- }
- }
- \caption{WP5: Freefem++ plan with Respect to Relevant Bottlenecks}
- \label{tab:WP5:Freefem++:bottlenecks}
-\end{table}
\ No newline at end of file
+The plan is to provide results comparing the performance of the different methods available in FreeFEM for a constrained minimal volume optimization problem. In particular, the performance and scalability of TAO solvers will be assessed, both in terms of number of iterations and computing time. The script will be made publicly available.
\ No newline at end of file
diff --git a/software/freefempp/freefempp.tex b/software/freefempp/freefempp.tex
index 272297f..f3cd83c 100644
--- a/software/freefempp/freefempp.tex
+++ b/software/freefempp/freefempp.tex
@@ -44,19 +44,53 @@ \section{Software: Freefem++}
\subsection{Software summary}
\label{sec:Freefem++:summary}
-Detailed overview not available.
+FreeFEM is a free open-source software developed at Laboratoire Jacques-Louis Lions and designed for one- to three-dimensional multi-physics simulations. It has been continuously developed since 1992 and has an active community of users; the community forum averages around 25 discussion topics per month. The FreeFEM source code is hosted on Github, with the repository having (at time of writing) 764 stars and 189 forks and 1500 binary downloads per month on average. FreeFEM leverages the strengh of the team in numerical analysis and scientific computing, with finite element methods (FEM), boundary element methods (BEM) and domain decomposition (DD) methods implemented to a high level. It works on Linux, MacOS and Windows and can run in various architectures from cell phones to national supercomputers. \\
+FreeFEM embeds a high-level user-friendly Domain Specific Language (DSL), which allows users with minimal programming knowledge to describe and solve their problems with a few lines of code. Its main objective is to provide easy access to distributed parallel simulations and solvers while hiding technical programming difficulties from the user. FreeFEM is written in C++ for speed, and features easy custom extensions through dynamically-loaded plugins.\\
+On the one hand, FreeFEM’s DSL allows users to easily define and manipulate high-level notions such as variational forms, meshes, and FEM or BEM spaces; on the other hand, the user can also access the underlying data structures and linear algebra (i.e., FreeFEM is not a “black box" that cannot be opened). It allows rapid multiphysics prototyping ; the resulting scripts are close to their mathematical counterparts. The abstraction of the DSL reduces the gap between mathematical objects and their numerical and practical implementation. As an illustration, below is a sample script setting up and solving a 2D acoustic scattering problem:
+\begin{figure}[h!]
+\begin{minipage}{.45\textwidth}
+\begin{center}
+Code\\[1em]
+
+\includegraphics[height=6cm]{graphics/freefempp/samplecode.png}\end{center}
+\end{minipage}
+\hfill
+\begin{minipage}{.45\textwidth}
+\begin{center}
+Output\\[1em]
+
+\includegraphics[height=6cm]{graphics/freefempp/sampleplot.pdf}
+\end{center}
+\end{minipage}
+\caption{Sample FreeFEM code solving a 2D acoustic scattering problem by a rectangular cavity.}
+\end{figure}
+
+The DSL allows the end user to easily implement their own physics modules using the provided FreeFEM language. Numerous physics are pre-built: Incompressible Navier-Stokes (using the P1-P2 Taylor Hood element), Lamé equations (linear elasticity), Neo-Hookean, Mooney-Rivlin (nonlinear elasticity), Thermal diffusion, Thermal convection, Thermal radiation, Magnetostatics, Electrostatics, Fluid-structure interaction (FSI),...\\
+FreeFEM supports 1D (curvilinear), 2D (surface) and 3D simplicial meshes. It embeds its own 2D internal mesher and is compatible with powerful open-source mesh and visualization software such as Tetgen, Gmsh, Mmg, ParMmg and ParaView.\\
+
+FreeFEM is interfaced with state of the art parallel solvers and libraries such as
+
+\begin{itemize}
+\item the HPDDM library, which contains optimized C++ MPI-parallel implementation of the latest domain-decomposition methods,
+\item the HTOOL library, a high-performance parallel library implementing domain-decomposition solvers and Hierarchical matrix (H-matrix)
+compression for BEM operators using hybrid OpenMP-MPI parallelism
+\item the PETSc library (Portable, Extensible Toolkit for Scientific Computation), giving access to a wide variety of solvers in one easy-to-use toolbox.
+\end{itemize}
+
+Being developed hand-in-hand with the team's research in scientific computing, FreeFEM benefits from the fast integration of the latest state of the art research developments in particular regarding fast and robust parallel linear solvers, allowing the software to remain at the forefront in the scientific community and contribute to important applications in various fields such as health, medical imaging, earth science, climatology, computational fluid dynamics, quantum mechanics, finance, ...
\subsection{Purpose}
\label{sec:Freefem++:purpose}
-Purpose not available.
+
+FreeFEM aims to bring scientific computing to everyone, allowing the user to easily set up and perform multiphysics numerical simulations while enjoying the performance and scalability of state of the art robust parallel solvers.
\subsection{Programming and Computational Environment}
\label{sec::Freefem++:environment_capabilities}
-The following table summarizes these aspects for Freefem++, providing a view of its programming and computational capabilities.
+Table~\ref{tab:Freefem++:env} summarizes these aspects for Freefem++, providing a view of its programming and computational capabilities.
\begin{table}[h!]
\centering
@@ -69,68 +103,102 @@ \subsection{Programming and Computational Environment}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
\rowcolor{white}Languages & \begin{tabular}{l}
C++\\
-\end{tabular} & Programming languages and language standards supported by the software \\
+\end{tabular} & FreeFEM is written in C++ for speed and allows the user to easily add features in the language through dynamically-loaded C++ plugins. FreeFEM mainly uses the C++14 standard. \\
\rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
MPI\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
+\end{tabular} & FreeFEM relies on MPI for distributed memory parallelism. FreeFEM provides a rich and robust interface to state of the art MPI-parallel solver libraries such as PETSc and HPDDM. The user can also define and manage their own distributed parallel data structures thanks to FreeFEM's interface to MPI in the DSL.\\
\rowcolor{white}Data Formats & \begin{tabular}{l}
Gmsh and associated formats\\
HDF5\\
VTK\\
in-house format\\
-\end{tabular} & Data formats that the software can handle or produce.\\
+\end{tabular} & FreeFEM supports various standard mesh and data formats for I/O.\\
\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
None\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
+\end{tabular} & FreeFEM does not currently implement resilience functionalities such as fault tolerance or recovery mechanisms.\\
\rowcolor{white}DevOps & \begin{tabular}{l}
Continuous Integration\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
+\end{tabular} & FreeFEM is maintained on GitHub. We are in the process of switching frameworks to use GitHub Actions workflows for Continuous Integration. FreeFEM uses standard development workflows using develop and feature branches ; PRs are reviewed before being merged into the develop branch. The develop branch is merged into master for each software release.\\
\rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
-Debian\\
+Ubuntu\\
+Spack\\
+GUIX-HPC\\
\end{tabular} & Software packaging and distribution.\\
\rowcolor{white}Testing & \begin{tabular}{l}
Unit\\
Validation\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
+\end{tabular} & FreeFEM includes a test suite of more then 600 examples extensively testing and validating the features of the software, both in sequential and parallel settings.\\
\rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
Docker\\
\end{tabular} & Container technologies used to package and deploy the software.\\
\rowcolor{white}Interfaces & \begin{tabular}{l}
-HPdomain decomposition methods\\
+tetgen\\
MMG/ParMMG\\
MUMPS\\
+HPDDM\\
PETSc\\
+HTOOL\\
Scotch\\
-\end{tabular} & List of software Freefem++ has interfaces with.\\
+Metis/ParMetis\\
+IPOPT\\
+
+\end{tabular} & FreeFEM interfaces with various software and libraries for mesh generation and mesh adaptation (tetgen/MMG/ParMMG), partitioning(Scotch/Metis/ParMetis), low-rank compression(HTOOL), solvers(MUMPS/HPDDM/PETSc), optimization(IPOPT), ... \\
\bottomrule
\end{tabular}
}}
\caption{Freefem++ programming and computational environment}
+\label{tab:Freefem++:env}
\end{table}
\subsection{Mathematics}
\label{sec:Freefem++:mathematics}
-Mathematics not available.
-In this section, provide a summary the mathematics used in the software.
+FreeFEM implements both finite element methods (FEM) and boundary element methods (BEM) for the solution of partial differential equations (PDEs) in complex geometries. It uses various mathematically sound numerical algorithms to enhance the accuracy and speed of the simulations, from metric-based adaptive mesh refinement to provably robust multilevel domain decomposition solvers. It allows for the joint use of real and complex variables.
\subsection{Relevant Publications}
\label{sec:Freefem++:publications}
-Here is a list of relevant publications related to the software:
+\begin{description}
+\item[\fullcite{hecht_new_2012}] This is a short presentation of the capabilities of the software. The documentation is available online at \url{https://doc.freefem.org}.
+\item[\fullcite{bernardi_mathematics_2024}] This self-contained book provides a thorough theoretical study of finite element methods for solving incompressible Navier-Stokes equations. It focuses on efficient and widely used finite element methods that are well adapted to large-scale simulations. In this revised and expanded edition of Girault and Raviart's 1986 textbook Finite Element Methods for Navier-Stokes Equations, readers will find rigorous proof of stability and convergence, analysis of practical algorithms, and a stand-alone chapter on finite element methods that is applicable to a large range of PDEs. The book also covers a variety of numerical algorithms used in the computer implementation of Navier-Stokes equations and numerical experiments.
-\subsection{Acknowledgements}
-\label{sec::Freefem++:acknowledgements}
+\item[\fullcite{dolean_introduction_2015}] The purpose of this book is to offer an overview of the most popular domain decomposition methods for partial differential equations (PDEs). The authors present all popular algorithms, both at the PDE level and at the discrete level in terms of matrices, along with systematic FreeFEM scripts for sequential implementation as well as some parallel scripts.
+
+\item[\fullcite{nataf_geneo_2024}] This paper introduces an adaptive element-based domain decomposition (DD) method for solving saddle point problems defined as a block two by two matrix. The algorithm does not require any knowledge of the constrained space. The design of the adaptive coarse space extends the GenEO theory to saddle point problems. Numerical results on three dimensional elasticity problems for steel-rubber structures are shown for up to one billion degrees of freedom.
+
+\item[\fullcite{zhu_89-line_2021}] This paper presents an 89-line code for nonlinear topology optimization written in FreeFEM based on the popular SIMP (solid isotropic material with penalization) method. Excluding thirteen lines which are used for explanation, only 76 lines are needed for the initialization of the design parameters, nonlinear finite element analysis, sensitivity calculation, and updated design variables. Different design problems can be solved by modifying several lines in the proposed program.
+
+\item[\fullcite{hecht_pde-constrained_2024}] This book is aimed at students and researchers who want to learn how to efficiently solve constrained optimization problems involving partial differential equations (PDE) using the FreeFEM software.
+
+\item[\fullcite{tournier_numerical_2017}] This paper deals with microwave tomography for brain stroke imaging using state-of-the-art numerical modeling and massively parallel computing. It includes the accurate modeling of a whole-microwave measurement system. The inverse problem is solved by a gradient based L-BFGS minimization algorithm. The successive solution of the direct problem in the optimization loop is accelerated using an Optimized Restricted Additive Schwarz (ORAS) preconditioner. This work was awarded the Joseph Fourier-Bull prize in 2015.
-The software has been developed with the support of the following funding agencies and institutions:
+\item[\fullcite{sadaka_parallel_2020}] This work presents a FreeFEM Toolbox for the parallel computing of two- or three-dimensional liquid-solid phase-change systems involving natural convection. Parallel 2D and 3D computations of benchmark cases of increasing difficulty are presented: natural convection of air, natural convection of water, melting or solidification of a phase-change material, water freezing. For each case, careful validations are provided and the performance of the code is assessed.
+\item[\fullcite{tournier_three-dimensional_2022}] In seismic imaging, efficient frequency-domain full-waveform inversion (FWI) of long-offset node data can be designed with a few discrete frequencies, which lead to modest data volumes to be managed during the inversion process. This requires the solution of large and sparse linear indefinite systems for each frequency with multiple right-hand sides (RHSs). Here we investigate Optimized Restricted Additive Schwarz (ORAS) preconditioners with Robin or Perfectly Matched Layer (PML) interface conditions. Multiple sources are processed in groups with a pseudo-block method. The accuracy, computational cost and scalability of the solver are assessed against several realistic benchmarks.
+\item[\fullcite{li_three-dimensional_2022}] This paper presents a new framework for the two- and three-dimensional topology optimization of the weakly-coupled fluid-structure system. The proposed design methodology uses a reaction-diffusion equation for updating the level-set function based on the topological sensitivity. The performance of the methodology is demonstrated by solving three different optimization problems: compliance, power dissipation, and fluid-structure interaction. For comparison and for assessing the various techniques, the designs are benchmarked against state-of-the-art works followed by showcasing a variety of practical engineering design examples.
+\item[\fullcite{dapogny_geometrical_2018}] This paper presents simple and robust numerical methods for two-dimensional geometrical shape optimization problems, in the context of viscous flows driven by the stationary Navier-Stokes equations at low Reynolds number. Several pedagogical examples are discussed. The corresponding program is written in the FreeFem++ environment, and it is freely available. Its chief features are carefully presented, so that it can easily be handled and elaborated upon to deal with different, or more complex physical situations.
-Acknowledgements not available.
+\item[\fullcite{golse_radiative_2023}] To study the temperature in a gas subjected to electromagnetic radiations, one may use the Radiative Transfer equations coupled with the Navier-Stokes equations. The problem has 7 dimensions but can be simplified to a small number of integro-differential equations in 3 dimensions. This work presents the method and its numerical implementation using a Hierarchical matrix compression scheme, using FreeFEM and HTOOL. Applications to the temperature in the French Chamonix valley are presented with and without snow or cloud and with a variable absorption coefficient. The software is precise enough to assert temperature differences due to increased absorption in the vibrational frequency subrange of greenhouse gases.
+
+\item[\fullcite{sadaka_finite_2024}] This work presents a FreeFEM finite element toolbox for the computation of Bogoliubov-de Gennes modes used to assess the linear stability of stationary solutions of the Gross-Pitaevskii (GP) equation. Applications concern one (single GP equation) or two-component (a system of coupled GP equations) Bose-Einstein condensates in one, two and three dimensions of space. Programs are validated through comparisons with known theoretical results for simple cases and numerical results reported in the literature.
+\end{description}
+
+\subsection{Acknowledgements}
+\label{sec::Freefem++:acknowledgements}
+The software has been developed with the support of the following funding agencies, institutions and private companies:
+\begin{itemize}
+\item Sorbonne Université
+\item University of Rouen
+\item INRIA
+\item ANR
+\item Airthium company
+\item GENCI for HPC resources
+\end{itemize}
\ No newline at end of file
diff --git a/software/hawen/WP1/WP1.tex b/software/hawen/WP1/WP1.tex
index 1e3c081..f9db235 100644
--- a/software/hawen/WP1/WP1.tex
+++ b/software/hawen/WP1/WP1.tex
@@ -40,6 +40,25 @@ \section{Software: Hawen}
\subsection{Software Overview}
\label{sec:WP1:Hawen:summary}
+\hawen~uses the Hybridizable discontinuous Galerkin method (HDG)
+for the discretization of the time-harmonic wave problems such
+as \cref{eq:hawen:viscoacoustic,eq:hawen:viscoelastic}.
+The method relies on a static condensation of DG method, and we
+refer to, e.g. \cite{Faucher2020adjoint,pham_numerical_2024}
+and the references therein regarding implementation in acoustics and
+elasticity, respectively.
+The specificity of the HDG method is to assemble a global linear system
+only in terms of the degrees of freedom (dofs) on the faces of the discretized
+mesh cells. By avoiding the interior dofs, it allows to reduce the computational
+cost by reducing the size of subsequent linear system, e.g. \cite{Faucher2020adjoint}.
+However, this benefit is only obtained for sufficiently high degrees of
+polynomials, hence requiring robust quadrature method and flexible model representation
+on the mesh.
+\hawen~uses $p$-adaptivity to adapt the order of the polynomials on each cell
+depending on the wavelength.
+
+
+
In~\cref{tab:WP1:Hawen:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
\begin{table}[h!]
@@ -54,7 +73,11 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} dG/hdG & provide short description here \\
+\rowcolor{white} dG/hdG & \hawen~uses the HDG method for the discretization, it allows
+ for local $p-$adaptivity, such that each cell of the discretized
+ domain can use a different polynomial orders.
+ \hawen~solves for 1D, 2D and 3D problems using unstructured
+ simplex meshes.\\
\rowcolor{numpexlightergray} multiphysics coupling & provide short description here \\
\end{tabular}
}
@@ -67,59 +90,166 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP1:Hawen:performances}
+\hawen~uses MPI and OpenMP parallelism. The HDG method is particularly
+appropriate for parallelism (as other methods in the DG family), such
+that each cell of the mesh can be treated independently in parallel.
+\hawen~has been used on several supercomputers, including GENCI Adastra
+with Genoa partition for CPUs parallelism.
+\hawen~is linked with MUMPS library, which is a multifrontal direct solver
+for sparse linear systems.
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+%\begin{itemize}
+% \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
+% \item describe the parallel computation environment: type of architecture and super computer used.
+% \item describe the parallel capabilities of the software
+% \item \textbf{Scalability:} Describe the general scalability properties of the software
+% \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+%\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP1:Hawen:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP1. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+The benchmarks of this WP investigate the overall performance
+of \hawen~for large-scale time-harmonic wave problem. We will
+use an acoustic and an elastic test-cases to evaluate the
+difference in cost between the two model of wave propagation.
+The computational cost is primarily evaluated by the memory footprint
+of the matrix factorization, its number of operations required,
+and the overall computational time.
+
+%This section provides a summary of initial performance benchmarks performed in the context of WP1. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+%
+%\begin{itemize}
+% \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+% \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+% \begin{itemize}
+% \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
+% \item Output dataset format and key results.
+% \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
+% \item DOI or permanent link for accessing the dataset.
+% \end{itemize}
+% \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
+% \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+% \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+%\end{itemize}
+
+\subsubsection{Benchmark \#1: Visco-acoustic time-harmonic wave propagation}
+\label{subsec:WP1:Hawen:benchmark1}
+
+\paragraph{Description}
+This benchmark considers the propagation of time-harmonic waves in a large-scale
+geophysical configuration. We use the model SEAM for this benchmark, which is of
+size \num{35}$\times$\num{45}$\times$\num{35}\si{\km\cubed}.
+It gives us the wave-speed and density models, respectively $\sqrt{\kappa/\rho}$
+and $\rho$ to solve the acoustic wave problem \cref{eq:hawen:viscoacoustic}.
+We illustrate the wave-speed model in \cref{figure:hawen:seam-model}.
+This model is challenging as it is relatively large (depending on the frequency
+we simulate, see below), and includes high-contrast objects.
+For instance, the maximal value of the wave-speed is about 2.5 times higher than
+the minimal value, and an object of high contrast has to be carefully discretized.
+
+
+The objective is to evaluate the performance when we increase the simulation frequency,
+to see the maximal frequency attainable at a reasonable cost with \hawen.
+Low frequency typically costs less as the wavelength is larger, and the discretization
+can be relaxed (in terms of number of cells in the mesh and/or polynomial orders).
+However, when increasing the frequency, the computational cost increases, and this
+progression of cost vs. frequency will be investigated.
+
+Another objective is to design the most efficient setup for the HDG discretization
+of \hawen. In particular, it is known that HDG is more efficient for high order
+polynomials and this aspect must be enforced for efficient simulations. This benchmark
+relies onto CPU parallelism.
+
+
+\paragraph{Benchmarking Tools Used}
+To evaluate the performance of the simulations, we will use the memory consumption,
+which will be separated into the one used at the level of the linear algebra (with
+MUMPS) and the other operations (e.g., matrix storage).
+We also evaluate the execution time, with a similar separation to evaluate the
+performance of the computational blocs separately.
+For the linear algebra operations with MUMPS, we will also investigate the number
+of flops required to factorize the matrix.
+
+\paragraph{Input/Output Dataset Description} The input correspond to the
+physical models that feed the wave equation, such as the wave-speed illustrated
+in \cref{figure:hawen:seam-model}. These are available via the SEAM
+corporation\footnote{\url{https://wiki.seg.org/wiki/SEAM}}.
+
+\paragraph{Results Summary}
+We will provide the statistics for the computations with frequency, and
+the comparison of HDG discretization (different meshes and polynomial orders)
+to identify the best options.
+
+\paragraph{Challenges Identified}
+There are two main challenges, first the memory footprint of the matrix factorization,
+second the efficient high-order polynomials to be used with HDG method to maximize its
+efficiency.
+
+
+%
+%\begin{itemize}
+% \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+% \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+% \item \textbf{Input/Output Dataset Description:}
+% \begin{itemize}
+% \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+% \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+% \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+% \end{itemize}
+% \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+% \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+%\end{itemize}
+
+% ----------------------
+\begin{figure}[ht!]\centering
+\includegraphics[scale=0.65]{graphics/hawen/skeleton_3D_seam}
+\caption{SEAM wave-speed model of size \num{35}$\times$\num{45}$\times$\num{35}\si{\km\cubed}
+ used for the benchmarks.}
+\label{figure:hawen:seam-model}
+\end{figure}
+% ----------------------
+
+
+\subsubsection{Benchmark \#2: Visco-elastic time-harmonic wave propagation}
+\label{subsec:WP1:Hawen:benchmark2}
+
+The second benchmark is the continuation of the previous one where, instead
+of considering the acoustic problem \cref{eq:hawen:viscoacoustic}, we now
+solve the viscoelastic problem with equation~\cref{eq:hawen:viscoelastic}.
+Elasticity is computationally much more demanding as, instead of working with
+a scalar unknown as in the acoustic case, we now have a vector unknown.
+The configuration of this setup is the same as above, and we also wish to
+compare the performance between acoustic and elastic wave propagation.
+For instance, we wish to identify, for a given computational cost, what
+is the maximal frequency attainable in elastic wave propagation compared to
+the acoustic case.
+The strategy for the benchmark is the same above, emphasising the use of
+high-order polynomials for HDG in \hawen.
+
\subsection{12-Month Roadmap}
\label{sec:WP1:Hawen:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
+
+We wish to standardize the use of high-order polynomials, which
+is critical for the HDG method. By high-order, we mean at least
+order 7 in three dimensions, so that the static condensation used
+by the method becomes more useful. For instance, using high orders
+to have robust and efficient quadrature rules.
+In addition, we wish to improve input/output data management by including more options
+to save the results of \hawen, for instance this can include the hdf5 format
+which is not allowed currently.
+Eventually, the configuration of the benchmark and the results obtained will
+be made available in open-access for reproducibility.
+%In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+%\begin{itemize}
+% \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
+% \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+% \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+%\end{itemize}
In~\cref{tab:WP1:Hawen:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
\begin{table}[h!]
@@ -138,13 +268,13 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & Improve high-order polynomials operations for HDG discretization. \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Each benchmark will have a separate repository with the input files and associated results to reproduce the simulations. \\
+\rowcolor{white} B6 - Data Management & Provide the meshes used for the benchmark and the associated results in standard format such as vtk and hdf5. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Control the memory footprint. \\
\end{tabular}
}
}
\caption{WP1: Hawen plan with Respect to Relevant Bottlenecks}
\label{tab:WP1:Hawen:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
diff --git a/software/hawen/WP3/WP3.tex b/software/hawen/WP3/WP3.tex
index b26b8a7..48cd2e7 100644
--- a/software/hawen/WP3/WP3.tex
+++ b/software/hawen/WP3/WP3.tex
@@ -40,7 +40,30 @@ \section{Software: Hawen}
\subsection{Software Overview}
\label{sec:WP3:Hawen:summary}
-In~\cref{tab:WP3:Hawen:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+By considering time-harmonic wave problem, the discretization in
+\hawen~results in a large sparse linear system to solve. In this
+we investigate the efficient resolution of the linear system arising
+from the discretization of wave problems. In particular, \hawen~currently
+relies on direct solver MUMPS for the resolution of the linear system.
+MUMPS is a direct solver which uses LU factorization, the main advantage
+is to enable the resolution for multiple right-hand sides, which a
+necessity in the context of inversion where several sources have to
+be simulated (e.g., possibly several thousands in Earth imaging).
+In addition, the matrix factorization can be reused in some operations
+of inversion, typically for the gradient computation.
+
+The direct linear solver MUMPS follows three steps with 1) the
+analysis phase, 2) the factorization, and 3) the solve.
+In addition to the multiple right-hand side features that acts
+at the solve phase, we wish to investigate how the specificity
+of the HDG method
+(working with the degrees of freedom on the faces of the discretized cells only)
+used by \hawen~can also serve in the analysis phase to simplify
+the graph that is created by MUMPS.
+
+
+In~\cref{tab:WP3:Hawen:features} we provide a summary of the software
+features relevant to the work package which are briefly discussed.
\begin{table}[h!]
\centering
@@ -54,7 +77,9 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} "reuse of Krylov subspaces for multiple right-hand sides" & provide short description here \\
+\rowcolor{white} "reuse of Krylov subspaces for multiple right-hand sides" &
+We currently use direct solver MUMPS to solve for multiple right-hand side and
+which to investigate the performance with extremely large number ( $>\num{10000}$). \\
\end{tabular}
}
}
@@ -66,59 +91,114 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP3:Hawen:performances}
+\hawen~uses MPI and OpenMP parallelism. The HDG method is particularly
+appropriate for parallelism (as other methods in the DG family), such
+that each cell of the mesh can be treated independently in parallel.
+\hawen~has been used on several supercomputers, including GENCI Adastra
+with Genoa partition for CPUs parallelism.
+\hawen~is linked with MUMPS library, which is a multifrontal direct solver
+for sparse linear systems.
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+
+%\begin{itemize}
+% \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
+% \item describe the parallel computation environment: type of architecture and super computer used.
+% \item describe the parallel capabilities of the software
+% \item \textbf{Scalability:} Describe the general scalability properties of the software
+% \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+%\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP3:Hawen:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+
+The benchmarks of this WP strongly relate with the
+ones of WP1 described in \cref{sec:WP1:Hawen:metrics}.
+Therefore, we consider large-scale time-harmonic wave
+problem for acoustic and elastic propagation.
+In this WP, we emphasize the consideration of many
+right-hand sides.
+
+
+%This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+%
+%\begin{itemize}
+% \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+% \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+% \begin{itemize}
+% \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
+% \item Output dataset format and key results.
+% \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
+% \item DOI or permanent link for accessing the dataset.
+% \end{itemize}
+% \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
+% \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+% \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+%\end{itemize}
+
+\subsubsection{Benchmark \#1: Visco-acoustic time-harmonic wave propagation}
+\label{subsec:WP3:Hawen:benchmark1}
+
+This benchmark follows the configuration of the one
+established in WP1 in \cref{subsec:WP1:Hawen:benchmark2} and
+considers the propagation of acoustic waves in a large-scale
+3D medium.
+However, contrary to WP1, we now emphasize the operations
+of linear algebra to solve the linear system resulting from
+the HDG discretization in \hawen.
+We highlight two main objectives:
+\begin{enumerate}
+\item Study the specificity of the HDG method to fasten the
+ analysis phase of the direct solver, in particular with
+ the ``analysis by block'' feature of the direct solver
+ MUMPS.
+\item Study the performance when a large number of right-hand
+ sides has to be solves.
+\end{enumerate}
+
+
+%\begin{itemize}
+% \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+% \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+% \item \textbf{Input/Output Dataset Description:}
+% \begin{itemize}
+% \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+% \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+% \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+% \end{itemize}
+% \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+% \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+%\end{itemize}
+
+\subsubsection{Benchmark \#2: Visco-elastic time-harmonic wave propagation}
+\label{subsec:WP3:Hawen:benchmark2}
+
+This benchmark follows the previous one but in the context of
+elastic wave propagation. The configuration is
+established in WP1 in \cref{subsec:WP1:Hawen:benchmark2}.
\subsection{12-Month Roadmap}
\label{sec:WP3:Hawen:roadmap}
-
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
-
+%
+%In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+%\begin{itemize}
+% \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
+% \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+% \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+%\end{itemize}
+
+We wish to investigate the efficient of the linear algebra operations
+when working with particularly high polynomial orders, as established
+in \cref{sec:WP1:Hawen:software}.
+We emphasize the cost on the memory footprint for the matrix factorization,
+which is the main bottleneck in time-harmonic wave simulations using a
+direct solver.
+We will rely on the direct solver MUMPS to investigate the cost and benefit
+from its latest development to reduce the computational cost. However, one
+must find a good compromise between the reduction of the memory footprint
+(e.g., with block low-rank compression or mixed precision), and the accuracy
+of the solution.
In~\cref{tab:WP3:Hawen:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
\begin{table}[h!]
@@ -137,13 +217,20 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity &
+Investigate the efficiency of the linear solvers with high-order polynomials
+required by HDG discretization (i.e., order higher than or equal to 7). \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation
+& The benchmarks will be made available on online repository. \\
+\rowcolor{white} B6 - Data Management
+& Inputs parameters will be made available to generate the matrix but to its size, the matrix
+ itself should not be stored.\\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms &
+Control the memory footprint of the solver and use latest developments of direct solver
+MUMPS to reduce it. Investigate how the compression affects the accuracy of the solutions.\\
\end{tabular}
}
}
\caption{WP3: Hawen plan with Respect to Relevant Bottlenecks}
\label{tab:WP3:Hawen:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
diff --git a/software/hawen/WP4/WP4.tex b/software/hawen/WP4/WP4.tex
index eed0bdd..48a3c55 100644
--- a/software/hawen/WP4/WP4.tex
+++ b/software/hawen/WP4/WP4.tex
@@ -40,8 +40,38 @@ \section{Software: Hawen}
\subsection{Software Overview}
\label{sec:WP4:Hawen:summary}
+\hawen~solves quantitative inverse wave problem by following
+and iterative minimization approach, \cite{Faucher2020adjoint,faucher_hawen_2021}.
+The sketch of the iterative procedure is depicted
+in \cref{figure:hawen:wp4:inversion}, and is typically a
+Newton-type nonlinear optimization approach, \cite{Virieux2009}.
+This procedure heavily relies onto numerical simulations of
+wave propagation, which are compared with the data at each
+iteration. Therefore, an efficient modeling solver is required,
+it is investigated in WP1 for \hawen, \cref{sec:WP1:Hawen:software}.
+In addition, investigation regarding the choices of the acquisition
+(to limit the number of sources to use) and the on efficient criterion
+to evaluate the discrepancy between the data and the simulations
+are necessary to reduce the computational cost.
+
In~\cref{tab:WP4:Hawen:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+\begin{figure}[ht!]\centering
+\includegraphics[scale=1.00]{graphics/hawen/haven_inversion}
+\caption{Sketch of the quantitative inverse wave problems based upon
+ iterative minimization in \hawen. %, extracted from \cite{faucher_hawen_2021}.
+ \textbf{a)} Acquisition stage: probing waves are are
+ recorded by devices positioned on a portion
+ of the domain, typically near the boundary.
+ \textbf{b)} The reconstruction algorithm starts from initial
+ model parameters and compares simulations of wave
+ propagation with the data,
+ then iteratively updates those properties using
+ a Newton-based algorithm.}
+\label{figure:hawen:wp4:inversion}
+\end{figure}
+
+
\begin{table}[h!]
\centering
{
@@ -67,59 +97,136 @@ \subsection{Parallel Capabilities}
\label{sec:WP4:Hawen:performances}
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
+\hawen~uses MPI and OpenMP parallelism. The HDG method is particularly
+appropriate for parallelism (as other methods in the DG family), such
+that each cell of the mesh can be treated independently in parallel.
+\hawen~has been used on several supercomputers, including GENCI Adastra
+with Genoa partition for CPUs parallelism.
+\hawen~is linked with MUMPS library, which is a multifrontal direct solver
+for sparse linear systems.
+MUMPS allows to solve for multiple right-hand side, hence reducing the
+computational cost of having many sources in the acquisition during
+inversion.
+
+%\begin{itemize}
+% \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
+% \item describe the parallel computation environment: type of architecture and super computer used.
+% \item describe the parallel capabilities of the software
+% \item \textbf{Scalability:} Describe the general scalability properties of the software
+% \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+%\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP4:Hawen:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP4. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+
+The performance of inversion can be evaluated with different
+criteria.
+It typically is a compromise between the accuracy (resolution)
+of the final reconstruction and the computational cost required
+to reach it.
+The number of iterations and complexity of the computations involved
+at each of them is also a criterion for numerical efficiency.
+We also wish to investigate target-oriented inversion, to reconstruct
+some part of the model without having to simulate everything, hence
+reducing the computational cost.
+
+
+%This section provides a summary of initial performance benchmarks performed in the context of WP4. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+%
+%\begin{itemize}
+% \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+% \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+% \begin{itemize}
+% \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
+% \item Output dataset format and key results.
+% \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
+% \item DOI or permanent link for accessing the dataset.
+% \end{itemize}
+% \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
+% \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+% \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+%\end{itemize}
+
+\subsubsection{Benchmark \#1: Visco-acoustic time-harmonic wave propagation}
+\label{subsec:WP4:Hawen:benchmark1}
+
+
+\paragraph{Description}
+The configuration for the inversion investigated in this benchmark
+is based upon the related benchmarks of \hawen~of WP1 and WP3,
+respectively \cref{subsec:WP1:Hawen:benchmark1,subsec:WP3:Hawen:benchmark1}.
+Here we use the acoustic case, and our target model is the one
+given in \cref{figure:hawen:seam-model}.
+For the reconstruction, we will take a smooth representation of the
+model, and vary the degree of smoothness to make the reconstruction
+harder.
+
+\paragraph{Input/Output Dataset Description}
+The input data for inversion consist in the recording
+of waves near the boundary for different position of
+sources, as illustrated in the panel a)
+of \cref{figure:hawen:wp4:inversion}. We will use
+synthetically generated wave, and add noise to our data-set.
+In addition, we can provide different acquisition setup,
+i.e., varying the number of sources and number of data-points.
+
+
+\paragraph{Results Summary}
+The accuracy of the reconstruction will be evaluated depending
+on the number of iterations required. We will also try to use
+target-oriented inversion to limit the computational cost by
+only inverting part of the domain.
+In this case some well-designed discrepancy criterion can be
+employed to fully use the data-set, cf. our previous works in
+\cite{Faucher2019FRgWIGeo,Faucher2020DAS}.
+
+%\begin{itemize}
+% \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+% \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+% \item \textbf{Input/Output Dataset Description:}
+% \begin{itemize}
+% \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+% \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+% \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+% \end{itemize}
+% \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+% \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+%\end{itemize}
+
+
+\subsubsection{Benchmark \#2: Visco-elastic time-harmonic wave propagation}
+\label{subsec:WP4:Hawen:benchmark2}
+
+This benchmark will be similar to the above but considering elastic medium,
+hence drastically increasing the computational cost. For inversion and
+additional difficulty lies in that more model parameters need to be reconstructed,
+hence increasing the uncertainties and difficulties.
\subsection{12-Month Roadmap}
\label{sec:WP4:Hawen:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
+We wish to combine the results of WP1 and WP3
+\cref{sec:WP1:Hawen:software,sec:WP3:Hawen:software}
+to obtain the most efficient configuration for inversion.
+It includes the use of high-order polynomials which, in turn,
+require to have sufficient flexibility for the representation
+of the model onto the discretized domain to have the adequate
+resolution. In addition, the choices of configuration for the
+underlying optimization method (e.g., choice of misfit function)
+is investigated.
+Once the numerical setup is designed, everything will be
+made available on a dedicated repository for reproducibility,
+see \cref{tab:WP4:Hawen:bottlenecks}.
-In~\cref{tab:WP4:Hawen:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+%In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+%\begin{itemize}
+% \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
+% \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+% \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+%\end{itemize}
+% In~\cref{tab:WP4:Hawen:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
\begin{table}[h!]
\centering
@@ -137,13 +244,21 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity
+ & Use flexible representation of the unknowns to maintain
+ the resolution in inversion, while still using discretization
+ based on large cells to benefit from the HDG method.
+ \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation
+ & Benchmarks are made available via online repositories.\\
+\rowcolor{white} B6 - Data Management
+ & Datasets will be available within the repository. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms
+ & Enable different methods to carry out the
+ iterative optimization. \\
\end{tabular}
}
}
\caption{WP4: Hawen plan with Respect to Relevant Bottlenecks}
\label{tab:WP4:Hawen:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
diff --git a/software/hawen/hawen.tex b/software/hawen/hawen.tex
index 1b9a119..e91de39 100644
--- a/software/hawen/hawen.tex
+++ b/software/hawen/hawen.tex
@@ -39,21 +39,36 @@ \section{Software: Hawen}
\caption{Hawen Information}
\end{table}
+% ---------------------------------------------
+\newcommand{\hawen}{\textsc{Hawen}}
\subsection{Software summary}
\label{sec:Hawen:summary}
-Detailed overview not available.
+% ---------------------------------------------
+
+Software \hawen~(\url{https://ffaucher.gitlab.io/hawen-website/})
+considers the time-harmonic modeling of mechanical waves, and the
+associated quantitative inverse wave problems.
+The code uses the Hybridizable Discontinuous Galerkin method (HDG)
+for the discretization.
+It relies on nonlinear iterative minimization algorithm to solve
+the quantitative inverse wave problem.
+The code is written in \texttt{Fortran90} and uses \texttt{mpi}
+and \texttt{OpenMP} parallelism.
\subsection{Purpose}
\label{sec:Hawen:purpose}
-Purpose not available.
+
+\hawen~solves large-scale inverse wave problems in the frequency domain,
+with an emphasis on applications in the context of Earth's imaging and helioseismology.
+
\subsection{Programming and Computational Environment}
\label{sec::Hawen:environment_capabilities}
-The following table summarizes these aspects for Hawen, providing a view of its programming and computational capabilities.
+The following \cref{table:hawen-environment} summarizes these aspects for \hawen, providing a view of its programming and computational capabilities.
\begin{table}[h!]
\centering
@@ -66,16 +81,21 @@ \subsection{Programming and Computational Environment}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
\rowcolor{white}Languages & \begin{tabular}{l}
Fortran\\
-\end{tabular} & Programming languages and language standards supported by the software \\
+\end{tabular} &
+\hawen~is developed in Fortran90. \\
\rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
MPI\\
Multithread\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
+\end{tabular} & \hawen relies onto MPI and multithreading for parallelism.\\
\rowcolor{white}Data Formats & \begin{tabular}{l}
Gmsh and associated formats\\
VTK\\
-in-house format\\
-\end{tabular} & Data formats that the software can handle or produce.\\
+in-house binary format\\
+\end{tabular} & Data formats that the software can handle or produce.
+Gmsh format can be used for input mesh.
+VTK or binary format can be used to save the results such as the
+wave propagation solutions.
+\\
\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
None\\
\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
@@ -86,33 +106,122 @@ \subsection{Programming and Computational Environment}
None\\
\end{tabular} & Software packaging and distribution.\\
\rowcolor{white}Testing & \begin{tabular}{l}
-None\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
+Analytic solutions\\
+\end{tabular} & Testing methodologies employed to ensure software quality and correctness:
+Analytic solutions for the propagation of acoustic and elastic waves, respectively
+\cref{eq:hawen:viscoacoustic,eq:hawen:viscoelastic} exist and are compared
+with the numerical resolutions. For instance we refer to \cite{pham_numerical_2024} for
+the validation of the linear elasticity propagator.
+\\
\rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
None\\
\end{tabular} & Container technologies used to package and deploy the software.\\
\rowcolor{white}Interfaces & \begin{tabular}{l}
-MUMPS\\
-\end{tabular} & List of software Hawen has interfaces with.\\
+MUMPS\\
+\end{tabular} & List of software Hawen has interfaces with.
+ MUMPS is used to solve the linear system resulting from
+ the numerical discretization. Metis is used to partition
+ the mesh. ARPACK and PARPACK are optional dependencies
+ used for eigenproblems.\\
\bottomrule
\end{tabular}
}}
\caption{Hawen programming and computational environment}
+ \label{table:hawen-environment}
\end{table}
\subsection{Mathematics}
\label{sec:Hawen:mathematics}
-Mathematics not available.
-In this section, provide a summary the mathematics used in the software.
+\paragraph{Forward problem} \hawen~allows for the modeling of time-harmonic
+mechanical waves for different types of media.
+By using the HDG method for the discretization, we work with first-order systems.
+In the case of visco-acoustics wave propagation, the forward problem consists in
+finding the scalar pressure field $p$ and vector velocity field $\boldsymbol{v}$
+such that
+\begin{subequations}\label{eq:hawen:viscoacoustic}
+\begin{empheq}[left={\empheqlbrace}]{align}
+ &-\mathrm{i}\omega \rho \boldsymbol{v} \,+\, \nabla p \,=\, 0 \,, \\
+ &-\dfrac{\mathrm{i}\omega}{\kappa} p \,+\, \nabla\cdot\boldsymbol{v} \,=\, f \,,
+\end{empheq}\end{subequations}
+where $\omega$ is the angular frequency, and the medium is characterized
+by the density $\rho$ and the bulk modulus $\kappa$. The source term is
+$f$.
+We further refer to \cite{Faucher2020adjoint,Faucher2023viscoacoustic}
+for more details.
+The code allows for different types of boundary conditions, such as
+imposing the Dirichlet trace, Neumann trace, or a Robin-type condition
+(e.g., for radiation condition).
+
+
+In the case of linear visco-elastic wave propagation, the forward problem consists in
+finding the displacement field $\boldsymbol{u}$ and stress tensor $\boldsymbol{\sigma}$
+solutions to, cf., e.g., \cite{pham_numerical_2024},
+\begin{subequations}\label{eq:hawen:viscoelastic}
+\begin{empheq}[left={\empheqlbrace}]{align}
+ &-\omega^2\rho\boldsymbol{u} \,-\, \nabla\cdot\boldsymbol{\sigma}\,=\,\boldsymbol{f} \,, \\
+ & \boldsymbol{\sigma} \,=\, \boldsymbol{C} \, \big(\nabla\boldsymbol{u} \,+\, (\nabla\boldsymbol{u})^t\big) \,.
+\end{empheq}\end{subequations}
+Here, the medium is characterized by the density
+$\rho$ and the stiffness tensor $\boldsymbol{C}$,
+we refer to \cite{pham_numerical_2024}
+for more details related to HDG implementation.
+
+In the context of helioseismology, we consider the scalar-wave propagation
+which comes from simplification of the Galbrun's equation,
+cf.~\cite{Pham2020Siam,pham_assembling_2024}, in three dimensions, the
+simplest form corresponds to finding $u$ solution to:
+\begin{equation} \label{eq:hawen:helio-scalar}
+ \Big( \,-\nabla \,-\,\dfrac{\omega^2}{c^2} \,+\, q \Big) \, u \,=\, f\,,
+\end{equation}
+with potential $q$, sound-speed $c$ and source term $f$.
+In addition, \hawen~solves the problem under the assumption of
+spherical symmetry and we refer to \cite{Pham2020Siam} for more
+details.
+Several choices of radiation boundary conditions have been
+implemented and tested, cf.~\cite{Pham2019radiationBC,Pham2020Siam}.
+Recently, more complete equations to model the Sun have
+been implemented in \hawen, see \cite{pham_assembling_2024}.
+
+\paragraph{Inversion}
+\hawen~solves the inverse problems for the reconstruction of the
+model parameters from the measurements of waves. For instance,
+considering the acoustic wave equation \cref{eq:hawen:viscoacoustic},
+it consists in finding the density $\rho$ and bulk modulus $\kappa$
+from the observations of the pressure $p$ and/or velocity field $\boldsymbol{v}$
+at a discrete set of positions.
+\hawen~employs a nonlinear iterative minimization algorithm for the
+reconstruction of the properties, as illustrated in
+\cite{faucher_hawen_2021,Faucher2019FRgWIGeo,Faucher2020DAS,Faucher2023viscoacoustic}.
+This approach is typically referred to as the \emph{Full Waveform Inversion}
+in the context of seismic imaging, cf. \cite{Virieux2009} for a review.
\subsection{Relevant Publications}
\label{sec:Hawen:publications}
Here is a list of relevant publications related to the software:
+\begin{itemize}
+\item \cite{faucher_hawen_2021}: Reference of the software in the journal of open-source software;
+\item \cite{Faucher2020adjoint}:
+ Mathematical details of the adjoint-state method in the framework
+ of Hybridizable Discontinuous Galerkin discretization method.
+ It provides the computational steps for the implementation of the
+ inverse problem.
+\item \cite{pham_numerical_2024}: Details of the numerical implementation
+ of the HDG method for anisotropic elasticity.
+\item \cite{Faucher2019FRgWIGeo,Faucher2020DAS}:
+ Use of the software in the context of seismic imaging.
+\item \cite{Pham2020Siam,pham_assembling_2024}:
+ Use of the software in the context of helioseismology.
+\item \cite{Faucher2023viscoacoustic}:
+ Use of the software in the context of viscoacoustics ultrasound imaging.
+\item \cite{Liu2024,Benitez2024}:
+ Use of the software in the context of data science and
+ for benchmarks.
+\end{itemize}
\subsection{Acknowledgements}
@@ -120,9 +229,18 @@ \subsection{Acknowledgements}
The software has been developed with the support of the following funding agencies and institutions:
-
-
-
-Acknowledgements not available.
+\begin{itemize}
+ \item Since 2021, F. Faucher is part of the team Makutu of INRIA Bordeaux, at the
+ University of Pau and Pays de l'Adour.
+ \item 2024--2029, F. Faucher acknowledges support of the European Research Council
+ with ERC-StG Project INCORWAVE -- grant 101116288.
+ \item 2024--2027, \hawen~is used in the framework of the ANR-DFG project BUTTERFLY,
+ grant number ANR-23-CE46-0009-01.
+ \item 2023--2024, \hawen~is used in the GENCI Grand Challenges
+ on the supercomputer Adastra’s GENOA partition
+ with project gda2306.
+ \item 2019--2021, F. Faucher acknowledges funding by the Austrian Science Fund (FWF)
+ under the Lise Meitner grant allocation M2791-N.
+\end{itemize}
diff --git a/software/hpddm/WP3/WP3.tex b/software/hpddm/WP3/WP3.tex
index 1bdec53..de55281 100644
--- a/software/hpddm/WP3/WP3.tex
+++ b/software/hpddm/WP3/WP3.tex
@@ -55,13 +55,12 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} "reuse of Krylov subspaces for multiple right-hand sides" & provide short description here \\
-\rowcolor{numpexlightergray} domain decomposition methods & provide short description here \\
-\rowcolor{white} interface & provide short description here \\
-\rowcolor{numpexlightergray} low-rank & provide short description here \\
-\rowcolor{white} multi-precision & provide short description here \\
-\rowcolor{numpexlightergray} randomization & provide short description here \\
-\rowcolor{white} tensor calculus & provide short description here \\
+\rowcolor{white} reuse of Krylov subspaces for multiple right-hand sides & By reusing information from previously computed Krylov subspaces, such as the basis vectors, the computational cost is reduced, and convergence can be faster. This is particularly useful in applications like time-dependent simulations, parametric studies. \\
+\rowcolor{numpexlightergray} domain decomposition methods & Numerical techniques used to solve large-scale problems, typically arising from the discretization of partial differential equations (PDEs), by dividing the problem domain into smaller subdomains. Each subdomain is solved independently, often in parallel, and then the solutions are combined to obtain the global solution.\\
+\rowcolor{white} interface & This ensures that various Exa-MA libraries can work together efficiently, sharing data and results. It typically involves standardized APIs, data formats, and compatibility protocols to enable seamless integration.\\
+\rowcolor{numpexlightergray} low-rank & Approximation of large, complex matrices with simpler matrices that have lower ranks, while preserving their most important features. \\
+\rowcolor{white} multi-precision & Performing numerical calculations with variable precision, beyond the standard single or double precision typically used in computing. It allows for higher or lower levels of precision depending on the needs of the problem, enabling more accurate results for sensitive computations or faster performance when lower precision is sufficient. \\
+\rowcolor{numpexlightergray} randomization & Random sampling techniques to approximate solutions to large-scale matrix problems more efficiently. By leveraging randomness, algorithms can reduce the computational cost and memory requirements of tasks such as matrix decompositions \\
\end{tabular}
}
}
@@ -75,55 +74,50 @@ \subsection{Parallel Capabilities}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+ \item MPI, OpenMP, and CUDA.
+ \item Most French supercomputers from GENCI, Fugaku@RIKEN, and other supercomputers worldwide (mostly when used through PETSc).
+ \item The library relies on already-distributed inputs, e.g., a matrix, it cannot be used to do the data-decomposition itself.
+ \item \textbf{Scalability:} multi-level domain decomposition methods are highly scalable numerically with a very aggressive coarsening factor (compared to, e.g., multigrid methods). Special care must be taken to implement these efficiently, for example to ensure proper load-balancing.
+ \item \textbf{Integration with Other Systems:} through its interface to PETSc and FreeFEM, the software is readily integrated in other higher-level library used by others from the Exa-MA framework.
\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP3:HPDDM:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+ \item \textbf{Overall Performance:} The scalability of the library in its current state has been advertised on numerous occasion. As part of the Exa-MA framework, we will provide a standalone benchmark detailed next.
+ \item \textbf{Input/Output Dataset:}
\begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
+ \item Input matrix and right-hand side in PETSc binary format.
+ \item Output solution vector in PETSc binary format.
\end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+ \item \textbf{open-data Access:} The files will be made open-source, though they aren't currently.
+ \item \textbf{Challenges:} Scalable input/output for large process counts, proper parameters for multi-level preconditioning.
+ \item \textbf{Future Improvements:} Setting suitable default parameters for building efficient multi-level domain decomposition preconditioners.
\end{itemize}
\subsubsection{Benchmark \#1}
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+ \item \textbf{Description:} A discrete fracture network (DFN) is a modeling approach used to represent the geometry and connectivity of fractures in rock masses. It treats fractures as discrete features within a larger rock domain, often simulating the flow of fluids or transport of particles through these fractures. This requires the solution of large symmetric positive definite systems, with hundreds of millions of unknowns.
+ \item \textbf{Benchmarking Tools Used:} PETSc \verb!-log_view! will be used to measure execution time and FLOPS.
+% \item \textbf{Input/Output Dataset Description:}
+% \begin{itemize}
+% \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+% \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+% \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+% \end{itemize}
+% \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+% \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
\end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP3:HPDDM:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+ \item \textbf{Data Improvements:} Modify \verb!PCView_HPDDM()! to avoid writing three files per subdomain, but instead a single file for all the subdomains, with the data grouped into a single \verb!Mat! and single \verb!IS!. This will greatly reduce the number of input/output operations.
+ \item \textbf{Methodology Application:} Make sure that HPDDM can handle matrices store on device using cuSparse.
+ \item \textbf{Results Retention:} Online on GitHub in the NumPEx organization.
\end{itemize}
In~\cref{tab:WP3:HPDDM:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
@@ -144,13 +138,13 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & provide suitable default parameters to make preconditioners easier to use \\
+ \rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & ensure randomized algorithms within the preconditioner are sharp enough to not hinder the global (outer) convergence \\
+\rowcolor{white} B6 - Data Management & keep the volume of data reasonable and make sure the inputs and outputs can be reused among different architectures \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & multi-level preconditioning and efficient porting of subdomain solvers on accelerators \\
\end{tabular}
}
}
\caption{WP3: HPDDM plan with Respect to Relevant Bottlenecks}
\label{tab:WP3:HPDDM:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
diff --git a/software/hpddm/hpddm.tex b/software/hpddm/hpddm.tex
index 6a8c537..ddc6b03 100644
--- a/software/hpddm/hpddm.tex
+++ b/software/hpddm/hpddm.tex
@@ -24,7 +24,7 @@ \section{Software: HPDDM}
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU or GPU\\
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://github.com/hpdomain decomposition methods/hpdomain decomposition methods}{https://github.com/hpdomain decomposition methods/hpdomain decomposition methods} \\
+ \rowcolor{white}\textbf{Repository} & \href{https://github.com/hpddm/hpddm}{https://github.com/hpddm/hpddm} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
OSS:: LGPL v*\\
\end{tabular} \\
@@ -42,13 +42,21 @@ \section{Software: HPDDM}
\subsection{Software summary}
\label{sec:HPDDM:summary}
-Detailed overview not available.
-
+HPDDM (High-Performance unified framework for Domain Decomposition Methods) is a library of
+advanced Krylov methods and multilevel domain decomposition methods, used as preconditioners, designed for high-performace computing.
+The available Krylov methods include CG, block CG, GMRES, and block GMRES.
+The available domain decomposition methods include one- and multi-level restricted additive Schwarz (RAS) methods.
+The efficiency of these methods has been shown on elliptic PDEs such as the scalar diffusion equation, the linear elasticity equations,
+or the Helmhotz equation for wave propagation.
+The library can be used through various software for scientific computing:
+PETSc, SLEPc, FreeFEM, Feel++, or HTOOL.
\subsection{Purpose}
\label{sec:HPDDM:purpose}
-Purpose not available.
+
+HPDDM aims at providing the scientific community a set of robust and efficient domain decomposition methods
+to solve linear systems in parallel.
\subsection{Programming and Computational Environment}
\label{sec::HPDDM:environment_capabilities}
@@ -70,38 +78,40 @@ \subsection{Programming and Computational Environment}
C++\\
Fortran\\
Python\\
-\end{tabular} & Programming languages and language standards supported by the software \\
+\end{tabular} & The software is header-only in C++ but there are bindings to other languages and libraries.\\
\rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
GPU\\
MPI\\
Multithread\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
+\end{tabular} & Subdomains are typically bound to an MPI process, which can then exploit OpenMP or CUDA.\\
\rowcolor{white}Data Formats & \begin{tabular}{l}
in-house format\\
-\end{tabular} & Data formats that the software can handle or produce.\\
+\end{tabular} & The software mostly relies on the data formats of calling libraries, e.g., PETSc.\\
\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
None\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
+\end{tabular} & Currently no recovery mechanisms.\\
\rowcolor{white}DevOps & \begin{tabular}{l}
Continuous Integration\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
+\end{tabular} & GitHub Actions is used for running a test suite with MPI and CUDA examples. Codecov is used for the code coverage. \\
\rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
None\\
-\end{tabular} & Software packaging and distribution.\\
+\end{tabular} & Available in Spack, MacPorts, PETSc, SLEPc, and FreeFEM.\\
\rowcolor{white}Testing & \begin{tabular}{l}
Unit\\
Verification\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
+\end{tabular} & Regression tests with respect to number of iterations, time to solution, and other preconditioner metrics.\\
\rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
None\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
+\end{tabular} & Not applicable.\\
\rowcolor{white}Interfaces & \begin{tabular}{l}
Feel++\\
Freefem++\\
MUMPS\\
PETSc\\
PaStiX\\
-\end{tabular} & List of software HPDDM has interfaces with.\\
+HTOOL\\
+\\
+\end{tabular} & The library can use PaStiX and MUMPS. The library is used by Feel++, Freefem++, PETSc, and HTOOL, all through the C++ API.\\
\bottomrule
\end{tabular}
}}
@@ -112,25 +122,29 @@ \subsection{Programming and Computational Environment}
\subsection{Mathematics}
\label{sec:HPDDM:mathematics}
-Mathematics not available.
-
-In this section, provide a summary the mathematics used in the software.
-
+\fullcite{dolean_introduction_2015} provides the mathematical foundation of the library, most importantly when it comes to the definition of parameter-robust preconditioner for symmetric positive definite systems.
\subsection{Relevant Publications}
\label{sec:HPDDM:publications}
Here is a list of relevant publications related to the software:
-
+\begin{itemize}
+\item \fullcite{jolivet_scalable_2013} presents a two-level domain decomposition preconditioner tested to solve elliptic problems in parallel with up to 22 billion unknowns in 2D and 2 billion unknowns in 3D.
+\item \fullcite{jolivet_block_2016} presents block iterative methods and recycling strategies to solve linear systems with multiple right-hand sides and millions of unknwons while preserving a good scalability.
+\item \fullcite{jolivet_ksphpddm_2021} presents KSPHPDDM (Krylov methods) and PCHPDDM (preconditioners) available in PETSc, when configured with HPDDM support.
+\end{itemize}
+
\subsection{Acknowledgements}
\label{sec::HPDDM:acknowledgements}
The software has been developed with the support of the following funding agencies and institutions:
-
-
-
-Acknowledgements not available.
-
-
+\begin{itemize}
+\item Sorbonne Université
+\item Institut de Recherche en Informatique de Toulouse
+\item INRIA
+\item ANR
+\item Eidgenössische Technische Hochschule Zürich
+\item Université Grenoble Alpes
+\end{itemize}
diff --git a/software/hpdomain-decomposition-methods/WP3/WP3.tex b/software/hpdomain-decomposition-methods/WP3/WP3.tex
deleted file mode 100644
index aac95b5..0000000
--- a/software/hpdomain-decomposition-methods/WP3/WP3.tex
+++ /dev/null
@@ -1,97 +0,0 @@
-\section{Software: HPdomain decomposition methods}
-\label{sec:WP3:HPdomain decomposition methods:software}
-
-\begin{table}[h!]
- \centering
- { \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {\fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
- \rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-None\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
-Inria PARIS\\
-Sorbonne U\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Contact Emails} & \begin{tabular}{l}
-pierre@joliv.et\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
-CPU\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://github.com/hpdomain decomposition methods/hpdomain decomposition methods}{https://github.com/hpdomain decomposition methods/hpdomain decomposition methods} \\
- \rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
-OSS:: LGPL v*\\
-\end{tabular} \\
- \bottomrule
- \end{tabular}
- }}
- \caption{WP3: HPdomain decomposition methods Information}
-\end{table}
-
-\subsection{Software Overview}
-\label{sec:WP3:HPdomain decomposition methods:summary}
-
-Provide a brief overview of the software with respect to WP3.
-
-\begin{table}[h!]
- \centering
- {
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {
- \fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} domain decomposition methods & provide short description here \\
-\rowcolor{numpexlightergray} multi-precision & provide short description here \\
-\rowcolor{white} tensor calculus & provide short description here \\
-\rowcolor{numpexlightergray} "reuse of Krylov subspaces for multiple right-hand sides" & provide short description here \\
-\rowcolor{white} randomization & provide short description here \\
-\rowcolor{numpexlightergray} low-rank & provide short description here \\
-\rowcolor{white} interface & provide short description here \\
-\end{tabular}
- }
- }
- \caption{WP3: HPdomain decomposition methods Features}
-\end{table}
-
-
-\subsection{Parallel Capabilities}
-\label{sec:WP3:HPdomain decomposition methods:performances}
-
-
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries and middleware in the Exa-MA framework.
-\end{itemize}
-
-\subsection{Initial Performance Metrics}
-\label{sec:WP3:HPdomain decomposition methods:metrics}
-
-In this section, provide a summary of the initial performance metrics of the software with respect to WP3.
-You can list one or more benchmarks and their associated results, the challenges , bottlenecks and the expectations for future versions of the software.
-
-
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item describe the benchmark
- \item \textbf{Benchmarking Tools Used:} Describe the tools used for parallel benchmarking and the metrics mesured
- \item \textbf{Results Summary:} Results summary not available.
- \item \textbf{Challenges Identified:} No challenges identified.
-\end{itemize}
-
-\subsection{12 months roadmap}
-\label{sec:WP3:HPdomain decomposition methods:roadmap}
-
-Describe the planned improvements for the software in the context of WP3 and Exa-MA for the year to come that will be adressed in the next version of this deliverable.
\ No newline at end of file
diff --git a/software/hpdomain-decomposition-methods/hpdomain-decomposition-methods.tex b/software/hpdomain-decomposition-methods/hpdomain-decomposition-methods.tex
deleted file mode 100644
index fbb0396..0000000
--- a/software/hpdomain-decomposition-methods/hpdomain-decomposition-methods.tex
+++ /dev/null
@@ -1,129 +0,0 @@
-\section{Software: HPdomain decomposition methods}
-\label{sec:HPdomain decomposition methods:software}
-
-
-
-\begin{table}[h!]
- \centering
- { \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {\fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
- \rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-None\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
-Inria PARIS\\
-Sorbonne U\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Contact Emails} & \begin{tabular}{l}
-pierre@joliv.et\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
-CPU\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://github.com/hpdomain decomposition methods/hpdomain decomposition methods}{https://github.com/hpdomain decomposition methods/hpdomain decomposition methods} \\
- \rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
-OSS:: LGPL v*\\
-\end{tabular} \\
- \bottomrule
- \end{tabular}
- }}
- \caption{HPdomain decomposition methods Information}
-\end{table}
-
-\subsection{Software summary}
-\label{sec:HPdomain decomposition methods:summary}
-Detailed overview not available.
-
-
-
-\subsection{Purpose}
-\label{sec:HPdomain decomposition methods:purpose}
-Purpose not available.
-
-\subsection{Programming and Computational Environment}
-\label{sec::HPdomain decomposition methods:environment_capabilities}
-
-
-The following table summarizes these aspects for HPdomain decomposition methods, providing a view of its programming and computational capabilities.
-
-\begin{table}[h!]
- \centering
- {
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {\fontsize{9}{11}\selectfont
- \begin{tabular}{lp{.3\textwidth}p{.5\textwidth}}
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
- \rowcolor{white}Languages & \begin{tabular}{l}
-C\\
-C++\\
-Fortran\\
-Python\\
-\end{tabular} & Programming languages and language standards supported by the software \\
- \rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
-MPI\\
-Multithread\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
- \rowcolor{white}Data Formats & \begin{tabular}{l}
-in-house format\\
-\end{tabular} & Data formats that the software can handle or produce.\\
- \rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
-None\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
- \rowcolor{white}DevOps & \begin{tabular}{l}
-Continuous Integration\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
- \rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
-None\\
-\end{tabular} & Software packaging and distribution.\\
- \rowcolor{white}Testing & \begin{tabular}{l}
-Unit\\
-Verification\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
- \rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
-None\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
- \rowcolor{white}Interfaces & \begin{tabular}{l}
-Feel++\\
-Freefem++\\
-MUMPS\\
-PETSc\\
-PaStIX\\
-\end{tabular} & List of software HPdomain decomposition methods has interfaces with.\\
- \bottomrule
- \end{tabular}
- }}
- \caption{HPdomain decomposition methods programming and computational environment}
-\end{table}
-
-
-
-\subsection{Mathematics}
-\label{sec:HPdomain decomposition methods:mathematics}
-Mathematics not available.
-
-In this section, provide a summary the mathematics used in the software.
-
-
-\subsection{Relevant Publications}
-\label{sec:HPdomain decomposition methods:publications}
-
-Here is a list of relevant publications related to the software:
-
-
-\subsection{Acknowledgements}
-\label{sec::HPdomain decomposition methods:acknowledgements}
-
-The software has been developed with the support of the following funding agencies and institutions:
-
-
-
-
-Acknowledgements not available.
-
-
diff --git a/software/manta/WP3/WP3.tex b/software/manta/WP3/WP3.tex
index ecb0116..fdde490 100644
--- a/software/manta/WP3/WP3.tex
+++ b/software/manta/WP3/WP3.tex
@@ -10,7 +10,7 @@ \section{Software: MANTA}
\begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
\rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-CEA + consortium in development (see EUROPLEXUS)\\
+CEA + consortium in development\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
CEA\\
@@ -21,9 +21,11 @@ \section{Software: MANTA}
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU Only\\
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{None}{None} \\
+ \rowcolor{white}\textbf{Repository} & \begin{tabular}{l}
+In progress\\
+\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
-None\\
+In progress\\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
B10 - Scientific Productivity\\
@@ -54,7 +56,9 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} multiphysics coupling & provide short description here \\
+\rowcolor{white} direct solver & MUMPS through PETSc \\
+\rowcolor{numpexlightergray} krylov solver & many solvers through PETSc including multigrid and fieldsplit for saddle-point problems \\
+\rowcolor{white} multiphysics coupling & MANTA is intended to be coupled with other software (fluid solver for instance) through the ICoCo standard (\href{https://github.com/cea-trust-platform/icoco-coupling}{https://github.com/cea-trust-platform/icoco-coupling}), as a reference component whose performance to preserve in the partitioned coupling \\
\end{tabular}
}
}
@@ -68,55 +72,38 @@ \subsection{Parallel Capabilities}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+ \item MANTA is intended to run in full MPI (CPU) mode for the first tasks in Exa-MA. Perspectives for GPU and hybrid CPU-GPU modes are expected as a second step.
+ \item The parallel computation environment are personal computers, clusters, and super computers: e.g. Adastra (CINES), Jean-Zay (IDRIS), Topaze (CCRT)
+ \item \textbf{Integration with Other Systems:} MANTA strongly integrates with the linear algebra library PETSc developed and improved in the Exa-Ma framework. It thus serves for the practical evaluation of the integral performance gain provided by the project in representative physical cases of interest in the field on Computational Mechanics.
\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP3:MANTA:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+Building the reference metrics for MANTA, in good connection with the work scheduled within WP3, is a work-in-progress, with actual tests to be selected and packaged during next year. They are intended to emphasize on the link between Manta and PETSc in the field of computational mechanics, with the following strategy:
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+ \item \textbf{Test(s) with explicit time integration:} They will test the \textit{assembly} steps of the problem and the data transfer between processes within PETSc, mandatory to separate from the solver performance for a correct understanding of the global performance results for very complex cases.
+ \item \textbf{Test(s) with implicit time integration:} They will test the actual performance of the PETSc solver, with two classes of problem of great interest:
+ \begin{itemize}
+ \item \textbf{Positive Definite Problem(s):} The focus will be given here to very large problems with poor conditioning of the system, for instance due to different materials in the model with several orders of magnitude between their respective mechanical properties. Extended analyses in this situation are currently carried out at CEA, in terms of both numerical performances and robustness of the solution. Some of theses tests have been performed in full MPI mode over more than 15 000 cores, with scalability results yet to be fully packaged and documented.
+ \item \textbf{Saddle-point Problem(s):} In the case of MANTA, they will mostly arise from large scale contact problems and will test the capabilities of PETSc to handle such problems in the case where the dual unknowns (i.e. contact forces seen as Lagrange Multipliers) are not uniformly distributed in the model.
+ \end{itemize}
\end{itemize}
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP3:MANTA:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+The 12-Month Roadmap for MANTA is mainly dedicated to the selection of the relevant tests following the guidelines of previous section and their implementation in the methodology proposed in section \ref{sec:methodology-types}, in terms of scalability analysis for CPU in a first step.\\
+\\
+Practically:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+ \item \textbf{Data availability:} Data for the selected benchmarks will be made available in a format readable with open-source only software, namely from the Salome environment (\href{https://www.salome-platform.org/}{https://www.salome-platform.org/}).
+ \item \textbf{Software availability: } An open version of the software will be made available for the benchmarks, with a possible limitation of the available features to cope with the license environment of MANTA, but providing all the necessary content for reliable and wide-range performance analyses.
+ \item \textbf{Methodology Application:} Implementation of the benchmarking methodology in terms of scalability measurements, with a particular focus on the selection of the relevant output data and associated tolerance for the reproducibility guarantee.
+ \item \textbf{Results Retention:} Benchmark results will be stored and made available with the suitable level of metadata and documentation through a dedicated repository proposed by the Exa-MA project.
\end{itemize}
In~\cref{tab:WP3:MANTA:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
@@ -137,10 +124,10 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & Accelerate the access to large scale numerical results to enhance knowledge and speed-up engineering operations in the field of computational mechanics. \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Ensure the reliability of the results for large scale and complex simulation independently from the computer and the selected run-mode (CPU initially, GPU and hybrid as perspectives for the future). \\
+\rowcolor{white} B6 - Data Management & This bottleneck is indirectly addressed for MANTA through the availability of the benchmark datasets and the collection of the results. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Benefit from the latest improvement in linear algebra libraries, quantify the practical gains on representative applications and track/characterize the remaining bottlenecks for the global performance. Use MANTA as a reference component for the analysis of the performance of partitioned coupling at exascale.\\
\end{tabular}
}
}
diff --git a/software/manta/manta.tex b/software/manta/manta.tex
index 1635e6d..1d9652b 100644
--- a/software/manta/manta.tex
+++ b/software/manta/manta.tex
@@ -25,7 +25,7 @@ \section{Software: MANTA}
\end{tabular} \\
\rowcolor{white}\textbf{Repository} & \href{None}{None} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
-None\\
+GPL-V3 (we may switch to LGPL)\\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
B10 - Scientific Productivity\\
@@ -41,19 +41,56 @@ \section{Software: MANTA}
\subsection{Software summary}
\label{sec:MANTA:summary}
-Detailed overview not available.
+MANTA (Mechanical Numerical Toolbox for advanced Application) is an open-source
+effort from the French Alternative Energies and Atomic Energy Commission (CEA) to develop a
+multiphysics solver for quasi-static and fast-transient simulations of fluids and solids. MANTA aims
+to replace the 40 years-old Cast3M and Europlexus solvers and provide larger physical modeling
+abilities using up to date technologies. It aims at being used in a massively parallel computation context.
+MANTA's functionalities are built over a very generic "core layer" which should be able to deal
+with any set of PDEs and any "mesh-based" numerical method. So whereas being developed mainly in the
+field of mechanics, it can be used easily to deal with other physics.
\subsection{Purpose}
\label{sec:MANTA:purpose}
-Purpose not available.
+
+The project has been designed to meet the following objectives:
+\begin{itemize}
+ \item Being able to simulate complex industrial systems: this implies a great flexibility to be able to handle the complexity of an industrial system in a single calculation.
+ \item High performance computing.
+ \item "Automatic parallelism": new functionalities should be developed without bothering about parallelism.
+ \item Provide a clean, simple and stable Application Programming Interface (API) in C++ and python.
+ \item Generic and flexible to be used by researchers in other fields of numerical methods than mechanics.
+ \item Quality assurance, robustness and reliability compatible with safety-critical studies in the
+nuclear industry.
+ \item Maintainability over decades.
+\end{itemize}
+
+MANTA targets two main kinds of users:
+\begin{itemize}
+ \item The mechanical engineers or researchers which exploit the output of numerical simulations
+to design or analyze physical system of interest. In view of such a user, MANTA provides
+a so-called end-user layer which offers a clean and easy API (both in C++ and python).
+Most numerical details are hidden by default. Also, a very important point is that its API
+is meant to be very stable in time.
+ \item The researcher in the field of numerical methods which would like to implement and test
+various algorithms. The MANTA so-called core-layer provides a generic and flexible way to
+implement a new unstructured-mesh-based numerical method dealing with a given set of
+Partial Differential Equations (PDE).
+\end{itemize}
\subsection{Programming and Computational Environment}
\label{sec::MANTA:environment_capabilities}
+MANTA is developed using a very standard "feature-branch" kind of collaborative workflow. The source code is hosted on `gitlab.com` (at this time in a private project, but soon in a public one). The code is built using `CMake`, uses `spack` to manage its dependencies. A very "modular" `CMake` architecture, borrowed from `VTK` has been developed: the source code shows as a set of interdependent "modules" which can be enabled/disabled by the user through a configuration file. Each "module" enabled triggers the enabling of the modules on which it depends, external third-parties, set of tests, ... The code is tested in several "test configurations" (compiler suite, linux distribution, compilation options, ...) inside docker images whose definitions are stored in the source repository. From an quality assurance point of view, we should be able to recompile and retest any commit of the code in the same way as it has been validated by the CI process. The code is mainly developed in `C++-20` (at this time, but we will follow the C++ new standards in the future), and can handle some functions in `Fortran`.
-The following table summarizes these aspects for MANTA, providing a view of its programming and computational capabilities.
+MANTA can be run on any linux machine, from a personal laptop to a large computation cluster. At this time it is not planned to be ported to windows. At this time, the code is MPI-only (only openMPI is supported in the build process at this time), but will start to be developed soon to handle performance portability. MANTA can be used through several ways:
+\begin{itemize}
+ \item By cloning the repository and compiling it. In this case, we provide scripts to synchronize everything from a local machine connected to internet to a computation cluster having no or limited access to internet.
+ \item We provide an automatically up-to-date `apptainer` image in which MANTA is already built.
+ \item We are working on a "`spack` recipe" that we will soon push to `spack`'s database.
+\end{itemize}
\begin{table}[h!]
\centering
@@ -83,17 +120,28 @@ \subsection{Programming and Computational Environment}
Continuous Integration\\
\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
\rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
-None\\
+Apptainer image, spack recipe\\
\end{tabular} & Software packaging and distribution.\\
\rowcolor{white}Testing & \begin{tabular}{l}
-Unit\\
+Non-regression\\
Verification\\
+Validation\\
+Use of several \\
+"test configurations" inside \\docker images
\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
\rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
-None\\
+Apptainer\\
\end{tabular} & Container technologies used to package and deploy the software.\\
\rowcolor{white}Interfaces & \begin{tabular}{l}
-None\\
+Most significant 3rd parties:\\
+PETSc, SLEPc\\
+moab\\
+zoltan\\
+eigen\\
+mfront/mgis\\
+vtk\\
+MEDcoupling\\
+nanobind
\end{tabular} & List of software MANTA has interfaces with.\\
\bottomrule
\end{tabular}
@@ -105,25 +153,62 @@ \subsection{Programming and Computational Environment}
\subsection{Mathematics}
\label{sec:MANTA:mathematics}
-Mathematics not available.
-
-In this section, provide a summary the mathematics used in the software.
+MANTA's "core layer" is fully generic to handle PDEs and related numerical methods which can be formalized as the assembling of distributed linear systems through the spatial integration on meshes, and their resolution. Some auxiliary linear systems can be attached to a linear system to introduce dual unknowns allowing to impose some constraints on the primal unknowns. We ends up with a saddle-point linear system of the form:
+
+{
+\newcommand{\mmm}[1]{\boldsymbol{#1}}
+\renewcommand{\v}[1]{\boldsymbol{#1}}
+\renewcommand{\t}[1]{\underline{#1}}
+\renewcommand{\d}[1]{\, \mathrm{d}#1}
+\begin{equation}
+ \begin{bmatrix}
+ \mmm{A}&\mmm{C}^T_1&\hdots&\mmm{C}^T_q \\
+ \mmm{C}^T_1&&& \\
+ \vdots &&\mmm{0}& \\
+ \mmm{C}^T_q &&&\\
+ \end{bmatrix}
+ \begin{bmatrix}
+ \v{X} \\
+ \v{\lambda}_1 \\
+ \vdots\\
+ \v{\lambda}_q \\
+ \end{bmatrix}
+ =
+ \begin{bmatrix}
+ \v{B} \\
+ \v{D}_1 \\
+ \vdots\\
+ \v{D}_q \\
+ \end{bmatrix}
+\end{equation}
+
+
+MANTA provides internally different methods to "eliminate" the dual unknown $p$ for any auxiliary linear system: $\mmm{A}$ and $\mmm{B}$ are modified so that one obtain the exact or an approximated (depending on the method) solution $\v{X}$ of the problem, but with the unknown $\v{\lambda}_p$ removed.
+
+The linear systems are assembled by spatial integration over some sets of mesh entities in a very classical way:
+\begin{equation}
+ \mmm{M}=\sum_i \mathcal{A}_i \int_{E_i} \mmm{m}(\t{x}) \d{\t{x}}
+\end{equation}
+Here $\mathcal{A}_i$ is an "assembling operator" which maps local degrees of freedom for the entity $E_i$ to degrees of freedom of the problem. $\mmm{m}$ is the integrand function whose value is a dense matrix. The integral evaluation is approximated by means of quadrature rules and some mappings to reference cells:
+\begin{equation}
+ \mmm{M}=\sum_i \mathcal{A}_i \sum_{j} w_j \mmm{m}(\t{\xi}_j) |\det(\t{\phi}_i(\t{\xi}_j))| \text{ , where } (\t{x}\in E_i) = \t{\phi}_i(\xi)
+\end{equation}
+The definition of the integrand $\mmm{m}$ and the assembling operator $\mathcal{A}_i$ are the two main entry points in the generic algorithm by which the new functionalities are developed.
+
+}
\subsection{Relevant Publications}
\label{sec:MANTA:publications}
-
-Here is a list of relevant publications related to the software:
+\begin{itemize}
+ \item \fullcite{jamond_manta_2022}. MANTA : un code HPC généraliste pour la simulation de problèmes complexes en mécanique.
+ \item \fullcite{jamond_manta_2024}. MANTA: an industrial-strength open-source high performance explicit and implicit multi-physics solver.
+\end{itemize}
\subsection{Acknowledgements}
\label{sec::MANTA:acknowledgements}
-The software has been developed with the support of the following funding agencies and institutions:
-
-
-
-
-Acknowledgements not available.
+MANTA is developed and funded by CEA.
diff --git a/software/pbb/WP5/WP5.tex b/software/pbb/WP5/WP5.tex
index 64968bb..707d0dd 100644
--- a/software/pbb/WP5/WP5.tex
+++ b/software/pbb/WP5/WP5.tex
@@ -21,14 +21,17 @@ \section{Software: pBB}
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU or GPU\\
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://gitlab.inria.fr/jgmys/permutationbb}{https://gitlab.inria.fr/jgmys/permutationbb} \\
+ \rowcolor{white}\textbf{Repository} & \begin{tabular}{l}
+ \href{https://gitlab.inria.fr/jgmys/permutationbb}{https://gitlab.inria.fr/jgmys/permutationbb}\\
+ \href{https://github.com/Guillaume-Helbecque/P3D-DFS}{https://github.com/Guillaume-Helbecque/P3D-DFS} \\
+ \end{tabular} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
OSS: Cecill-*\\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
B10 - Scientific Productivity\\
B11 - Reproducibility and Replicability of Computation\\
-B6 - Data Management\\
+%B6 - Data Management\\
B7 - Exascale Algorithms\\
\end{tabular} \\
\bottomrule
@@ -40,7 +43,9 @@ \section{Software: pBB}
\subsection{Software Overview}
\label{sec:WP5:pBB:summary}
-pBB is initially an implementation of a massively parallel Branch\&Bound algorithm for the exact resolution of permutation-based optimization problems, like Permutation Flow-shop Scheduling (see https://gitlab.inria.fr/jgmys/permutationbb). pBB is designed using the bare-metal MPI+X approach. First, pBB has been extended to improve its genericity w.r.t optimization problems than can be solved, going beyond the permutation ones. A new data structure named P3D-DFS is proposed for that purpose. In addition, a PGAS-guided design approach is used to improve its software productivity-awareness (see https://github.com/Guillaume-Helbecque). The Chapel language is used for the implementation of pBB meeting these genericity and productivity objectives.
+pBB is initially an implementation of a massively parallel Branch-and-Bound (B\&B) algorithm for the exact resolution of permutation-based optimization problems, like Permutation Flow-shop Scheduling (see \url{https://gitlab.inria.fr/jgmys/permutationbb}). pBB is designed using the bare-metal MPI+X approach.
+First, pBB has been extended to improve its genericity w.r.t optimization problems than can be solved, going beyond the permutation ones, like Knapsack problems. A new data structure named distBag-DFS is proposed for that purpose.
+In addition, a PGAS-guided design approach is used to improve its software productivity-awareness (see \url{https://github.com/Guillaume-Helbecque/P3D-DFS}). The Chapel language is used for this implementation of pBB meeting these genericity and productivity objectives.
In~\cref{tab:WP5:pBB:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
@@ -56,7 +61,7 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} Iterative methods & provide short description here \\
+\rowcolor{white} Branch-and-Bound algorithms & Branch-and-Bound is a general-purpose algorithm for solving optimization problems. It systematically explores the solution space using four key operators: branching, bounding, selection, and pruning. Branching divides the problem into smaller subproblems. Bounding calculates bounds to estimate the potential of subproblems. Selection chooses the most promising subproblems for further exploration. Pruning discards subproblems that cannot yield a better solution than the current best. \\
\end{tabular}
}
}
@@ -64,12 +69,11 @@ \subsection{Software Overview}
\label{tab:WP5:pBB:features}
\end{table}
-
\subsection{Parallel Capabilities}
\label{sec:WP5:pBB:performances}
\begin{itemize}
- \item pBB includes several parallel implementations at different parallel levels: multi-core, GPU and multi-GPU, (distributed) cluster. For the MPI+X approach, several parallel environments are used including MPI, OpenMP and PThreads, Cuda and HIP.
+ \item pBB includes several parallel implementations at different parallel levels: multi-core, GPU and multi-GPU, (distributed) cluster. For the MPI+X approach, several parallel environments are used including MPI, OpenMP and PThreads, Cuda and HIP. For the PGAS approach, all parallel levels are mananged using the Chapel programming language.
\item pBB has been used to solve optimization problems on Grid'5000, Meluxina (Luxembourg) and LUMI (Finland) EuroHPC supercomputers.
%\item
\item \textbf{Scalability:} pBB is scalable at the intra-node level (multi-core) as well as inter-node level (distributed). However, its scalability w.r.t GPUs has to be improved.
@@ -79,44 +83,56 @@ \subsection{Parallel Capabilities}
\subsection{Initial Performance Metrics}
\label{sec:WP5:pBB:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP5. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+This section provides a summary of initial performance benchmarks performed in the context of WP5. %It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+ \item \textbf{Overall Performance:}
+ pBB presented significant advancements in the field of combinatorial optimization, specifically targeting makespan minimization in permutation Flowshop Scheduling, a notoriously hard problem. Leveraging up to 384 NVIDIA V100 GPUs, pBB allowed to solve 11 previously unsolved benchmark instances from Taillard's 1993 benchmarks. By utilizing the computational power of peta-scale high-performance computing platforms, the study demonstrates how parallel search techniques can efficiently traverse highly irregular search trees on distributed systems.
+ On a large-scale CPU-based systems, pBB also demonstrates significant performance, as 50\% of strong scaling efficiency is achieved using 400 computer nodes (51,200 CPU cores).
+ \item \textbf{Input/Output Dataset:} %Provide a detailed description of the dataset used for the benchmark, including:
\begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
+ \item Optimization problems to solve are defined as functions: ${f: \mathcal{X} \rightarrow \mathbb{R}}$, where $\hat{x}$ is the optimum, $f$ the objective function, and $\mathcal{X}$ is the search space incorporating a set of constraints.
+ \item The output is mainly the solutions found (i.e, vectors of integers) and their quality, along with some execution statistics (e.g., size of the explored tree).
+ \item Our main dataset consists in well-known permutation Flowshop Scheduling instances introduced in: Taillard, E., Benchmarks for basic scheduling problems, \textit{European Journal of Operational Research}, 64, 2, 278-285, 1993. \url{https://doi.org/10.1016/0377-2217(93)90182-M}. Instances are generated using the generator described in the reference.
+ %\item DOI or permanent link for accessing the dataset.
\end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+ \item \textbf{open-data Access:} Links are provided in the previous section.
+ \item \textbf{Challenges:}
+ The main challenges of the benchmarking process are twofold:
+ \begin{itemize}
+ \item Scalability of parallel optimization algorithms: it consists in analyzing and improving the obtained speedups of the various parallel models that can be designed.
+ \item Characteristics of the target optimization problem: it consists in studying the characteristics of the optimization problems (e.g. dimension, cost of the objective function) that affect the performance of the algorithms.
+ \end{itemize}
+ \item \textbf{Future Improvements:}
+ The future improvements will concern the following aspects:
+ \begin{itemize}
+ \item Fault tolerance aspects have to be deepened.
+ \item In addition to standard benchmarks of optimization problems, it will be interesting to consider real-life complex applications.
+ \end{itemize}
\end{itemize}
\subsubsection{Benchmark \#1}
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+ \item \textbf{Description:} Our main benchmark consists in well-known permutation Flowshop Scheduling instances introduced in: Taillard, E., Benchmarks for basic scheduling problems, \textit{European Journal of Operational Research}, 64, 2, 278-285, 1993. \url{https://doi.org/10.1016/0377-2217(93)90182-M}. It includes many instance sizes, ranging from 5 machines $\times$ 20 jobs to 20 machines $\times$ 500 jobs. This benchmark is mainly used for testing performance scalability.
+ \item \textbf{Benchmarking Tools Used:} The execution time and speed-up metrics are mainly used in our experiments.
\item \textbf{Input/Output Dataset Description:}
\begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+ \item \textbf{Input Data:} Instances are generated using the generator described in the above reference.
+ \item \textbf{Output Data:} The output data contain statistics to check correctness (e.g., size of the explored tree, optimum found) as well as the total execution time of the algorithm.
+ %\item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
\end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+ %\item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+ %\item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
\end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP5:pBB:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+In this section, we describe the roadmap for improving benchmarks and addressing the challenges identified. %This should include:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+ \item \textbf{Data Improvements:} There are no issues concerning the improvements of input/output data management. All datasets are accessible and reproducibility is ensured through open-data initiatives.
+ %\item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
+ \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability. All obtained results are open access and are published in HAL, conferences and journals.
\end{itemize}
In~\cref{tab:WP5:pBB:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
@@ -138,9 +154,9 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Provide validation quantities for each benchmark. \\
+%\rowcolor{white} B6 - Data Management & provide short description here \\
+\rowcolor{white} B7 - Exascale Algorithms & Fault tolerance aspects will be deepened. \\
\end{tabular}
}
}
diff --git a/software/pbb/pbb.tex b/software/pbb/pbb.tex
index 23ecd52..904bc19 100644
--- a/software/pbb/pbb.tex
+++ b/software/pbb/pbb.tex
@@ -21,13 +21,16 @@ \section{Software: pBB}
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU or GPU\\
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://gitlab.inria.fr/jgmys/permutationbb}{https://gitlab.inria.fr/jgmys/permutationbb} \\
+ \rowcolor{white}\textbf{Repository} &
+ \begin{tabular}{l}
+ \href{https://gitlab.inria.fr/jgmys/permutationbb}{https://gitlab.inria.fr/jgmys/permutationbb}\\
+ \href{https://github.com/Guillaume-Helbecque/P3D-DFS}{https://github.com/Guillaume-Helbecque/P3D-DFS} \\
+ \end{tabular} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
OSS: Cecill-*\\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
B10 - Scientific Productivity\\
-Productivity is first related to the different parallel levels of supercomputers including the inter-node, intra-node and GPU levels. In pBB, productivity is ensured using the PGAS-based design approach. The Chapel language is used for the implementation of PGAS. In this latter, the different parallel levels are unified. Second, pBB provides a data structure (P3D-DFS) that can be used in a productive way for different optimization problems.
B11 - Reproducibility and Replicability of Computation\\
%B6 - Data Management\\
B7 - Exascale Algorithms\\
@@ -40,18 +43,26 @@ \section{Software: pBB}
\subsection{Software summary}
\label{sec:pBB:summary}
-%Detailed overview not available.
-pBB is initially an implementation of a massively parallel Branch\&Bound algorithm for the exact resolution of permutation-based optimization problems, like Permutation Flow-shop Scheduling (see https://gitlab.inria.fr/jgmys/permutationbb). pBB is designed using the bare-metal MPI+X approach. First, pBB has been extended to improve its genericity w.r.t optimization problems than can be solved, going beyond the permutation ones. A new data structure named distBag-DFS is proposed for that purpose. In addition, a PGAS-guided design approach is used to improve its software productivity-awareness (see https://github.com/Guillaume-Helbecque). The Chapel language is used for the implementation of pBB meeting these genericity and productivity objectives.
+pBB is initially an implementation of a massively parallel Branch-and-Bound (B\&B) algorithm for the exact resolution of permutation-based optimization problems, like Permutation Flow-shop Scheduling (see \url{https://gitlab.inria.fr/jgmys/permutationbb}). pBB is designed using the bare-metal MPI+X approach.
+First, pBB has been extended to improve its genericity w.r.t optimization problems than can be solved, going beyond the permutation ones, like Knapsack problems. A new data structure named distBag-DFS is proposed for that purpose.
+In addition, a PGAS-guided design approach is used to improve its software productivity-awareness (see \url{https://github.com/Guillaume-Helbecque/P3D-DFS}). The Chapel language is used for this implementation of pBB meeting these genericity and productivity objectives.
\subsection{Purpose}
\label{sec:pBB:purpose}
-Purpose not available.
+
+The purpose of pBB is to provide an efficient, scalable, and general framework for solving exact optimization problems using B\&B. Three main properties characterize pBB:
+\begin{itemize}
+ \item \textbf{Generalization}: our goal is to build a framework which is applicable to various optimization problems (e.g., Flow-shop Scheduling, Knapsack problems).
+
+ \item \textbf{Massively parallel}: a transparent and efficient parallel implementation of the algorithms on various architectures (e.g., multicore CPUs, GPUs, clusters) is carried out. The main challenge is the parallelization of the tree search component of the framework. Many parallel tree search algorithms can be considered.
+
+ \item \textbf{Productivity}: pBB enhances productivity by unifying inter-node, intra-node, and GPU levels through a PGAS-based design approach. This simplifies the development and maintenance of parallel algorithms, allowing developers to focus on high-level design while ensuring efficient execution across diverse architectures.
+\end{itemize}
\subsection{Programming and Computational Environment}
\label{sec::pBB:environment_capabilities}
-
The following table summarizes these aspects for pBB, providing a view of its programming and computational capabilities.
\begin{table}[h!]
@@ -66,34 +77,34 @@ \subsection{Programming and Computational Environment}
\rowcolor{white}Languages & \begin{tabular}{l}
C++\\
Chapel\\
-\end{tabular} & Programming languages and language standards supported by the software \\
+\end{tabular} & pBB has first been implemented in C++ and then extended to support productivity-aware implementation using the PGAS Chapel language. \\
\rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
Chapel\\
CUDA and HIP\\
MPI\\
OpenMP and PThreads\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
+\end{tabular} & The C++ version uses OpenMP and PThreads for shared-memory parallelism, MPI for message-passing, and CUDA and HIP for GPU computing. The Chapel version unifies all these parallel levels within one programming language. \\
\rowcolor{white}Data Formats & \begin{tabular}{l}
None\\
-\end{tabular} & Data formats that the software can handle or produce.\\
+\end{tabular} & None \\
\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
Checkpoint restart\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
+\end{tabular} & pBB employs a checkpoint-and-restart mechanism for fault tolerance and recovery.\\
\rowcolor{white}DevOps & \begin{tabular}{l}
None\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
+\end{tabular} & None \\
\rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
None\\
-\end{tabular} & Software packaging and distribution.\\
+\end{tabular} & None \\
\rowcolor{white}Testing & \begin{tabular}{l}
None\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
+\end{tabular} & None \\
\rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
None\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
+\end{tabular} & None \\
\rowcolor{white}Interfaces & \begin{tabular}{l}
None\\
-\end{tabular} & List of software pBB has interfaces with.\\
+\end{tabular} & None \\
\bottomrule
\end{tabular}
}}
@@ -102,24 +113,23 @@ \subsection{Programming and Computational Environment}
\subsection{Mathematics}
\label{sec:pBB:mathematics}
-Mathematics not available.
-In this section, provide a summary the mathematics used in the software.
+Combinatorial Optimization Problems (COP) consist in finding an object within a finite (or countably infinite) set which is optimal according to a given criterion. Formally, a COP can be defined as a couple $(X, f)$, where $X$ is the search space and $f:X\rightarrow \mathbb{R}$ the objective function to be minimized or maximized. Constraints that must be fulfilled by a feasible solution $x\in X$ can be incorporated in the definition of the search space $X$ or the objective function $f$. The objective function $f$ takes its values in a totally ordered set, usually the set of real numbers or integers. The value $f(x)$ measures the cost (e.g., quality, time, benefit) of solution $x\in X$. The goal is to find one or multiple solution(s) $x^*\in X$ that are feasible and satisfy $f(x^*)\leq f(x), \forall x\in X$ in the case of minimization, or $f(x^*)\geq f(x), \forall x\in X$ in the case of maximization.
\subsection{Relevant Publications}
\label{sec:pBB:publications}
Here is a list of relevant publications related to the software:
-Guillaume Helbecque, Tiago Carneiro, Nouredine Melab, Jan Gmys, Pascal Bouvry. PGAS Data Structure for Unbalanced Tree-Based Algorithms at Scale. Computational Science – ICCS 2024. 2024. https://doi.org/10.1007/978-3-031-63759-9\_13
+\begin{itemize}
+ \item \fullcite{gmys_exactly_2022} This publication presents significant advancements in the field of combinatorial optimization, specifically targeting makespan minimization in permutation Flowshop Scheduling, a notoriously hard problem. Leveraging a GPU-accelerated B\&B approach, pBB allows to solve 11 previously unsolved benchmark instances from Taillard's 1993 benchmarks. By utilizing the computational power of peta-scale high-performance computing platforms, the study demonstrates how parallel search techniques can efficiently traverse highly irregular search trees on distributed systems, providing key insights for optimization researchers focused on leveraging GPUs and multicore processors for large-scale problem-solving.
-Helbecque G, Gmys J, Melab N, Carneiro T, Bouvry P. Parallel distributed productivity-aware tree-search using Chapel. Concurrency Computat Pract Exper. 2023; 35(27):e7874. https://doi.org/10.1002/cpe.7874
+ \item \fullcite{franco_pgas_2024} This publication focuses on advancing the design of parallel algorithms for modern supercomputers by introducing a PGAS-based data structure and a Work-Stealing mechanism. These innovations target unbalanced tree exploration using depth-first search, a common challenge in parallel tree-based algorithms. Implemented in Chapel, the proposed solution demonstrates promising scalability in single-node backtracking experiments using the Unbalanced Tree Search benchmark. Additionally, large-scale experiments using B\&B for Flowshop Scheduling reveal significant strong scaling efficiency on 400 computer nodes (51,200 CPU cores), making it relevant for researchers interested in optimizing parallel workloads on distributed systems.
+
+ \item \fullcite{helbecque_parallel_2023} This publication explores the design and implementation of pBB, using the PGAS Chapel programming language, particularly suited for exascale computing. It exploits the PGAS data structure of previous publication. Experimental results, using up to 4,096 CPU cores, compare Chapel's implementation to OpenMP (intra-node) and MPI+X (inter-node) counterparts, highlighting competitive performance in both shared and distributed memory settings. The study underscores Chapel as a viable alternative to traditional parallel programming models for exascale-aware applications.
+\end{itemize}
\subsection{Acknowledgements}
\label{sec::pBB:acknowledgements}
-The software has been developed with the support of the following funding agencies and institutions:
-
-Acknowledgements not available.
-
-
+The software has been developed with the support of the following funding agencies and institutions: Université de Lille, Inria Lille, ANR, and FNR.
\ No newline at end of file
diff --git a/software/samourai/WP1/WP1.tex b/software/samourai/WP1/WP1.tex
deleted file mode 100644
index 6fb178a..0000000
--- a/software/samourai/WP1/WP1.tex
+++ /dev/null
@@ -1,46 +0,0 @@
-\section{Software: Samourai}
-\label{sec:WP1:Samourai:software}
-
-\begin{itemize}
- \item \textbf{Contact Email(s):} loic.gouarin@polytechnique.edu
- \item \textbf{Supported Architecture(s):} CPU
- \item \textbf{Repository Link:} \href{https://github.com/hpc-maths/samurai}{https://github.com/hpc-maths/samurai}
-\end{itemize}
-
-\subsection{Software Overview}
-\label{sec:WP1:Samourai:summary}
-
-Provide a brief overview of the software with respect to WP1.
-
-\subsection{Parallel Capabilities}
-\label{sec:WP1:Samourai:performances}
-
-
-\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries and middleware in the Exa-MA framework.
-\end{itemize}
-
-\subsection{Initial Performance Metrics}
-\label{sec:WP1:Samourai:metrics}
-
-In this section, provide a summary of the initial performance metrics of the software with respect to WP1.
-You can list one or more benchmarks and their associated results, the challenges , bottlenecks and the expectations for future versions of the software.
-
-
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item describe the benchmark
- \item \textbf{Benchmarking Tools Used:} Describe the tools used for parallel benchmarking and the metrics mesured
- \item \textbf{Results Summary:} Results summary not available.
- \item \textbf{Challenges Identified:} No challenges identified.
-\end{itemize}
-
-\subsection{12 months roadmap}
-\label{sec:WP1:Samourai:roadmap}
-
-Describe the planned improvements for the software in the context of WP1 and Exa-MA for the year to come that will be adressed in the next version of this deliverable.
\ No newline at end of file
diff --git a/software/samourai/samourai.tex b/software/samourai/samourai.tex
deleted file mode 100644
index b22fd9b..0000000
--- a/software/samourai/samourai.tex
+++ /dev/null
@@ -1,37 +0,0 @@
-\section{Software: Samourai}
-\label{sec:Samourai:software}
-
-
-
-\begin{itemize}
- \item \textbf{Contact Email(s):} loic.gouarin@polytechnique.edu
- \item \textbf{Supported Architecture(s):} CPU
- \item \textbf{Repository Link:} \href{https://github.com/hpc-maths/samurai}{https://github.com/hpc-maths/samurai}
-\end{itemize}
-
-\subsection{Software summary}
-\label{sec:Samourai:summary}
-Detailed overview not available.
-
-
-
-\subsection{Purpose}
-\label{sec:Samourai:purpose}
-Purpose not available.
-
-
-
-\subsection{Mathematics}
-\label{sec:Samourai:mathematics}
-Mathematics not available.
-
-
-\subsection{Relevant Publications}
-\label{sec:Samourai:publications}
-
-\subsection{Acknowledgements}
-\label{sec::Samourai:acknowledgements}
-
-Acknowledgements not available.
-
-
diff --git a/software/samurai/WP1/WP1.tex b/software/samurai/WP1/WP1.tex
index bf2175a..a4365af 100644
--- a/software/samurai/WP1/WP1.tex
+++ b/software/samurai/WP1/WP1.tex
@@ -41,21 +41,32 @@ \section{Software: Samurai}
\subsection{Software Overview}
\label{sec:WP1:Samurai:summary}
-In~\cref{tab:WP1:Samurai:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+samurai aims to provide an adaptive mesh library for flexible numerical simulations that makes it easy to test new methods. The interval structure coupled with a set algebra allows meshes to be manipulated efficiently, making inter- and intra-grid computation kernels easier to write.
+
+Using this structure to store Cartesian meshes, Samurai proposes to implement a range of spatial schemes such as finite volumes, Boltzmann lattice methods, finite differences and discrete Galerkin methods. The aim is to be able to easily test new resolution methods on adaptive meshes in a way that is transparent to the user. The user focuses on solving the problem and samurai takes care of managing the mesh.
+
+A third layer is currently being added to address various specific fields of application we are working on: combustion, two-phase
+flows, plasma discharge, lithium battery simulation... with several institution and industrial partners.
\begin{table}[h!]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} mesh adaptation & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+ \rowcolor{white} mesh adaptation & AMR with heuristic criteria and multiresolution based on wavelets \\
+ \rowcolor{numpexlightergray} load balancing & Space filling curve (Hilbert or Morton) or diffusion algorithm \\
+ \rowcolor{white} sparse Cartesian mesh & A new data structure based on intervals and algebra of set \\
+ \rowcolor{numpexlightergray} grid operators & Provide several operators to make prediction or projection of a field \\
+ \rowcolor{white} numerical schemes & Provide numerical schemes such as finite volume schemes \\
+
+
\end{tabular}
}
}
@@ -67,81 +78,74 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP1:Samurai:performances}
+samurai uses MPI and OpenMP parallelism. Container abstractions are used to connect different tensor libraries such as Xtensor or Eigen. This preliminary work will also make it easy to plug in the Kokkos library.
+When we use mesh adaptation methods, we do so in a dynamic context: in other words, the mesh evolves over time. There are therefore two metrics to take into account if we want to have an effective, scalable solution
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+\item What is the cost of mesh adaptation compared with calculating the refined solution everywhere?
+\item How can we ensure that there is always a good distribution of the load balancing between the processes?
\end{itemize}
+samurai offers two types of load balancing: the best known is the use of a space filling curve (Hilbert and Morton are implemented), the other solution is the use of a diffusion algorithm. The most complicated aspect here is adapting these solutions to the interval data structure. This is a work in progress, but it is important if we want to achieve good scalability for the target applications.
\subsection{Initial Performance Metrics}
\label{sec:WP1:Samurai:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP1. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+The benchmarks in this WP study the overall performance of samurai in terms of the cost of mesh adaptation compared with the calculation of the refined solution everywhere, and also in terms of the calculation times associated with the numerical schemes. We first propose to carry out a comparative study with equivalent open source software such as \href{https://github.com/AMReX-Codes/amrex}{AMRex}, \href{https://github.com/vanreeslab/murphy}{Murphy}, \href{https://github.com/Dyablo-HPC/Dyablo}{Dyablo}, \href{https://github.com/paralab/Dendro-5.01}{Dendro}. A second study will focus on two practical applications.
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
+\subsubsection{Benchmark \#1: AMR software performance comparaison}
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+\paragraph{Description}
+
+There are a number of open source software packages that offer adaptive mesh refinement methods. However, it is difficult to find a benchmark for testing their effectiveness on simple problems. We therefore propose to carry out a comparative study between samurai and a list of software that will be finalized when the benchmark is set up. This will be made public via a GitHub repository so that anyone can re-launch the study.
+
+The aim will be to compare a range of metrics: memory footprint of the mesh, ease of writing computation kernels, sequential computation time, parallel computation time, etc. on various simple problems.
+
+\paragraph{Benchmarking Tools Used}
+To evaluate the performance of the different test cases, we will use the Tau tool to measure the execution time and memory usage of the software.
+
+\paragraph{Input/Output Dataset Description}
+The input dataset will be a simple list of test cases which can be executed by all the chosen software. The output dataset will be a list of metrics (the execution time, the memory usage, the scalability, ...) extracted in a JSON like format and easily represented graphically.
+
+\paragraph{Results Summary}
+This benchmark will allow us to compare the performance of samurai with other software. The results will be made public via a GitHub repository and will use some tools such as \href{https://github.com/airspeed-velocity/asv}{airspeed velocity} used by \href{scipy}{https://pv.github.io/scipy-bench/} to represent the results.
+
+\paragraph{Challenges Identified}
+There is currently no benchmark that provides an overview of the performance of adaptive meshing software. The establishment of this benchmark should provide a better understanding of the impact of the data structure used (patch-based, cell-based or interval-based) depending on the use cases, and provide simple test cases for all selected software that can be easily enriched by the community.
+
+
+\subsubsection{Benchmark \#2: Plasma discharge simulation}
+
+The first application benchmark we are working on is related to the simulation of plasma discharges, with and without magnetic field, including the description of sheaths at the boundaries through a fluid model (Euler - Poisson system of PDEs in 2D and 3D). Such simulation are really hard to conduct in multi-dimensions due to the multi-scale character to the physics (small Debye length, small mass ration of the electrons, ratio of temperature) and require very refined numerical schemes (asymptotic preserving with respect to the various small parameters) with high stability properties : IMEX schemes with the cost of explicit schemes developed in the PhD thesis of L. Reboul within the samurai code. Such schemes have the ability to allow fine mesh adaptation in the neighborhood of the boundaries where the sheath is present, while allowing large cells in the electroneutral zone and are of paramount importance to conduct efficient fluid simulations, making them competitive in terms of computational time with respect to PIC methods, while not have the drawback of noise involved. The main objective of this benchmark is to demonstrate that without robust numerical schemes and the use of multiresolution as an adapted method, it is impossible or more difficult to perform such simulations in 2D and 3D using classical adaptive mesh methods. The second one is to demonstrate that fluid simulation can be competitive with respect to PIC methods in terms of computational time.
+
+\subsubsection{Benchmark \#3: Simulation of the hydrogen risk}
+
+The second benchmark we are currently setting up is the simulation of the hydrogen risk and direct numerical simulation of a hydrogen flame with deflagration to detonation transition, an old problem in the theory and simulation of combustion, with detailed transport and complex chemistry in the compressible Navier-Stokes equations. We aim at comparing our simulation tool, samurai, with the existing software AMRex, where the error control on the solution can not be guaranteed. A new numerical strategy based on a mixed operator splitting / IMEX scheme has been designed in order to reach the computational efficiency of full operator splitting techniques for simple chemistry (\cite{duarte_adaptive_nodate}, \cite{lecointre_hydrogen_nodate}), while allowing optimal parallel capabilities. The simulation configuration is a 2D and then 3D channel with obstacles, with potentially a different mesh level for the density and velocity field compared to the temperature and species mesh, with a verification on a series of cases that have been obtained with other codes, which do not have the distributed parallel capability (Dryads). The project is conducted with a strong collaboration with CEA and ONERA.
\subsection{12-Month Roadmap}
\label{sec:WP1:Samurai:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
-
-In~\cref{tab:WP1:Samurai:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+The contribution of the new data structure based on intervals and set algebra means that mesh adaptation methods can be approached differently to traditional methods. However, there is still work to be done if samurai is to become a reference software package for mesh refinement methods. The studies carried out as part of these benchmarks should confirm that the data structure compresses the mesh efficiently (as many cells as necessary) while providing optimal vectorization performance due to its memory contiguity. It will also be necessary to ensure that the load balancing methods used are efficient and scalable.
\begin{table}[h!]
\centering
-
-
+
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
+
+\rowcolor{white} B10 - Scientific Productivity & Confirm the efficiency of the interval-based data structure. \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Provide a relevant and reproducible study of AMR software performance. \\
+\rowcolor{white} B6 - Data Management & Offer a comparison format that can be easily represented graphically and freely accessible. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Allow to select different load balancing algorithms and test their efficiency. \\
\end{tabular}
}
}
diff --git a/software/samurai/samurai.tex b/software/samurai/samurai.tex
index c8d4a24..f8f02d8 100644
--- a/software/samurai/samurai.tex
+++ b/software/samurai/samurai.tex
@@ -42,13 +42,75 @@ \section{Software: Samurai}
\subsection{Software summary}
\label{sec:Samurai:summary}
-Detailed overview not available.
+samurai is an open source software package written in modern C++ (C++17 and soon C++20), enabling the representation of sparse Cartesian meshes with different levels of resolution in a compressed way, using an interval representation. Resolution refers to Cartesian cells of the same size. A set algebra is provided to manage the operators that can intervene between these meshes. These include intersections, unions, differences and translations.
+It is then possible to attach scalar and vector fields to these meshes and perform operations on these fields. Access operators facilitate field manipulation according to resolution levels and coordinates.
+This data structure can then be used to implement spatial and temporal schemes. The~\cref{fig:Samurai:architecture} shows the different layers involved in samurai. Some of them are currently being implemented.
+
+\begin{figure}[h!]
+ \centering
+ \includegraphics[width=0.8\textwidth]{graphics/samurai/samurai.png}
+ \caption{Samurai architecture}
+ \label{fig:Samurai:architecture}
+\end{figure}
+
+\href{https://github.com/hpc-maths/ponio}{ponio} is an open source software developed at the HPC@Maths team at CMAP (Ecole polytechnique). The aim of ponio is to provide a set of schemes in time for solving a whole collection of ODEs and PDEs. The simplest is the combination of an operator separation strategy and a method of line involving various classical time integrators like Runge-Kutta methods, or optimized ones (RADAU5, ROCK4) and also operator splitting methods as well as IMEX schemes; the long-term objective is also to be able to tackle innovative adaptive code coupling techniques through an interface as well as classes of time-space coupled schemes (Lax-Wendroff, OSMP, time-space coupled IMEX with good asymptotic preserving and stability properties...).
+
+The design principles of samurai are the following:
+\begin{enumerate}
+ \item Compress the mesh according to the level-wise spatial connectivity along each Cartesian axis.
+ \item Achieve fast look-up for a cell into the structure, especially for parents and neighbors. This is particularly useful when utilizing numerical schemes such as Finite Volumes, \emph{etc.} on the hybrid mesh.
+ \item Maximize the memory contiguity of the stored data to allow for caching and vectorization (contrarily to the $z$-curve).
+ \item Facilitate inter-level operations which are common in many numerical techniques (\emph{e.g.} multiresolution).
+ \item Allow for a time evolution of the hybrid mesh (\emph{via} AMR or multiresolution) efficiently.
+ \item Give the possibility of writing numerical schemes in a transparent way as one were on a uniform mesh.
+\end{enumerate}
+
+To give an overview of the compression capabilities of samurai, the~\cref{tab:Samurai:compression} shows the number of cells needed to represent a Cartesian mesh defined by the simple-2d example found in the p4est library (\cite{burstedde_p4est_2011}) and illustrated in~\cref{fig:Samurai:simple2d}.
+
+\begin{figure}[h!]
+ \centering
+ \includegraphics[width=0.8\textwidth]{graphics/samurai/p4est_3.png}
+ \caption{simple-2d test from p4est library}
+ \label{fig:Samurai:simple2d}
+\end{figure}
+
+\begin{table}[h!]
+ \centering
+ {
+ \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {
+ \fontsize{9}{11}\selectfont
+ \begin{tabular}{llllll}
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Level} & {\rule{0pt}{2.5ex}\color{white}\bf Num. of cells } &{\rule{0pt}{2.5ex}\color{white}\bf p4est} & {\rule{0pt}{2.5ex}\color{white}\bf samurai (leaves) } &{\rule{0pt}{2.5ex}\color{white}\bf samurai (all)} & {\rule{0pt}{2.5ex}\color{white}\bf ratio }\\
+
+ \rowcolor{white} 9 & 66379 & 2.57 Mb & 33.68 Kb & 121 Kb & 21.24 \\
+ \rowcolor{numpexlightergray} 10 & 263767 & 10.25 Mb & 66.64 Kb & 236.8 Kb & 43.28 \\
+ \rowcolor{white} 11 & 1051747 & 40.96 Mb & 132.36 Kb & 467.24 Kb & 87.66 \\
+ \rowcolor{numpexlightergray} 12 & 4200559 & 163.75 Mb & 263.6 Kb & 927 Kb & 176.64 \\
+ \rowcolor{white} 13 & 16789627 & 654.86 Mb & 525.9 Kb & 1.85 Mb & 353.98 \\
+ \rowcolor{numpexlightergray} 14 & 67133575 & 2.61 Gb & 1.05 Mb & 3.68 Mb & 709.24 \\
+\end{tabular}
+ }
+ }
+ \caption{WP1: Compression rate between samurai and p4est meshes}
+ \label{tab:Samurai:compression}
+\end{table}
\subsection{Purpose}
\label{sec:Samurai:purpose}
-Purpose not available.
+
+Based on this new data structure, samurai's objective is to be able to easily describe AMR mesh adaptation methods with a heuristic refinement criterion, or multiresolution methods based on a wavelet base decomposition (\cite{cohen_fully_2003}). Multiresolution, although more complicated to implement, offers greater robustness than AMR methods, since the refinement criterion is based solely on the calculation of a detail derived from the wavelet decomposition. It therefore provides finer control over the error made between the fine solution everywhere and the adapted solution whatever the physical problem studied. Most of available software are based on AMR methods (AMReX \cite{zhang_amrex_2021}, Dyablo \cite{delorme_novel_nodate} and others , \cite{dubey_survey_2014-1}). Only few of them are based on multiresolution methods (Murphy \cite{gillis_murphy---scalable_2022}, Wabbit \cite{krah_wavelet_2022}) and used cell-based structure.
+
+Block-based AMR methods have good memory contiguity thanks to their patch-based hierarchical data structure. This makes it possible to use the vectorization of modern processors. However, to be effective, the patches need to be large enough. It is therefore possible to refine more than necessary.
+
+Cell-based AMR methods lose this memory contiguity, as the mesh is now flattened. A tree-like data structure is therefore required. To restore good arithmetic intensity, it is customary to place cell blocks in the tree leaves. Here again, we refine more than is necessary.
+
+In comparison, the samurai data structure maintains memory contiguity in one direction by using intervals. This is the same as with AMR block-based methods. This ensures that modern processors remain vectorized. What's more, the data structure allows refinement only where necessary. This means no more refinement than is necessary, while maintaining good arithmetic intensity. samurai therefore combines the advantages of the two previous data structures.
\subsection{Programming and Computational Environment}
\label{sec::Samurai:environment_capabilities}
@@ -66,39 +128,39 @@ \subsection{Programming and Computational Environment}
\begin{tabular}{lp{.3\textwidth}p{.5\textwidth}}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
\rowcolor{white}Languages & \begin{tabular}{l}
-C++\\
-C++14\\
C++17\\
-\end{tabular} & Programming languages and language standards supported by the software \\
+\end{tabular} & samurai is developed in modern C++. The source code is written in C++17 but some new implementations will use concepts and ranges provided by C++20. \\
\rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
MPI\\
Multithread\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
+\end{tabular} & samurai is parallelized using MPI and OpenMP. Kokkos will also be used in future versions to provide GPU support.\\
\rowcolor{white}Data Formats & \begin{tabular}{l}
HDF5\\
-\end{tabular} & Data formats that the software can handle or produce.\\
+\end{tabular} & Output is in HDF5 format using the open source software HighFive. Adios2 is another solution being considered for future versions.\\
\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
None\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
+\end{tabular} & There are no fault tolerance and recovery mechanisms used by samurai.\\
\rowcolor{white}DevOps & \begin{tabular}{l}
Continuous Delivery\\
Continuous Integration\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
+\end{tabular} & samurai uses GitHub Actions to perform automatic tasks: documentation build, CI, new releases, etc. \\
\rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
Other\\
-\end{tabular} & Software packaging and distribution.\\
+\end{tabular} & samurai is available on conda and conan.\\
\rowcolor{white}Testing & \begin{tabular}{l}
Functional\\
Unit\\
Validation\\
Verification\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
+\end{tabular} & samurai has unit tests and regression tests. All the examples provided by samurai in its demos directory are verified at every change of the code.\\
\rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
None\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
+\end{tabular} & No container technologies are used to package and deploy the software. We use conda-forge to built the conda package.\\
\rowcolor{white}Interfaces & \begin{tabular}{l}
PETSc\\
-\end{tabular} & List of software Samurai has interfaces with.\\
+xtensor\\
+Eigen\\
+\end{tabular} & xtensor and Eigen are used to store the unknowns of the equations and to perform lazy evaluation. Matrix assembly and linear solver resolution are performed using PETSc.\\
\bottomrule
\end{tabular}
}}
@@ -109,25 +171,161 @@ \subsection{Programming and Computational Environment}
\subsection{Mathematics}
\label{sec:Samurai:mathematics}
-Mathematics not available.
-In this section, provide a summary the mathematics used in the software.
+samurai provides a set of operators for working with Cartesian grids of varying resolutions. Two distinct categories of operators can be identified:
+\begin{itemize}
+\item Prediction operators are employed to calculate the value of a field on a fine grid based on the value of a field on a coarse grid.
+\item Projection operators are used to determine the value of a field on a coarse grid by using the value of a field on a fine grid.
+\end{itemize}
-\subsection{Relevant Publications}
-\label{sec:Samurai:publications}
+The aforementioned operators are currently defined for control volumes up to order 11. The size of the associated stencils is automatically accounted for by samurai. In a near future, the same operators will be defined for finite differences and discountinuous Galerkin methods.
-Here is a list of relevant publications related to the software:
+It is thus possible to reconstruct the solution at any desired resolution level. This enables the solution to be found at the finest level, as well as calculations to be performed involving two different AMR meshes. To illustrate this purpose, we can imagine a case where one mesh is used to solve the Navier-Stokes equations, while another is employed to simulate the propagation of a pollutant via an advection equation, wherein the velocity is derived from the velocity provided by the Navier-Stokes equations. The first mesh is adapted using the velocity field, whereas the second mesh is adapted using the pollutant concentration (\cite{nguessan_high_2021}).
+In addition to these resolution-level operators, samurai provides an API for defining finite-volume operators that can be used for both explicit and implicit methods. The defined FVM operator available with samurai are
-\subsection{Acknowledgements}
-\label{sec::Samurai:acknowledgements}
+\begin{itemize}
+ \item linear homogeneous operator
+ \item linear heterogeneous operator
+ \item non-linear operator
+\end{itemize}
-The software has been developed with the support of the following funding agencies and institutions:
+In the following, we will illustrate the use of samurai with a simple example of a linear homogeneous operator. The operator is a scalar Laplacian operator defined as follows:
+Since we have
+\begin{equation*}
+\int_V \Delta u = \int_{\partial V} \nabla u\cdot \mathbf{n},
+\end{equation*}
+the flux function to implement is a discrete version of $\nabla u\cdot \mathbf{n}$.
+Here, we choose the normal gradient of the first order, requiring a stencil of two cells.
+This is enough to write the static configuration:
-Acknowledgements not available.
+\begin{listing}[ht]
+\begin{minted}[
+ linenos, % Line numbers
+ fontsize=\scriptsize, % Reduce font size
+ bgcolor=bgcolor, % Slightly gray background
+ frame=lines, % Delimiters around the code
+ framesep=2mm, % Space between code and frame
+ rulecolor=\color{gray}, % Color of the frame
+ breaklines % Allow line breaks in long lines
+ ]{cpp}
+auto u = samurai::make_field<1>("u", mesh); // scalar field
+using cfg = samurai::FluxConfig; // input_field_type
+\end{minted}
+\end{listing}
+
+Now, denoting by $V_L$ (left) and $V_R$ (right) the stencil cells and $F$ their interface, the discrete flux from $V_L$ to $V_R$ writes
+
+\begin{equation*}
+ \mathcal{F}_h(u_h)_{|F} := \frac{u_R-u_L}{h},
+\end{equation*}
+
+where $u_L$ and $u_R$ are the finite volume approximations of $u$ in the respective cells, and $h$ is the cell length.
+%Referring to formula :eq:`linear_comb`, the coefficients in the linear combination of $(u_L, u_R)$ correspond to $(-1/h, 1/h)$.
+The flux function then writes:
+
+\begin{listing}[ht]
+ \begin{minted}[
+ linenos, % Line numbers
+ fontsize=\scriptsize, % Reduce font size
+ bgcolor=bgcolor, % Slightly gray background
+ frame=lines, % Delimiters around the code
+ framesep=2mm, % Space between code and frame
+ rulecolor=\color{gray}, % Color of the frame
+ breaklines % Allow line breaks in long lines
+ ]{cpp}
+samurai::FluxDefinition gradient([](double h)
+{
+ static constexpr std::size_t L = 0; // left
+ static constexpr std::size_t R = 1; // right
+
+ samurai::FluxStencilCoeffs c;
+ c[L] = -1/h;
+ c[R] = 1/h;
+ return c;
+});
+\end{minted}
+\end{listing}
+
+First of all, remark that we have declared only one flux function for all directions.
+We could have written as many functions as directions:
+they would have been identical, except that we would have replaced the name of the constants
+\verb!L=0, R=1! with \verb!B=0, T=1! (bottom, top) and \verb!B=0, F=1! (back, front) to better reflect the actual direction currently managed.
+The indexes 0 and 1 actually refer to the configured stencil.
+In this case, no particular stencil has been defined, so the default ones are used: in the x-direction of a 3D space,
+it is \verb!{{0,0,0}, {1,0,0}}!, i.e. the current cell at index 0 (which we call \verb!L!) and its right neighbor at index 1 (which we call \verb!R!).
+
+Finally, the operator must be constructed from the flux definition by the instruction
+
+\begin{listing}[ht]
+ \begin{minted}[
+ linenos, % Line numbers
+ fontsize=\scriptsize, % Reduce font size
+ bgcolor=bgcolor, % Slightly gray background
+ frame=lines, % Delimiters around the code
+ framesep=2mm, % Space between code and frame
+ rulecolor=\color{gray}, % Color of the frame
+ breaklines % Allow line breaks in long lines
+ ]{cpp}
+auto laplacian = samurai::make_flux_based_scheme(gradient);
+\end{minted}
+\end{listing}
+
+samurai uses lazy evaluation to have concise and readable equations. The following code snippet shows how to apply the operator to a field to solve the heat equation in implicit and explicit ways using backward Euler time scheme
+
+\begin{listing}[ht]
+ \begin{minted}[
+ linenos, % Line numbers
+ fontsize=\scriptsize, % Reduce font size
+ bgcolor=bgcolor, % Slightly gray background
+ frame=lines, % Delimiters around the code
+ framesep=2mm, % Space between code and frame
+ rulecolor=\color{gray}, % Color of the frame
+ breaklines % Allow line breaks in long lines
+ ]{cpp}
+auto unp1 = samurai::make_field<1>("unp1", mesh);
+if (explicit_scheme)
+{
+ unp1 = u - dt * laplacian(u);
+}
+else
+{
+ auto back_euler = id + dt * laplacian;
+ samurai::petsc::solve(back_euler, unp1, u); // solves the linear equation [Id + dt*Diff](unp1) = u
+}
+\end{minted}
+\end{listing}
+
+It can be observed that the implicit case is constructed and solved using PETSc.
+
+These operators can then be used in adapted mesh refinement methods.
+
+Other spatial discretization methods will soon be proposed in samurai, such as finite differences and discontinuous Galerkin methods based on the same operator definition approach.
+
+\subsection{Relevant Publications}
+\label{sec:Samurai:publications}
+
+Here is a list of relevant publications related to the software:
+
+\begin{itemize}
+ \item \cite{bellotti_multidimensional_2022}: this article explains how to use the adaptive multiresolution (MR) approach based on wavelets with lattice Boltzmann methods.
+ \item \cite{bellotti_multiresolution-based_2022}: in this article, an error analysis is proposed. For the purpose of validating this error analysis, we conduct a series of test cases for various schemes and scalar and systems of conservation laws, where solutions with shocks are to be found and local mesh adaptation is especially relevant. Theoretical estimates are retrieved while a reduced memory footprint is observed.
+\end{itemize}
+
+\subsection{Acknowledgements}
+\label{sec::Samurai:acknowledgements}
+The software has been developed with the support of the following funding agencies and institutions:
+\begin{itemize}
+ \item \'Ecole polytechnique
+ \item CNRS
+ \item CIEDS
+ \end{itemize}
diff --git a/software/scimba/WP2/WP2.tex b/software/scimba/WP2/WP2.tex
index 0c7c6ab..f07ea14 100644
--- a/software/scimba/WP2/WP2.tex
+++ b/software/scimba/WP2/WP2.tex
@@ -4,37 +4,47 @@ \section{Software: Scimba}
\begin{table}[h!]
\centering
{ \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {\fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
- \rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-INRIA\\
-UNISTRA\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
-Unistra\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Contact Emails} & \begin{tabular}{l}
-emmanuel.franck@inria.fr\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
-CPU or GPU\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://gitlab.inria.fr/scimba/scimba}{https://gitlab.inria.fr/scimba/scimba} \\
- \rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
-None\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
-B10 - Scientific Productivity\\
-B11 - Reproducibility and Replicability of Computation\\
-B6 - Data Management\\
-B7 - Exascale Algorithms\\
-\end{tabular} \\
- \bottomrule
- \end{tabular}
- }}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+
+ \begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field}
+ & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
+ \rowcolor{white}\textbf{Consortium}
+ & \begin{tabular}{l}
+ INRIA \\
+ UNISTRA \\
+ \end{tabular} \\
+ \rowcolor{numpexlightergray}\textbf{Exa-MA Partners}
+ & \begin{tabular}{l}
+ Unistra \\
+ \end{tabular} \\
+ \rowcolor{white}\textbf{Contact Emails}
+ & \begin{tabular}{l}
+ emmanuel.franck@inria.fr \\
+ victor.michel-dansac@inria.fr \\
+ \end{tabular} \\
+ \rowcolor{numpexlightergray}\textbf{Supported Architectures}
+ & \begin{tabular}{l}
+ CPU or GPU \\
+ \end{tabular} \\
+ \rowcolor{white}\textbf{Repository}
+ & \href{https://gitlab.inria.fr/scimba/scimba}{https://gitlab.inria.fr/scimba/scimba} \\
+ \rowcolor{numpexlightergray}\textbf{License}
+ & \begin{tabular}{l}
+ MIT \\
+ \end{tabular} \\
+ \rowcolor{white}\textbf{Bottlenecks roadmap}
+ & \begin{tabular}{l}
+ B10 - Scientific Productivity \\
+ B11 - Reproducibility and Replicability of Computation \\
+ B6 - Data Management \\
+ B7 - Exascale Algorithms \\
+ \end{tabular} \\
+ \bottomrule
+ \end{tabular}
+ }}
\caption{WP2: Scimba Information}
\end{table}
@@ -45,19 +55,28 @@ \subsection{Software Overview}
\begin{table}[h!]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} NN/Autoencoder & provide short description here \\
-\rowcolor{numpexlightergray} PINN & provide short description here \\
-\end{tabular}
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features}
+ & {\rule{0pt}{2.5ex}\color{white}\bf Short Description } \\
+ \rowcolor{white} PINNs
+ & PINNs: Physics-Informed Neural Networks \\
+ \rowcolor{numpexlightergray} PINOs
+ & PINOs: Physics-Informed Neural Operators \\
+ \rowcolor{white} Neural Galerkin
+ & implementation of the Neural Galerkin method \\
+ \rowcolor{numpexlightergray} ML architectures
+ & implementation of machine learning architectures
+ (MLPs, PointNets, \dots) \\
+ \rowcolor{white} classical methods
+ & started implementing a fully differentiable
+ DG method on multi-dimensional Cartesian meshes \\
+ \end{tabular}
}
}
\caption{WP2: Scimba Features}
@@ -68,84 +87,110 @@ \subsection{Software Overview}
\subsection{Parallel Capabilities}
\label{sec:WP2:Scimba:performances}
-
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+ \item Parallel programming environment:
+ automatic parallelization tools called within \texttt{PyTorch}
+ (e.g. GPU parallelization with CUDA or shared-memory parallelization with OpenMP)
+ \item Parallel computation environment: computing servers of IRMA (Unistra), described in Appendix~\ref{sec:app:architectures}.
+ \item This software is designed to be used on a single node at the moment.
+ Parallelization on multiple nodes is in the roadmap.
+ \item \textbf{Scalability:}
+ In theory, the software can be scaled to multiple nodes, but this has not been tested yet.
+ \item \textbf{Integration with Other Systems:}
+ Integration with Feel++ is underway:
+ solution data produced with Feel++ has successfully been
+ used to train neural networks in Scimba.
\end{itemize}
-\subsection{Initial Performance Metrics}
-\label{sec:WP2:Scimba:metrics}
-
-This section provides a summary of initial performance benchmarks performed in the context of WP2. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+% \subsection{Initial Performance Metrics}
+% \label{sec:WP2:Scimba:metrics}
+
+% This section provides a summary of initial performance benchmarks performed in the context of WP2. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+
+% \begin{itemize}
+% \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+% \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+% \begin{itemize}
+% \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
+% \item Output dataset format and key results.
+% \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
+% \item DOI or permanent link for accessing the dataset.
+% \end{itemize}
+% \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
+% \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+% \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+% \end{itemize}
+
+% \subsubsection{Benchmark \#1}
+% \begin{itemize}
+% \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
+% \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
+% \item \textbf{Input/Output Dataset Description:}
+% \begin{itemize}
+% \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+% \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+% \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+% \end{itemize}
+% \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+% \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+% \end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP2:Scimba:roadmap}
In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+ \item \textbf{Data Improvements:} As a physics-informed machine learning framework, input data of Scimba will consist in small \texttt{json} (or \texttt{yaml}, etc.) files containing the parameters of the PDEs to solve and the neural networks solving them. Output files, in \texttt{PyTorch}'s \texttt{.pth} format, contain the trained neural networks; figures showing approximate solutions are produced. At the moment, no input data management is available, but it is planned to be implemented in the next 12 months. Output data is created and locally saved, but not clearly managed at the moment.
+ \item \textbf{Methodology Application:} No clear benchmark is available at the moment, but the software is being tested on a variety of problems, from simple ODEs to more complex PDEs. At the moment, around 50 examples are available, but without clear benchmarking. The roadmap includes the implementation of a benchmark suite, with a variety of problems and datasets, to test the software's performance and scalability.
+ \item \textbf{Results Retention:} The roadmap includes the implementation of a data management system, with versioning and metadata, to ensure reproducibility and long-term usability. Since the code is available on GitHub, versioning is ensured to replicate the results.
+ \item \textbf{Performance Results:} At the moment, the framework is parallelized on CPUs (shared-memory) and single GPU nodes, thanks to \texttt{PyTorch}'s intrinsic parallelization capabilities.
+ The roadmap includes multi-GPU support and performance optimization.
+ Namely, we will measure performance (using e.g. elapsed time) and mathematical accuracy (using e.g. the $L^2$ error between the approximate solution and a reference one).
\end{itemize}
In~\cref{tab:WP2:Scimba:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
\begin{table}[h!]
\centering
-
-
-
- \centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
-\end{tabular}
+
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks}
+ & {\rule{0pt}{2.5ex}\color{white}\bf Short Description } \\
+
+ \rowcolor{white}
+ B10 - Scientific Productivity
+ & At the moment, each example needs to be run manually.
+ The roadmap includes a way to automatically
+ run relevant examples, based on input data.
+ Future developments will include continuous benchmarking,
+ in the form of a CI/CD pipeline.
+ Some unit tests are already available,
+ but their generalization is part of the roadmap. \\
+ \rowcolor{numpexlightergray}
+ B11 - Reproducibility and Replicability of Computation
+ & At the moment, input data is hard-coded in examples.
+ The roadmap includes a way to manage input data
+ (e.g. in \texttt{json} files)
+ and ensure reproducibility. \\
+ \rowcolor{white}
+ B6 - Data Management
+ & Data is saved locally, but not managed.
+ The roadmap includes a data management system,
+ with versioning and metadata. \\
+ \rowcolor{numpexlightergray}
+ B7 - Exascale Algorithms
+ & The roadmap includes multi-GPU support
+ and performance optimization. \\
+ \end{tabular}
}
}
\caption{WP2: Scimba plan with Respect to Relevant Bottlenecks}
\label{tab:WP2:Scimba:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
diff --git a/software/scimba/scimba.tex b/software/scimba/scimba.tex
index c7dfc5a..55867c8 100644
--- a/software/scimba/scimba.tex
+++ b/software/scimba/scimba.tex
@@ -6,121 +6,205 @@ \section{Software: Scimba}
\begin{table}[h!]
\centering
{ \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {\fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field} & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
- \rowcolor{white}\textbf{Consortium} & \begin{tabular}{l}
-INRIA\\
-UNISTRA\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Exa-MA Partners} & \begin{tabular}{l}
-Unistra\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Contact Emails} & \begin{tabular}{l}
-emmanuel.franck@inria.fr\\
-\end{tabular} \\
- \rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
-CPU or GPU\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://gitlab.inria.fr/scimba/scimba}{https://gitlab.inria.fr/scimba/scimba} \\
- \rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
-None\\
-\end{tabular} \\
- \rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
-B10 - Scientific Productivity\\
-B11 - Reproducibility and Replicability of Computation\\
-B6 - Data Management\\
-B7 - Exascale Algorithms\\
-\end{tabular} \\
- \bottomrule
- \end{tabular}
- }}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{!{\color{numpexgray}\vrule}p{.4\textwidth}!{\color{numpexgray}\vrule}p{.6\textwidth}!{\color{numpexgray}\vrule}}
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Field}
+ & {\rule{0pt}{2.5ex}\color{white}\bf Details} \\
+ \rowcolor{white}\textbf{Consortium}
+ & \begin{tabular}{l}
+ INRIA \\
+ UNISTRA \\
+ \end{tabular} \\
+ \rowcolor{numpexlightergray}\textbf{Exa-MA Partners}
+ & \begin{tabular}{l}
+ Unistra \\
+ \end{tabular} \\
+ \rowcolor{white}\textbf{Contact Emails}
+ & \begin{tabular}{l}
+ emmanuel.franck@inria.fr \\
+ victor.michel-dansac@inria.fr \\
+ \end{tabular} \\
+ \rowcolor{numpexlightergray}\textbf{Supported Architectures}
+ & \begin{tabular}{l}
+ CPU or GPU \\
+ \end{tabular} \\
+ \rowcolor{white}\textbf{Repository}
+ & \href{https://gitlab.inria.fr/scimba/scimba}{https://gitlab.inria.fr/scimba/scimba} \\
+ \rowcolor{numpexlightergray}\textbf{License}
+ & \begin{tabular}{l}
+ MIT \\
+ \end{tabular} \\
+ \rowcolor{white}\textbf{Bottlenecks roadmap}
+ & \begin{tabular}{l}
+ B10 - Scientific Productivity \\
+ B11 - Reproducibility and Replicability of Computation \\
+ B6 - Data Management \\
+ B7 - Exascale Algorithms \\
+ \end{tabular} \\
+ \bottomrule
+ \end{tabular}
+ }}
\caption{Scimba Information}
\end{table}
\subsection{Software summary}
\label{sec:Scimba:summary}
-Detailed overview not available.
-
+Scimba\footnote{\url{https://gitlab.inria.fr/scimba/scimba}} is a software package that provides a framework for solving partial differential equations using physics-informed learning. It is mainly developed by Inria researchers (E. Franck and V. Michel-Dansac), with support from CNRS and the University of Strasbourg.
\subsection{Purpose}
\label{sec:Scimba:purpose}
-Purpose not available.
+
+The purpose of Scimba is to provide a readily-available
+library for physics-informed machine learning.
+It has two main objectives:
+\begin{itemize}
+ \item being used as a tool for solving PDEs
+ with differentiable classical or ML-based methods,
+ before exploiting their numerical solutions
+ and differentiable properties;
+ \item provide an easy framework in which users
+ may develop new differentiable numerical methods
+ (ML-based or classical).
+\end{itemize}
+As such, it aims at being easy to install and use,
+while providing existing ML-based and classical methods
+(see Section~\ref{sec:WP2:Scimba:summary}).
+It also aims at providing good performance,
+using the intrinsic prarallelizability of
+ML-based methods and libraries.
\subsection{Programming and Computational Environment}
\label{sec::Scimba:environment_capabilities}
-
The following table summarizes these aspects for Scimba, providing a view of its programming and computational capabilities.
\begin{table}[h!]
\centering
{
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {\fontsize{9}{11}\selectfont
- \begin{tabular}{lp{.3\textwidth}p{.5\textwidth}}
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category} & {\rule{0pt}{2.5ex}\color{white}\bf Details} & {\rule{0pt}{2.5ex}\color{white}\bf Description}\\
- \rowcolor{white}Languages & \begin{tabular}{l}
-Python\\
-\end{tabular} & Programming languages and language standards supported by the software \\
- \rowcolor{numpexlightergray}Parallelism & \begin{tabular}{l}
-GPU\\
-\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
- \rowcolor{white}Data Formats & \begin{tabular}{l}
-None\\
-\end{tabular} & Data formats that the software can handle or produce.\\
- \rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
-None\\
-\end{tabular} & Fault tolerance and recovery mechanisms employed by the software.\\
- \rowcolor{white}DevOps & \begin{tabular}{l}
-None\\
-\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
- \rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
-None\\
-\end{tabular} & Software packaging and distribution.\\
- \rowcolor{white}Testing & \begin{tabular}{l}
-None\\
-\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
- \rowcolor{numpexlightergray}Containerization & \begin{tabular}{l}
-None\\
-\end{tabular} & Container technologies used to package and deploy the software.\\
- \rowcolor{white}Interfaces & \begin{tabular}{l}
-pytorch\\
-\end{tabular} & List of software Scimba has interfaces with.\\
- \bottomrule
- \end{tabular}
- }}
+ \setlength{\parindent}{0pt}
+ \def\arraystretch{1.25}
+ \arrayrulecolor{numpexgray}
+ {\fontsize{9}{11}\selectfont
+ \begin{tabular}{lp{.3\textwidth}p{.5\textwidth}}
+ \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Category}
+ & {\rule{0pt}{2.5ex}\color{white}\bf Details}
+ & {\rule{0pt}{2.5ex}\color{white}\bf Description} \\
+ \rowcolor{white}Languages
+ & \begin{tabular}{l}
+ Python \\
+ \end{tabular}
+ & Scimba is written in \texttt{Python}, using the \texttt{pytorch} library for ML and differentiable computing. \\
+ \rowcolor{numpexlightergray}Parallelism
+ & \begin{tabular}{l}
+ GPU, CPU (shared) \\
+ \end{tabular}
+ & Scimba takes advantage of the intrinsic parallelization capabilities of \texttt{pytorch} (shared-memory CPU and GPU, by default). \\
+ \rowcolor{white}Data Formats
+ & input: none;
+ output: \texttt{.pth} \& image files
+ & Scimba outputs \texttt{.pth} files containing neural networks weights, as well as image files representing the equation solutions. \\
+ \rowcolor{numpexlightergray}Resilience
+ & \begin{tabular}{l}
+ None \\
+ \end{tabular}
+ & None \\
+ \rowcolor{white}DevOps
+ & \begin{tabular}{l}
+ None \\
+ \end{tabular}
+ & None \\
+ \rowcolor{numpexlightergray}Packaging
+ & \begin{tabular}{l}
+ pip, GitHub \\
+ \end{tabular}
+ & Scimba is packaged on pip (\texttt{pip install scimba}) and available on gitlab (\url{https://gitlab.inria.fr/scimba/scimba}) \\
+ \rowcolor{white}Testing
+ & \begin{tabular}{l}
+ pytest \\
+ \end{tabular}
+ & Tests are provided for core functionalities, available by running \texttt{pytest}. \\
+ \rowcolor{numpexlightergray}Containerization
+ & \begin{tabular}{l}
+ None \\
+ \end{tabular}
+ & None \\
+ \rowcolor{white}Interfaces
+ & \begin{tabular}{l}
+ \texttt{Feel++} \\
+ \end{tabular}
+ & Scimba interfaces with Feel++: solution data produced with Feel++ has successfully been
+ used to train neural networks in Scimba. \\
+ \bottomrule
+ \end{tabular}
+ }}
\caption{Scimba programming and computational environment}
\end{table}
-
\subsection{Mathematics}
\label{sec:Scimba:mathematics}
-Mathematics not available.
-
-In this section, provide a summary the mathematics used in the software.
-
-
-\subsection{Relevant Publications}
-\label{sec:Scimba:publications}
-
-Here is a list of relevant publications related to the software:
+% Mathematics not available.
+
+% In this section, provide a summary the mathematics used in the software.
+
+This software mainly uses physics-informed
+learning~\cite{karniadakis_physics-informed_2021}
+to solve partial differential equations.
+More specifically, it solves partial differential equations of the form
+\begin{equation*}
+ \begin{cases}
+ \mathcal{D}(u, x, t,; \mu) = 0,
+ & \text{in } \Omega \times (0,T) \times \mathbb{M}, \\
+ \mathcal{B}(u, x, t; \mu) = 0,
+ & \text{on } \partial \Omega \times (0,T) \times \mathbb{M}, \\
+ u(x, 0; \mu) = u_0(x; \mu),
+ & \text{on } \Omega \times \mathbb{M}, \\
+ \end{cases}
+\end{equation*}
+with $\mathcal{D}$ a differential operator, $\mathcal{B}$ a boundary condition operator,
+$u_0$ an initial condition, $\Omega$ the spatial domain,
+$T$ the final time, and $\mathbb{M}$ the parameter space.
+The solution $u$ is approximated with a variety of physics-informed neural networks,
+which are trained to minimize the residual of the PDE,
+the boundary conditions and the initial conditions.
+An example of a physics-informed loss function is:
+\begin{equation*}
+ \begin{aligned}
+ \mathcal{L} & =
+ \int_\Omega \int_0^T \int_{\mathbb{M}}
+ ||\mathcal{D}(u, x, t; \mu)||^2 \,
+ \mathrm{d}\mu \, \mathrm{d}t \, \mathrm{d}x \\
+ & +
+ \int_{\partial\Omega} \int_0^T \int_{\mathbb{M}}
+ ||\mathcal{B}(u, x, t; \mu)||^2 \,
+ \mathrm{d}\mu \, \mathrm{d}t \, \mathrm{d}x
+ +
+ \int_{\Omega} \int_{\mathbb{M}}
+ ||u(x, 0; \mu) - u_0(x;\mu)||^2 \,
+ \mathrm{d}\mu \, \mathrm{d}x.
+ \end{aligned}
+\end{equation*}
+Other data-driven methods
+(e.g. DeepONets~\cite{lu_learning_2021})
+or numerical methods
+(e.g. Neural Galerkin~\cite{bruna_neural_2024})
+are also available in Scimba.
+
+% \subsection{Relevant Publications}
+% \label{sec:Scimba:publications}
+
+% Here is a list of relevant publications related to the software:
\subsection{Acknowledgements}
\label{sec::Scimba:acknowledgements}
-The software has been developed with the support of the following funding agencies and institutions:
-
-
-
-
-Acknowledgements not available.
-
+Scimba has mainly been developed by Inria research scientists.
+Other contributors include students and researchers
+from the University of Strasbourg and CEA Cadarache.
+For the moment, no specific funding has been dedicated to Scimba.
+Exa-MA has supported the internships of Marie Sengler (Strasbourg) and Daria Hrebenshchykova (Sophia-Antipolis), who both worked with Scimba.
diff --git a/software/trust-platform/WP3/WP3.tex b/software/trust-platform/WP3/WP3.tex
index c174a63..cf0ffe6 100644
--- a/software/trust-platform/WP3/WP3.tex
+++ b/software/trust-platform/WP3/WP3.tex
@@ -57,9 +57,9 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} direct solver & provide short description here \\
-\rowcolor{numpexlightergray} krylov solver & provide short description here \\
-\rowcolor{white} multiphysics coupling & provide short description here \\
+\rowcolor{white} direct solver & MUMPS on CPU and STRUMPACK on GPU are used in TRUST \\
+\rowcolor{numpexlightergray} krylov solver & PETSc on CPU, AmgX or PETSc on Nvidia GPU, rocALUTION or PETSc on AMD GPU are used in TRUST \\
+\rowcolor{white} multiphysics coupling & TRUST is intended to be coupled with other software (structure solver for instance) through the ICoCo standard (\href{https://github.com/cea-trust-platform/icoco-coupling}{https://github.com/cea-trust-platform/icoco-coupling}), as a reference component whose performance to preserve in the partitioned coupling \\
\end{tabular}
}
}
@@ -73,55 +73,49 @@ \subsection{Parallel Capabilities}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+ \item The parallel programming model is MPI and a mix of Kokkos and OpenMP-target directives for GPU computing
+ \item The parallel computation environment are personal computers, clusters, and super computers: e.g. Adastra (CINES), Jean-Zay (IDRIS), Topaze (CCRT)
+ \item The code can run on several cores of one or several CPU nodes. A GPU version (limited to some physical modules) of the code may run on one or several GPU, on one or several nodes.
+ \item \textbf{Scalability:} The parallel efficacity on CPU nodes (128 AMD Rome cores/node) declines from ~90\% on 4 nodes, ~80\% on 32 nodes, to ~60\% on 256 nodes during a weak scaling test. The parallel efficacity on GPU nodes (4 GPU Nvidia V100/node) declines quicker from ~60\% on 4 nodes, ~35\% on 32 nodes, to 20\% on 256 nodes during the same weak scaling test.
+ \item \textbf{Integration with Other Systems:} TRUST integrates with the linear algebra libraries developed and improved in the Exa-Ma framework and especially those interfaced through PETSc. It thus serves for the practical evaluation of the integral performance gain provided by the project in representative physical cases of interest.
\end{itemize}
\subsection{Initial Performance Metrics}
\label{sec:WP3:TRUST Platform:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP3. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
+Establishing reference metrics for TRUST to serve as basic data for the methodology proposed in chapter \ref{chap:methodology} is an ongoing process that will continue during the upcoming year. However, some solid elements are available concerning the relevant tests to be implemented, emphasizing on the relation between TRUST and linear algebra libraries supported in WP3, and with PETSc in particular:
\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+ \item \textbf{Weak scaling test for GPU: } using Krylov solver with algebraic multigrid preconditioner (e.g Hypre or AmgX) through PETSc on 2 to 1024 GPUs
+ \item\textbf{Weak scaling test for CPU: } using Krylov solver with algebraic multigrid preconditioner (e.g. Hypre) through PETSc on 8 to 32768 CPUs
+ \item \textbf{Strong scaling for CPU: } using MUMPS direct solver through PETSc on 1 to 128 cores
\end{itemize}
-\subsubsection{Benchmark \#1}
+Current observations state that:
+
\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+ \item \textbf{Weak scaling test for GPU: } the scalability can be improved, in particular with :
+ \begin{itemize}
+ \item the fine tuning of the already tested libraries or the choice and integration of new libraries, with a particular focus on the converence rate,
+ \item the improvement of the robustness of the MPI communications between GPUs.
+ \end{itemize}
+ \item\textbf{Weak scaling test for CPU: } the scalability already reaches satisfactory levels, but some improvement can yet be expected from the advanced management of MPI communications with a focus on collective communications.
+ \item \textbf{Strong scaling for CPU: } these tests are great interest for component applications (i.e. complex physics, smaller models) that can be involved in multiphysics and partitioned coupling.
\end{itemize}
+Tests for performance evaluation of the hybrid CPU-GPU run modes will be considered in a second step.
+
\subsection{12-Month Roadmap}
\label{sec:WP3:TRUST Platform:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
+The 12-Month Roadmap for TRUST platform is mainly dedicated to the implementation of tests identified in the previous section in the methodology proposed in section \ref{sec:methodology-types}, in terms of scalability analysis for CPU or GPU in a first step, and of hybrid benchmarking for CPU-GPU in a second step (post-12 month perspective).\\
+\\
+Practically:
\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
+ \item \textbf{Data availability:} Data for the selected benchmarks will be made available in a format readable with open-source only software, namely from the Salome environment (\href{https://www.salome-platform.org/}{https://www.salome-platform.org/}).
+ \item \textbf{Methodology Application:} Implementation of the benchmarking methodology in terms of scalability measurements, with a particular focus on the selection of the relevant output data and associated tolerance for the reproducibility guarantee.
+ \item \textbf{Results Retention:} Benchmark results will be stored and made available with the suitable level of metadata and documentation through a dedicated repository proposed by the Exa-MA project.
\end{itemize}
In~\cref{tab:WP3:TRUST Platform:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
@@ -142,13 +136,13 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & provide short description here \\
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & provide short description here \\
-\rowcolor{white} B6 - Data Management & provide short description here \\
-\rowcolor{numpexlightergray} B7 - Exascale Algorithms & provide short description here \\
+\rowcolor{white} B10 - Scientific Productivity & Accelerate the access to large scale numerical results to enhance knowledge and speed-up engineering operations in the field of computational fluid mechanics. \\
+\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation & Ensure the reliability of the results for large scale and complex simulation independently from the computer and the selected run-mode (CPU, GPU, hybrid). \\
+\rowcolor{white} B6 - Data Management & This bottleneck is indirectly addressed for TRUST through the availability of the benchmark datasets and the collection of the results. \\
+\rowcolor{numpexlightergray} B7 - Exascale Algorithms & Benefit from the latest improvement in linear algebra libraries, quantify the practical gains on representative applications and track/characterize the remaining bottlenecks for the global performance. Use TRUST as a reference component for the analysis of the performance of partitioned coupling at exascale.\\
\end{tabular}
}
}
\caption{WP3: TRUST Platform plan with Respect to Relevant Bottlenecks}
\label{tab:WP3:TRUST Platform:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
diff --git a/software/trust-platform/trust-platform.tex b/software/trust-platform/trust-platform.tex
index 61e2b3f..5d0810b 100644
--- a/software/trust-platform/trust-platform.tex
+++ b/software/trust-platform/trust-platform.tex
@@ -44,13 +44,13 @@ \section{Software: TRUST Platform}
\subsection{Software summary}
\label{sec:TRUST Platform:summary}
-Detailed overview not available.
-
+TRUST is a High Performance Computing (HPC) platform tool developed by the CEA since 1993. Initially designed for nuclear applications, TRUST has evolved to tackle a range of thermohydraulic challenges, from one-phase to multi-phase flows. It offers various numerical methods and supports different mesh types for efficient computation on diverse computing platforms, including high-performance computers. Recently, efforts have been made to integrate GPU computing libraries like AmgX, rocALUTION, and Kokkos aiming for a hybrid CPU/GPU code achieving better performance portability. This software is OpenSource (BSD license), available on GitHub.
\subsection{Purpose}
\label{sec:TRUST Platform:purpose}
-Purpose not available.
+
+TRUST can be used as a standalone generic simulation software, or serves as a kernel for several independent private projects. As all the base classes and numerical methods are available in TRUST, building an application for a specific domain becomes easier by basing the project on the platform. This is known as Build an Application Linked to TRUST Kernel (BALTIK). Such a specialized application allows for example to extend the TRUST functionalities for low scale turbulent multi-phase simulations (TrioCFD code), component-scale simulations (e.g. 3D module of CATHARE code), or even non-nuclear application, like batteries and fuel-cell (PEMFC) simulations (and many other CEA internal codes).
\subsection{Programming and Computational Environment}
\label{sec::TRUST Platform:environment_capabilities}
@@ -109,25 +109,32 @@ \subsection{Programming and Computational Environment}
\subsection{Mathematics}
\label{sec:TRUST Platform:mathematics}
-Mathematics not available.
+Solving a TRUST problem requires to select a certain discretization which allows the code to pass the treated equations from a continuous to a discretized form.
+Four discretizations are available:
+
+\begin{itemize}
+\item \textbf{Finite Volume Difference (VDF) discretization}: It is the simplest and the most performant discretization of the TRUST plaform. This discretization is compatible with conform mesh with hexahedral type of elements. As stated by its name, the VDF is a conservative finite volume scheme of Marker-and-Cell (MAC) type. The discretization of each term of the equation is performed by integrating over a control volume. The diffusion gradient terms are approximated by a linear difference equation. All scalars are stored at the center of each control volume except the velocity field which is defined on a staggered mesh.
-In this section, provide a summary the mathematics used in the software.
+\item \textbf{Finite Element Volume (VEF) discretization}: It is used when the mesh is conform but with tetrahedral elements (triangles in 2D). This numerical scheme combines finite volume and finite elements to integrate in conservative form all conservation equations over the control volumes belonging to the calculation domain. As in the classical Crouzeix–Raviart element, both vector and scalar quantities are located at the centers of the faces. The pressure, however, is located at the vertices and at the center of gravity of a tetrahedral element (in 3D, triangles in 2D). This discretization leads to very good pressure/velocity coupling and has a very dense divergence free basis. Along this staggered mesh arrangement, the unknowns, i.e. the vector and scalar values, are expressed using non-conforming linear shape-functions (P1-nonconforming). The shape function for the pressure is constant for the center of the element (P0) and linear for the vertices (P1).
+\item \textbf{PolyMAC-series discretization}: It is a series of Marker-and-Cell (MAC) schemes that can handle any type of mesh (non-conform, non-orthogonal, poly-hedral types, …). The numerical description of this schemes is quite complex and depends on the employed version.
+
+\item \textbf{Finite Element (EF) discretization}: It implements a classical finite element method.
+\end{itemize}
\subsection{Relevant Publications}
\label{sec:TRUST Platform:publications}
Here is a list of relevant publications related to the software:
+\begin{itemize}
+ \item \fullcite{calvin_object-oriented_2002}. It's the original paper for the Trio\_U platform, previous name of TRUST, describing the C++ choices and parallel capabilities for the code.
+ \item \fullcite{saikali_highly_2019}. It describes the first TRUST simulation (DNS) using 50K cores in context of safety assessment of systems using hydrogen.
+ \item \fullcite{angeli_wall-resolved_2022}. Large Eddy Simulation of a pressurized thermal shock into a PWR downcomer: numerical study with TrioCFD a TRUST based CFD application.
+\end{itemize}
\subsection{Acknowledgements}
\label{sec::TRUST Platform:acknowledgements}
-The software has been developed with the support of the following funding agencies and institutions:
-
-
-
-
-Acknowledgements not available.
-
+The software has been developed by the CEA (Commissariat à l'énergie atomique et aux énergies alternatives).
diff --git a/software/uranie/WP2/WP2.tex b/software/uranie/WP2/WP2.tex
index a840328..566cf77 100644
--- a/software/uranie/WP2/WP2.tex
+++ b/software/uranie/WP2/WP2.tex
@@ -23,7 +23,7 @@ \section{Software: Uranie}
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU Only\\
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://sourceforge.net/projects/uranie/}{https://sourceforge.net/projects/uranie/} \\
+ \rowcolor{white}\textbf{Repository} & \href{https://uranie.cea.fr}{https://uranie.cea.fr} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
OSS:: LGPL v*\\
\end{tabular} \\
@@ -38,8 +38,14 @@ \section{Software: Uranie}
\subsection{Software Overview}
\label{sec:WP2:Uranie:summary}
-
-In~\cref{tab:WP2:Uranie:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+Uranie platform is based on ROOT and has by consequence a lot of ROOT characteristics such as:
+\begin{itemize}
+ \item interactive C++ interpreter (Cling) based on LLVM and Clang
+ \item Python interface (PyROOT)
+ \item SQL database access
+\end{itemize}
+Uranie is organized in different modules, each devoted to a specific task in the Uncertainty Quantification (UQ) framework.
+The different surrogate modeling techniques available in the Modeler module are listed in table \ref{tab:WP2:Uranie:features}
\begin{table}[h!]
\centering
@@ -53,7 +59,9 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} NN/Autoencoder & provide short description here \\
+\rowcolor{white} Polynomial chaos expansion & The basic idea of polynomial chaos expansion is that any square-integrable function can be written as $f(x) = \sum_{\alpha} f_{\alpha} \Psi_{\alpha}(x)$ where $\{f_{alpha}\}$ are the PC coefficients, $\{\Psi_{alpha}\}$ is the orthogonal polynomial-basis. $\alpha$ corresponds to a multi-index whose dimension is equal to the dimension of vector $x$ and whose L1 norm $\lVert \alpha \rVert_1$ is the degree of the resulting polynomial. \\
+\rowcolor{numpexlightergray} Artificial Neural Networks & The artificial neural networks done within Uranie need input from \texttt{OPT++} and can also benefit from the computation power of graphical process unit (GPU) if available. Their implementation is done through the \texttt{TANNModeler} Uranie-class \\
+\rowcolor{white} Kriging & First developed for geostatistic needs, the kriging method, named after D. Krige and also called Gaussian Process (GP) regression is another way to construct an surrogate model of a deterministic function. Its interesting features of GP are that it provides uncertainty along with its prediction and that it can interpolate the training data which is very useful for surrogate modeling of deterministic functions.
\end{tabular}
}
}
@@ -67,79 +75,16 @@ \subsection{Parallel Capabilities}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
-
-
-\subsection{Initial Performance Metrics}
-\label{sec:WP2:Uranie:metrics}
-
-This section provides a summary of initial performance benchmarks performed in the context of WP2. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
+ \item The parallel programming environment of URANIE leverages MPI, PTHREAD
+ and CUDA to exploit the full potential of parallel computing.
+ On our platform, MPI is used to distribute simulations across different nodes,
+ ensuring efficient scalability and enabling the platform to handle complex, large-scale computations. It is mainly use for the launching of external code.
+ On the other hand, CUDA use GPU's capabilities in Artificial Neural Network of URANIE.
+ URANIE use also LibSSH for launching code on different cluster (in the module TLauncher).
+
+ \item The parallel computation environment of our platform is built on a HPC architecture designed to maximize computational power and efficiency
+ using both distributed and shared memory parallelism. URANIE is used on CEA/TGCC supercomputers such as IRESNE.
+
+ \item URANIE allows performing simulations in parallel for uncertainty quantification.
+ \item \textbf{Scalability:} The scalability is constant because each the software distributes the simulation. if we add resource, they are devoted to run new simulations.
\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
-
-\subsection{12-Month Roadmap}
-\label{sec:WP2:Uranie:roadmap}
-
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
-
-In~\cref{tab:WP2:Uranie:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
-
-\begin{table}[h!]
- \centering
-
-
-
- \centering
- {
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {
- \fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} None & provide short description here \\
-\end{tabular}
- }
- }
- \caption{WP2: Uranie plan with Respect to Relevant Bottlenecks}
- \label{tab:WP2:Uranie:bottlenecks}
-\end{table}
\ No newline at end of file
diff --git a/software/uranie/WP5/WP5.tex b/software/uranie/WP5/WP5.tex
index 73e8a72..48193d7 100644
--- a/software/uranie/WP5/WP5.tex
+++ b/software/uranie/WP5/WP5.tex
@@ -23,7 +23,7 @@ \section{Software: Uranie}
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU Only\\
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://sourceforge.net/projects/uranie/}{https://sourceforge.net/projects/uranie/} \\
+ \rowcolor{white}\textbf{Repository} & \href{https://uranie.cea.fr}{https://uranie.cea.fr} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
OSS:: LGPL v*\\
\end{tabular} \\
@@ -39,7 +39,14 @@ \section{Software: Uranie}
\subsection{Software Overview}
\label{sec:WP5:Uranie:summary}
-In~\cref{tab:WP5:Uranie:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+Uranie platform is based on ROOT and has by consequence a lot of ROOT characteristics such as:
+\begin{itemize}
+ \item interactive C++ interpreter (Cling) based on LLVM and Clang
+ \item Python interface (PyROOT)
+ \item SQL database access
+\end{itemize}
+Uranie is organized in different modules, each devoted to a specific task in the Uncertainty Quantification (UQ) framework.
+The modules devoted to optimization are listed in table \ref{tab:WP5:Uranie:features}
\begin{table}[h!]
\centering
@@ -53,9 +60,9 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} "Iterative methods" & provide short description here \\
-\rowcolor{numpexlightergray} "Metaheuristics" & provide short description here \\
-\rowcolor{white} "robust optimisation" & provide short description here \\
+\rowcolor{white} Optimizer and Reoptimizer modules & The Optimizer and Reoptimizer libraries are dedicated to optimisation and model calibration. Model calibration consists in setting up the degrees of freedom of a model so that future simulations will optimally fit an experimental database. The optimisation is a complex procedure and several techniques are available to perform single-criterion or multi criteria one, with and without constraint. \\
+\rowcolor{numpexlightergray} Optimization with surrogate model module & The MetaModelOptim library is a library dedicated to optimisation techniques coupling the generation of surrogate models (in particular the kriging one) and the evolutionnary algorithms to get an EGO-like approach. \\
+\rowcolor{white} Calibration module & The Calibration library is more a dedicated module that is used to get the best estimations of some of the parameter of a specific model under consideration. This module provides different techniques relying on their own hypothesis on the model but all of these methods need data to perform this calibration.
\end{tabular}
}
}
@@ -69,79 +76,18 @@ \subsection{Parallel Capabilities}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+ \item The parallel programming environment of URANIE leverages MPI, PTHREAD
+ and CUDA to exploit the full potential of parallel computing.
+ On our platform, MPI is used to distribute simulations across different nodes,
+ ensuring efficient scalability and enabling the platform to handle complex, large-scale computations. It is mainly use for the launching of external code.
+ On the other hand, CUDA use GPU's capabilities in Artificial Neural Network of URANIE.
+ URANIE use also LibSSH for launching code on different cluster (in the module TLauncher).
+
+ \item The parallel computation environment of our platform is built on a HPC architecture designed to maximize computational power and efficiency
+ using both distributed and shared memory parallelism. URANIE is used on CEA/TGCC supercomputers such as IRESNE.
+
+ \item URANIE allows performing simulations in parallel for uncertainty quantification.
+ \item \textbf{Scalability:} The scalability is constant because each the software distributes the simulation. if we add resource, they are devoted to run new simulations.
\end{itemize}
-\subsection{Initial Performance Metrics}
-\label{sec:WP5:Uranie:metrics}
-
-This section provides a summary of initial performance benchmarks performed in the context of WP5. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
-
-\subsection{12-Month Roadmap}
-\label{sec:WP5:Uranie:roadmap}
-
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
-
-In~\cref{tab:WP5:Uranie:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
-
-\begin{table}[h!]
- \centering
-
-
-
- \centering
- {
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {
- \fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} None & provide short description here \\
-\end{tabular}
- }
- }
- \caption{WP5: Uranie plan with Respect to Relevant Bottlenecks}
- \label{tab:WP5:Uranie:bottlenecks}
-\end{table}
\ No newline at end of file
diff --git a/software/uranie/WP6/WP6.tex b/software/uranie/WP6/WP6.tex
index cbc8dbe..54208e5 100644
--- a/software/uranie/WP6/WP6.tex
+++ b/software/uranie/WP6/WP6.tex
@@ -22,13 +22,14 @@ \section{Software: Uranie}
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU Only\\
+GPU
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://sourceforge.net/projects/uranie/}{https://sourceforge.net/projects/uranie/} \\
+ \rowcolor{white}\textbf{Repository} & \href{https://uranie.cea.fr}{https://uranie.cea.fr} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
OSS:: LGPL v*\\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
-None\\
+Coupling with parallelized software \\
\end{tabular} \\
\bottomrule
\end{tabular}
@@ -39,7 +40,14 @@ \section{Software: Uranie}
\subsection{Software Overview}
\label{sec:WP6:Uranie:summary}
-In~\cref{tab:WP6:Uranie:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
+Uranie platform is based on ROOT and has by consequence a lot of ROOT characteristics such as:
+\begin{itemize}
+ \item interactive C++ interpreter (Cling) based on LLVM and Clang
+ \item Python interface (PyROOT)
+ \item SQL database access
+\end{itemize}
+Uranie is organized in different modules, each devoted to a specific task in the Uncertainty Quantification (UQ) framework.
+The modules devoted to Uncertainty Quantification (UQ) are listed in table \ref{tab:WP6:Uranie:features}
\begin{table}[h!]
\centering
@@ -53,9 +61,10 @@ \subsection{Software Overview}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} robust optimisation & provide short description here \\
-\rowcolor{numpexlightergray} sensitivity analysis & provide short description here \\
-\rowcolor{white} uncertainty propagation & provide short description here \\
+\rowcolor{white} DataServer module & The DataServer module is the heart of the Uranie platform, and contains the DataServer class. It contains all the necessary information on the variables in a problem (names, units, probability distributions, data files, etc.) and enables basic statistical operations to be performed. \\
+\rowcolor{numpexlightergray} Sampler module & This module contains sampling algorithms, such as Monte Carlo or Latin Hypercube Sampling. \\
+\rowcolor{white} Launcher \& Relauncher modules & The Launcher and Relauncher module apply an analytical function (python or C++), external simulation code or any combination of these to the contents of a DataServer. The contents of the DataServer can result from a design of experiments generated using one of the sampling techniques, or can be loaded from an external source (ASCII file, SQL database, etc.). Calculations can be easily launched on a cluster or parallelized using different paradigms: fork, shared memory (thread), separate memory (MPI)... \\
+\rowcolor{numpexlightergray} Sensitivity module: & The Sensitivity module is used to perform a sensitivity analysis of one of the output responses in relation to the input factors. Estimation of Sobol' indices and HSIC indices are implemented in this module.
\end{tabular}
}
}
@@ -69,63 +78,29 @@ \subsection{Parallel Capabilities}
\begin{itemize}
- \item describe the parallel programming environment : MPI, OpenMP, CUDA, OpenACC, etc.
- \item describe the parallel computation environment: type of architecture and super computer used.
- \item describe the parallel capabilities of the software
- \item \textbf{Scalability:} Describe the general scalability properties of the software
- \item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
-\end{itemize}
-
-
-\subsection{Initial Performance Metrics}
-\label{sec:WP6:Uranie:metrics}
-
-This section provides a summary of initial performance benchmarks performed in the context of WP6. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
-\begin{itemize}
- \item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
- \item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
- \begin{itemize}
- \item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
- \item Output dataset format and key results.
- \item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
- \item DOI or permanent link for accessing the dataset.
- \end{itemize}
- \item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
- \item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
- \item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-\end{itemize}
-
-\subsubsection{Benchmark \#1}
-\begin{itemize}
- \item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
- \item \textbf{Benchmarking Tools Used:} List the tools used for performance analysis, such as Extrae, Score-P, TAU, Vampir, or Nsight, and specify what metrics were measured (e.g., execution time, FLOPS, energy consumption).
- \item \textbf{Input/Output Dataset Description:}
- \begin{itemize}
- \item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
- \item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
- \item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
- \end{itemize}
- \item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
- \item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+ \item The parallel programming environment of URANIE leverages MPI, PTHREAD
+ and CUDA to exploit the full potential of parallel computing.
+ On our platform, MPI is used to distribute simulations across different nodes,
+ ensuring efficient scalability and enabling the platform to handle complex, large-scale computations. It is mainly use for the launching of external code.
+ On the other hand, CUDA use GPU's capabilities in Artificial Neural Network of URANIE.
+ URANIE use also LibSSH for launching code on different cluster (in the module TLauncher).
+
+ \item The parallel computation environment of our platform is built on a HPC architecture designed to maximize computational power and efficiency
+ using both distributed and shared memory parallelism. URANIE is used on CEA/TGCC supercomputers.
+
+ \item URANIE allows performing simulations in parallel for uncertainty quantification.
+ \item \textbf{Scalability:} The scalability is constant because each the software distributes the simulation. if we add resource, they are devoted to run new simulations.
\end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP6:Uranie:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
-\begin{itemize}
- \item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
- \item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
- \item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-\end{itemize}
-
-In~\cref{tab:WP6:Uranie:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
+In the context of exascale computation, Uranie will have to perform uncertainty quantification on more complex simulation software, that will be heavily parallelized using for instance MPI or OpenMP, and run on supercomputers. For the specific case of uncertainty propagation. Ensemble-runs of a simulation software has to be performed, and this can be tricky when the software is parallelized. Memory storage will also be challenging in the exascale era and "on the fly" handling of the output data generated by the simulation software has to be performed by Uranie. In the next deliverable, an adaptation of the Relauncher module using the ICoCo API (\url{https://github.com/cea-trust-platform/icoco-coupling}) will be proposed and illustrated on an uncertainty propagation task using TRUST software, with on the fly processing of the data generated using TRUST Python API.
\begin{table}[h!]
\centering
-
-
+
+
\centering
{
@@ -135,13 +110,14 @@ \subsection{12-Month Roadmap}
{
\fontsize{9}{11}\selectfont
\begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
+
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} None & provide short description here \\
+
+\rowcolor{white} Coupling with parallelized software & Nowadays more and more simulation software are parralelized, sometimes with shared memory. Moreover, multiphysics simulations implies chained or coupled simulation software. Uranie has to be capable to perform UQ studies on such simulation software. \\
\end{tabular}
}
}
\caption{WP6: Uranie plan with Respect to Relevant Bottlenecks}
\label{tab:WP6:Uranie:bottlenecks}
-\end{table}
\ No newline at end of file
+\end{table}
+
diff --git a/software/uranie/uranie.tex b/software/uranie/uranie.tex
index 34a7fc6..f30aa86 100644
--- a/software/uranie/uranie.tex
+++ b/software/uranie/uranie.tex
@@ -23,11 +23,11 @@ \section{Software: Uranie}
rudy.chocat@cea.fr\\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
-CPU Only\\
+CPU, GPU\\
\end{tabular} \\
- \rowcolor{white}\textbf{Repository} & \href{https://sourceforge.net/projects/uranie/}{https://sourceforge.net/projects/uranie/} \\
+ \rowcolor{white}\textbf{Repository} & \href{https://uranie.cea.fr}{https://uranie.cea.fr} \\
\rowcolor{numpexlightergray}\textbf{License} & \begin{tabular}{l}
-OSS:: LGPL v*\\
+OSS:: LGPL \\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
None\\
@@ -40,20 +40,21 @@ \section{Software: Uranie}
\subsection{Software summary}
\label{sec:Uranie:summary}
-Detailed overview not available.
+Uranie platform is based on ROOT and has by consequence a lot of ROOT characteristics such as:
+\begin{itemize}
+ \item Interactive C++ interpreter (Cling) based on LLVM and Clang
+ \item Python interface (PyROOT)
+ \item SQL database access
+\end{itemize}
\subsection{Purpose}
\label{sec:Uranie:purpose}
-Purpose not available.
-\subsection{Programming and Computational Environment}
-\label{sec::Uranie:environment_capabilities}
+Uranie (the version under discussion here being v4.9.0) is a software dedicated to perform studies on uncertainty propagation, sensitivity analysis and surrogate model generation and calibration, based on ROOT (the corresponding version being v6.32.00). The motivation for the development of Uranie is the VVUQ (Verification, Validation and Uncertainty Quantification) approach for conceiving a numerical model of real physical phenomena of interests. Uranie is developed such that it interfaces well with CEA internal numerical simulation software.
-The following table summarizes these aspects for Uranie, providing a view of its programming and computational capabilities.
-
\begin{table}[h!]
\centering
{
@@ -73,8 +74,7 @@ \subsection{Programming and Computational Environment}
Multithread\\
\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
\rowcolor{white}Data Formats & \begin{tabular}{l}
-ASCII\\
-JSON\\
+SALOME format\\
ROOT\\
SQL\\
\end{tabular} & Data formats that the software can handle or produce.\\
@@ -96,7 +96,8 @@ \subsection{Programming and Computational Environment}
None\\
\end{tabular} & Container technologies used to package and deploy the software.\\
\rowcolor{white}Interfaces & \begin{tabular}{l}
-Salome\\
+Salome \\
+Trust \\
\end{tabular} & List of software Uranie has interfaces with.\\
\bottomrule
\end{tabular}
@@ -108,15 +109,19 @@ \subsection{Programming and Computational Environment}
\subsection{Mathematics}
\label{sec:Uranie:mathematics}
-Mathematics not available.
-
-In this section, provide a summary the mathematics used in the software.
+The mathematics used in Uranie are related to:
+\begin{itemize}
+ \item Surrogate modeling \& machine learning techniques: Gaussian process regression, neural networks, polynomial chaos expansion
+ \item Optimization: EGO-like algorithm and ants colony based metaheuristics
+ \item Calibration: Latent parameters estimations \& Bayesian algorithms
+ \item Sensitivity analysis: HSIC indices estimation, Sobol' indices.
+\end{itemize}
\subsection{Relevant Publications}
\label{sec:Uranie:publications}
-Here is a list of relevant publications related to the software:
+Here is a relevant publication used to cite Uranie:
\begin{itemize}
\item \fullcite{blanchard_uranie_2019}
@@ -126,10 +131,7 @@ \subsection{Relevant Publications}
\subsection{Acknowledgements}
\label{sec::Uranie:acknowledgements}
-The software has been developed with the support of the following funding agencies and institutions:
-
-
+The software has been developed in \href{https://www.cea.fr/}{CEA} and funded by the SIMU research program.
-Acknowledgements not available.
diff --git a/software/zellij/WP5/WP5.tex b/software/zellij/WP5/WP5.tex
index 6f6fd3f..6c13a12 100644
--- a/software/zellij/WP5/WP5.tex
+++ b/software/zellij/WP5/WP5.tex
@@ -16,7 +16,7 @@ \section{Software: Zellij}
Inria Lille\\
\end{tabular} \\
\rowcolor{white}\textbf{Contact Emails} & \begin{tabular}{l}
-el-ghazali.talbi@univ-lille.fr\\
+el-ghazali.talbi@univ-lille.fr, thomas.firmin@univ-lille.fr \\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU Only \\
@@ -26,10 +26,9 @@ \section{Software: Zellij}
OSS: Cecill-*\\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
-B10 - Scientific Productivity\\
-B11 - Reproducibility and Replicability of Computation\\
-B6 - Data Management\\
B7 - Exascale Algorithms\\
+B9 - Resilience, robustness and accuracy \\
+B10 - Scientific Productivity\\
\end{tabular} \\
\bottomrule
\end{tabular}
@@ -40,46 +39,18 @@ \section{Software: Zellij}
\subsection{Software Overview}
\label{sec:WP5:Zellij:summary}
-In~\cref{tab:WP5:Zellij:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
-
-\begin{table}[h!]
- \centering
- {
- \setlength{\parindent}{0pt}
- \def\arraystretch{1.25}
- \arrayrulecolor{numpexgray}
- {
- \fontsize{9}{11}\selectfont
- \begin{tabular}{!{\color{numpexgray}\vrule}p{.25\linewidth}!{\color{numpexgray}\vrule}p{.6885\linewidth}!{\color{numpexgray}\vrule}}
-
- \rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Features} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-
-\rowcolor{white} Iterative methods & A {\it fractal-based decomposition algorithm} is organized around five search components: fractal geometry, tree search, scoring, exploration and exploitation. It decomposes the search space via a given \textit{fractal geometrical object}. {\it Tree search} allows a dynamic and hierarchical fractal decomposition of the search space. A \textit{scoring} search component allows to balance the exploration and exploitation of the search space by selecting the promising fractals. \\
-
-\rowcolor{numpexlightergray} Metaheuristics (e.g. gradient, evolutionary algorithms) & The {\it exploration} could be done in a passive way (e.g. Markov chain Monte Carlo (MCMC) sampling, low discrepancy sequences) or in an active way (e.g. metaheuristics such as evolutionary algorithms and swarm intelligence, surrogate-based optimization). The challenge here is to find efficient sampling algorithms for diverse and complex fractals such as polytopes. When a fractal reaches the maximum depth of the partition tree, an {\it exploitation} algorithm can be applied to it. The exploitation phase has not to be constrained within a fractal. The only bounds will be ones from the initial search space so that the exploitation can move freely toward a local or global optimum. One can use local search strategies such as gradient-based algorithms or simulated annealing.
-\end{tabular}
- }
- }
- \caption{WP5: Zellij Features}
- \label{tab:WP5:Zellij:features}
-\end{table}
+Zellij is a {\it fractal-based decomposition algorithm} for solving optimization problems. Zellij is organized around five search components: fractal geometry, tree search, scoring, exploration and exploitation. It decomposes the search space via a given \textit{fractal geometrical object}. {\it Tree search} allows a dynamic and hierarchical fractal decomposition of the search space. A \textit{scoring} search component allows to balance the exploration and exploitation of the search space by selecting the promising fractals. \\
+The {\it exploration} could be done in a passive way (e.g. Markov chain Monte Carlo (MCMC) sampling, low discrepancy sequences) or in an active way (e.g. metaheuristics such as evolutionary algorithms and swarm intelligence, surrogate-based optimization). The challenge here is to find efficient sampling algorithms for diverse and complex fractals such as polytopes. When a fractal reaches the maximum depth of the partition tree, an {\it exploitation} algorithm can be applied to it. The exploitation phase has not to be constrained within a fractal. The only bounds will be ones from the initial search space so that the exploitation can move freely toward a local or global optimum. One can use local search strategies such as gradient-based algorithms or simulated annealing.
\subsection{Parallel Capabilities}
\label{sec:WP5:Zellij:performances}
-\begin{itemize}
- \item describe the parallel programming environment : Some parallel models of fractal optimization algorithms have been implemented using MPI.
-
- In short term perspective, We are currently investigating the use of GPUs to implement some parallel models of fractal-based optimization algorithms using Kokkos parallel environment.
+Some parallel models of fractal optimization algorithms have been implemented using MPI. In short term perspective, We are currently investigating the use of GPUs to implement some parallel models of fractal-based optimization algorithms using Kokkos parallel environment.
- In medium term perspective, hybrid parallel computing will be considered using MPI and Kokkos. The main goal is to target massively parallel clusters of CPU-GPUs.
-
- \item describe the parallel computation environment: type of architecture and super computer used.
+In medium term perspective, hybrid parallel computing will be considered using MPI and Kokkos. The main goal is to target massively parallel clusters of CPU-GPUs.
- The MPI version has been deployed on multi-nodes and multi-CPUs distributed environments (e.g. Grid'5000). Some experiments were conducted on 66 Intel Xeon Gold 5220 CPUs of 18 cores, with 96 GiB of RAM each. Each CPU is mounted on a Dell PowerEdge R640 rack, and each are on a single node. Nodes are connected on a DELL POWERSWITCH Z9264F-ON, via Ethernet configured at a 25Gbps rate, with SR-IOV activated.
-
- \item describe the parallel capabilities of the software
+The MPI version has been deployed on multi-nodes and multi-CPUs distributed environments (e.g. Grid'5000). Some experiments were conducted on 66 Intel Xeon Gold 5220 CPUs of 18 cores, with 96 GiB of RAM each. Each CPU is mounted on a Dell PowerEdge R640 rack, and each are on a single node. Nodes are connected on a DELL POWERSWITCH Z9264F-ON, via Ethernet configured at a 25Gbps rate, with SR-IOV activated. \\
Three parallel designs focusing on different levels of a metaheuristic can be designed:
\begin{itemize}
@@ -88,86 +59,54 @@ \subsection{Parallel Capabilities}
\item Solution-level: The evaluation of a single solution is parallelized. We are not concerned by this parallelization level, as it depends on the target optimization problem. This parallel level can be relevant in the case where the objective function is expensive in terms of computation time.
\end{itemize}
- \item \textbf{Scalability:} Describe the general scalability properties of the software
-
-The algorithm level and solution-level are limited in terms of scalability. Using the iteration-level in conjunction with the two other models, one can generate highly scalable optimization algorithms.
-
-The efficiency of the parallel models depends also on the cost of the objective function and the dimension of the optimization problem. Less scalability is need for easy optimization problems where the objective function is not expensive and/or the dimen,sion is small.
-
-\item \textbf{Integration with Other Systems:} Describe how the software integrates with other numerical libraries in the Exa-MA framework.
+The algorithm level and solution-level are limited in terms of scalability. Using the iteration-level in conjunction with the two other models, one can generate highly scalable optimization algorithms. \\
-The framework can been integrated with many librairies. For AutoML experiments, it will be integrated with PyTorch and BoTorch.
-
-\end{itemize}
+The efficiency of the parallel models depends also on the cost of the objective function and the dimension of the optimization problem. Less scalability is need for easy optimization problems where the objective function is not expensive and/or the dimen,sion is small. The framework can been integrated with many librairies. For AutoML experiments, it will be integrated with PyTorch and BoTorch.
\subsection{Initial Performance Metrics}
\label{sec:WP5:Zellij:metrics}
-This section provides a summary of initial performance benchmarks performed in the context of WP5. It ensures reproducibility by detailing input/output datasets, benchmarking tools, and the results. All data should be publicly available, ideally with a DOI for future reference.
-
\begin{itemize}
-\item \textbf{Overall Performance:} Summarize the software's computational performance, energy efficiency, and scalability results across different architectures (e.g., CPU, GPU, hybrid systems).
+\item \textbf{Overall Performance:}
-Preliminary computational experiments show the importance of the asynchronous issues for high-dimension and/or stochastic optimization problems. This work also shows the impact of the
-parallel implementation to the various search components such
-as tree search, fractal object, exploration, and exploitation.
+Preliminary computational experiments show the importance of the asynchronous issues for high-dimension and/or stochastic optimization problems. This work also shows the impact of the parallel implementation to the various search components such as tree search, fractal object, exploration, and exploitation. \\
-The obtained speedups were sublinear, except for
-one function of the AutoML benchmark, where we observed
-a superlinear speedup. Different behaviors were obtained according to the configuration of the search components and
-the computation time of the objective function. We showed
+The obtained speedups were sublinear, except for one function of the AutoML benchmark, where we observed a superlinear speedup. Different behaviors were obtained according to the configuration of the search components and the computation time of the objective function. We showed
that the number of processes and their distribution should be carefully selected according to the problem’s dimensionality and/or the computation time of the objective function.
-The genericity and flexibility of Zellij, allows the design
-and the experimentation of new efficient search components
-for a better scalability to handle exascale optimization. Future experiments on massively parallel architectures will allow analyzing the limitations of the developed master/workers models and hence introduce more distributed parallel models.
+The genericity and flexibility of Zellij, allows the design and the experimentation of new efficient search components for a better scalability to handle exascale optimization. Future experiments on massively parallel architectures will allow analyzing the limitations of the developed master/workers models and hence introduce more distributed parallel models. \\
-Another important perspective is the solving of real-life
-complex optimization problems such as the hyperparameters
-optimization and the neural architecture search of deep neural networks. Those high-impact optimization problems are
-characterized by expensive and stochastic objective functions
+Another important perspective is the solving of real-life complex optimization problems such as the hyperparameters optimization and the neural architecture search of deep neural networks. Those high-impact optimization problems are characterized by expensive and stochastic objective functions
(i.e., learning procedures) and a huge number of variables (i.e., thousands to millions).
-
-\item \textbf{Input/Output Dataset:} Provide a detailed description of the dataset used for the benchmark, including:
+\item \textbf{Input/Output Dataset:}
\begin{itemize}
-\item Input dataset size, structure, and format (e.g., CSV, HDF5, NetCDF).
-
-Optimization problems to solve are defined as functions in a program (e.g. Python or C++): ${f: \mathcal{S} \subset \mathbb{R}^n \rightarrow \mathbb{R}}$, where $\hat{x}$ is the global optima, $f$ the objective function, and $\mathcal{S}$ a compact set made of inequalities (e.g. upper and lower bounds of the search space).
-
-\item Output dataset format and key results.
+\item Input dataset: Optimization problems to solve are defined as functions in a program (e.g. Python or C++): ${f: \mathcal{S} \subset \mathbb{R}^n \rightarrow \mathbb{R}}$, where $\hat{x}$ is the global optima, $f$ the objective function, and $\mathcal{S}$ a compact set made of inequalities (e.g. upper and lower bounds of the search space).
-The output is mainly the solutions found (i.e vectors of continuous values) and their quality. Some plots related to convergence can also be obtained.
+\item Output dataset: The output is mainly the solutions found (i.e vectors of continuous values) and their quality. Some plots related to convergence can also be obtained.
-\item Location of the dataset (e.g., GitHub repository, institutional repository, or open access platform).
-
-In recent years, there have been significant developments in the field of optimization, with new algorithms being proposed to solve challenging problems. As a result, it has become increasingly important to update traditional testing criteria to evaluate the performance of these optimization algorithms.
+\item Location of the dataset: In recent years, there have been significant developments in the field of optimization, with new algorithms being proposed to solve challenging problems. As a result, it has become increasingly important to update traditional testing criteria to evaluate the performance of these optimization algorithms.
Therefore, three main standard librairies are used:
\begin{itemize}
-\item CEC2020 benchmark: Single Objective Bound Constrained Numerical Optimization. See URL https://www.kaggle.com/code/kooaslansefat/cec-2022-benchmark.
-\item The blackbox optimization benchmarking (bbob) test suite. See URL https://numbbo.github.io/coco/testsuites/bbob.
-\item HPOBench: A collection of Reproducible Multi-Fidelity Benchmark Problems for hyperparameter optimization of machine learning models. See URL https://github.com/automl/HPOBench.
+\item CEC2020 benchmark: Single Objective Bound Constrained Numerical Optimization. \\
+See \url{https://www.kaggle.com/code/kooaslansefat/cec-2022-benchmark}
+\item The blackbox optimization benchmarking (bbob) test suite. \\
+See \url{https://numbbo.github.io/coco/testsuites/bbob}
+\item HPOBench: A collection of Reproducible Multi-Fidelity Benchmark Problems for hyperparameter optimization of machine learning models. \\
+See \url{https://github.com/automl/HPOBench}
\end{itemize}
-
-\item DOI or permanent link for accessing the dataset.
\end{itemize}
-\item \textbf{open-data Access:} Indicate whether the datasets used for the benchmark are open access, and provide a DOI or a direct link for download. Where applicable, highlight any licensing constraints.
-
-All the datasets used are open access. The links are provided in the previous section.
-
-\item \textbf{Challenges:} Identify any significant bottlenecks or challenges observed during the benchmarking process, including data handling and computational performance.
+\item \textbf{open-data Access:} All the datasets used are open access. The links are provided in the previous section.
-The main challenges of the benchmarking process are twofold:
+\item \textbf{Challenges:} The main challenges of the benchmarking process are twofold:
\begin{itemize}
\item Scalability of parallel decomposition-based optimization algorithms: it consists in analyzing and improving the obtained speedups of the various parallel models that can be designed.
\item Characteristics of the target optimization problem: it consists in studying the characteristics of the optimization problems (e.g. dimension, cost of the objective function) that affect the performance of the algorithms
\end{itemize}
-\item \textbf{Future Improvements:} Outline areas for optimization, including dataset handling, memory usage, or algorithmic efficiency, to address identified challenges.
-
-The future improvements will concern the following aspects:
+\item \textbf{Future Improvements:} The future improvements will concern the following aspects:
\begin{itemize}
\item Parallel models of algorithms: it will be interesting yo used in conjunction the three parallel models of decomposition-based optimization algorithms. Those hybrid models will improve the scalability of the algorithms to target Exascale architectures.
@@ -183,9 +122,7 @@ \subsection{Initial Performance Metrics}
\subsubsection{Benchmark \#1}
\begin{itemize}
-\item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
-
-The first family of benchmarks consists in well-known benchmarks in global optimization community, such as CEC2020, SOCO2011 and BBOB.
+\item \textbf{Description:} The first family of benchmarks consists in well-known benchmarks in global optimization community, such as CEC2020, SOCO2011 and BBOB.
\item \textbf{Benchmarking Tools Used:}
@@ -196,16 +133,13 @@ \subsubsection{Benchmark \#1}
\item \textbf{Input/Output Dataset Description:}
\begin{itemize}
-\item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+\item \textbf{Input Data:} the input data describes the optimization problem (e.g. continuous function) to solve and the associated data.
-\item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
-
-\item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
-\end{itemize}
+\item \textbf{Output Data:} the results details the best solution found for solving the problem.
-\item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+\item \textbf{Results Summary:} solving academic optimization problems (i.e. non expensive objective function), even high dimension problems, is not an issue in terms of execution time and other resource usage. However, solving AutoML optimization problems (e.g. neural architecture search of deep neural networks, hyper parameter optimization of large langage models - LLM) takes a lot of time (e.g. some days). Experiments needs the access to at least Petaflop architectures.
-\item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+\item \textbf{Challenges Identified:} the main challenges is the scalability of the designed parallel models. Moreover, one has to design some load balancing and fault tolerance strategies to support large scale heterogeneous parallel architectures.
\item \textbf{Benchmarking Tools Used:}
@@ -213,62 +147,48 @@ \subsubsection{Benchmark \#1}
Those benchmarks are made of more than 24 functions with peculiar properties such as, separability, ill-conditioning, multi-modality and weak structured multi-modality. Each function has many different instances, and are available for many dimension sizes, $d \in \{ 2,3,5,10,20,40, ... \}$.
\end{itemize}
-The following metrics have been used in our experiments: measured are: execution time, budget, quality of solutions, speedup).
-\begin{itemize}
-\item \textbf{Input/Output Dataset Description:}
+The following metrics have been used in our experiments: execution time, budget, quality of solutions, speedup). Converengce plots are also provided to compare the behavior of different optimization algorithms.
+%\begin{itemize}
-\item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+%\item \textbf{Input/Output Dataset Description:}
-\item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+%\item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
-\item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+%\item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
-\item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+%\item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
-\item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
-\end{itemize}
+%\item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+
+%\item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+%\end{itemize}
\subsubsection{Benchmark \#2}
-\begin{itemize}
-\item \textbf{Description:} Briefly describe the benchmark case, including the problem size, target architecture (e.g., CPU, GPU), and the input data. Mention the specific goals of the benchmark (e.g., testing scalability, energy efficiency).
-\end{itemize}
The second family of benchmarks consists in well-known benchmarks from machine learning community (AutoML).
+
The main characteristics of those problems in their black box and expensive objective functions.
-Automated Machine Learning (AutoML) consists in automating
-the application of machine-learning models to realworld
-applications. The goal of this approach is to design
-better machine learning models. Because, we are interested
-in applying this methodology to the HyperParameter Optimization
-(HPO) of expensive deep neural networks, we
-have selected three benchmarks from HPOBench. We
+Automated Machine Learning (AutoML) consists in automating the application of machine-learning models to realworld applications. The goal of this approach is to design better machine learning models. Because, we are interested in applying this methodology to the HyperParameter Optimization
+(HPO) of expensive deep neural networks, we have selected three benchmarks from HPOBench. We
performed a HPO on 3 functions:
\begin{itemize}
-\item Random Forest (RF): 4 hyperparameters (n = 4) are
-optimized, with a budget of 20000 calls.
-\item Multi-layer Neural Network (NN): 5 hyperparameters
-(n = 5) are optimized, with a budget of 6000 calls.
-\item Histogram-Based Gradient Boosting (HBGB): 4 hyperparameters
-(n = 4) are optimized, with a budget of 6000 calls.
+\item Random Forest (RF): 4 hyperparameters (n = 4) are optimized, with a budget of 20000 calls.
+\item Multi-layer Neural Network (NN): 5 hyperparameters (n = 5) are optimized, with a budget of 6000 calls.
+\item Histogram-Based Gradient Boosting (HBGB): 4 hyperparameters (n = 4) are optimized, with a budget of 6000 calls.
\end{itemize}
The budget was defined according to their computation time. These models were trained and tested on a given task of OpenML (supervised classification on car). We have selected these three models for their simplicity. They do not require any GPU, and the computation time is low compared to expensive DNN benchmarks, but higher compared to low-cost academic functions usually used to
validate performances of continuous optimization algorithms.
-The following metrics have been used in our experiments: measured are: execution time, budget, quality of ML models, speedup).
+The following metrics have been used in our experiments: measured are: execution time, budget, quality of ML models, speedup, convergence plots).
\begin{itemize}
-\item \textbf{Input/Output Dataset Description:}
-
-
-\item \textbf{Input Data:} Describe the input dataset (size, format, data type) and provide a DOI or link to access it.
+\item \textbf{Input/Output Dataset Description:} The HPBbench is available under the URL: \url{https://www.automl.org/hpo-overview/hpo-benchmarks/hpobench/}
\end{itemize}
-The HPBbench is available under the URL: \url{https://www.automl.org/hpo-overview/hpo-benchmarks/hpobench/}
-
HPOBench aims at solving the aforementioned issues with the following contributions:
\begin{itemize}
@@ -277,36 +197,26 @@ \subsubsection{Benchmark \#2}
\item Surrogate and Tabular benchmarks provide a cheap way of evaluating the target algorithms.
\end{itemize}
-HPOBench currently contains more than 100 multi-fidelity benchmark problems with various properties: numerical and categorical configuration space, different difficulties, and complexities. Furthermore, HPOBench also provides the result of several popular HPO packages to make them easier to be compared with the new HPO algorithms. For more information, please check our HPOBench GitHub repository/
+HPOBench currently contains more than 100 multi-fidelity benchmark problems with various properties: numerical and categorical configuration space, different difficulties, and complexities. Furthermore, HPOBench also provides the result of several popular HPO packages to make them easier to be compared with the new HPO algorithms. For more information, please check our HPOBench GitHub repository.
-\begin{itemize}
-\item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
+%\begin{itemize}
+%\item \textbf{Output Data:} Specify the structure of the results (e.g., memory usage, runtime logs) and how they can be accessed or replicated.
-\item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
+%\item \textbf{Data Repository:} Indicate where the data is stored (e.g., Zenodo, institutional repository) and provide a DOI or URL for accessing the data.
-\item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
+%\item \textbf{Results Summary:} Include a summary of key metrics (execution time, memory usage, FLOPS) and their comparison across architectures (e.g., CPU, GPU).
-\item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
+%\item \textbf{Challenges Identified:} Describe any bottlenecks encountered (e.g., memory usage, parallelization inefficiencies) and how they impacted the benchmark.
\end{itemize}
\subsection{12-Month Roadmap}
\label{sec:WP5:Zellij:roadmap}
-In this section, describe the roadmap for improving benchmarks and addressing the challenges identified. This should include:
\begin{itemize}
-\item \textbf{Data Improvements:} Plans for improving input/output data management, including making datasets more accessible and ensuring reproducibility through open-data initiatives.
-
-There are no issues concerning the improvements of input/output data management. All datasets are accessible and reproducibility is ensured through open-data initiatives.
-
-\item \textbf{Methodology Application:} Implementation of the benchmarking methodology proposed in this deliverable to streamline reproducibility and dataset management.
-
-All obtained results will be published in HAL, conferences and journals.
-
-\item \textbf{Results Retention:} Plans to maintain benchmark results in a publicly accessible repository with appropriate metadata and documentation, ensuring long-term usability.
-
-All obtained results will be published in HAL, conferences and journals.
+\item \textbf{Data Improvements:} There are no issues concerning the improvements of input/output data management. All datasets are accessible and reproducibility is ensured through open-data initiatives.
+\item \textbf{Methodology Application and result retention :} All obtained results are open access and are published in HAL, conferences and journals.
\end{itemize}
In~\cref{tab:WP5:Zellij:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
@@ -325,20 +235,11 @@ \subsection{12-Month Roadmap}
\rowcolor{numpexgray}{\rule{0pt}{2.5ex}\color{white}\bf Bottlenecks} & {\rule{0pt}{2.5ex}\color{white}\bf Short Description }\\
-\rowcolor{white} B10 - Scientific Productivity & The obtained results in terms of parallel algorithm design and implementation and their application to big optimization problems will be published in conferences and journals in the field of optimization and parallel computing.
-
- \\
-
-\rowcolor{numpexlightergray} B11 - Reproducibility and Replicability of Computation &
-
-All the developed software will be available in open source in the software library zellij. The benchmarks used are standard data from optimization and machine learning communities. \\
-
-\rowcolor{white} B6 - Data Management & There is no important issues regarding data management in current studies.
-
- \\
-
\rowcolor{numpexlightergray} B7 - Exascale Algorithms & We are investigating currently the parallel implementation of the algorithms on hybrid inter-node architectures combining GPU and CPU using the environments MPI and Kokkos. In addition to Grid'5000 platform we will target some larger scale machines such as Jean Zay. \\
+\rowcolor{white} B9 - Resilience, robustness and accuracy & Some load balancing and fault tolerance strategies have to be designed for large Exascale heterogeneous architectures. \\
+
+ \rowcolor{numpexlightergray} B10 - Scientific Productivity & The obtained results in terms of parallel algorithm design and implementation and their application to big optimization problems will be published in conferences and journals in the field of optimization and parallel computing. \\
\end{tabular}
}
}
diff --git a/software/zellij/zellij.tex b/software/zellij/zellij.tex
index 0770193..c8de6b8 100644
--- a/software/zellij/zellij.tex
+++ b/software/zellij/zellij.tex
@@ -16,7 +16,7 @@ \section{Software: Zellij}
Inria Lille\\
\end{tabular} \\
\rowcolor{white}\textbf{Contact Emails} & \begin{tabular}{l}
-el-ghazali.talbi@univ-lille.fr\\
+el-ghazali.talbi@univ-lille.fr, thomas.firmin@univ-lille.fr \\
\end{tabular} \\
\rowcolor{numpexlightergray}\textbf{Supported Architectures} & \begin{tabular}{l}
CPU only \\
@@ -26,15 +26,17 @@ \section{Software: Zellij}
OSS: Cecill-*\\
\end{tabular} \\
\rowcolor{white}\textbf{Bottlenecks roadmap} & \begin{tabular}{l}
-B10 - Scientific Productivity\\
-B11 - Reproducibility and Replicability of Computation \\
-B6 - Data Management\\
B7 - Exascale Algorithms\\
+B9 - Resilience, robustness, accuracy \\
+B10 - Scientific productivity \\
\end{tabular} \\
- \bottomrule
+ \rowcolor{numpexlightergray}\textbf{Contributors} & \begin{tabular}{l}
+Thomas Firmin, El-Ghazali Talbi \\
+\end{tabular} \\
+\bottomrule
\end{tabular}
}}
- \caption{Zellij Information}
+ \caption{WP5}
\end{table}
\subsection{Software summary}
@@ -85,7 +87,7 @@ \subsection{Programming and Computational Environment}
MPI\\
\end{tabular} & Parallel computing methods and frameworks utilized by the software.\\
\rowcolor{white}Data Formats & \begin{tabular}{l}
-None \\
+In-house format \\
\end{tabular} & Data formats that the software can handle or produce.\\
\rowcolor{numpexlightergray}Resilience & \begin{tabular}{l}
Checkpoint restart\\
@@ -94,13 +96,13 @@ \subsection{Programming and Computational Environment}
None \\
\end{tabular} & Outlines the development and operational practices including continuous integration, containerization, and testing methodologies. \\
\rowcolor{numpexlightergray}Packaging & \begin{tabular}{l}
-None \\
+Github \\
\end{tabular} & Software packaging and distribution.\\
- \rowcolor{white}Testing & \begin{tabular}{l}
-None\\
+ \rowcolor{white} Testing & \begin{tabular}{l}
+Unit verification \\
\end{tabular} & Testing methodologies employed to ensure software quality and correctness.\\
\rowcolor{numpexlightergray} Containerization & \begin{tabular}{l}
-None\\
+None \\
\end{tabular} & Container technologies used to package and deploy the software.\\
\rowcolor{white}Interfaces & \begin{tabular}{l}
None\\
diff --git a/templates/desc-software.tex b/templates/desc-software.tex
index 95f4b43..c90c3f9 100644
--- a/templates/desc-software.tex
+++ b/templates/desc-software.tex
@@ -3,7 +3,7 @@ \section{Software: \VAR{software.name}}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -40,7 +40,7 @@ \subsection{Programming and Computational Environment}
The following table summarizes these aspects for \VAR{software.name}, providing a view of its programming and computational capabilities.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{
\setlength{\parindent}{0pt}
@@ -82,7 +82,7 @@ \subsection{Relevant Publications}
\subsection{Acknowledgements}
\label{sec::\VAR{software.name}:acknowledgements}
-The software has been developed with the support of the following funding agencies and institutions:
+The software has been developed with the support of the following funding agencies and institutions:
diff --git a/templates/wp-software.tex b/templates/wp-software.tex
index c621bcc..eebd9d0 100644
--- a/templates/wp-software.tex
+++ b/templates/wp-software.tex
@@ -1,7 +1,7 @@
\section{Software: \VAR{software.name}}
\label{sec:\VAR{wp}:\VAR{software.name}:software}
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
{ \setlength{\parindent}{0pt}
\def\arraystretch{1.25}
@@ -27,9 +27,9 @@ \subsection{Software Overview}
In~\cref{tab:\VAR{wp}:\VAR{software.name}:features} we provide a summary of the software features relevant to the work package which are briefly discussed.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}
@@ -101,13 +101,13 @@ \subsection{12-Month Roadmap}
In~\cref{tab:\VAR{wp}:\VAR{software.name}:bottlenecks}, we briefly discuss the bottleneck roadmap associated to the software and relevant to the work package.
-\begin{table}[h!]
+\begin{table}[!ht]
\centering
-
-
+
+
\centering
- {
+ {
\setlength{\parindent}{0pt}
\def\arraystretch{1.25}
\arrayrulecolor{numpexgray}