diff --git a/.codespell-ignore b/.codespell-ignore new file mode 100644 index 00000000..79f47edb --- /dev/null +++ b/.codespell-ignore @@ -0,0 +1,6 @@ +Fram +dropse +figurestyle +hist +namd +rouge diff --git a/.github/workflows/test_and_build.yml b/.github/workflows/test_and_build.yml index c7e3bd56..fa90883a 100644 --- a/.github/workflows/test_and_build.yml +++ b/.github/workflows/test_and_build.yml @@ -8,20 +8,26 @@ on: - gh-pages jobs: - spellcheck: + lint_common_files: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: 3.7 - name: Install codespell run: | - pip3 install codespell + sudo apt-get install codespell - name: Check spelling run: | make spellcheck + - name: Check top-level Markdown syntax + uses: DavidAnson/markdownlint-cli2-action@v9 + - name: Check episode Markdown syntax + uses: DavidAnson/markdownlint-cli2-action@v9 + with: + globs: _episodes/*.md + - name: Check extra Markdown syntax + uses: DavidAnson/markdownlint-cli2-action@v9 + with: + globs: _extras/*.md check_lesson_and_build_default: runs-on: ubuntu-latest @@ -29,7 +35,8 @@ jobs: - uses: actions/checkout@v2 - uses: ruby/setup-ruby@v1 with: - ruby-version: '2.7' + ruby-version: '3.0.4' + bundler-cache: true - name: Install basic requirements run: | # Need this library for nokogiri @@ -38,9 +45,6 @@ jobs: bundle config set path '.vendor/bundle' bundle config build.nokogiri --use-system-libraries bundle install - - name: "Lint episode markdown" - run: | - find _episodes -name \*.md -exec bundle exec mdl -r MD001,MD003,MD005,MD006,MD007,MD008,MD009,MD010,MD011,MD012,MD015,MD016,MD017,MD018,MD019,MD020,MD021,MD022,MD023,MD025,MD035,MD036,MD037,MD038,MD039,MD046 {} \; - name: "Check lesson for warnings" run: | make lesson-check-all @@ -56,18 +60,21 @@ jobs: strategy: matrix: HPC_JEKYLL_CONFIG: + - Birmingham_Baskerville_slurm - ComputeCanada_Graham_slurm - - EPCC_Cirrus_pbs + - EPCC_Cirrus_slurm + - HPCC_MagicCastle_slurm + - Magic_Castle_EESSI_slurm - NIST_CTCMS_slurm - Norway_SIGMA2_SAGA_slurm - UCL_Myriad_sge - - Magic_Castle_EESSI_slurm - BSU_Borah_slurm steps: - uses: actions/checkout@v2 - uses: ruby/setup-ruby@v1 with: - ruby-version: '2.7' + ruby-version: '3.0.4' + bundler-cache: true - name: Install basic requirements run: | # Need this library for nokogiri @@ -79,6 +86,7 @@ jobs: - name: Check build ${{matrix.HPC_JEKYLL_CONFIG}} run: | make --always-make site HPC_JEKYLL_CONFIG=_includes/snippets_library/${{matrix.HPC_JEKYLL_CONFIG}}/_config_options.yml - - name: "Lint snippet markdown" - run: | - find _includes/snippets_library/${{matrix.HPC_JEKYLL_CONFIG}} -name \*.snip -exec bundle exec mdl -r MD001,MD003,MD005,MD006,MD007,MD008,MD009,MD010,MD011,MD012,MD015,MD016,MD017,MD018,MD019,MD020,MD021,MD022,MD023,MD025,MD035,MD036,MD037,MD038,MD039,MD046 {} \; + - name: Lint snippet markdown + uses: DavidAnson/markdownlint-cli2-action@v9 + with: + globs: _includes/snippets_library/${{matrix.HPC_JEKYLL_CONFIG}}/**/*.snip diff --git a/.markdownlint-cli2.yaml b/.markdownlint-cli2.yaml new file mode 100644 index 00000000..077a30a8 --- /dev/null +++ b/.markdownlint-cli2.yaml @@ -0,0 +1,73 @@ +# markdownlint-cli2: +# - Example: +# - Rules: +--- +config: + MD001: false # heading-increment + MD002: false # mdl rule undefined + MD003: true # heading-style + MD004: true # ul-style + MD005: true # list-indent + MD006: false # mdl rule undefined + MD007: true # ul-indent + MD008: false # mdl rule undefined + MD009: true # no-trailing-spaces + MD010: true # no-hard-tabs + MD011: true # no-reversed-links + MD012: true # no-multiple-blanks + MD013: false # line-length + MD014: false # commands-show-output + MD015: false # mdl rule undefined + MD016: false # mdl rule undefined + MD017: false # mdl rule undefined + MD018: true # no-missing-space-atx + MD019: true # no-multiple-space-atx + MD020: true # no-missing-space-closed-atx + MD021: true # no-multiple-space-closed-atx + MD022: true # blanks-around-headings + MD023: true # heading-start-left + MD024: false # no-duplicate-heading + MD025: true # single-h1 + MD026: false # no-trailing-punctuation + MD027: false # no-multiple-space-blockquote + MD028: false # no-blanks-blockquote + MD029: true # ol-prefix + MD030: true # list-marker-space + MD031: false # blanks-around-fences + MD032: true # blanks-around-lists + MD033: false # no-inline-html + MD034: true # no-bare-urls + MD035: true # hr-style + MD036: true # no-emphasis-as-heading + MD037: true # no-space-in-emphasis + MD038: true # no-space-in-code + MD039: true # no-space-in-links + MD040: false # fenced-code-language + MD041: false # first-line-h1 + MD042: true # no-empty-links + MD043: false # required-headings + MD044: true # proper-names + MD045: true # no-alt-text + MD046: true # code-block-style + MD047: true # single-trailing-newline + MD048: true # code-fence-style + MD049: false # emphasis-style + MD050: false # strong-style + MD051: true # link-fragments + MD052: true # reference-links-images + MD053: false # link-image-reference-definitions + MD054: false # link-image-style + MD055: true # table-pipe-style + MD056: true # table-column-count + +ignores: + - .vendor + - .bundle + - _extras/figures.md # really HTML + - _includes/links.md # just a list of links + - "**/*.html" + - "**/*.sh" + - "**/*.yaml" + - "**/*.yml" + +# noProgress: true diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 7843ea6f..466e7156 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -130,4 +130,3 @@ which everyone is welcome to join. You can also [reach us by email][email]. [swc-issues]: https://github.com/issues?q=user%3Aswcarpentry [swc-lessons]: https://software-carpentry.org/lessons/ [swc-site]: https://software-carpentry.org/ - diff --git a/Gemfile b/Gemfile index 5714bad6..cc53459d 100644 --- a/Gemfile +++ b/Gemfile @@ -1,4 +1,18 @@ -source "https://rubygems.org" -gem "github-pages", group: :jekyll_plugins +# frozen_string_literal: true + +source 'https://rubygems.org' + +git_source(:github) { |repo_name| "https://github.com/#{repo_name}" } + +# Synchronize with https://pages.github.com/versions +ruby '3.3.4' + +gem 'github-pages', group: :jekyll_plugins + +if Gem::Version.new(RUBY_VERSION) >= Gem::Version.new('3.0.0') + gem 'webrick', '>= 1.6.1' +end + gem "kramdown-parser-gfm" + gem "mdl" diff --git a/LICENSE.md b/LICENSE.md index edbad2a1..46deb500 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,9 +1,7 @@ --- -title: "Licenses" +title: "License: CC BY 4.0" --- -# CC BY 4.0 - ## Instructional Material All High Performance Computing Carpentry instructional material is diff --git a/Makefile b/Makefile index f451874e..d8add2b7 100644 --- a/Makefile +++ b/Makefile @@ -154,7 +154,7 @@ lesson-fixme : ## spellcheck spellcheck: - codespell --skip="assets,*.svg,.vendor" --quiet-level=2 -L "dropse,figurestyle,hist,namd,rouge" + codespell --skip=".bundle,.vendor,assets,*.svg" --quiet-level=2 --ignore-words=".codespell-ignore" ## ## IV. Auxililary (plumbing) commands diff --git a/_config.yml b/_config.yml index 5e499c49..c0732327 100644 --- a/_config.yml +++ b/_config.yml @@ -6,14 +6,14 @@ #------------------------------------------------------------ # Cluster host and scheduler options: the defaults come from -# Graham at Compute Canada, running Slurm. Other options can -# be found in the library of snippets, +# the HPC Carpentry tutorial cluster, running Slurm. Other +# options can be found in the library of snippets, # `_includes/snippets_library`. To use one, replace options # below with those in `_config_options.yml` from the # library. E.g, to customise for Cirrus at EPCC, running -# PBS, we could replace the options below with those from +# Slurm, we could replace the options below with those from # -# _includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml +# _includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml # # If your cluster is not represented in the library, please # copy an existing folder, rename it, and customize for your @@ -23,7 +23,7 @@ snippets: "/snippets_library/BSU_Borah_slurm" local: - prompt: "[user@laptop ~]$" + prompt: "[you@laptop:~]$" bash_shebang: "#!/usr/bin/env bash" remote: @@ -35,6 +35,7 @@ remote: homedir: "/bsuhome" user: "yourUsername" prompt: "[yourUsername@borah-login ~]$" + module_python3: "python/3.9.7" bash_shebang: "#!/usr/bin/env bash" sched: @@ -60,6 +61,7 @@ sched: info: "sinfo" comment: "#SBATCH" hist: "sacct" + hist_filter: "" episode_order: - 10-hpc-intro @@ -67,9 +69,9 @@ episode_order: - 12-cluster - 13-scheduler - 01-break - - 14-modules - - 15-transferring-files - - 18-responsibility + - 15-modules + - 16-transferring-files + - 19-responsibility #------------------------------------------------------------ # Values for this lesson @@ -79,7 +81,7 @@ episode_order: # "swc": Software Carpentry # "dc": Data Carpentry # "lc": Library Carpentry -# "cp": Carpentries (e.g., instructor traning) +# "cp": Carpentries (e.g., instructor training) carpentry: "incubator" # Overall title for pages. diff --git a/_episodes/11-connecting.md b/_episodes/11-connecting.md index 16809969..16e386f3 100644 --- a/_episodes/11-connecting.md +++ b/_episodes/11-connecting.md @@ -19,17 +19,17 @@ keypoints: ## Secure Connections The first step in using a cluster is to establish a connection from our laptop -to the cluster. When we are sitting at a computer, we have come to expect a -visual display with icons, widgets, and perhaps some windows or applications: -a _graphical user interface_, or GUI. Since computer clusters are remote -resources that we connect to over slow or intermittent interfaces (WiFi -and VPNs especially), it is more practical to use a _command-line interface_, -or CLI, to send commands as plain-text. If a command returns output, it is -printed as plain text as well. The commands we run today will not open a window +to the cluster. When we are sitting at a computer, we have come to expect a +visual display with icons, widgets, and perhaps some windows or applications: +a _graphical user interface_, or GUI. Since computer clusters are remote +resources that we connect to over slow or intermittent interfaces (WiFi +and VPNs especially), it is more practical to use a _command-line interface_, +or CLI, to send commands as plain-text. If a command returns output, it is +printed as plain text as well. The commands we run today will not open a window to show graphical results. -If you have already taken The Carpentries' courses on the UNIX Shell or -Version Control, you have used the CLI on your _local machine_ extensively. +If you have already taken The Carpentries' courses on the UNIX Shell or +Version Control, you have used the CLI on your _local machine_ extensively. The only leap to be made here is to open a CLI on a _remote machine_, while taking some precautions so that other folks on the network can't see (or change) the commands you're running or the results the remote machine sends @@ -70,12 +70,12 @@ the password prompt are not displayed on the screen. Normal output will resume once you press `Enter`. You may have noticed that the prompt changed when you logged into the remote -system using the terminal. This change is important because it can help you -distinguish on which system the commands you type will be run when you pass -them into the terminal. This change is also a small complication that we will -need to navigate throughout the workshop. Exactly what is displayed as the -prompt (which conventionally ends in `$`) in the terminal when it is connected -to the local system and the remote system will typically be different for +system using the terminal. This change is important because it can help you +distinguish on which system the commands you type will be run when you pass +them into the terminal. This change is also a small complication that we will +need to navigate throughout the workshop. Exactly what is displayed as the +prompt (which conventionally ends in `$`) in the terminal when it is connected +to the local system and the remote system will typically be different for every user. We still need to indicate which system we are entering commands on though so we will adopt the following convention: @@ -87,7 +87,7 @@ on though so we will adopt the following convention: ## Changing Your Password -When your account is created, Research Computing assigns you a password. The +When your account is created, Research Computing assigns you a password. The first thing you should do upon logging in is change it! You can change your password by entering the `passwd` command as shown below: @@ -105,12 +105,12 @@ the new password, and finally confirmation of the new password. > When prompted, enter a strong password that you will remember. There are two > common approaches to this: > -> 1. Create a memorable passphrase with some punctuation, mixed-case and -> number-for-letter substitutions, 32 characters or longer. Please note +> 1. Create a memorable passphrase with some punctuation, mixed-case and +> number-for-letter substitutions, 32 characters or longer. Please note > that passwords are case sensitive. > 2. Use a password manager and its built-in password generator with all > character classes, 25 characters or longer. [KeePass][keepass] and -> [BitWarden][bitwarden] are two good options. This is also a good option +> [BitWarden][bitwarden] are two good options. This is also a good option > for storing passwords. {: .callout} @@ -157,8 +157,8 @@ scratch ``` {: .output} -The system administrators have configured your home directory with a link -(a shortcut) to a scratch space reserved for you. You can also include +The system administrators have configured your home directory with a link +(a shortcut) to a scratch space reserved for you. You can also include hidden files in your directory listing: ``` diff --git a/_episodes/12-cluster.md b/_episodes/12-cluster.md index 82d640ef..15af47d7 100644 --- a/_episodes/12-cluster.md +++ b/_episodes/12-cluster.md @@ -92,23 +92,48 @@ system files and change as you install new software or upgrade your OS. > * __Home__ -- a _network filesystem_, data stored here is available > throughout the HPC system, and is backed up periodically; however, users > are limited on how much they can store. -> * __Scratch__ -- also a _network filesystem_, which has more space available -> than the Home directory, but it is not backed up, and should not be used +> * __Scratch__ -- also a _network filesystem_, which has more space available +> than the Home directory, but it is not backed up, and should not be used > for long term storage. {: .callout} +You can also explore the available filesystems using `df` to show **d**isk +**f**ree space. The `-h` flag renders the sizes in a human-friendly format, +i.e., GB instead of B. The **t**ype flag `-T` shows what kind of filesystem +each resource is. + +``` +{{ site.remote.prompt }} df -Th +``` +{: .language-bash} + +> ## Different results from `df` +> +> * The local filesystems (ext, tmp, xfs, zfs) will depend on whether +> you're on the same login node (or compute node, later on). +> * Networked filesystems (beegfs, cifs, gpfs, nfs, pvfs) will be similar +> -- but may include {{ site.remote.user }}, depending on how it +> is [mounted][mount]. +{: .discussion} + +> ## Shared Filesystems +> +> This is an important point to remember: files saved on one node +> (computer) are often available everywhere on the cluster! +{: .callout} + ## Nodes Recall that the individual computers that compose a cluster are called _nodes_. -On a cluster, there are different types of nodes for different types of tasks. -The node where you are right now is called the _login node_. A login node +On a cluster, there are different types of nodes for different types of tasks. +The node where you are right now is called the _login node_. A login node serves as the access point to the cluster _for all users_. As a gateway, the login node should not be used for time-consuming or -resource-intensive tasks as consuming the cpu or memory of the login node -would slow down the cluster for everyone! It is well suited for uploading -and downloading files, minor software setup, and submitting jobs to the -scheduler. Generally speaking, in these lessons, we will avoid running +resource-intensive tasks as consuming the cpu or memory of the login node +would slow down the cluster for everyone! It is well suited for uploading +and downloading files, minor software setup, and submitting jobs to the +scheduler. Generally speaking, in these lessons, we will avoid running jobs on the login node. Who else is logged in to the login node? @@ -208,60 +233,12 @@ connect to a shared, remote fileserver or cluster of servers. > {: .solution} {: .challenge} -> ## Explore the Login Node -> -> Now compare the resources of your computer with those of the login node. -> -> > ## Solution -> > -> > ``` -> > {{ site.local.prompt }} ssh {{ site.remote.user }}@{{ site.remote.login }} -> > {{ site.remote.prompt }} nproc --all -> > {{ site.remote.prompt }} free -h -> > ``` -> > {: .language-bash} -> > -> > You can get more information about the processors using `lscpu`, -> > and a lot of detail about the memory by reading the file `/proc/meminfo`: -> > -> > ``` -> > {{ site.remote.prompt }} less /proc/meminfo -> > ``` -> > {: .language-bash} -> > -> > You can also explore the available filesystems using `df` to show **d**isk -> > **f**ree space. The `-h` flag renders the sizes in a human-friendly format, -> > i.e., GB instead of B. The **t**ype flag `-T` shows what kind of filesystem -> > each resource is. -> > -> > ``` -> > {{ site.remote.prompt }} df -Th -> > ``` -> > {: .language-bash} -> > -> > > ## Different results from `df` -> > > -> > > * The local filesystems (ext, tmp, xfs, zfs) will depend on whether -> > > you're on the same login node (or compute node, later on). -> > > * Networked filesystems (beegfs, cifs, gpfs, nfs, pvfs) will be similar -> > > -- but may include {{ site.remote.user }}, depending on how it -> > > is [mounted][mount]. -> > {: .discussion} -> > -> > > ## Shared Filesystems -> > > -> > > This is an important point to remember: files saved on one node -> > > (computer) are often available everywhere on the cluster! -> > {: .callout} -> {: .solution} -{: .challenge} - {% include {{ site.snippets }}/cluster/specific-node-info.snip %} -> ## Compare Your Computer, the Login Node and the Compute Node +> ## Compare Your Computer and the Compute Node > > Compare your laptop's number of processors and memory with the numbers you -> see on the cluster login node and compute node. What implications do +> see on the cluster compute node. What implications do > you think the differences might have on running your research work on the > different systems and nodes? > diff --git a/_episodes/13-scheduler.md b/_episodes/13-scheduler.md index 44c3e70f..f3373903 100644 --- a/_episodes/13-scheduler.md +++ b/_episodes/13-scheduler.md @@ -184,7 +184,6 @@ Submit the job and monitor its status: Fantastic, we've successfully changed the name of our job! - ### Resource Requests What about more important changes, such as the number of cores and memory for @@ -239,8 +238,6 @@ later episode of this lesson. > {: .solution} {: .challenge} -{% include {{ site.snippets }}/scheduler/print-sched-variables.snip %} - Resource requests are typically binding. If you exceed them, your job will be killed. Let's use wall time as an example. We will request 1 minute of wall time and attempt to run a job for two minutes. @@ -342,4 +339,4 @@ When you are done with the interactive job, type `exit` or ctrl + {% include links.md %} [fshs]: https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard -[hisat]: https://ccb.jhu.edu/software/hisat2/index.shtml +[hisat]: https://daehwankimlab.github.io/hisat2/ diff --git a/_episodes/14-environment-variables.md b/_episodes/14-environment-variables.md new file mode 100644 index 00000000..b13cc5ca --- /dev/null +++ b/_episodes/14-environment-variables.md @@ -0,0 +1,258 @@ +--- +title: Environment Variables +teaching: 10 +exercises: 5 +questions: +- "How are variables set and accessed in the Unix shell?" +- "How can I use variables to change how a program runs?" +objectives: +- "Understand how variables are implemented in the shell" +- "Read the value of an existing variable" +- "Create new variables and change their values" +- "Change the behaviour of a program using an environment variable" +- "Explain how the shell uses the `PATH` variable to search for executables" +keypoints: +- "Shell variables are by default treated as strings" +- "Variables are assigned using \"`=`\" and recalled using the variable's name prefixed by \"`$`\"" +- "Use \"`export`\" to make an variable available to other programs" +- "The `PATH` variable defines the shell's search path" +--- + +> ## Episode provenance +> +> This episode has been remixed from the +> [Shell Extras episode on Shell Variables](https://github.com/carpentries-incubator/shell-extras/blob/gh-pages/_episodes/08-environment-variables.md) +> and the [HPC Shell episode on scripts](https://github.com/hpc-carpentry/hpc-shell/blob/gh-pages/_episodes/05-scripts.md) +{: .callout} + +The shell is just a program, and like other programs, it has variables. +Those variables control its execution, +so by changing their values +you can change how the shell behaves (and with a little more effort how other +programs behave). + +Variables +are a great way of saving information under a name you can access later. In +programming languages like Python and R, variables can store pretty much +anything you can think of. In the shell, they usually just store text. The best +way to understand how they work is to see them in action. + +Let's start by running the command `set` and looking at some of the variables +in a typical shell session: + +~~~ +$ set +~~~ +{: .language-bash} + +~~~ +COMPUTERNAME=TURING +HOME=/home/vlad +HOSTNAME=TURING +HOSTTYPE=i686 +NUMBER_OF_PROCESSORS=4 +PATH=/Users/vlad/bin:/usr/local/git/bin:/usr/bin:/bin:/usr/sbin:/sbin:/usr/local/bin +PWD=/home/vlad +UID=1000 +USERNAME=vlad +... +~~~ +{: .output} + +As you can see, there are quite a few — in fact, +four or five times more than what's shown here. +And yes, using `set` to *show* things might seem a little strange, +even for Unix, but if you don't give it any arguments, +it might as well show you things you *could* set. + +Every variable has a name. +All shell variables' values are strings, +even those (like `UID`) that look like numbers. +It's up to programs to convert these strings to other types when necessary. +For example, if a program wanted to find out how many processors the computer +had, it would convert the value of the `NUMBER_OF_PROCESSORS` variable from a +string to an integer. + +## Showing the Value of a Variable + +Let's show the value of the variable `HOME`: + +~~~ +$ echo HOME +~~~ +{: .language-bash} + +~~~ +HOME +~~~ +{: .output} + +That just prints "HOME", which isn't what we wanted +(though it is what we actually asked for). +Let's try this instead: + +~~~ +$ echo $HOME +~~~ +{: .language-bash} + +~~~ +/home/vlad +~~~ +{: .output} + +The dollar sign tells the shell that we want the *value* of the variable +rather than its name. +This works just like wildcards: +the shell does the replacement *before* running the program we've asked for. +Thanks to this expansion, what we actually run is `echo /home/vlad`, +which displays the right thing. + +## Creating and Changing Variables + +Creating a variable is easy — we just assign a value to a name using "=" +(we just have to remember that the syntax requires that there are _no_ spaces +around the `=`!): + +~~~ +$ SECRET_IDENTITY=Dracula +$ echo $SECRET_IDENTITY +~~~ +{: .language-bash} + +~~~ +Dracula +~~~ +{: .output} + +To change the value, just assign a new one: + +~~~ +$ SECRET_IDENTITY=Camilla +$ echo $SECRET_IDENTITY +~~~ +{: .language-bash} + +~~~ +Camilla +~~~ +{: .output} + +## Environment variables + +When we ran the `set` command we saw there were a lot of variables whose names +were in upper case. That's because, by convention, variables that are also +available to use by _other_ programs are given upper-case names. Such variables +are called _environment variables_ as they are shell variables that are defined +for the current shell and are inherited by any child shells or processes. + +To create an environment variable you need to `export` a shell variable. For +example, to make our `SECRET_IDENTITY` available to other programs that we call +from our shell we can do: + +~~~ +$ SECRET_IDENTITY=Camilla +$ export SECRET_IDENTITY +~~~ +{: .language-bash} + +You can also create and export the variable in a single step: + +~~~ +$ export SECRET_IDENTITY=Camilla +~~~ +{: .language-bash} + +> ## Using environment variables to change program behaviour +> +> Set a shell variable `TIME_STYLE` to have a value of `iso` and check this +> value using the `echo` command. +> +> Now, run the command `ls` with the option `-l` (which gives a long format). +> +> `export` the variable and rerun the `ls -l` command. Do you notice any +> difference? +> +> > ## Solution +> > +> > The `TIME_STYLE` variable is not _seen_ by `ls` until is exported, at which +> > point it is used by `ls` to decide what date format to use when presenting +> > the timestamp of files. +> > +> {: .solution} +{: .challenge} + +You can see the complete set of environment variables in your current shell +session with the command `env` (which returns a subset of what the command +`set` gave us). **The complete set of environment variables is called +your _runtime environment_ and can affect the behaviour of the programs you +run**. + +{% include {{ site.snippets }}/scheduler/print-sched-variables.snip %} + +To remove a variable or environment variable you can use the `unset` command, +for example: + +~~~ +$ unset SECRET_IDENTITY +~~~ +{: .language-bash} + +## The `PATH` Environment Variable + +Similarly, some environment variables (like `PATH`) store lists of values. +In this case, the convention is to use a colon ':' as a separator. +If a program wants the individual elements of such a list, +it's the program's responsibility to split the variable's string value into +pieces. + +Let's have a closer look at that `PATH` variable. +Its value defines the shell's search path for executables, +i.e., the list of directories that the shell looks in for runnable programs +when you type in a program name without specifying what directory it is in. + +For example, when we type a command like `analyze`, +the shell needs to decide whether to run `./analyze` or `/bin/analyze`. +The rule it uses is simple: +the shell checks each directory in the `PATH` variable in turn, +looking for a program with the requested name in that directory. +As soon as it finds a match, it stops searching and runs the program. + +To show how this works, +here are the components of `PATH` listed one per line: + +~~~ +/Users/vlad/bin +/usr/local/git/bin +/usr/bin +/bin +/usr/sbin +/sbin +/usr/local/bin +~~~ +{: .output} + +On our computer, +there are actually three programs called `analyze` +in three different directories: +`/bin/analyze`, +`/usr/local/bin/analyze`, +and `/users/vlad/analyze`. +Since the shell searches the directories in the order they're listed in `PATH`, +it finds `/bin/analyze` first and runs that. +Notice that it will *never* find the program `/users/vlad/analyze` +unless we type in the full path to the program, +since the directory `/users/vlad` isn't in `PATH`. + +This means that I can have executables in lots of different places as long as +I remember that I need to to update my `PATH` so that my shell can find them. + +What if I want to run two different versions of the same program? +Since they share the same name, if I add them both to my `PATH` the first one +found will always win. +In the next episode we'll learn how to use helper tools to help us manage our +runtime environment to make that possible without us needing to do a lot of +bookkeeping on what the value of `PATH` (and other important environment +variables) is or should be. + +{% include links.md %} diff --git a/_episodes/14-modules.md b/_episodes/15-modules.md similarity index 91% rename from _episodes/14-modules.md rename to _episodes/15-modules.md index d40a0ee0..9947ff34 100644 --- a/_episodes/14-modules.md +++ b/_episodes/15-modules.md @@ -27,16 +27,16 @@ understand the reasoning behind this approach. The three biggest factors are: Software incompatibility is a major headache for programmers. Sometimes the presence (or absence) of a software package will break others that depend on -it. Two of the most famous examples are Python 2 and 3 and C compiler versions. +it. Two well known examples are Python and C compiler versions. Python 3 famously provides a `python` command that conflicts with that provided by Python 2. Software compiled against a newer version of the C libraries and -then used when they are not present will result in a nasty `'GLIBCXX_3.4.20' -not found` error, for instance. +then run on a machine that has older C libraries installed will result in a +nasty `'GLIBCXX_3.4.20' not found` error. Software versioning is another common issue. A team might depend on a certain package version for their research project - if the software version was to change (for instance, if a package was updated), it might affect their results. -Having access to multiple software versions allow a set of researchers to +Having access to multiple software versions allows a set of researchers to prevent software versioning issues from affecting their results. Dependencies are where a particular software package (or even a particular @@ -89,10 +89,7 @@ message telling you so ``` {: .language-bash} -``` -No Modulefiles Currently Loaded. -``` -{: .output} +{% include {{ site.snippets }}/modules/default-modules.snip %} ## Loading and Unloading Software @@ -173,11 +170,11 @@ software is loaded. Let's examine the output of `module avail` more closely. ``` -{{ site.remote.prompt }} module avail +{{ site.remote.prompt }} module avail gcc ``` {: .language-bash} -{% include {{ site.snippets }}/modules/available-modules.snip %} +{% include {{ site.snippets }}/modules/available-modules-gcc.snip %} {% include {{ site.snippets }}/modules/wrong-gcc-version.snip %} @@ -198,8 +195,11 @@ Let's examine the output of `module avail` more closely. > > > > ``` > > {{ site.remote.bash_shebang }} +> > {{ site.sched.comment }} {{ site.sched.flag.partition }}{% if site.sched.flag.qos %} +> > {{ site.sched.comment }} {{ site.sched.flag.qos }} +> > {% endif %}{{ site.sched.comment }} {{ site.sched.flag.time }} 00:00:30 > > -> > module load python3 +> > module load {{ site.remote.module_python3 }} > > > > python3 --version > > ``` diff --git a/_episodes/15-transferring-files.md b/_episodes/15-transferring-files.md deleted file mode 100644 index c37eaa3b..00000000 --- a/_episodes/15-transferring-files.md +++ /dev/null @@ -1,377 +0,0 @@ ---- -title: "Transferring files with remote computers" -teaching: 15 -exercises: 15 -questions: -- "How do I transfer files to (and from) the cluster?" -objectives: -- "Transfer files to and from a computing cluster." -keypoints: -- "`wget` and `curl -O` download a file from the internet." -- "`scp` and `rsync` transfer files to and from your computer." -- "You can use an SFTP client like FileZilla to transfer files through a GUI." ---- - -Performing work on a remote computer is not very useful if we cannot get files -to or from the cluster. There are several options for transferring data between -computing resources using CLI and GUI utilities, a few of which we will cover. - -## Download Files From the Internet - -One of the most straightforward ways to download files is to use either `curl` -or `wget`. Any file that can be downloaded in your web browser through a direct -link can be downloaded using `curl -O` or `wget`. This is a quick way to -download datasets or source code. - -The syntax for these commands is: `curl -O https://some/link/to/a/file` -and `wget https://some/link/to/a/file`. Try it out by downloading -some material we'll use later on, from a terminal on your local machine. - -``` -{{ site.local.prompt }} curl -O {{ site.url }}{{ site.baseurl }}/files/hpc-intro-data.tar.gz -``` -{: .language-bash} -or -``` -{{ site.local.prompt }} wget {{ site.url }}{{ site.baseurl }}/files/hpc-intro-data.tar.gz -``` -{: .language-bash} - - -> ## `tar.gz`? -> -> This is an archive file format, just like `.zip`, commonly used and supported -> by default on Linux, which is the operating system the majority of HPC -> cluster machines run. You may also see the extension `.tgz`, which is exactly -> the same. We'll talk more about "tarballs," since "tar-dot-g-z" is a -> mouthful, later on. -{: .discussion} - -## Transferring Single Files and Folders With `scp` - -To copy a single file to or from the cluster, we can use `scp` ("secure copy"). -The syntax can be a little complex for new users, but we'll break it down. -The `scp` command is a relative of the `ssh` command we used to -access the system, and can use the same public-key authentication -mechanism. - -To _upload to_ another computer: - -``` -{{ site.local.prompt }} scp path/to/local/file.txt {{ site.remote.user }}@{{ site.remote.login }}:/path/on/{{ site.remote.name }} -``` -{: .language-bash} - -To _download from_ another computer: - -``` -{{ site.local.prompt }} scp {{ site.remote.user }}@{{ site.remote.login }}:/path/on/{{ site.remote.name }}/file.txt path/to/local/ -``` -{: .language-bash} - -Note that everything after the `:` is relative to our home directory on the -remote computer. We can leave it at that if we don't care where the file goes. - -``` -{{ site.local.prompt }} scp local-file.txt {{ site.remote.user }}@{{ site.remote.login }}: -``` -{: .language-bash} - -> ## Upload a File -> -> Copy the file you just downloaded from the Internet to your home directory on -> {{ site.remote.name }}. -> -> > ## Solution -> > -> > ``` -> > {{ site.local.prompt }} scp hpc-intro-data.tar.gz {{ site.remote.user }}@{{ site.remote.login }}:~/ -> > ``` -> > {: .language-bash} -> {: .solution} -{: .challenge} - -To copy a whole directory, we add the `-r` flag, for "**r**ecursive": copy the -item specified, and every item below it, and every item below those... until it -reaches the bottom of the directory tree rooted at the folder name you -provided. - -``` -{{ site.local.prompt }} scp -r some-local-folder {{ site.remote.user }}@{{ site.remote.login }}:target-directory/ -``` -{: .language-bash} - -> ## Caution -> -> For a large directory -- either in size or number of files -- -> copying with `-r` can take a long time to complete. -{: .callout} - -## What's in a `/`? - -When using `scp`, you may have noticed that a `:` _always_ follows the remote -computer name; sometimes a `/` follows that, and sometimes not, and sometimes -there's a final `/`. On Linux computers, `/` is the ___root___ directory, the -location where the entire filesystem (and others attached to it) is anchored. A -path starting with a `/` is called _absolute_, since there can be nothing above -the root `/`. A path that does not start with `/` is called _relative_, since -it is not anchored to the root. - -If you want to upload a file to a location inside your home directory -- -which is often the case -- then you don't need a leading `/`. After the -`:`, start writing the sequence of folders that lead to the final storage -location for the file or, as mentioned above, provide nothing if your home -directory _is_ the destination. - -A trailing slash on the target directory is optional, and has no effect for -`scp -r`, but is important in other commands, like `rsync`. - -> ## A Note on `rsync` -> -> As you gain experience with transferring files, you may find the `scp` -> command limiting. The [rsync][rsync] utility provides -> advanced features for file transfer and is typically faster compared to both -> `scp` and `sftp` (see below). It is especially useful for transferring large -> and/or many files and creating synced backup folders. -> -> The syntax is similar to `scp`. To transfer _to_ another computer with -> commonly used options: -> -> ``` -> {{ site.local.prompt }} rsync -avzP path/to/local/file.txt {{ site.remote.user }}@{{ site.remote.login }}:directory/path/on/{{ site.remote.name }}/ -> ``` -> {: .language-bash} -> -> The options are: -> * `a` (archive) to preserve file timestamps and permissions among other things -> * `v` (verbose) to get verbose output to help monitor the transfer -> * `z` (compression) to compress the file during transit to reduce size and -> transfer time -> * `P` (partial/progress) to preserve partially transferred files in case -> of an interruption and also displays the progress of the transfer. -> -> To recursively copy a directory, we can use the same options: -> -> ``` -> {{ site.local.prompt }} rsync -avzP path/to/local/dir {{ site.remote.user }}@{{ site.remote.login }}:directory/path/on/{{ site.remote.name }}/ -> ``` -> {: .language-bash} -> -> As written, this will place the local directory and its contents under the -> specified directory on the remote system. If the trailing slash is omitted on -> the destination, a new directory corresponding to the transferred directory -> ('dir' in the example) will not be created, and the contents of the source -> directory will be copied directly into the destination directory. -> -> The `a` (archive) option implies recursion. -> -> To download a file, we simply change the source and destination: -> -> ``` -> {{ site.local.prompt }} rsync -avzP {{ site.remote.user }}@{{ site.remote.login }}:path/on/{{ site.remote.name }}/file.txt path/to/local/ -> ``` -> {: .language-bash} -{: .callout} - -## Transferring Files Interactively with FileZilla - -FileZilla is a cross-platform client for downloading and uploading files to and -from a remote computer. It is absolutely fool-proof and always works quite -well. It uses the `sftp` protocol. You can read more about using the `sftp` -protocol in the command line in the -[lesson discussion]({{ site.baseurl }}{% link _extras/discuss.md %}). - -Download and install the FileZilla client from . -After installing and opening the program, you should end up with a window with -a file browser of your local system on the left hand side of the screen. When -you connect to the cluster, your cluster files will appear on the right hand -side. - -To connect to the cluster, we'll just need to enter our credentials at the top -of the screen: - -* Host: `sftp://{{ site.remote.login }}` -* User: Your cluster username -* Password: Your cluster password -* Port: (leave blank to use the default port) - -Hit "Quickconnect" to connect. You should see your remote files appear on the -right hand side of the screen. You can drag-and-drop files between the left -(local) and right (remote) sides of the screen to transfer files. - -Finally, if you need to move large files (typically larger than a gigabyte) -from one remote computer to another remote computer, SSH in to the computer -hosting the files and use `scp` or `rsync` to transfer over to the other. This -will be more efficient than using FileZilla (or related applications) that -would copy from the source to your local machine, then to the destination -machine. - -## Archiving Files - -One of the biggest challenges we often face when transferring data between -remote HPC systems is that of large numbers of files. There is an overhead to -transferring each individual file and when we are transferring large numbers of -files these overheads combine to slow down our transfers to a large degree. - -The solution to this problem is to _archive_ multiple files into smaller -numbers of larger files before we transfer the data to improve our transfer -efficiency. Sometimes we will combine archiving with _compression_ to reduce -the amount of data we have to transfer and so speed up the transfer. - -The most common archiving command you will use on a (Linux) HPC cluster is -`tar`. `tar` can be used to combine files into a single archive file and, -optionally, compress it. - -Let's start with the file we downloaded from the lesson site, -`hpc-lesson-data.tar.gz`. The "gz" part stands for _gzip_, which is a -compression library. Reading this file name, it appears somebody took a folder -named "hpc-lesson-data," wrapped up all its contents in a single file with -`tar`, then compressed that archive with `gzip` to save space. Let's check -using `tar` with the `-t` flag, which prints the "**t**able of contents" -without unpacking the file, specified by `-f `, on the remote -computer. Note that you can concatenate the two flags, instead of writing -`-t -f` separately. - -``` -{{ site.local.prompt }} ssh {{ site.remote.user }}@{{ site.remote.login }} -{{ site.remote.prompt }} tar -tf hpc-lesson-data.tar.gz -hpc-intro-data/ -hpc-intro-data/north-pacific-gyre/ -hpc-intro-data/north-pacific-gyre/NENE01971Z.txt -hpc-intro-data/north-pacific-gyre/goostats -hpc-intro-data/north-pacific-gyre/goodiff -hpc-intro-data/north-pacific-gyre/NENE02040B.txt -hpc-intro-data/north-pacific-gyre/NENE01978B.txt -hpc-intro-data/north-pacific-gyre/NENE02043B.txt -hpc-intro-data/north-pacific-gyre/NENE02018B.txt -hpc-intro-data/north-pacific-gyre/NENE01843A.txt -hpc-intro-data/north-pacific-gyre/NENE01978A.txt -hpc-intro-data/north-pacific-gyre/NENE01751B.txt -hpc-intro-data/north-pacific-gyre/NENE01736A.txt -hpc-intro-data/north-pacific-gyre/NENE01812A.txt -hpc-intro-data/north-pacific-gyre/NENE02043A.txt -hpc-intro-data/north-pacific-gyre/NENE01729B.txt -hpc-intro-data/north-pacific-gyre/NENE02040A.txt -hpc-intro-data/north-pacific-gyre/NENE01843B.txt -hpc-intro-data/north-pacific-gyre/NENE01751A.txt -hpc-intro-data/north-pacific-gyre/NENE01729A.txt -hpc-intro-data/north-pacific-gyre/NENE02040Z.txt -``` -{: .language-bash} - -This shows a folder containing another folder, which contains a bunch of files. -If you've taken The Carpentries' Shell lesson recently, these might look -familiar. Let's see about that compression, using `du` for "**d**isk -**u**sage". - -``` -{{ site.remote.prompt }} du -sh hpc-lesson-data.tar.gz -36K hpc-intro-data.tar.gz -``` -{: .language-bash} - -> ## Files Occupy at Least One "Block" -> -> If the filesystem block size is larger than 36 KB, you'll see a larger -> number: files cannot be smaller than one block. -{: .callout} - -Now let's unpack the archive. We'll run `tar` with a few common flags: - -* `-x` to e**x**tract the archive -* `-v` for **v**erbose output -* `-z` for g**z**ip compression -* `-f` for the file to be unpacked - -When it's done, check the directory size with `du` and compare. - -> ## Extract the Archive -> -> Using the four flags above, unpack the lesson data using `tar`. -> Then, check the size of the whole unpacked directory using `du`. -> -> Hint: `tar` lets you concatenate flags. -> -> > ## Commands -> > -> > ``` -> > {{ site.remote.prompt }} tar -xvzf hpc-lesson-data.tar.gz -> > ``` -> > {: .language-bash} -> > -> > ``` -> > hpc-intro-data/ -> > hpc-intro-data/north-pacific-gyre/ -> > hpc-intro-data/north-pacific-gyre/NENE01971Z.txt -> > hpc-intro-data/north-pacific-gyre/goostats -> > hpc-intro-data/north-pacific-gyre/goodiff -> > hpc-intro-data/north-pacific-gyre/NENE02040B.txt -> > hpc-intro-data/north-pacific-gyre/NENE01978B.txt -> > hpc-intro-data/north-pacific-gyre/NENE02043B.txt -> > hpc-intro-data/north-pacific-gyre/NENE02018B.txt -> > hpc-intro-data/north-pacific-gyre/NENE01843A.txt -> > hpc-intro-data/north-pacific-gyre/NENE01978A.txt -> > hpc-intro-data/north-pacific-gyre/NENE01751B.txt -> > hpc-intro-data/north-pacific-gyre/NENE01736A.txt -> > hpc-intro-data/north-pacific-gyre/NENE01812A.txt -> > hpc-intro-data/north-pacific-gyre/NENE02043A.txt -> > hpc-intro-data/north-pacific-gyre/NENE01729B.txt -> > hpc-intro-data/north-pacific-gyre/NENE02040A.txt -> > hpc-intro-data/north-pacific-gyre/NENE01843B.txt -> > hpc-intro-data/north-pacific-gyre/NENE01751A.txt -> > hpc-intro-data/north-pacific-gyre/NENE01729A.txt -> > hpc-intro-data/north-pacific-gyre/NENE02040Z.txt -> > ``` -> > {: .output} -> > -> > Note that we did not type out `-x -v -z -f`, thanks to the flag -> > concatenation, though the command works identically either way. -> > -> > ``` -> > {{ site.remote.prompt }} du -sh hpc-lesson-data -> > 144K hpc-intro-data -> > ``` -> > {: .language-bash} -> {: .solution} -> -> > ## Was the Data Compressed? -> > -> > Text files compress nicely: the "tarball" is one-quarter the total size of -> > the raw data! -> {: .discussion} -{: .challenge} - -If you want to reverse the process -- compressing raw data instead of -extracting it -- set a `c` flag instead of `x`, set the archive filename, -then provide a directory to compress: - -``` -{{ site.local.prompt }} tar -cvzf compressed_data.tar.gz hpc-intro-data -``` -{: .language-bash} - -> ## Working with Windows -> -> When you transfer text files to from a Windows system to a Unix system (Mac, -> Linux, BSD, Solaris, etc.) this can cause problems. Windows encodes its files -> slightly different than Unix, and adds an extra character to every line. -> -> On a Unix system, every line in a file ends with a `\n` (newline). On -> Windows, every line in a file ends with a `\r\n` (carriage return + newline). -> This causes problems sometimes. -> -> Though most modern programming languages and software handles this correctly, -> in some rare instances, you may run into an issue. The solution is to convert -> a file from Windows to Unix encoding with the `dos2unix` command. -> -> You can identify if a file has Windows line endings with `cat -A filename`. A -> file with Windows line endings will have `^M$` at the end of every line. A -> file with Unix line endings will have `$` at the end of a line. -> -> To convert the file, just run `dos2unix filename`. (Conversely, to convert -> back to Windows format, you can run `unix2dos filename`.) -{: .callout} - -{% include links.md %} - -[rsync]: https://rsync.samba.org/ diff --git a/_episodes/16-parallel.md b/_episodes/16-parallel.md deleted file mode 100644 index 53b1db4f..00000000 --- a/_episodes/16-parallel.md +++ /dev/null @@ -1,668 +0,0 @@ ---- -title: "Running a parallel job" -teaching: 30 -exercises: 60 -questions: -- "How do we execute a task in parallel?" -- "What benefits arise from parallel execution?" -- "What are the limits of gains from execution in parallel?" -objectives: -- "Construct a program that can execute in parallel." -- "Prepare a job submission script for the parallel executable." -- "Launch jobs with parallel execution." -- "Record and summarize the timing and accuracy of jobs." -- "Describe the relationship between job parallelism and performance." -keypoints: -- "Parallel programming allows applications to take advantage of - parallel hardware; serial code will not 'just work.'" -- "Distributed memory parallelism is a common case, using the Message - Passing Interface (MPI)." -- "The queuing system facilitates executing parallel tasks." -- "Performance improvements from parallel execution do not scale linearly." ---- - -We now have the tools we need to run a multi-processor job. This is a very -important aspect of HPC systems, as parallelism is one of the primary tools -we have to improve the performance of computational tasks. - -Our example implements a stochastic algorithm for estimating the value of -π, the ratio of the circumference to the diameter of a circle. -The program generates a large number of random points on a 1×1 square -centered on (½,½), and checks how many of these points fall -inside the unit circle. -On average, π/4 of the randomly-selected points should fall in the -circle, so π can be estimated from 4*f*, where _f_ is the observed -fraction of points that fall in the circle. -Because each sample is independent, this algorithm is easily implemented -in parallel. - -{% include figure.html url="" caption="" max-width="40%" - file="/fig/pi.png" - alt="Algorithm for computing pi through random sampling" %} - -## A Serial Solution to the Problem - -We start from a Python script using concepts taught in Software Carpentry's -[Programming with Python][inflammation] workshops. -We want to allow the user to specify how many random points should be used -to calculate π through a command-line parameter. -This script will only use a single CPU for its entire run, so it's classified -as a serial process. - -Let's write a Python program, `pi.py`, to estimate π for us. -Start by importing the `numpy` module for calculating the results, -and the `sys` module to process command-line parameters: - -``` -import numpy as np -import sys -``` -{: .language-python} - -We define a Python function `inside_circle` that accepts a single parameter -for the number of random points used to calculate π. -See [Programming with Python: Creating Functions][python-func] -for a review of Python functions. -It randomly samples points with both _x_ and _y_ on the half-open interval -[0, 1). -It then computes their distances from the origin (i.e., radii), and returns -how many of those distances were less than or equal to 1.0. -All of this is done using _vectors_ of double-precision (64-bit) -floating-point values. - -``` -def inside_circle(total_count): - x = np.random.uniform(size=total_count) - y = np.random.uniform(size=total_count) - radii = np.sqrt(x * x + y * y) - count = len(radii[np.where(radii<=1.0)]) - return count -``` -{: .language-python} - -Next, we create a main function to call the `inside_circle` function and -calculate π from its returned result. -See [Programming with Python: Command-Line Programs][cmd-line] -for a review of `main` functions and parsing command-line parameters. - -``` -def main(): - n_samples = int(sys.argv[1]) - counts = inside_circle(n_samples) - my_pi = 4.0 * counts / n_samples - print(my_pi) - -if __name__ == '__main__': - main() -``` -{: .language-python} - -If we run the Python script locally with a command-line parameter, as in -`python pi-serial.py 1024`, we should see the script print its estimate of -π: - -``` -{{ site.local.prompt }} python pi-serial.py 1024 -3.10546875 -``` -{: .language-bash} - -> ## Random Number Generation -> -> In the preceding code, random numbers are conveniently generated using the -> built-in capabilities of NumPy. In general, random-number generation is -> difficult to do well, it's easy to accidentally introduce correlations into -> the generated sequence. -> -> * Discuss why generating high quality random numbers might be difficult. -> * Is the quality of random numbers generated sufficient for estimating π -> in this implementation? -> -> > ## Solution -> > -> > * Computers are deterministic and produce pseudo random numbers using -> > an algorithm. The choice of algorithm and its parameters determines -> > how random the generated numbers are. Pseudo random number generation -> > algorithms usually produce a sequence numbers taking the previous output -> > as an input for generating the next number. At some point the sequence of -> > pseudo random numbers will repeat, so care is required to make sure the -> > repetition period is long and that the generated numbers have statistical -> > properties similar to those of true random numbers. -> > * Yes. -> {: .solution } -{: .discussion } - -## Measuring Performance of the Serial Solution - -The stochastic method used to estimate π should converge on the true -value as the number of random points increases. -But as the number of points increases, creating the variables `x`, `y`, and -`radii` requires more time and more memory. -Eventually, the memory required may exceed what's available on our local -laptop or desktop, or the time required may be too long to meet a deadline. -So we'd like to take some measurements of how much memory and time the script -requires, and later take the same measurements after creating a parallel -version of the script to see the benefits of parallelizing the calculations -required. - -### Estimating Memory Requirements - -Since the largest variables in the script are `x`, `y`, and `radii`, each -containing `n_samples` points, we'll modify the script to report their -total memory required. -Each point in `x`, `y`, or `radii` is stored as a NumPy `float64`, we can -use NumPy's [`dtype`][np-dtype] function to calculate the size of a `float64`. - -Replace the `print(my_pi)` line with the following: - -``` -size_of_float = np.dtype(np.float64).itemsize -memory_required = 3 * n_samples * size_of_float / (1024**3) -print("Pi: {}, memory: {} GiB".format(my_pi, memory_required)) -``` -{: .language-python} - -The first line calculates the bytes of memory required for a single -64-bit floating point number using the `dtype` function. -The second line estimates the total amount of memory required to store three -variables containing `n_samples` `float64` values, converting the value into -units of [gibibytes][units]. -The third line prints both the estimate of π and the estimated amount of -memory used by the script. - -The updated Python script is: - -``` -import numpy as np -import sys - -def inside_circle(total_count): - x = np.random.uniform(size=total_count) - y = np.random.uniform(size=total_count) - radii = np.sqrt(x * x + y * y) - count = len(radii[np.where(radii<=1.0)]) - return count - -def main(): - n_samples = int(sys.argv[1]) - counts = inside_circle(n_samples) - my_pi = 4.0 * counts / n_samples - size_of_float = np.dtype(np.float64).itemsize - memory_required = 3 * n_samples * size_of_float / (1024**3) - print("Pi: {}, memory: {} GiB".format(my_pi, memory_required)) - -if __name__ == '__main__': - main() -``` -{: .language-python} - -Run the script again with a few different values for the number of samples, -and see how the memory required changes: - -``` -{{ site.local.prompt }} python pi-serial.py 1000 -Pi: 3.144, memory: 2.2351741790771484e-05 GiB -{{ site.local.prompt }} python pi-serial.py 2000 -Pi: 3.18, memory: 4.470348358154297e-05 GiB -{{ site.local.prompt }} python pi-serial.py 1000000 -Pi: 3.140944, memory: 0.022351741790771484 GiB -{{ site.local.prompt }} python pi-serial.py 100000000 -Pi: 3.14182724, memory: 2.2351741790771484 GiB -``` -{: .language-bash } - -Here we can see that the estimated amount of memory required scales linearly -with the number of samples used. -In practice, there is some memory required for other parts of the script, -but the `x`, `y`, and `radii` variables are by far the largest influence -on the total amount of memory required. - -### Estimating Calculation Time - -Most of the calculations required to estimate π are in the -`inside_circle` function: - -1. Generating `n_samples` random values for `x` and `y`. -1. Calculating `n_samples` values of `radii` from `x` and `y`. -1. Counting how many values in `radii` are under 1.0. - -There's also one multiplication operation and one division operation required -to convert the `counts` value to the final estimate of π in the main -function. - -A simple way to measure the calculation time is to use Python's `datetime` -module to store the computer's current date and time before and after the -calculations, and calculate the difference between those times. - -To add the time measurement to the script, add the following line below the -`import sys` line: - -``` -import datetime -``` -{: .language-python} - -Then, add the following line immediately above the line calculating `counts`: - -``` -start_time = datetime.datetime.now() -``` -{: .language-python} - -Add the following two lines immediately below the line calculating `counts`: - -``` -end_time = datetime.datetime.now() -elapsed_time = (end_time - start_time).total_seconds() -``` -{: .language-python} - -And finally, modify the `print` statement with the following: - -``` -print("Pi: {}, memory: {} GiB, time: {} s".format(my_pi, memory_required, - elapsed_time)) -``` -{: .language-python} - -The final Python script for the serial solution is: - -``` -import numpy as np -import sys -import datetime - -def inside_circle(total_count): - x = np.random.uniform(size=total_count) - y = np.random.uniform(size=total_count) - radii = np.sqrt(x * x + y * y) - count = len(radii[np.where(radii<=1.0)]) - return count - -def main(): - n_samples = int(sys.argv[1]) - start_time = datetime.datetime.now() - counts = inside_circle(n_samples) - my_pi = 4.0 * counts / n_samples - end_time = datetime.datetime.now() - elapsed_time = (end_time - start_time).total_seconds() - size_of_float = np.dtype(np.float64).itemsize - memory_required = 3 * n_samples * size_of_float / (1024**3) - print("Pi: {}, memory: {} GiB, time: {} s".format(my_pi, memory_required, - elapsed_time)) - -if __name__ == '__main__': - main() -``` -{: .language-python} - -Run the script again with a few different values for the number of samples, -and see how the solution time changes: - -``` -{{ site.local.prompt }} python pi-serial.py 1000000 -Pi: 3.139612, memory: 0.022351741790771484 GiB, time: 0.034872 s -{{ site.local.prompt }} python pi-serial.py 10000000 -Pi: 3.1425492, memory: 0.22351741790771484 GiB, time: 0.351212 s -{{ site.local.prompt }} python pi-serial.py 100000000 -Pi: 3.14146608, memory: 2.2351741790771484 GiB, time: 3.735195 s -``` -{: .language-bash } - -Here we can see that the amount of time required scales approximately linearly -with the number of samples used. -There could be some variation in additional runs of the script with the same -number of samples, since the elapsed time is affected by other programs -running on the computer at the same time. -But if the script is the most computationally-intensive process running at the -time, its calculations are the largest influence on the elapsed time. - -Now that we've developed our initial script to estimate π, we can see -that as we increase the number of samples: - -1. The estimate of π tends to become more accurate. -1. The amount of memory required scales approximately linearly. -1. The amount of time to calculate scales approximately linearly. - -In general, achieving a better estimate of π requires a greater number of -points. -Take a closer look at `inside_circle`: should we expect to get high accuracy -on a single machine? - -Probably not. -The function allocates three arrays of size _N_ equal to the number of points -belonging to this process. -Using 64-bit floating point numbers, the memory footprint of these arrays can -get quite large. -Each 100,000,000 points sampled consumes 2.24 GiB of memory. -Sampling 400,000,000 points consumes 8.94 GiB of memory, -and if your machine has less RAM than that, it will grind to a halt. -If you have 16 GiB installed, you won't quite make it to 750,000,000 points. - -## Running the Serial Job on a Compute Node - -Create a submission file, requesting one task on a single node and enough -memory to prevent the job from running out of memory: - -``` -{{ site.remote.prompt }} nano serial-pi.sh -{{ site.remote.prompt }} cat serial-pi.sh -``` -{: .language-bash} - -{% include {{ site.snippets }}/parallel/one-task-with-memory-jobscript.snip %} - -Then submit your job. We will use the batch file to set the options, -rather than the command line. - -``` -{{ site.remote.prompt }} {{ site.sched.submit.name }} serial-pi.sh -``` -{: .language-bash} - -As before, use the status commands to check when your job runs. -Use `ls` to locate the output file, and examine it. Is it what you expected? - -* How good is the value for π? -* How much memory did it need? -* How long did the job take to run? - -Modify the job script to increase both the number of samples and the amount -of memory requested (perhaps by a factor of 2, then by a factor of 10), -and resubmit the job each time. - -* How good is the value for π? -* How much memory did it need? -* How long did the job take to run? - -Even with sufficient memory for necessary variables, -a script could require enormous amounts of time to calculate on a single CPU. -To reduce the amount of time required, -we need to modify the script to use multiple CPUs for the calculations. -In the largest problem scales, -we could use multiple CPUs in multiple compute nodes, -distributing the memory requirements across all the nodes used to -calculate the solution. - -## Running the Parallel Job - -We will run an example that uses the Message Passing Interface (MPI) for -parallelism -- this is a common tool on HPC systems. - -> ## What is MPI? -> -> The Message Passing Interface is a set of tools which allow multiple parallel -> jobs to communicate with each other. -> Typically, a single executable is run multiple times, possibly on different -> machines, and the MPI tools are used to inform each instance of the -> executable about how many instances there are, which instance it is. -> MPI also provides tools to allow communication and coordination between -> instances. -> An MPI instance typically has its own copy of all the local variables. -{: .callout} - -While MPI jobs can generally be run as stand-alone executables, in order for -them to run in parallel they must use an MPI _run-time system_, which is a -specific implementation of the MPI _standard_. -To do this, they should be started via a command such as `mpiexec` (or -`mpirun`, or `srun`, etc. depending on the MPI run-time you need to use), -which will ensure that the appropriate run-time support for parallelism is -included. - -> ## MPI Runtime Arguments -> -> On their own, commands such as `mpiexec` can take many arguments specifying -> how many machines will participate in the execution, -> and you might need these if you would like to run an MPI program on your -> laptop (for example). -> In the context of a queuing system, however, it is frequently the case that -> we do not need to specify this information as the MPI run-time will have been -> configured to obtain it from the queuing system, -> by examining the environment variables set when the job is launched. -{: .callout} - -> ## What Changes Are Needed for an MPI Version of the π Calculator? -> -> First, we need to import the `MPI` object from the Python module `mpi4py` by -> adding an `from mpi4py import MPI` line immediately below the `import -> datetime` line. -> -> Second, we need to modify the "main" function to perform the overhead and -> accounting work required to: -> -> * subdivide the total number of points to be sampled, -> * _partition_ the total workload among the various parallel processors -> available, -> * have each parallel process report the results of its workload back -> to the "rank 0" process, -> which does the final calculations and prints out the result. -> -> The modifications to the serial script demonstrate four important concepts: -> -> * COMM_WORLD: the default MPI Communicator, providing a channel for all the -> processes involved in this `mpiexec` to exchange information with one -> another. -> * Scatter: A collective operation in which an array of data on one MPI rank -> is divided up, with separate portions being sent out to the partner ranks. -> Each partner rank receives data from the matching index of the host array. -> * Gather: The inverse of scatter. One rank populates a local array, -> with the array element at each index assigned the value provided by the -> corresponding partner rank -- including the host's own value. -> * Conditional Output: since every rank is running the _same code_, the -> partitioning, the final calculations, and the `print` statement are -> wrapped in a conditional so that only one rank performs these operations. -{: .discussion} - -We add the lines: - -``` -comm = MPI.COMM_WORLD -cpus = comm.Get_size() -rank = comm.Get_rank() -``` -{: .language-python} - -immediately before the `n_samples` line to set up the MPI environment for -each process. - -We replace the `start_time` and `counts` lines with the lines: - -``` -if rank == 0: - start_time = datetime.datetime.now() - partitions = [ int(n_samples / cpus) ] * cpus - counts = [ int(0) ] * cpus -else: - partitions = None - counts = None -``` -{: .language-python} - -This ensures that only the rank 0 process measures times and coordinates -the work to be distributed to all the ranks, while the other ranks -get placeholder values for the `partitions` and `counts` variables. - -Immediately below these lines, let's - -* distribute the work among the ranks with MPI `scatter`, -* call the `inside_circle` function so each rank can perform its share - of the work, -* collect each rank's results into a `counts` variable on rank 0 using MPI - `gather`. - -by adding the following three lines: - -``` -partition_item = comm.scatter(partitions, root=0) -count_item = inside_circle(partition_item) -counts = comm.gather(count_item, root=0) -``` -{: .language-python} - -Illustrations of these steps are shown below. - ---- - -Setup the MPI environment and initialize local variables -- including the -vector containing the number of points to generate on each parallel processor: - -{% include figure.html url="" caption="" max-width="50%" - file="/fig/initialize.png" - alt="MPI initialize" %} - -Distribute the number of points from the originating vector to all the parallel -processors: - -{% include figure.html url="" caption="" max-width="50%" - file="/fig/scatter.png" - alt="MPI scatter" %} - -Perform the computation in parallel: - -{% include figure.html url="" caption="" max-width="50%" - file="/fig/compute.png" - alt="MPI compute" %} - -Retrieve counts from all the parallel processes: - -{% include figure.html url="" caption="" max-width="50%" - file="/fig/gather.png" - alt="MPI gather" %} - -Print out the report: - -{% include figure.html url="" caption="" max-width="50%" - file="/fig/finalize.png" - alt="MPI finalize" %} - ---- - -Finally, we'll ensure the `my_pi` through `print` lines only run on rank 0. -Otherwise, every parallel processor will print its local value, -and the report will become hopelessly garbled: - -``` -if rank == 0: - my_pi = 4.0 * sum(counts) / sum(partitions) - end_time = datetime.datetime.now() - elapsed_time = (end_time - start_time).total_seconds() - size_of_float = np.dtype(np.float64).itemsize - memory_required = 3 * sum(partitions) * size_of_float / (1024**3) - print("Pi: {}, memory: {} GiB, time: {} s".format(my_pi, memory_required, - elapsed_time)) -``` -{: .language-python} - -A fully commented version of the final MPI parallel python code is available: -[pi-mpi.py]({{ site.url }}{{ site.baseurl }}/files/pi-mpi.py). - -Our purpose here is to exercise the parallel workflow of the cluster, not to -optimize the program to minimize its memory footprint. -Rather than push our local machines to the breaking point (or, worse, the login -node), let's give it to a cluster node with more resources. - -Create a submission file, requesting more than one task on a single node: - -``` -{{ site.remote.prompt }} nano parallel-pi.sh -{{ site.remote.prompt }} cat parallel-pi.sh -``` -{: .language-bash} - -{% include {{ site.snippets }}/parallel/four-tasks-jobscript.snip %} - -Then submit your job. We will use the batch file to set the options, -rather than the command line. - -``` -{{ site.remote.prompt }} {{ site.sched.submit.name }} parallel-pi.sh -``` -{: .language-bash} - -As before, use the status commands to check when your job runs. -Use `ls` to locate the output file, and examine it. -Is it what you expected? - -* How good is the value for π? -* How much memory did it need? -* How much faster was this run than the serial run with 100000000 points? - -Modify the job script to increase both the number of samples and the amount -of memory requested (perhaps by a factor of 2, then by a factor of 10), -and resubmit the job each time. -You can also increase the number of CPUs. - -* How good is the value for π? -* How much memory did it need? -* How long did the job take to run? - -## How Much Does MPI Improve Performance? - -In theory, by dividing up the π calculations among _n_ MPI processes, -we should see run times reduce by a factor of _n_. -In practice, some time is required to start the additional MPI processes, -for the MPI processes to communicate and coordinate, and some types of -calculations may only be able to run effectively on a single CPU. - -Additionally, if the MPI processes operate on different physical CPUs -in the computer, or across multiple compute nodes, additional time is -required for communication compared to all processes operating on a -single CPU. - -[Amdahl's Law][amdahl] is one way of predicting improvements in execution time -for a __fixed__ parallel workload. If a workload needs 20 hours to complete on -a single core, and one hour of that time is spent on tasks that cannot be -parallelized, only the remaining 19 hours could be parallelized. Even if an -infinite number of cores were used for the parallel parts of the workload, the -total run time cannot be less than one hour. - -In practice, it's common to evaluate the parallelism of an MPI program by - -* running the program across a range of CPU counts, -* recording the execution time on each run, -* comparing each execution time to the time when using a single CPU. - -The speedup factor _S_ is calculated as the single-CPU execution time divided -by the multi-CPU execution time. -For a laptop with 8 cores, the graph of speedup factor versus number of cores -used shows relatively consistent improvement when using 2, 4, or 8 cores, but -using additional cores shows a diminishing return. - -{% include figure.html url="" caption="" max-width="50%" - file="/fig/laptop-mpi_Speedup_factor.png" - alt="MPI speedup factors on an 8-core laptop" %} - -For a set of HPC nodes containing 28 cores each, the graph of speedup factor -versus number of cores shows consistent improvements up through three nodes -and 84 cores, but __worse__ performance when adding a fourth node with an -additional 28 cores. -This is due to the amount of communication and coordination required among -the MPI processes requiring more time than is gained by reducing the amount -of work each MPI process has to complete. This communication overhead is not -included in Amdahl's Law. - -{% include figure.html url="" caption="" max-width="50%" - file="/fig/hpc-mpi_Speedup_factor.png" - alt="MPI speedup factors on an 8-core laptop" %} - -In practice, MPI speedup factors are influenced by: - -* CPU design, -* the communication network between compute nodes, -* the MPI library implementations, and -* the details of the MPI program itself. - -In an HPC environment, we try to reduce the execution time for all types of -jobs, and MPI is an extremely common way to combine dozens, hundreds, or -thousands of CPUs into solving a single problem. To learn more about -parallelization, see the [parallel novice lesson][parallel-novice] lesson. - -{% include links.md %} - -[amdahl]: https://en.wikipedia.org/wiki/Amdahl's_law -[cmd-line]: https://swcarpentry.github.io/python-novice-inflammation/12-cmdline/index.html -[inflammation]: https://swcarpentry.github.io/python-novice-inflammation/ -[np-dtype]: https://numpy.org/doc/stable/reference/generated/numpy.dtype.html -[parallel-novice]: http://www.hpc-carpentry.org/hpc-parallel-novice/ -[python-func]: https://swcarpentry.github.io/python-novice-inflammation/08-func/index.html -[units]: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units diff --git a/_episodes/16-transferring-files.md b/_episodes/16-transferring-files.md new file mode 100644 index 00000000..f0d6074f --- /dev/null +++ b/_episodes/16-transferring-files.md @@ -0,0 +1,464 @@ +--- +title: "Transferring files with remote computers" +teaching: 15 +exercises: 15 +questions: +- "How do I transfer files to (and from) the cluster?" +objectives: +- "Transfer files to and from a computing cluster." +keypoints: +- "`wget` and `curl -O` download a file from the internet." +- "`scp` and `rsync` transfer files to and from your computer." +- "You can use an SFTP client like FileZilla to transfer files through a GUI." +--- + +Performing work on a remote computer is not very useful if we cannot get files +to or from the cluster. There are several options for transferring data between +computing resources using CLI and GUI utilities, a few of which we will cover. + +## Download Lesson Files From the Internet + +One of the most straightforward ways to download files is to use either `curl` +or `wget`. One of these is usually installed in most Linux shells, on Mac OS +terminal and in GitBash. Any file that can be downloaded in your web browser +through a direct link can be downloaded using `curl` or `wget`. This is a +quick way to download datasets or source code. The syntax for these commands is + +* `wget [-O new_name] https://some/link/to/a/file` +* `curl [-o new_name] https://some/link/to/a/file` + +Try it out by downloading some material we'll use later on, from a terminal on +your local machine, using the URL of the current codebase: + + + +> ## Download the "Tarball" +> +> The word "tarball" in the above URL refers to a compressed archive format +> commonly used on Linux, which is the operating system the majority of HPC +> cluster machines run. +> A tarball is a lot like a `.zip` file. +> The actual file extension is `.tar.gz`, which reflects the two-stage process +> used to create the file: +> the files or folders are merged into a single file using `tar`, which is then +> compressed using `gzip`, so the file extension is "tar-dot-g-z." +> That's a mouthful, so people often say "the _xyz_ tarball" instead. +> +> You may also see the extension `.tgz`, which is just an abbreviation of +> `.tar.gz`. +> +> By default, `curl` and `wget` download files to the same name as the URL: +> in this case, `main`. +> Use one of the above commands to save the tarball as `amdahl.tar.gz`. +> +> > ## `wget` and `curl` Commands +> > +> > ``` +> > {{ site.local.prompt }} wget -O amdahl.tar.gz https://github.com/hpc-carpentry/amdahl/tarball/main +> > # or +> > {{ site.local.prompt }} curl -o amdahl.tar.gz -L https://github.com/hpc-carpentry/amdahl/tarball/main +> > ``` +> > {: .language-bash} +> > The `-L` option to `curl` tells it to follow URL redirects (which `wget` does by default). +> {: .solution} +{: .challenge} + +After downloading the file, use `ls` to see it in your working directory: + +``` +{{ site.local.prompt }} ls +``` +{: .language-bash} + +## Archiving Files + +One of the biggest challenges we often face when transferring data between +remote HPC systems is that of large numbers of files. There is an overhead to +transferring each individual file and when we are transferring large numbers of +files these overheads combine to slow down our transfers to a large degree. + +The solution to this problem is to _archive_ multiple files into smaller +numbers of larger files before we transfer the data to improve our transfer +efficiency. +Sometimes we will combine archiving with _compression_ to reduce the amount of +data we have to transfer and so speed up the transfer. +The most common archiving command you will use on a (Linux) HPC cluster is +`tar`. + +`tar` can be used to combine files and folders into a single archive file and, +optionally, compress the result. +Let's look at the file we downloaded from the lesson site, `amdahl.tar.gz`. + +The `.gz` part stands for _gzip_, which is a compression library. +It's common (but not necessary!) that this kind of file can be interpreted by +reading its name: it appears somebody took files and folders relating to +something called "amdahl," wrapped them all up into a single file with `tar`, +then compressed that archive with `gzip` to save space. + +Let's see if that is the case, _without_ unpacking the file. +`tar` prints the "**t**able of contents" with the `-t` flag, for the file +specified with the `-f` flag followed by the filename. +Note that you can concatenate the two flags: writing `-t -f` is interchangeable +with writing `-tf` together. +However, the argument following `-f` must be a filename, so writing `-ft` will +_not_ work. + +``` +{{ site.local.prompt }} tar -tf amdahl.tar.gz +hpc-carpentry-amdahl-46c9b4b/ +hpc-carpentry-amdahl-46c9b4b/.github/ +hpc-carpentry-amdahl-46c9b4b/.github/workflows/ +hpc-carpentry-amdahl-46c9b4b/.github/workflows/python-publish.yml +hpc-carpentry-amdahl-46c9b4b/.gitignore +hpc-carpentry-amdahl-46c9b4b/LICENSE +hpc-carpentry-amdahl-46c9b4b/README.md +hpc-carpentry-amdahl-46c9b4b/amdahl/ +hpc-carpentry-amdahl-46c9b4b/amdahl/__init__.py +hpc-carpentry-amdahl-46c9b4b/amdahl/__main__.py +hpc-carpentry-amdahl-46c9b4b/amdahl/amdahl.py +hpc-carpentry-amdahl-46c9b4b/requirements.txt +hpc-carpentry-amdahl-46c9b4b/setup.py +``` +{: .language-bash} + +This example output shows a folder which contains a few files, where `46c9b4b` +is an 8-character [git][git-swc] commit hash that will change when the source +material is updated. + +Now let's unpack the archive. We'll run `tar` with a few common flags: + +* `-x` to e**x**tract the archive +* `-v` for **v**erbose output +* `-z` for g**z**ip compression +* `-f «tarball»` for the file to be unpacked + +> ## Extract the Archive +> +> Using the flags above, unpack the source code tarball into a new +> directory named "amdahl" using `tar`. +> +> ``` +> {{ site.local.prompt }} tar -xvzf amdahl.tar.gz +> ``` +> {: .language-bash} +> +> ``` +> hpc-carpentry-amdahl-46c9b4b/ +> hpc-carpentry-amdahl-46c9b4b/.github/ +> hpc-carpentry-amdahl-46c9b4b/.github/workflows/ +> hpc-carpentry-amdahl-46c9b4b/.github/workflows/python-publish.yml +> hpc-carpentry-amdahl-46c9b4b/.gitignore +> hpc-carpentry-amdahl-46c9b4b/LICENSE +> hpc-carpentry-amdahl-46c9b4b/README.md +> hpc-carpentry-amdahl-46c9b4b/amdahl/ +> hpc-carpentry-amdahl-46c9b4b/amdahl/__init__.py +> hpc-carpentry-amdahl-46c9b4b/amdahl/__main__.py +> hpc-carpentry-amdahl-46c9b4b/amdahl/amdahl.py +> hpc-carpentry-amdahl-46c9b4b/requirements.txt +> hpc-carpentry-amdahl-46c9b4b/setup.py +> ``` +> {: .output} +> +> Note that we did not need to type out `-x -v -z -f`, thanks to flag +> concatenation, though the command works identically either way -- +> so long as the concatenated list ends with `f`, because the next string +> must specify the name of the file to extract. +{: .discussion} + +The folder has an unfortunate name, so let's change that to something more +convenient. + +``` +{{ site.local.prompt }} mv hpc-carpentry-amdahl-46c9b4b amdahl +``` +{: .language-bash} + +Check the size of the extracted directory and compare to the compressed +file size, using `du` for "**d**isk **u**sage". + +``` +{{ site.local.prompt }} du -sh amdahl.tar.gz +8.0K amdahl.tar.gz +{{ site.local.prompt }} du -sh amdahl +48K amdahl +``` +{: .language-bash} + +Text files (including Python source code) compress nicely: +the "tarball" is one-sixth the total size of the raw data! + +If you want to reverse the process -- compressing raw data instead of +extracting it -- set a `c` flag instead of `x`, set the archive filename, +then provide a directory to compress: + +``` +{{ site.local.prompt }} tar -cvzf compressed_code.tar.gz amdahl +``` +{: .language-bash} +``` +amdahl/ +amdahl/.github/ +amdahl/.github/workflows/ +amdahl/.github/workflows/python-publish.yml +amdahl/.gitignore +amdahl/LICENSE +amdahl/README.md +amdahl/amdahl/ +amdahl/amdahl/__init__.py +amdahl/amdahl/__main__.py +amdahl/amdahl/amdahl.py +amdahl/requirements.txt +amdahl/setup.py +``` +{: .output} + +If you give `amdahl.tar.gz` as the filename in the above command, `tar` will +update the existing tarball with any changes you made to the files. +That would mean adding the new `amdahl` folder to the _existing_ folder +(`hpc-carpentry-amdahl-46c9b4b`) inside the tarball, doubling the size of the +archive! + +> ## Working with Windows +> +> When you transfer text files from a Windows system to a Unix system (Mac, +> Linux, BSD, Solaris, etc.) this can cause problems. Windows encodes its files +> slightly different than Unix, and adds an extra character to every line. +> +> On a Unix system, every line in a file ends with a `\n` (newline). On +> Windows, every line in a file ends with a `\r\n` (carriage return + newline). +> This causes problems sometimes. +> +> Though most modern programming languages and software handles this correctly, +> in some rare instances, you may run into an issue. The solution is to convert +> a file from Windows to Unix encoding with the `dos2unix` command. +> +> You can identify if a file has Windows line endings with `cat -A filename`. A +> file with Windows line endings will have `^M$` at the end of every line. A +> file with Unix line endings will have `$` at the end of a line. +> +> To convert the file, just run `dos2unix filename`. (Conversely, to convert +> back to Windows format, you can run `unix2dos filename`.) +{: .callout} + +## Transferring Single Files and Folders With `scp` + +To copy a single file to or from the cluster, we can use `scp` ("secure copy"). +The syntax can be a little complex for new users, but we'll break it down. +The `scp` command is a relative of the `ssh` command we used to +access the system, and can use the same public-key authentication +mechanism. + +To _upload to_ another computer, the template command is + +``` +{{ site.local.prompt }} scp local_file {{ site.remote.user }}@{{ site.remote.login }}:remote_destination +``` +{: .language-bash} + +in which `@` and `:` are field separators and `remote_destination` is a path +relative to your remote home directory, or a new filename if you wish to change +it, or both a relative path _and_ a new filename. +If you don't have a specific folder in mind you can omit the +`remote_destination` and the file will be copied to your home directory on the +remote computer (with its original name). +If you include a `remote_destination`, note that `scp` interprets this the same +way `cp` does when making local copies: +if it exists and is a folder, the file is copied inside the folder; if it +exists and is a file, the file is overwritten with the contents of +`local_file`; if it does not exist, it is assumed to be a destination filename +for `local_file`. + +Upload the lesson material to your remote home directory like so: + +``` +{{ site.local.prompt }} scp amdahl.tar.gz {{ site.remote.user }}@{{ site.remote.login }}: +``` +{: .language-bash} + +> ## Why Not Download on {{ site.remote.name }} Directly? +> +> Most computer clusters are protected from the open internet by a _firewall_. +> For enhanced security, some are configured to allow traffic _inbound_, but +> not _outbound_. +> This means that an authenticated user can send a file to a cluster machine, +> but a cluster machine cannot retrieve files from a user's machine or the +> open Internet. +> +> Try downloading the file directly. Note that it may well fail, and that's +> OK! +> +> > ## Commands +> > +> > ``` +> > {{ site.local.prompt }} ssh {{ site.remote.user }}@{{ site.remote.login }} +> > {{ site.remote.prompt }} wget -O amdahl.tar.gz https://github.com/hpc-carpentry/amdahl/tarball/main +> > # or +> > {{ site.remote.prompt }} curl -o amdahl.tar.gz https://github.com/hpc-carpentry/amdahl/tarball/main +> > ``` +> > {: .language-bash} +> {: .solution} +> +> Did it work? If not, what does the terminal output tell you about what +> happened? +{: .challenge} + +## Transferring a Directory + +To transfer an entire directory, we add the `-r` flag for "**r**ecursive": +copy the item specified, and every item below it, and every item below those... +until it reaches the bottom of the directory tree rooted at the folder name you +provided. + +``` +{{ site.local.prompt }} scp -r amdahl {{ site.remote.user }}@{{ site.remote.login }}: +``` +{: .language-bash} + +> ## Caution +> +> For a large directory -- either in size or number of files -- +> copying with `-r` can take a long time to complete. +{: .callout} + +When using `scp`, you may have noticed that a `:` _always_ follows the remote +computer name. +A string _after_ the `:` specifies the remote directory you wish to transfer +the file or folder to, including a new name if you wish to rename the remote +material. +If you leave this field blank, `scp` defaults to your home directory and the +name of the local material to be transferred. + +On Linux computers, `/` is the separator in file or directory paths. +A path starting with a `/` is called _absolute_, since there can be nothing +above the root `/`. +A path that does not start with `/` is called _relative_, since it is not +anchored to the root. + +If you want to upload a file to a location inside your home directory -- +which is often the case -- then you don't need a _leading_ `/`. After the `:`, +you can type the destination path relative to your home directory. +If your home directory _is_ the destination, you can leave the destination +field blank, or type `~` -- the shorthand for your home directory -- for +completeness. + +With `scp`, a trailing slash on the target directory is optional, and has no effect. +A trailing slash on a source directory is important for other commands, like `rsync`. + +> ## A Note on `rsync` +> +> As you gain experience with transferring files, you may find the `scp` +> command limiting. The [rsync] utility provides +> advanced features for file transfer and is typically faster compared to both +> `scp` and `sftp` (see below). It is especially useful for transferring large +> and/or many files and for synchronizing folder contents between computers. +> +> The syntax is similar to `scp`. To transfer _to_ another computer with +> commonly used options: +> +> ``` +> {{ site.local.prompt }} rsync -avP amdahl.tar.gz {{ site.remote.user }}@{{ site.remote.login }}: +> ``` +> {: .language-bash} +> +> The options are: +> +> * `-a` (**a**rchive) to preserve file timestamps, permissions, and folders, +> among other things; implies recursion +> * `-v` (**v**erbose) to get verbose output to help monitor the transfer +> * `-P` (partial/progress) to preserve partially transferred files in case +> of an interruption and also displays the progress of the transfer. +> +> To recursively copy a directory, we can use the same options: +> +> ``` +> {{ site.local.prompt }} rsync -avP amdahl {{ site.remote.user }}@{{ site.remote.login }}:~/ +> ``` +> {: .language-bash} +> +> As written, this will place the local directory and its contents under your +> home directory on the remote system. If a trailing slash is added to the +> source, a new directory corresponding to the transferred directory +> will not be created, and the contents of the source directory will be +> copied directly into the destination directory. +> +> To download a file, we simply change the source and destination: +> +> ``` +> {{ site.local.prompt }} rsync -avP {{ site.remote.user }}@{{ site.remote.login }}:amdahl ./ +> ``` +> {: .language-bash} +{: .callout} + +File transfers using both `scp` and `rsync` use SSH to encrypt data sent through +the network. So, if you can connect via SSH, you will be able to transfer +files. By default, SSH uses network port 22. If a custom SSH port is in use, +you will have to specify it using the appropriate flag, often `-p`, `-P`, or +`--port`. Check `--help` or the `man` page if you're unsure. + +> ## Change the Rsync Port +> +> Say we have to connect `rsync` through port 768 instead of 22. How would we +> modify this command? +> +> ``` +> {{ site.local.prompt }} rsync amdahl.tar.gz {{ site.remote.user }}@{{ site.remote.login }}: +> ``` +> {: .language-bash} +> +> _Hint:_ check the `man` page or "help" for `rsync`. +> +> > ## Solution +> > +> > ``` +> > {{ site.local.prompt }} man rsync +> > {{ site.local.prompt }} rsync --help | grep port +> > --port=PORT specify double-colon alternate port number +> > See http://rsync.samba.org/ for updates, bug reports, and answers +> > {{ site.local.prompt }} rsync --port=768 amdahl.tar.gz {{ site.remote.user }}@{{ site.remote.login }}: +> > ``` +> > {: .language-bash} +> > +> > (Note that this command will fail, as the correct port in this case is the +> > default: 22.) +> {: .solution} +{: .challenge} + +## Transferring Files Interactively with FileZilla + +FileZilla is a cross-platform client for downloading and uploading files to and +from a remote computer. It is absolutely fool-proof and always works quite +well. It uses the `sftp` protocol. You can read more about using the `sftp` +protocol in the command line in the +[lesson discussion]({{ site.baseurl }}{% link _extras/discuss.md %}). + +Download and install the FileZilla client from . +After installing and opening the program, you should end up with a window with +a file browser of your local system on the left hand side of the screen. When +you connect to the cluster, your cluster files will appear on the right hand +side. + +To connect to the cluster, we'll just need to enter our credentials at the top +of the screen: + +* Host: `sftp://{{ site.remote.login }}` +* User: Your cluster username +* Password: Your cluster password +* Port: (leave blank to use the default port) + +Hit "Quickconnect" to connect. You should see your remote files appear on the +right hand side of the screen. You can drag-and-drop files between the left +(local) and right (remote) sides of the screen to transfer files. + +{% include {{ site.snippets }}/transferring-files/filezilla-ssh-tunnel-instructions.snip %} + +Finally, if you need to move large files (typically larger than a gigabyte) +from one remote computer to another remote computer, SSH in to the computer +hosting the files and use `scp` or `rsync` to transfer over to the other. This +will be more efficient than using FileZilla (or related applications) that +would copy from the source to your local machine, then to the destination +machine. + +{% include links.md %} + +[git-swc]: https://swcarpentry.github.io/git-novice/ +[rsync]: https://rsync.samba.org/ diff --git a/_episodes/17-parallel.md b/_episodes/17-parallel.md new file mode 100644 index 00000000..dc5635e2 --- /dev/null +++ b/_episodes/17-parallel.md @@ -0,0 +1,470 @@ +--- +title: "Running a parallel job" +teaching: 30 +exercises: 60 +questions: +- "How do we execute a task in parallel?" +- "What benefits arise from parallel execution?" +- "What are the limits of gains from execution in parallel?" +objectives: +- "Install a Python package using `pip`" +- "Prepare a job submission script for the parallel executable." +- "Launch jobs with parallel execution." +- "Record and summarize the timing and accuracy of jobs." +- "Describe the relationship between job parallelism and performance." +keypoints: +- "Parallel programming allows applications to take advantage of + parallel hardware." +- "The queuing system facilitates executing parallel tasks." +- "Performance improvements from parallel execution do not scale linearly." +--- + +We now have the tools we need to run a multi-processor job. This is a very +important aspect of HPC systems, as parallelism is one of the primary tools +we have to improve the performance of computational tasks. + +If you disconnected, log back in to the cluster. + +``` +{{ site.local.prompt }} ssh {{ site.remote.user }}@{{ site.remote.login }} +``` +{: .language-bash} + +## Install the Amdahl Program + +With the Amdahl source code on the cluster, we can install it, which will +provide access to the `amdahl` executable. +Move into the extracted directory, then use the Package Installer for Python, +or `pip`, to install it in your ("user") home directory: + +``` +{{ site.remote.prompt }} cd amdahl +{{ site.remote.prompt }} python3 -m pip install --user . +``` +{: .language-bash} + +> ## Amdahl is Python Code +> +> The Amdahl program is written in Python, and installing or using it requires +> locating the `python3` executable on the login node. +> If it can't be found, try listing available modules using `module avail`, +> load the appropriate one, and try the command again. +{: .callout} + +### MPI for Python + +The Amdahl code has one dependency: __mpi4py__. +If it hasn't already been installed on the cluster, `pip` will attempt to +collect mpi4py from the Internet and install it for you. +If this fails due to a one-way firewall, you must retrieve mpi4py on your +local machine and upload it, just as we did for Amdahl. + +> ## Retrieve and Upload `mpi4py` +> +> If installing Amdahl failed because mpi4py could not be installed, +> retrieve the tarball from +> then `rsync` it to the cluster, extract, and install: +> +> ``` +> {{ site.local.prompt }} wget -O mpi4py.tar.gz https://github.com/mpi4py/mpi4py/releases/download/3.1.4/mpi4py-3.1.4.tar.gz +> {{ site.local.prompt }} scp mpi4py.tar.gz {{ site.remote.user }}@{{ site.remote.login }}: +> # or +> {{ site.local.prompt }} rsync -avP mpi4py.tar.gz {{ site.remote.user }}@{{ site.remote.login }}: +> ``` +> {: .language-bash} +> +> ``` +> {{ site.local.prompt }} ssh {{ site.remote.user }}@{{ site.remote.login }} +> {{ site.remote.prompt }} tar -xvzf mpi4py.tar.gz # extract the archive +> {{ site.remote.prompt }} mv mpi4py* mpi4py # rename the directory +> {{ site.remote.prompt }} cd mpi4py +> {{ site.remote.prompt }} python3 -m pip install --user . +> {{ site.remote.prompt }} cd ../amdahl +> {{ site.remote.prompt }} python3 -m pip install --user . +> ``` +> {: .language-bash} +{: .discussion} + +> ## If `pip` Raises a Warning... +> +> `pip` may warn that your user package binaries are not in your PATH. +> +> ``` +> WARNING: The script amdahl is installed in "${HOME}/.local/bin" which is +> not on PATH. Consider adding this directory to PATH or, if you prefer to +> suppress this warning, use --no-warn-script-location. +> ``` +> {: .warning} +> +> To check whether this warning is a problem, use `which` to search for the +> `amdahl` program: +> +> ``` +> {{ site.remote.prompt }} which amdahl +> ``` +> {: .language-bash} +> +> If the command returns no output, displaying a new prompt, it means the file +> `amdahl` has not been found. You must update the environment variable named +> `PATH` to include the missing folder. +> Edit your shell configuration file as follows, then log off the cluster and +> back on again so it takes effect. +> +> ``` +> {{ site.remote.prompt }} nano ~/.bashrc +> {{ site.remote.prompt }} tail ~/.bashrc +> ``` +> {: .language-bash} +> ``` +> export PATH=${PATH}:${HOME}/.local/bin +> ``` +> {: .output} +> +> After logging back in to {{ site.remote.login }}, `which` should be able to +> find `amdahl` without difficulties. +> If you had to load a Python module, load it again. +{: .discussion} + +## Help! + +Many command-line programs include a "help" message. Try it with `amdahl`: + +``` +{{ site.remote.prompt }} amdahl --help +``` +{: .language-bash} + +``` +usage: amdahl [-h] [-p [PARALLEL_PROPORTION]] [-w [WORK_SECONDS]] [-t] [-e] [-j [JITTER_PROPORTION]] + +optional arguments: + -h, --help show this help message and exit + -p [PARALLEL_PROPORTION], --parallel-proportion [PARALLEL_PROPORTION] + Parallel proportion: a float between 0 and 1 + -w [WORK_SECONDS], --work-seconds [WORK_SECONDS] + Total seconds of workload: an integer greater than 0 + -t, --terse Format output as a machine-readable object for easier analysis + -e, --exact Exactly match requested timing by disabling random jitter + -j [JITTER_PROPORTION], --jitter-proportion [JITTER_PROPORTION] + Random jitter: a float between -1 and +1 +``` +{: .output} + +This message doesn't tell us much about what the program _does_, but it does +tell us the important flags we might want to use when launching it. + +## Running the Job on a Compute Node + +Create a submission file, requesting one task on a single node, then launch it. + +``` +{{ site.remote.prompt }} nano serial-job.sh +{{ site.remote.prompt }} cat serial-job.sh +``` +{: .language-bash} + +{% include {{ site.snippets }}/parallel/one-task-jobscript.snip %} + +``` +{{ site.remote.prompt }} {{ site.sched.submit.name }} serial-job.sh +``` +{: .language-bash} + +As before, use the {{ site.sched.name }} status commands to check whether your job +is running and when it ends: + +``` +{{ site.remote.prompt }} {{ site.sched.status }} {{ site.sched.flag.user }} +``` +{: .language-bash} + +Use `ls` to locate the output file. The `-t` flag sorts in +reverse-chronological order: newest first. What was the output? + +> ## Read the Job Output +> +> The cluster output should be written to a file in the folder you launched the +> job from. For example, +> +> ``` +> {{ site.remote.prompt }} ls -t +> ``` +> {: .language-bash} +> ``` +> slurm-347087.out serial-job.sh amdahl README.md LICENSE.txt +> ``` +> {: .output} +> ``` +> {{ site.remote.prompt }} cat slurm-347087.out +> ``` +> {: .language-bash} +> ``` +> Doing 30.000 seconds of 'work' on 1 processor, +> which should take 30.000 seconds with 0.850 parallel proportion of the workload. +> +> Hello, World! I am process 0 of 1 on {{ site.remote.node }}. I will do all the serial 'work' for 4.500 seconds. +> Hello, World! I am process 0 of 1 on {{ site.remote.node }}. I will do parallel 'work' for 25.500 seconds. +> +> Total execution time (according to rank 0): 30.033 seconds +> ``` +> {: .output} +{: .solution} + +As we saw before, two of the `amdahl` program flags set the amount of work and +the proportion of that work that is parallel in nature. Based on the output, we +can see that the code uses a default of 30 seconds of work that is 85% +parallel. The program ran for just over 30 seconds in total, and if we run the +numbers, it is true that 15% of it was marked 'serial' and 85% was 'parallel'. + +Since we only gave the job one CPU, this job wasn't really parallel: the same +processor performed the 'serial' work for 4.5 seconds, then the 'parallel' part +for 25.5 seconds, and no time was saved. The cluster can do better, if we ask. + +## Running the Parallel Job + +The `amdahl` program uses the Message Passing Interface (MPI) for parallelism +-- this is a common tool on HPC systems. + +> ## What is MPI? +> +> The Message Passing Interface is a set of tools which allow multiple tasks +> running simultaneously to communicate with each other. +> Typically, a single executable is run multiple times, possibly on different +> machines, and the MPI tools are used to inform each instance of the +> executable about its sibling processes, and which instance it is. +> MPI also provides tools to allow communication between instances to +> coordinate work, exchange information about elements of the task, or to +> transfer data. +> An MPI instance typically has its own copy of all the local variables. +{: .callout} + +While MPI-aware executables can generally be run as stand-alone programs, in +order for them to run in parallel they must use an MPI _run-time environment_, +which is a specific implementation of the MPI _standard_. +To activate the MPI environment, the program should be started via a command +such as `mpiexec` (or `mpirun`, or `srun`, etc. depending on the MPI run-time +you need to use), which will ensure that the appropriate run-time support for +parallelism is included. + +> ## MPI Runtime Arguments +> +> On their own, commands such as `mpiexec` can take many arguments specifying +> how many machines will participate in the execution, +> and you might need these if you would like to run an MPI program on your +> own (for example, on your laptop). +> In the context of a queuing system, however, it is frequently the case that +> MPI run-time will obtain the necessary parameters from the queuing system, +> by examining the environment variables set when the job is launched. +{: .callout} + +Let's modify the job script to request more cores and use the MPI run-time. + +```bash +{{ site.remote.prompt }} cp serial-job.sh parallel-job.sh +{{ site.remote.prompt }} nano parallel-job.sh +{{ site.remote.prompt }} cat parallel-job.sh +``` +{: .language-bash} + +{% include {{ site.snippets }}/parallel/four-tasks-jobscript.snip %} + +Then submit your job. Note that the submission command has not really changed +from how we submitted the serial job: all the parallel settings are in the +batch file rather than the command line. + +``` +{{ site.remote.prompt }} {{ site.sched.submit.name }} parallel-job.sh +``` +{: .language-bash} + +As before, use the status commands to check when your job runs. + +``` +{{ site.remote.prompt }} ls -t +``` +{: .language-bash} +``` +slurm-347178.out parallel-job.sh slurm-347087.out serial-job.sh amdahl README.md LICENSE.txt +``` +{: .output} +``` +{{ site.remote.prompt }} cat slurm-347178.out +``` +{: .language-bash} +``` +Doing 30.000 seconds of 'work' on 4 processors, +which should take 10.875 seconds with 0.850 parallel proportion of the workload. + + Hello, World! I am process 0 of 4 on {{ site.remote.node }}. I will do all the serial 'work' for 4.500 seconds. + Hello, World! I am process 2 of 4 on {{ site.remote.node }}. I will do parallel 'work' for 6.375 seconds. + Hello, World! I am process 1 of 4 on {{ site.remote.node }}. I will do parallel 'work' for 6.375 seconds. + Hello, World! I am process 3 of 4 on {{ site.remote.node }}. I will do parallel 'work' for 6.375 seconds. + Hello, World! I am process 0 of 4 on {{ site.remote.node }}. I will do parallel 'work' for 6.375 seconds. + +Total execution time (according to rank 0): 10.888 seconds +``` +{: .output} + +> ## Is it 4× faster? +> +> The parallel job received 4× more processors than the serial job: +> does that mean it finished in ¼ the time? +> +> > ## Solution +> > +> > The parallel job did take _less_ time: 11 seconds is better than 30! +> > But it is only a 2.7× improvement, not 4×. +> > +> > Look at the job output: +> > +> > * While "process 0" did serial work, processes 1 through 3 did their +> > parallel work. +> > * While process 0 caught up on its parallel work, +> > the rest did nothing at all. +> > +> > Process 0 always has to finish its serial task before it can start on the +> > parallel work. This sets a lower limit on the amount of time this job will +> > take, no matter how many cores you throw at it. +> > +> > This is the basic principle behind [Amdahl's Law][amdahl], which is one way +> > of predicting improvements in execution time for a __fixed__ workload that +> > can be subdivided and run in parallel to some extent. +> {: .solution} +{: .challenge} + +## How Much Does Parallel Execution Improve Performance? + +In theory, dividing up a perfectly parallel calculation among _n_ MPI processes +should produce a decrease in total run time by a factor of _n_. +As we have just seen, real programs need some time for the MPI processes to +communicate and coordinate, and some types of calculations can't be subdivided: +they only run effectively on a single CPU. + +Additionally, if the MPI processes operate on different physical CPUs in the +computer, or across multiple compute nodes, even more time is required for +communication than it takes when all processes operate on a single CPU. + +In practice, it's common to evaluate the parallelism of an MPI program by + +* running the program across a range of CPU counts, +* recording the execution time on each run, +* comparing each execution time to the time when using a single CPU. + +Since "more is better" -- improvement is easier to interpret from increases in +some quantity than decreases -- comparisons are made using the speedup factor +_S_, which is calculated as the single-CPU execution time divided by the multi-CPU +execution time. For a perfectly parallel program, a plot of the speedup _S_ +versus the number of CPUs _n_ would give a straight line, _S_ = _n_. + +Let's run one more job, so we can see how close to a straight line our `amdahl` +code gets. + +```bash +{{ site.remote.prompt }} nano parallel-job.sh +{{ site.remote.prompt }} cat parallel-job.sh +``` + +{% include {{ site.snippets }}/parallel/eight-tasks-jobscript.snip %} + +Then submit your job. Note that the submission command has not really changed +from how we submitted the serial job: all the parallel settings are in the +batch file rather than the command line. + +``` +{{ site.remote.prompt }} {{ site.sched.submit.name }} parallel-job.sh +``` +{: .language-bash} + +As before, use the status commands to check when your job runs. + +``` +{{ site.remote.prompt }} ls -t +``` +{: .language-bash} +``` +slurm-347271.out parallel-job.sh slurm-347178.out slurm-347087.out serial-job.sh amdahl README.md LICENSE.txt +``` +{: .output} +``` +{{ site.remote.prompt }} cat slurm-347178.out +``` +{: .language-bash} +``` +which should take 7.688 seconds with 0.850 parallel proportion of the workload. + + Hello, World! I am process 4 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + Hello, World! I am process 0 of 8 on {{ site.remote.node }}. I will do all the serial 'work' for 4.500 seconds. + Hello, World! I am process 2 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + Hello, World! I am process 1 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + Hello, World! I am process 3 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + Hello, World! I am process 5 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + Hello, World! I am process 6 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + Hello, World! I am process 7 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + Hello, World! I am process 0 of 8 on {{ site.remote.node }}. I will do parallel 'work' for 3.188 seconds. + +Total execution time (according to rank 0): 7.697 seconds +``` +{: .output} + +> ## Non-Linear Output +> +> When we ran the job with 4 parallel workers, the serial job wrote its output +> first, then the parallel processes wrote their output, with process 0 coming +> in first and last. +> +> With 8 workers, this is not the case: since the parallel workers take less +> time than the serial work, it is hard to say which process will write its +> output first, except that it will _not_ be process 0! +{: .discussion} + +Now, let's summarize the amount of time it took each job to run: + +| Number of CPUs | Runtime (sec) | +| --- | --- | +| 1 | 30.033 | +| 4 | 10.888 | +| 8 | 7.697 | + +Then, use the first row to compute speedups _S_, using Python as a command-line calculator: + +``` +{{ site.remote.prompt }} for n in 30.033 10.888 7.697; do python3 -c "print(30.033 / $n)"; done +``` +{: .language-bash} + +| Number of CPUs | Speedup | Ideal | +| --- | --- | --- | +| 1 | 1.0 | 1 | +| 4 | 2.75 | 4 | +| 8 | 3.90 | 8 | + +The job output files have been telling us that this program is performing 85% +of its work in parallel, leaving 15% to run in serial. This seems reasonably +high, but our quick study of speedup shows that in order to get a 4× speedup, +we have to use 8 or 9 processors in parallel. In real programs, the speedup +factor is influenced by + +* CPU design +* communication network between compute nodes +* MPI library implementations +* details of the MPI program itself + +Using Amdahl's Law, you can prove that with this program, it is _impossible_ +to reach 8× speedup, no matter how many processors you have on hand. Details of +that analysis, with results to back it up, are left for the next class in the +HPC Carpentry workshop, _HPC Workflows_. + +In an HPC environment, we try to reduce the execution time for all types of +jobs, and MPI is an extremely common way to combine dozens, hundreds, or +thousands of CPUs into solving a single problem. To learn more about +parallelization, see the [parallel novice lesson][parallel-novice] lesson. + +{% include links.md %} + +[amdahl]: https://en.wikipedia.org/wiki/Amdahl's_law +[cmd-line]: https://swcarpentry.github.io/python-novice-inflammation/12-cmdline/index.html +[inflammation]: https://swcarpentry.github.io/python-novice-inflammation/ +[np-dtype]: https://numpy.org/doc/stable/reference/generated/numpy.dtype.html +[parallel-novice]: http://www.hpc-carpentry.org/hpc-parallel-novice/ +[python-func]: https://swcarpentry.github.io/python-novice-inflammation/08-func/index.html +[units]: https://en.wikipedia.org/wiki/Byte#Multiple-byte_units diff --git a/_episodes/17-resources.md b/_episodes/18-resources.md similarity index 87% rename from _episodes/17-resources.md rename to _episodes/18-resources.md index 493a66ab..ffa4ed9a 100644 --- a/_episodes/17-resources.md +++ b/_episodes/18-resources.md @@ -47,9 +47,9 @@ finish and free up the resources needed to match what you asked for. ## Stats -Since we already submitted `pi.py` to run on the cluster, we can query the +Since we already submitted `amdahl` to run on the cluster, we can query the scheduler to see how long our job took and what resources were used. We will -use `{{ site.sched.hist }}` to get statistics about `parallel-pi.sh`. +use `{{ site.sched.hist }}` to get statistics about `parallel-job.sh`. ``` {{ site.remote.prompt }} {{ site.sched.hist }} @@ -58,11 +58,13 @@ use `{{ site.sched.hist }}` to get statistics about `parallel-pi.sh`. {% include {{ site.snippets }}/resources/account-history.snip %} -This shows all the jobs we ran recently (note that there are multiple entries -per job). To get info about a specific job, we change command slightly. +This shows all the jobs we ran today (note that there are multiple entries per +job). +To get info about a specific job (for example, 347087), we change command +slightly. ``` -{{ site.remote.prompt }} {{ site.sched.hist }} {{ site.sched.flag.histdetail }} 1965 +{{ site.remote.prompt }} {{ site.sched.hist }} {{ site.sched.flag.histdetail }} 347087 ``` {: .language-bash} @@ -72,7 +74,7 @@ information to `less` to make it easier to view (use the left and right arrow keys to scroll through fields). ``` -{{ site.remote.prompt }} {{ site.sched.hist }} {{ site.sched.flag.histdetail }} +{{ site.remote.prompt }} {{ site.sched.hist }} {{ site.sched.flag.histdetail }} 347087 | less -S ``` {: .language-bash} @@ -87,12 +89,12 @@ keys to scroll through fields). ## Improving Resource Requests -From the job history, we see that `pi.py` jobs finished executing in +From the job history, we see that `amdahl` jobs finished executing in at most a few minutes, once dispatched. The time estimate we provided in the job script was far too long! This makes it harder for the queuing system to accurately estimate when resources will become free for other jobs. Practically, this means that the queuing system waits -to dispatch our `pi.py` job until the full requested time slot opens, +to dispatch our `amdahl` job until the full requested time slot opens, instead of "sneaking it in" a much shorter window where the job could actually finish. Specifying the expected runtime in the submission script more accurately will help alleviate cluster congestion and may @@ -100,7 +102,7 @@ get your job dispatched earlier. > ## Narrow the Time Estimate > -> Edit `parallel_pi.sh` to set a better time estimate. How close can +> Edit `parallel_job.sh` to set a better time estimate. How close can > you get? > > Hint: use `{{ site.sched.flag.time }}`. diff --git a/_episodes/18-responsibility.md b/_episodes/19-responsibility.md similarity index 100% rename from _episodes/18-responsibility.md rename to _episodes/19-responsibility.md diff --git a/_extras/discuss.md b/_extras/discuss.md index 352f424b..228f78a1 100644 --- a/_extras/discuss.md +++ b/_extras/discuss.md @@ -82,20 +82,19 @@ individual to individual! > > - Vary the number of threads used per process > - Reduce the number of cores used per node -> - Allow the calculation to use Symmetric Mutithreading (SMT) if enabled +> - Allow the calculation to use Symmetric Multithreading (SMT) if enabled > > Please ask for more information on these options from a helper! {: .challenge} > ## Running Many Serial BLAST+ Analyses in Parallel > -> [BLAST+]( -https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download) +> [BLAST+](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download) > finds regions of similarity between biological sequences. The program > compares nucleotide or protein sequences to sequence databases and calculates > the statistical significance. > -> In this exercise, you should use what you have learnt so far to set up a way +> In this exercise, you should use what you have learned so far to set up a way > to run multiple serial BLAST+ analyses in parallel. There are many different > ways to do this that can be used on their own or in combination. Some ideas > include: @@ -107,14 +106,12 @@ https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Do > > We have prepared an example dataset that has 100 sequences to analyse > (actually this is 10 sequences repeated 10 times). This set is based on the -> [BLAST GNU Parallel example]( -https://github.com/LangilleLab/microbiome_helper/wiki/Quick-Introduction-to-GNU-Parallel) +> [BLAST GNU Parallel example](https://github.com/LangilleLab/microbiome_helper/wiki/Quick-Introduction-to-GNU-Parallel) > > This exercise involves: > > - Downloading and expanding the dataset to the HPC system from: -> [{{ site.url }}{{site.baseurl }}/files/parallel_example.tar.gz]( -> {{ site.url }}{{site.baseurl }}/files/parallel_example.tar.gz) +> [{{ site.url }}{{site.baseurl }}/files/parallel_example.tar.gz]({{ site.url }}{{site.baseurl}}/files/parallel_example.tar.gz) > - Writing a job submission script to run a single analysis using the `blast` > module and the following command: > diff --git a/_extras/guide.md b/_extras/guide.md index 80d4f28e..1f952e9c 100644 --- a/_extras/guide.md +++ b/_extras/guide.md @@ -66,8 +66,7 @@ is a great way to contribute. : *to be defined* [Supercomputer](https://en.wikipedia.org/wiki/Supercomputer) -: ["... a major scientific instrument ..."]( -https://www.hpcnotes.com/2015/10/essential-analogies-for-hpc-advocate.html) +: ["... a major scientific instrument ..."](https://www.hpcnotes.com/2015/10/essential-analogies-for-hpc-advocate.html) [Workstation](https://en.wikipedia.org/wiki/Workstation) : *to be defined* diff --git a/_extras/learner-prerequisites.md b/_extras/learner-prerequisites.md index 0ec2a5fa..ba5752d9 100644 --- a/_extras/learner-prerequisites.md +++ b/_extras/learner-prerequisites.md @@ -14,8 +14,7 @@ bring all prerequisites to the course. ## Pre-Workshop Survey For a motivation of this survey type, see Greg Wilson's template in [Teaching -Tech Together]( -https://teachtogether.tech/en/index.html#s:checklists-preassess). +Tech Together](https://teachtogether.tech/en/index.html#s:checklists-preassess). ### Shell diff --git a/_includes/snippets_library/BSU_Borah_slurm/modules/available-modules-gcc.snip b/_includes/snippets_library/BSU_Borah_slurm/modules/available-modules-gcc.snip new file mode 100644 index 00000000..e3a47902 --- /dev/null +++ b/_includes/snippets_library/BSU_Borah_slurm/modules/available-modules-gcc.snip @@ -0,0 +1,8 @@ +``` +---------------------------- /cm/local/modulefiles ---------------------------- +gcc/9.2.0 + +--------------------------- /cm/shared/modulefiles ---------------------------- +gcc/7.5.0 gcc/8.2.0 gcc/10.2.0 +``` +{: .output} diff --git a/_includes/snippets_library/BSU_Borah_slurm/modules/available-modules.snip b/_includes/snippets_library/BSU_Borah_slurm/modules/available-modules.snip index 7ae2ec26..833a8a43 100644 --- a/_includes/snippets_library/BSU_Borah_slurm/modules/available-modules.snip +++ b/_includes/snippets_library/BSU_Borah_slurm/modules/available-modules.snip @@ -1 +1,15 @@ - +``` +---------------------------- /cm/shared/modulefiles --------------------------- +abyss/2.3.1 vim/9.0.2149 +adolc/2.6.3/gcc/12.1.0 wgrib2/3.1.3/gcc/12.1.0 +afni/23.2.11 wps/intel/3.8.1 +agisoft/2.1.0 wps/intel/4.1.2 +alphafold/2(default) wrf-hydro/4.1.2 +alphafold/2.3.2 wrf/intel/3.8.1 +alphafold/3.0.0 wrf/intel/4.1.2 +alpine3d/3.2.0.c3aaad0/openmpi/4.1.3/gcc/12.1.0 zlib/intel/1.2.11 + +[removed most of the output here for clarity] + +``` +{: .output} diff --git a/_includes/snippets_library/BSU_Borah_slurm/modules/default-modules.snip b/_includes/snippets_library/BSU_Borah_slurm/modules/default-modules.snip new file mode 100644 index 00000000..f503377f --- /dev/null +++ b/_includes/snippets_library/BSU_Borah_slurm/modules/default-modules.snip @@ -0,0 +1,5 @@ +``` +Currently Loaded Modulefiles: + 1) slurm/slurm/23.02.7 +``` +{: .output} diff --git a/_includes/snippets_library/BSU_Borah_slurm/modules/missing-python.snip b/_includes/snippets_library/BSU_Borah_slurm/modules/missing-python.snip index d7844546..36a20095 100644 --- a/_includes/snippets_library/BSU_Borah_slurm/modules/missing-python.snip +++ b/_includes/snippets_library/BSU_Borah_slurm/modules/missing-python.snip @@ -1,4 +1,19 @@ +If the `python3` command was unavailable, we would see output like + ``` /usr/bin/which: no python3 in (/cm/shared/apps/slurm/current/sbin:/cm/shared/apps/slurm/current/bin:/cm/local/apps/gcc/9.2.0/bin:/cm/local/apps/environment-modules/4.4.0//bin:/usr/local/bin:/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/ibutils/bin:/sbin:/usr/sbin:/cm/local/apps/environment-modules/4.4.0/bin:/opt/dell/srvadmin/bin:/bsuhome/{{ site.remote.user }}/.local/bin:/bsuhome/{{ site.remote.user }}/bin) ``` {: .output} + +Note that this wall of text is really a list, with values separated +by the `:` character. + +However, in our case we do have an existing `python3` available so we see + +``` +/usr/bin/python3 +``` +{: .output} + +We need a different Python than the system provided one though, so let us load +a module to access it. diff --git a/_includes/snippets_library/BSU_Borah_slurm/modules/module-load-python.snip b/_includes/snippets_library/BSU_Borah_slurm/modules/module-load-python.snip index d18252dd..d9bab7b4 100644 --- a/_includes/snippets_library/BSU_Borah_slurm/modules/module-load-python.snip +++ b/_includes/snippets_library/BSU_Borah_slurm/modules/module-load-python.snip @@ -1,4 +1,5 @@ ``` +{{ site.remote.prompt }} module load {{ site.remote.module_python3 }} {{ site.remote.prompt }} which python3 ``` {: .language-bash} diff --git a/_includes/snippets_library/BSU_Borah_slurm/modules/software-dependencies.snip b/_includes/snippets_library/BSU_Borah_slurm/modules/software-dependencies.snip index 68f82c30..17f171c6 100644 --- a/_includes/snippets_library/BSU_Borah_slurm/modules/software-dependencies.snip +++ b/_includes/snippets_library/BSU_Borah_slurm/modules/software-dependencies.snip @@ -1 +1,58 @@ - +To demonstrate, let's use `module list`. `module list` shows all loaded +software modules. + +``` +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modulefiles: + 1) slurm/slurm/23.02.7 +``` +{: .output} + +``` +{{ site.remote.prompt }} module load gromacs/2024.2 +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modulefiles: + 1) slurm/slurm/23.02.7 4) openmpi/4.1.3/gcc/12.1.0 + 2) borah-base/default 5) cuda_toolkit/12.3.0 + 3) gcc/12.1.0 6) gromacs/2024.2/openmpi/4.1.3/gcc/12.1.0 +``` +{: .output} + +So in this case, loading the `gromacs` module (a bioinformatics software +package), also loaded `openmpi/4.1.3/gcc/12.1.0` and `gcc/12.1.0` as well. +Let's try unloading the `gromacs` package. + +``` +{{ site.remote.prompt }} module unload gromacs +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modulefiles: + 1) slurm/slurm/23.02.7 +``` +{: .output} + +So using `module unload` "un-loads" a module, and depending on how a site is + configured it may also unload all of the dependencies. If we wanted to unload + everything at once, we could run `module purge` (unloads everything). + +``` +{{ site.remote.prompt }} module purge +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/BSU_Borah_slurm/parallel/eight-tasks-jobscript.snip b/_includes/snippets_library/BSU_Borah_slurm/parallel/eight-tasks-jobscript.snip new file mode 100644 index 00000000..2f643071 --- /dev/null +++ b/_includes/snippets_library/BSU_Borah_slurm/parallel/eight-tasks-jobscript.snip @@ -0,0 +1,16 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 8 + +# Load the computing environment we need +# (mpi4py and numpy are in SciPy-bundle) +module load {{ site.remote.module_python3 }} +module load SciPy-bundle + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/BSU_Borah_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/BSU_Borah_slurm/parallel/four-tasks-jobscript.snip index 5eb930b4..19804d74 100644 --- a/_includes/snippets_library/BSU_Borah_slurm/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/BSU_Borah_slurm/parallel/four-tasks-jobscript.snip @@ -1,12 +1,16 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 4 -{{ site.sched.comment }} --mem=3G + +# Load the computing environment we need +# (mpi4py and numpy are in SciPy-bundle) +module load {{ site.remote.module_python3 }} +module load SciPy-bundle # Execute the task -mpiexec python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/BSU_Borah_slurm/parallel/one-task-jobscript.snip similarity index 65% rename from _includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/one-task-with-memory-jobscript.snip rename to _includes/snippets_library/BSU_Borah_slurm/parallel/one-task-jobscript.snip index 5838157f..1941ef04 100644 --- a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/one-task-with-memory-jobscript.snip +++ b/_includes/snippets_library/BSU_Borah_slurm/parallel/one-task-jobscript.snip @@ -1,15 +1,14 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 1 -{{ site.sched.comment }} --mem=3G # Load the computing environment we need -module load python3 +module load {{ site.remote.module_python3 }} # Execute the task -python pi.py 100000000 +amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/BSU_Borah_slurm/scheduler/using-nodes-interactively.snip b/_includes/snippets_library/BSU_Borah_slurm/scheduler/using-nodes-interactively.snip new file mode 100644 index 00000000..40b11437 --- /dev/null +++ b/_includes/snippets_library/BSU_Borah_slurm/scheduler/using-nodes-interactively.snip @@ -0,0 +1,69 @@ +`{{ site.sched.interactive }}` runs a single command on the cluster and then +exits. Let's demonstrate this by running the `hostname` command with +`{{ site.sched.interactive }}`. (We can cancel an `{{ site.sched.interactive }}` +job with `Ctrl-c`.) + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} hostname +``` +{: .language-bash} + +``` +{{ site.remote.node }} +``` +{: .output} + +`{{ site.sched.interactive }}` accepts all of the same options as +`{{ site.sched.submit.name }}`. However, instead of specifying these in a script, +these options are specified on the command-line when starting a job. To submit +a job that uses 2 CPUs for instance, we could use the following command: + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} -n 2 echo "This job will use 2 CPUs." +``` +{: .language-bash} + +``` +This job will use 2 CPUs. +This job will use 2 CPUs. +``` +{: .output} + +Typically, the resulting shell environment will be the same as that for +`{{ site.sched.submit.name }}`. + +### Interactive jobs + +Sometimes, you will need a lot of resources for interactive use. Perhaps it's +our first time running an analysis or we are attempting to debug something that +went wrong with a previous job. Fortunately, {{ site.sched.name }} makes it +easy to start an interactive job with `{{ site.sched.interactive }}`: + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} --pty bash +``` +{: .language-bash} + +You should be presented with a bash prompt. Note that the prompt will likely +change to reflect your new location, in this case the compute node we are +logged on. You can also verify this with `hostname`. + +> ## Creating remote graphics +> +> To see graphical output inside your jobs, you need to use X11 forwarding. To +> connect with this feature enabled, use the `-Y` option when you login with +> the `ssh` command, e.g., `ssh -Y {{ site.remote.user }}@{{ site.remote.login }}`. +> +> To demonstrate what happens when you create a graphics window on the remote +> node, use the `xeyes` command. A relatively adorable pair of eyes should pop +> up (press `Ctrl-C` to stop). If you are using a Mac, you must have installed +> XQuartz (and restarted your computer) for this to work. +> +> If your cluster has the +> [slurm-spank-x11](https://github.com/hautreux/slurm-spank-x11) plugin +> installed, you can ensure X11 forwarding within interactive jobs by using the +> `--x11` option for `{{ site.sched.interactive }}` with the command +> `{{ site.sched.interactive }} --x11 --pty bash`. +{: .callout} + +When you are done with the interactive job, type `exit` to quit your session. diff --git a/_includes/snippets_library/BSU_Borah_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/BSU_Borah_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/Birmingham_Baskerville_slurm/_config_options.yml b/_includes/snippets_library/Birmingham_Baskerville_slurm/_config_options.yml index e91ba0b3..8fbeb012 100644 --- a/_includes/snippets_library/Birmingham_Baskerville_slurm/_config_options.yml +++ b/_includes/snippets_library/Birmingham_Baskerville_slurm/_config_options.yml @@ -55,14 +55,16 @@ sched: info: "sinfo" comment: "#SBATCH" hist: "sacct -u $USER" + hist_filter: "" episode_order: - 10-hpc-intro - 11-connecting - 12-cluster - 13-scheduler - - 14-modules - - 15-transferring-files - - 16-parallel - - 17-resources - - 18-responsibility + - 14-environment-variables + - 15-modules + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/default-modules.snip b/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/sbatch-options.snip b/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/sbatch-options.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/wrong-gcc-version.snip b/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/wrong-gcc-version.snip index 23ee5df8..41dfffce 100644 --- a/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/wrong-gcc-version.snip +++ b/_includes/snippets_library/Birmingham_Baskerville_slurm/modules/wrong-gcc-version.snip @@ -3,32 +3,29 @@ C/C++/Fortran compiler. Tons of software is dependent on the GCC version, and might not compile or run if the wrong version is loaded. In this case, there are few different versions: -`GCC/7.3.0-2.30 GCC/8.2.0-2.31.1 GCC/8.3.0 GCC/9.3.0` +`GCC/7.3.0 GCC/8.2.0 GCC/8.3.0 GCC/9.3.0` How do we load each copy and which copy is the default? -On SAGA and Fram we do not have default modules and we must use the full name -to load it. - ``` -{{ site.host_prompt }} module load gcc +{{ site.host_prompt }} module load GCC ``` {: .language-bash} ``` Lmod has detected the following error: The following module(s) are unknown: -"gcc" +"GCC" Please check the spelling or version number. Also try "module spider ..." It is also possible your cache file is out-of-date; it may help to try: - $ module --ignore-cache load "gcc" + $ module --ignore-cache load "GCC" ``` {: .output} To load a software module we must specify the full module name: ``` -{{ site.host_prompt }} module load GCC/8.2.0-2.31.1 +{{ site.host_prompt }} module load GCC/8.2.0 {{ site.host_prompt }} gcc --version ``` {: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/eight-tasks-jobscript.snip similarity index 72% rename from _includes/snippets_library/EPCC_Cirrus_pbs/parallel/four-tasks-jobscript.snip rename to _includes/snippets_library/Birmingham_Baskerville_slurm/parallel/eight-tasks-jobscript.snip index b1d90eb9..62569a87 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/eight-tasks-jobscript.snip @@ -2,12 +2,14 @@ {{ site.remote.bash_shebang }} {{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} -{{ site.sched.comment }} -l nodes=1:ppn=4:mem=3G +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 8 +{{ site.sched.comment }} --mem=3G # Load the computing environment we need module load python3 # Execute the task -mpiexec python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/four-tasks-jobscript.snip index ac8effab..83220026 100644 --- a/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/four-tasks-jobscript.snip @@ -10,6 +10,6 @@ module load python3 # Execute the task -mpiexec python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/one-task-jobscript.snip similarity index 73% rename from _includes/snippets_library/Birmingham_Baskerville_slurm/parallel/one-task-with-memory-jobscript.snip rename to _includes/snippets_library/Birmingham_Baskerville_slurm/parallel/one-task-jobscript.snip index 5838157f..3ca27f07 100644 --- a/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/one-task-with-memory-jobscript.snip +++ b/_includes/snippets_library/Birmingham_Baskerville_slurm/parallel/one-task-jobscript.snip @@ -1,15 +1,15 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 1 {{ site.sched.comment }} --mem=3G # Load the computing environment we need -module load python3 +module load python # Execute the task -python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/Birmingham_Baskerville_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/Birmingham_Baskerville_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/_config_options.yml b/_includes/snippets_library/ComputeCanada_Graham_slurm/_config_options.yml index b29ad4f4..3dbf9cc9 100644 --- a/_includes/snippets_library/ComputeCanada_Graham_slurm/_config_options.yml +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/_config_options.yml @@ -8,9 +8,9 @@ # `_includes/snippets_library`. To use one, replace options # below with those in `_config_options.yml` from the # library. E.g, to customise for Cirrus at EPCC, running -# PBS, we could replace the options below with those from +# Slurm, we could replace the options below with those from # -# _includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml +# _includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml # # If your cluster is not represented in the library, please # copy an existing folder, rename it, and customize for your @@ -55,14 +55,16 @@ sched: info: "sinfo" comment: "#SBATCH" hist: "sacct -u yourUsername" + hist_filter: "" episode_order: - 10-hpc-intro - 11-connecting - 12-cluster - 13-scheduler - - 14-modules - - 15-transferring-files - - 16-parallel - - 17-resources - - 18-responsibility + - 14-environment-variables + - 15-modules + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/cluster/root-folders.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/cluster/root-folders.snip new file mode 100644 index 00000000..715de741 --- /dev/null +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/cluster/root-folders.snip @@ -0,0 +1,6 @@ +``` +bin etc lib64 proc sbin sys var +boot {{ site.remote.homedir | replace: "/", "" }} mnt root scratch tmp working +dev lib opt run srv usr +``` +{: .output} diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/modules/default-modules.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/eight-tasks-jobscript.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/eight-tasks-jobscript.snip new file mode 100644 index 00000000..38365d68 --- /dev/null +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/eight-tasks-jobscript.snip @@ -0,0 +1,14 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 8 + +# Load the computing environment we need +module load python3 + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/four-tasks-jobscript.snip index ac8effab..14046d46 100644 --- a/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/four-tasks-jobscript.snip @@ -1,15 +1,14 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 4 -{{ site.sched.comment }} --mem=3G # Load the computing environment we need module load python3 # Execute the task -mpiexec python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/one-task-jobscript.snip similarity index 70% rename from _includes/snippets_library/ComputeCanada_Graham_slurm/parallel/one-task-with-memory-jobscript.snip rename to _includes/snippets_library/ComputeCanada_Graham_slurm/parallel/one-task-jobscript.snip index 5838157f..0e2a9947 100644 --- a/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/one-task-with-memory-jobscript.snip +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/parallel/one-task-jobscript.snip @@ -1,15 +1,14 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 1 -{{ site.sched.comment }} --mem=3G # Load the computing environment we need module load python3 # Execute the task -python pi.py 100000000 +amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/resources/hist-fields.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/resources/hist-fields.snip new file mode 100644 index 00000000..f0e215ba --- /dev/null +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/resources/hist-fields.snip @@ -0,0 +1,6 @@ +* **Hostname**: Where did your job run? +* **MaxRSS**: What was the maximum amount of memory used? +* **Elapsed**: How long did the job take? +* **State**: What is the job currently doing/what happened to it? +* **MaxDiskRead**: Amount of data read from disk. +* **MaxDiskWrite**: Amount of data written to disk. diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/scheduler/email-notifications.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/scheduler/email-notifications.snip new file mode 100644 index 00000000..e681b3c0 --- /dev/null +++ b/_includes/snippets_library/ComputeCanada_Graham_slurm/scheduler/email-notifications.snip @@ -0,0 +1,19 @@ +> Jobs on an HPC system might run for days or even weeks. We probably have +> better things to do than constantly check on the status of our job with +> `{{ site.sched.status }}`. Looking at the manual page for +> `{{ site.sched.submit.name }}`, can you set up our test job to send you an email +> when it finishes? +> +> > ## Hint +> > +> > You can use the *manual pages* for {{ site.sched.name }} utilities to find +> > more about their capabilities. On the command line, these are accessed +> > through the `man` utility: run `man `. You can find the same +> > information online by searching > "man ". +> > +> > ``` +> > {{ site.remote.prompt }} man {{ site.sched.submit.name }} +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/ComputeCanada_Graham_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/ComputeCanada_Graham_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml b/_includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml deleted file mode 100644 index 6a9cb8b5..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml +++ /dev/null @@ -1,69 +0,0 @@ -#------------------------------------------------------------ -# EPCC, The University of Edinburgh: Cirrus + PBS Pro -#------------------------------------------------------------ - -# Cluster host and scheduler options: the defaults come from -# Graham at Compute Canada, running Slurm. Other options can -# be found in the library of snippets, -# `_includes/snippets_library`. To use one, replace options -# below with those in `_config_options.yml` from the -# library. E.g, to customise for Cirrus at EPCC, running -# PBS, we could replace the options below with those from -# -# _includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml -# -# If your cluster is not represented in the library, please -# copy an existing folder, rename it, and customize for your -# installation. Remember to keep the leading slash on the -# `snippets` variable below! - -snippets: "/snippets_library/EPCC_Cirrus_pbs" - -local: - prompt: "[user@laptop ~]$" - bash_shebang: "#!/usr/bin/env bash" - -remote: - name: "Cirrus" - login: "login.cirrus.ac.uk" - host: "cirrus-login0" - node: "r1i0n32" - location: "EPCC, The University of Edinburgh" - homedir: "/lustre/home/tc001/lola" - user: "yourUsername" - prompt: "[yourUsername@cirrus-login0 ~]$" - bash_shebang: "#!/usr/bin/env bash" - -sched: - name: "PBS Pro" - submit: - name: "qsub" - options: "-A tc001 -q R387726" - iopt: "" - queue: - debug: "standard" - testing: "standard" - status: "qstat" - flag: - user: "-u yourUsername" - interactive: "-IVl select=1:ncpus=1" - name: "-N" - histdetail: "-f" - time: "-l walltime" - queue: "-q" - del: "qdel" - interactive: "qsub" - info: "pbsnodes -a" - comment: "#PBS" - hist: "tracejob" - -episode_order: - - 10-hpc-intro - - 11-connecting - - 12-cluster - - 13-scheduler - - 14-modules - - 15-transferring-files - - 16-parallel - - 17-resources - - 18-responsibility diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/cluster/queue-info.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/cluster/queue-info.snip deleted file mode 100644 index 3953b85e..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/cluster/queue-info.snip +++ /dev/null @@ -1,23 +0,0 @@ -``` -{{ site.remote.node }} - Mom = {{ site.remote.node }}.ib0.icexa.epcc.ed.ac.uk - ntype = PBS - state = offline - pcpus = 72 - resources_available.arch = linux - resources_available.host = {{ site.remote.node }} - resources_available.mem = 263773892kb - resources_available.ncpus = 36 - resources_available.vnode = {{ site.remote.node }} - resources_assigned.accelerator_memory = 0kb - resources_assigned.mem = 0kb - resources_assigned.naccelerators = 0 - resources_assigned.ncpus = 0 - resources_assigned.netwins = 0 - resources_assigned.vmem = 0kb - resv_enable = True - sharing = default_shared - license = l -... -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/available-modules.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/available-modules.snip deleted file mode 100644 index 049d3076..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/available-modules.snip +++ /dev/null @@ -1,12 +0,0 @@ -``` ------------- /usr/share/Modules/modulefiles ----------- -dot module-info mpt_2.16 perfboost use.own -module-git modules null perfcatcher - ---------------- /lustre/sw/modulefiles ---------------- -abinit/8.2.3-intel17-mpt214(default) -allinea/7.0.0(default) -altair-hwsolvers/13.0.213 -... -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/missing-python.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/missing-python.snip deleted file mode 100644 index 7d6ef7c4..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/missing-python.snip +++ /dev/null @@ -1,7 +0,0 @@ -``` -/usr/bin/which: no python3 in (/lustre/home/z04/aturner/miniconda2/bin: -/opt/sgi/sbin:/opt/sgi/bin:/usr/lib64/qt-3.3/bin:/opt/pbs/default/bin: -/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/c3/bin:/sbin:/bin: -/lustre/home/z04/aturner/bin) -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/module-load-python.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/module-load-python.snip deleted file mode 100644 index bc13fe96..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/module-load-python.snip +++ /dev/null @@ -1,5 +0,0 @@ -``` -{{ site.remote.prompt }} module load anaconda/python3 -{{ site.remote.prompt }} which python3 -``` -{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-executable-dir.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-executable-dir.snip deleted file mode 100644 index cefef21f..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-executable-dir.snip +++ /dev/null @@ -1,4 +0,0 @@ -``` -/lustre/sw/anaconda/anaconda3-5.1.0/bin/python3 -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-ls-dir-command.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-ls-dir-command.snip deleted file mode 100644 index e7019b19..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-ls-dir-command.snip +++ /dev/null @@ -1,4 +0,0 @@ -``` -{{ site.remote.prompt }} ls /lustre/sw/anaconda/anaconda3-5.1.0/bin -``` -{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-ls-dir-output.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-ls-dir-output.snip deleted file mode 100644 index 6235d6d8..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-ls-dir-output.snip +++ /dev/null @@ -1,15 +0,0 @@ -``` -[output truncated] - -2to3 Modules_module-info pip3 python3.5m -2to3-3.5 Modules_modules pip3.5 python3.5m-config -easy_install Modules_mpt_2.16 pydoc3 python3-config -easy_install-3.5 Modules_null pydoc3.5 pyvenv -idle3 Modules_perfboost python pyvenv-3.5 -idle3.5 Modules_perfcatcher python3 virtualenv -Modules_dot Modules_use.own python3.5 wheel -Modules_module-git pip python3.5-config - -[output truncated] -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-module-path.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-module-path.snip deleted file mode 100644 index 15e6235d..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/python-module-path.snip +++ /dev/null @@ -1,4 +0,0 @@ -``` -/lustre/home/z04/aturner/miniconda2/bin:/opt/sgi/sbin:/opt/sgi/bin:/usr/lib64/qt-3.3/bin:/opt/pbs/default/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/c3/bin:/sbin:/bin:/lustre/home/z04/aturner/bin -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/software-dependencies.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/modules/software-dependencies.snip deleted file mode 100644 index a6cae294..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/software-dependencies.snip +++ /dev/null @@ -1,53 +0,0 @@ -To demonstrate, let's load the `abinit` module and then use the `module list` -command to show which modules we currently have loaded in our environment. -([Abinit](https://www.abinit.org/) is an open source materials science -modelling software package.) - -``` -{{ site.remote.prompt }} module load abinit -{{ site.remote.prompt }} module list -``` -{: .language-bash} - -``` -Currently Loaded Modulefiles: - 1) anaconda/python3 6) intel-cmkl-17/17.0.2.174 - 2) mpt/2.16 7) gcc/6.2.0 - 3) intel-cc-17/17.0.2.174 8) fftw-3.3.5-intel-17.0.2-dxt2dzn - 4) intel-fc-17/17.0.2.174 9) netcdf/4.4.1 - 5) intel-compilers-17/17.0.2.174 10) abinit/8.2.3-intel17-mpt214 -``` -{: .output} - -So in this case, loading the `abinit` module also loaded a variety of other -modules. Let's try unloading the `abinit` package. - -``` -{{ site.remote.prompt }} module unload abinit -{{ site.remote.prompt }} module list -``` -{: .language-bash} - -``` -Currently Loaded Modulefiles: - 1) anaconda/python3 -``` -{: .output} - -So using `module unload` "un-loads" a module along with its dependencies. If we -wanted to unload everything at once, we could run `module purge` (unloads -everything). - -``` -{{ site.remote.prompt }} module load abinit -{{ site.remote.prompt }} module purge -``` -{: .language-bash} - -``` -No Modulefiles Currently Loaded. -``` -{: .output} - -Note that `module purge` has removed the `anaconda/python3` module as well as -`abinit` and its dependencies. diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/account-history.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/resources/account-history.snip deleted file mode 100644 index fffcbf19..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/account-history.snip +++ /dev/null @@ -1,12 +0,0 @@ -``` -{{ site.remote.host }}: - Req'd Req'd -Job ID Username Queue Jobname SessID NDS TSK Memory Time S -------------------- -------- -------- ---------- ------ --- --- ------ ----- - -324396.{{ site.remote.host }} user workq test1 57348 1 1 - -324397.{{ site.remote.host }} user workq test2 57456 1 1 - -324401.{{ site.remote.host }} user workq test3 58159 1 1 - -324410.{{ site.remote.host }} user workq test4 34027 1 1 - -324418.{{ site.remote.host }} user workq test5 35243 1 1 - -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/hist_fields.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/resources/hist_fields.snip deleted file mode 100644 index 54a46fef..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/hist_fields.snip +++ /dev/null @@ -1,6 +0,0 @@ -* **exec_vnode** - Where did your job run? -* **resources_used.walltime** - How long did the job take? -* **comment** - Any notes on success or errors in the job -* **Output_Path** - The file that stdout from the job was sent to -* **Resource_List.** - Set of resources requested by the job -* **resources_used.** - Set of resources used by the job diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/basic-job-status.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/basic-job-status.snip deleted file mode 100644 index 78151c7d..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/basic-job-status.snip +++ /dev/null @@ -1,12 +0,0 @@ -``` -{{ site.remote.host }}: - Req'd Req'd Elap -Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time ------- -------- -------- -------------- ------ --- --- ------ ----- - ----- -387775 yourUser workq example-job.sh 50804 1 1 -- 96:00 R 00:00 -``` -{: .output} - -We can see all the details of our job, most importantly that it is in the `R` -or `RUNNING` state. Sometimes our jobs might need to wait in a queue -(`PENDING`) or have an error (`E`). diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/job-with-name-status.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/job-with-name-status.snip deleted file mode 100644 index fe467110..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/job-with-name-status.snip +++ /dev/null @@ -1,8 +0,0 @@ -``` -38778.{{ site.remote.host }} - Req'd Req'd Elap -Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time ------- -------- -------- ---------- ------ --- --- ------ ----- - ----- -38778 yourUser workq hello-worl 51536 1 1 -- 96:00 R 00:00 -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/option-flags-list.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/option-flags-list.snip deleted file mode 100644 index 93977169..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/option-flags-list.snip +++ /dev/null @@ -1,10 +0,0 @@ -* `-l select=:ncpus=` — how many nodes does your - job need and how many cores per node? Note that there are 36 cores per node - on Cirrus. - -* `-l walltime=` — How much real-world time - (walltime) will your job take to run? - -* `-l place=scatter:excl` — Reserve your nodes just for yourself. (If you - are using full nodes, you should include this as it stops other users from - interfering with the performance of your job.) diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/runtime-exceeded-job.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/runtime-exceeded-job.snip deleted file mode 100644 index d275d2e9..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/runtime-exceeded-job.snip +++ /dev/null @@ -1,4 +0,0 @@ -``` -{{ site.remote.prompt }} cat example-job.sh.e387798 -``` -{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/runtime-exceeded-output.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/runtime-exceeded-output.snip deleted file mode 100644 index f33a7dc4..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/runtime-exceeded-output.snip +++ /dev/null @@ -1,4 +0,0 @@ -``` -=>> PBS: job killed: walltime 33 exceeded limit 30 -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-job-begin.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-job-begin.snip deleted file mode 100644 index 53666196..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-job-begin.snip +++ /dev/null @@ -1,10 +0,0 @@ -``` -38759.{{ site.remote.host }} - -indy2-login0: - Req'd Req'd Elap -Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time ------- -------- -------- -------------- ------ --- --- ------ ----- - ----- -38759 yourUser workq example-job.sh 32085 1 1 -- 00:10 R 00:00 -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-job-cancel.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-job-cancel.snip deleted file mode 100644 index 69753894..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-job-cancel.snip +++ /dev/null @@ -1,4 +0,0 @@ -``` -...(no output from qstat when there are no jobs to display)... -``` -{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-multiple-jobs.snip b/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-multiple-jobs.snip deleted file mode 100644 index 139597f9..00000000 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/terminate-multiple-jobs.snip +++ /dev/null @@ -1,2 +0,0 @@ - - diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml b/_includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml new file mode 100644 index 00000000..57ab94fd --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml @@ -0,0 +1,76 @@ +#------------------------------------------------------------ +# EPCC, The University of Edinburgh: Cirrus + Slurm +#------------------------------------------------------------ + +# Cluster host and scheduler options: the defaults come from +# Graham at Compute Canada, running Slurm. Other options can +# be found in the library of snippets, +# `_includes/snippets_library`. To use one, replace options +# below with those in `_config_options.yml` from the +# library. E.g, to customise for Cirrus at EPCC, running +# Slurm, we could replace the options below with those from +# +# _includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml +# +# If your cluster is not represented in the library, please +# copy an existing folder, rename it, and customize for your +# installation. Remember to keep the leading slash on the +# `snippets` variable below! + +snippets: "/snippets_library/EPCC_Cirrus_slurm" + +local: + prompt: "[auser@laptop ~]$" + bash_shebang: "#!/bin/bash" + +remote: + name: "Cirrus" + login: "login.cirrus.ac.uk" + host: "cirrus-login1" + node: "r1i0n32" + location: "EPCC, The University of Edinburgh" + homedir: "/lustre/home/tc001" + user: "auser" + group: "tc001" + prompt: "[auser@cirrus-login1 ~]$" + bash_shebang: "#!/bin/bash" + module_python3: "anaconda/python3-2021.11" + +sched: + name: "Slurm" + submit: + name: "sbatch" + options: "--partition=standard --qos=standard --time=00:02:00" + queue: + debug: "debug" + testing: "testing" + status: "squeue" + flag: + user: "-u auser" + interactive: "--time=00:20:00 --partition=standard --qos=standard --pty /usr/bin/bash --login" + histdetail: "-l -j" + name: "-J" + partition: "-p standard" + qos: "-q standard" + time: "-t" + queue: "-p" + nodes: "-N" + tasks: "-n" + del: "scancel" + interactive: "srun" + info: "sinfo" + comment: "#SBATCH" + hist: "sacct" + hist_filter: "--format=JobID,JobName,State,Elapsed,NodeList,MaxRSS,MaxDiskRead,MaxDiskWrite" + +episode_order: + - 10-hpc-intro + - 11-connecting + - 12-cluster + - 13-scheduler + - 14-environment-variables + - 15-modules + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/queue-info.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/queue-info.snip new file mode 100644 index 00000000..43331c97 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/queue-info.snip @@ -0,0 +1,17 @@ +``` +PARTITION AVAIL TIMELIMIT NODES STATE NODELIST +standard up 4-00:00:00 4 resv r1i0n[0-1],r1i2n[18-19] +standard up 4-00:00:00 6 mix r1i0n[31,33],r1i3n30,r1i4n5,r1i5n... +standard up 4-00:00:00 187 alloc r1i0n[2,5,13,18-30,32,34-35],r1i1... +standard up 4-00:00:00 83 idle r1i0n[3-4,6-12,14-17],r1i3n[4,9,1... +gpu-skylake up 20:00 1 mix r2i3n0 +gpu-skylake up 20:00 1 idle r2i3n1 +gpu-cascade up 20:00 2 maint r2i7n[7-8] +gpu-cascade up 20:00 1 resv r2i5n5 +gpu-cascade up 20:00 4 mix r2i4n[3,8],r2i5n[0,4] +gpu-cascade up 20:00 10 alloc r2i4n[0,2,4,6-7],r2i5n[6-8],r2i6n... +gpu-cascade up 20:00 19 idle r2i4n[1,5],r2i5n[1-3],r2i6n[0-2,4... +tds up 4-00:00:00 4 idle r1i4n[8,17,26,35] +gpu-tds up 10:00 2 maint r2i7n[7-8] +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/root-folders.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/root-folders.snip new file mode 100644 index 00000000..7324c9c5 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/root-folders.snip @@ -0,0 +1,7 @@ +``` +backports beegfs bin boot data dev etc +home lib lib64 lost+found lustre media mnt +opt proc root run sbin scratch srv +sys tmp usr var +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/specific-node-info.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/specific-node-info.snip new file mode 100644 index 00000000..6810e285 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/cluster/specific-node-info.snip @@ -0,0 +1,11 @@ +> ## Explore a Worker Node +> +> Finally, let's look at the resources available on the worker nodes where your +> jobs will actually run. Try running this command to see the name, CPUs and +> memory (in MB) available on the worker nodes: +> +> ``` +> {{ site.remote.prompt }} sinfo -n {{ site.remote.node }} -o "%n %c %m" +> ``` +> {: .language-bash} +{: .challenge} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/available-modules.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/available-modules.snip new file mode 100644 index 00000000..57e19238 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/available-modules.snip @@ -0,0 +1,10 @@ +``` +------------------------------------------------- /lustre/sw/modulefiles -------------------------------------------------- +altair-hwsolvers/13.0.213 flacs-cfd/21.1 intel-19.5/mpi libxkbcommon/1.0.1(default) openmpi/4.1.0-cuda-11.2 +altair-hwsolvers/14.0.210 flacs-cfd/21.2 intel-19.5/pxse matlab/R2019a perf/1.0.0 +anaconda/python2 flacs/10.9.1 intel-19.5/tbb matlab/R2019b petsc/3.13.2-intel-mpi-18 +anaconda/python3 flex/2.6.4 intel-19.5/vtune matlab/R2020b(default) petsc/3.13.2-mpt +anaconda/python3-2021.11 forge/20.0.0(default) intel-20.4/cc matlab/R2021b quantum-espresso/6.5-intel +... ... ... ... ... +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/default-modules.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/default-modules.snip new file mode 100644 index 00000000..b4237781 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/default-modules.snip @@ -0,0 +1,5 @@ +``` +Currently Loaded Modulefiles: + 1) git/2.21.0(default) 2) epcc/utils 3) /lustre/sw/modulefiles/epcc/setup-env +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/basic-job-script.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/missing-python.snip similarity index 54% rename from _includes/snippets_library/EPCC_Cirrus_pbs/scheduler/basic-job-script.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/modules/missing-python.snip index 01363e40..584eae91 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/basic-job-script.snip +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/missing-python.snip @@ -1,4 +1,4 @@ ``` -387775 +/usr/bin/python3 ``` {: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/module-load-python.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/module-load-python.snip new file mode 100644 index 00000000..d9bab7b4 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/module-load-python.snip @@ -0,0 +1,5 @@ +``` +{{ site.remote.prompt }} module load {{ site.remote.module_python3 }} +{{ site.remote.prompt }} which python3 +``` +{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-executable-dir.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-executable-dir.snip new file mode 100644 index 00000000..f04c8908 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-executable-dir.snip @@ -0,0 +1,4 @@ +``` +/lustre/sw/anaconda/anaconda3-2021.11/bin/python3 +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-ls-dir-command.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-ls-dir-command.snip new file mode 100644 index 00000000..f299be46 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-ls-dir-command.snip @@ -0,0 +1,4 @@ +``` +{{ site.remote.prompt }} ls /lustre/sw/anaconda/anaconda3-2021.11/bin +``` +{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-ls-dir-output.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-ls-dir-output.snip new file mode 100644 index 00000000..637ea953 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-ls-dir-output.snip @@ -0,0 +1,13 @@ +``` +2to3 derb h5fc libtool pyflakes sphinx-quickstart +2to3-3.9 designer h5format_convert libtoolize pyftmerge spyder +acountry djpeg h5import linguist pyftsubset sqlite3 +activate dltest h5jam linkicc pygmentize sqlite3_analyzer +adig dwebp h5ls list_instances pyjson5 symilar +aec dynamodb_dump h5mkgrp lrelease pylint syncqt.pl +ahost dynamodb_load h5perf_serial lsm2bin pylsp tabs +anaconda elbadmin h5redeploy lss3 pylupdate5 taskadmin +anaconda-navigator epylint h5repack lupdate pyrcc5 tclsh +... ... ... ... ... ... +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-module-path.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-module-path.snip new file mode 100644 index 00000000..60de9cfd --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/python-module-path.snip @@ -0,0 +1,4 @@ +``` +/lustre/sw/anaconda/anaconda3-2021.11/bin:/lustre/sw/spack-cirrus/opt/spack/linux-centos7-x86_64/gcc-8.2.0/git-2.21.0-rcchd4zgfdherdlklrr2y3amq7p73svi/bin:/lustre/sw/epcc-utils/bin:/opt/clmgr/sbin:/opt/clmgr/bin:/opt/sgi/sbin:/opt/sgi/bin:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/c3/bin:/sbin:/bin:/lustre/home/tc001/auser/.local/bin +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/modules/software-dependencies.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/software-dependencies.snip new file mode 100644 index 00000000..7a8e5e3b --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/software-dependencies.snip @@ -0,0 +1,35 @@ +To demonstrate, let's load the `namd` module and then use the `module list` +command to show which modules we currently have loaded in our environment. +([NAMD](https://www.ks.uiuc.edu/Research/namd/) is parallel molecular dynamics code +designed for high-performance simulation of large biomolecular systems.) + +``` +{{ site.remote.prompt }} module load namd +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modulefiles: + 1) git/2.21.0(default) 2) epcc/utils 3) /lustre/sw/modulefiles/epcc/setup-env + 4) gcc/8.2.0 5) intel-license 6) intel-mpi-19/19.0.0.117 + 7) fftw/3.3.9-impi19-gcc8 8) namd/2.14(default) +``` +{: .output} + +So in this case, loading the `namd` module also loaded a variety of other +modules. Let's try unloading the `namd` package. + +``` +{{ site.remote.prompt }} module unload namd +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modulefiles: + 1) git/2.21.0(default) 2) epcc/utils 3) /lustre/sw/modulefiles/epcc/setup-env +``` +{: .output} + +So using `module unload` "un-loads" a module along with its dependencies. diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/wrong-gcc-version.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/wrong-gcc-version.snip similarity index 73% rename from _includes/snippets_library/EPCC_Cirrus_pbs/modules/wrong-gcc-version.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/modules/wrong-gcc-version.snip index 74506dcf..424703d5 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/modules/wrong-gcc-version.snip +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/modules/wrong-gcc-version.snip @@ -1,10 +1,10 @@ Let's take a closer look at the `gcc` module. GCC is an extremely widely used C/C++/Fortran compiler. Lots of software is dependent on the GCC version, and might not compile or run if the wrong version is loaded. In this case, there -are three different versions: `gcc/6.2.0`, `gcc/6.3.0` and `gcc/7.2.0`. How do -we load each copy and which copy is the default? +are four different versions: `gcc/6.2.0`, `gcc/6.3.0`, `gcc/8.2.0` and `gcc/10.2.0`. +How do we load each copy and which copy is the default? -In this case, `gcc/6.2.0` has a `(default)` next to it. This indicates that it +In this case, `gcc/6.3.0` has a `(default)` next to it. This indicates that it is the default - if we type `module load gcc`, this is the copy that will be loaded. @@ -15,7 +15,7 @@ loaded. {: .language-bash} ``` -gcc (GCC) 6.2.0 +gcc (GCC) 6.3.0 Copyright (C) 2016 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. @@ -24,19 +24,18 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. So how do we load the non-default copy of a software package? In this case, the only change we need to make is be more specific about the module we are -loading. There are three GCC modules: `gcc/6.2.0`, `gcc/6.3.0` and `gcc/7.2.0` +loading. There are four GCC modules: `gcc/6.2.0`, `gcc/6.3.0`, `gcc/8.2.0` and `gcc/10.2.0` To load a non-default module, we need to make add the version number after the `/` in our `module load` command ``` -{{ site.remote.prompt }} module load gcc/7.2.0 +{{ site.remote.prompt }} module load gcc/10.2.0 ``` {: .language-bash} ``` -gcc/7.2.0(17):ERROR:150: Module 'gcc/7.2.0' conflicts with the currently loaded -module(s) 'gcc/6.2.0' -gcc/7.2.0(17):ERROR:102: Tcl command execution failed: conflict gcc +WARNING: gcc/10.2.0 cannot be loaded due to a conflict. +HINT: Might try "module unload gcc" first. ``` {: .output} @@ -47,46 +46,45 @@ new version. ``` {{ site.remote.prompt }} module unload gcc -{{ site.remote.prompt }} module load gcc/7.2.0 +{{ site.remote.prompt }} module load gcc/10.2.0 {{ site.remote.prompt }} gcc --version ``` {: .language-bash} ``` -gcc (GCC) 7.2.0 -Copyright (C) 2017 Free Software Foundation, Inc. +gcc (GCC) 10.2.0 +Copyright (C) 2020 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ``` {: .output} -We now have successfully switched from GCC 6.2.0 to GCC 7.2.0. +We now have successfully switched from GCC 6.3.0 to GCC 10.2.0. As switching between different versions of the same module is often used you can use `module swap` rather than unloading one version before loading another. The equivalent of the steps above would be: ``` -{{ site.remote.prompt }} module purge +{{ site.remote.prompt }} module unload gcc/10.2.0 {{ site.remote.prompt }} module load gcc {{ site.remote.prompt }} gcc --version -{{ site.remote.prompt }} module swap gcc gcc/7.2.0 +{{ site.remote.prompt }} module swap gcc gcc/10.2.0 {{ site.remote.prompt }} gcc --version ``` {: .language-bash} ``` -gcc (GCC) 6.2.0 +gcc (GCC) 6.3.0 Copyright (C) 2016 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -gcc (GCC) 7.2.0 -Copyright (C) 2017 Free Software Foundation, Inc. +gcc (GCC) 10.2.0 +Copyright (C) 2020 Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. ``` {: .output} This achieves the same result as unload followed by load but in a single step. - diff --git a/_includes/snippets_library/UCL_Myriad_sge/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/eight-tasks-jobscript.snip similarity index 62% rename from _includes/snippets_library/UCL_Myriad_sge/parallel/one-task-with-memory-jobscript.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/parallel/eight-tasks-jobscript.snip index 56aee37a..f0e8716f 100644 --- a/_includes/snippets_library/UCL_Myriad_sge/parallel/one-task-with-memory-jobscript.snip +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/eight-tasks-jobscript.snip @@ -1,13 +1,13 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} -{{ site.sched.comment }} -l nodes=1:ppn=1:mem=3G +{{ site.sched.comment }} -l nodes=1:ppn=8 # Load the computing environment we need module load python3 # Execute the task -python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/four-tasks-jobscript.snip new file mode 100644 index 00000000..6b4aae76 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/four-tasks-jobscript.snip @@ -0,0 +1,18 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi +{{ site.sched.comment }} {{ site.sched.flag.partition }} +{{ site.sched.comment }} {{ site.sched.flag.qos }} +{{ site.sched.comment }} --exclusive +{{ site.sched.comment }} --time=00:20:00 +{{ site.sched.comment }} --nodes=1 +{{ site.sched.comment }} --tasks-per-node=4 +{{ site.sched.comment }} --cpus-per-task=1 + +# Load the computing environment we need +module load mpi4py + +# Execute the task +srun --cpu-bind=cores python parallel-pi.py 100000000 +``` +{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/one-task-jobscript.snip similarity index 62% rename from _includes/snippets_library/EPCC_Cirrus_pbs/parallel/one-task-with-memory-jobscript.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/parallel/one-task-jobscript.snip index 56aee37a..42fdd337 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/parallel/one-task-with-memory-jobscript.snip +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/one-task-jobscript.snip @@ -1,13 +1,13 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} -{{ site.sched.comment }} -l nodes=1:ppn=1:mem=3G +{{ site.sched.comment }} -l nodes=1:ppn=1 # Load the computing environment we need module load python3 # Execute the task -python pi.py 100000000 +amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/one-task-with-memory-jobscript.snip new file mode 100644 index 00000000..baefb638 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/parallel/one-task-with-memory-jobscript.snip @@ -0,0 +1,16 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.partition }} +{{ site.sched.comment }} {{ site.sched.flag.qos }} +{{ site.sched.comment }} --exclusive +{{ site.sched.comment }} --time=00:20:00 +{{ site.sched.comment }} --ntasks=1 + +# Load the computing environment we need +module load {{ site.remote.module_python3 }} + +# Execute the task +python serial-pi.py 100000000 +``` +{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/resources/account-history.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/account-history.snip new file mode 100644 index 00000000..8791e7f6 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/account-history.snip @@ -0,0 +1,12 @@ +``` +JobID JobName Partition Account AllocCPUS State ExitCode +-------------- ----------- --------- ------- --------- --------- -------- +2168130 serial-pi standard tc001 36 COMPLETED 0:0 +2168130.batch batch tc001 36 COMPLETED 0:0 +2168130.extern extern tc001 36 COMPLETED 0:0 +2168132 parallel-pi standard tc001 36 COMPLETED 0:0 +2168132.batch batch tc001 36 COMPLETED 0:0 +2168132.extern extern tc001 36 COMPLETED 0:0 +2168132.0 python tc001 4 COMPLETED 0:0 +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/cfd_bench.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/cfd-bench.snip similarity index 84% rename from _includes/snippets_library/EPCC_Cirrus_pbs/resources/cfd_bench.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/resources/cfd-bench.snip index 1a0443de..638e6112 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/cfd_bench.snip +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/cfd-bench.snip @@ -4,14 +4,14 @@ using the following command: > > ``` -> {{ site.workshop_host.prompt }} wget {{ site.url }}{{ site.baseurl }}/files/cfd.tar.gz +> {{ site.remote.prompt }} wget {{ site.url }}{{ site.baseurl }}/files/cfd.tar.gz > ``` > {: .language-bash} > > Then unpack it using > > ``` -> {{ site.workshop_host.prompt }} tar -xvf cfd.tar.gz +> {{ site.remote.prompt }} tar -xvf cfd.tar.gz > ``` > {: .language-bash} > @@ -19,7 +19,7 @@ > `cfd.py` program. > > ``` -> module load anaconda/python2 +> module load {{ site.remote.module_python3 }} > python cfd.py 3 20000 > ``` > {: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/resources/hist-fields.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/hist-fields.snip new file mode 100644 index 00000000..c0eac7f7 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/hist-fields.snip @@ -0,0 +1,6 @@ +* **NodeList**: The node(s) on which your job ran. +* **MaxRSS**: What was the maximum amount of memory used? +* **Elapsed**: How long did the job take? +* **State**: What is the job currently doing/what happened to it? +* **MaxDiskRead**: Amount of data read from disk. +* **MaxDiskWrite**: Amount of data written to disk. diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/monitor-processes-top.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/monitor-processes-top.snip similarity index 79% rename from _includes/snippets_library/EPCC_Cirrus_pbs/resources/monitor-processes-top.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/resources/monitor-processes-top.snip index 5d4255c2..8961c52f 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/monitor-processes-top.snip +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/monitor-processes-top.snip @@ -5,14 +5,13 @@ Tasks: 1526 total, 4 running, 1495 sleeping, 8 stopped, 19 zombie KiB Mem : 26377216+total, 11843416+free, 10668532 used, 13466947+buff/cache KiB Swap: 2097148 total, 105600 free, 1991548 used. 22326803+avail Mem PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND -21917 root 20 0 374324 233452 6584 R 55.6 0.1 308:13.19 pbs_server. -30680 marius 20 0 152436 20772 5872 R 17.8 0.0 0:00.08 cc1 -27287 aturner 20 0 157312 3768 1600 R 8.9 0.0 0:00.59 top -30681 kfindlay 20 0 16744 2176 932 S 4.4 0.0 0:00.02 pbsnodes +30680 user8 20 0 152436 20772 5872 R 17.8 0.0 0:00.08 cc1 +27287 user2 20 0 157312 3768 1600 R 8.9 0.0 0:00.59 top +30681 user9 20 0 16744 2176 932 S 4.4 0.0 0:00.02 pbsnodes 2765 root 20 0 20940 32 0 S 2.2 0.0 5:59.78 aksusbd 7361 root 20 0 0 0 0 S 2.2 0.0 36:53.49 ptlrpcd_35 -26386 hallen 20 0 4321956 123520 6740 S 2.2 0.0 0:03.81 conda -30830 pcerro 20 0 117344 1656 1312 S 2.2 0.0 0:05.70 deployer_oo +26386 user3 20 0 4321956 123520 6740 S 2.2 0.0 0:03.81 conda +30830 user5 20 0 117344 1656 1312 S 2.2 0.0 0:05.70 deployer_oo 1 root 20 0 196108 3932 1644 S 0.0 0.0 82:49.29 systemd 2 root 20 0 0 0 0 S 0.0 0.0 6:14.69 kthreadd 3 root 20 0 0 0 0 S 0.0 0.0 0:06.40 ksoftirqd/0 diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/resources/system-memory-free.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/resources/system-memory-free.snip similarity index 100% rename from _includes/snippets_library/EPCC_Cirrus_pbs/resources/system-memory-free.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/resources/system-memory-free.snip diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/basic-job-script.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/basic-job-script.snip new file mode 100644 index 00000000..d19fd486 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/basic-job-script.snip @@ -0,0 +1,4 @@ +``` +Submitted batch job 2128732 +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/basic-job-status.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/basic-job-status.snip new file mode 100644 index 00000000..cbc0621c --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/basic-job-status.snip @@ -0,0 +1,9 @@ +``` + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) +2165985 standard example-job.sh auser R 0:24 1 r1i0n24 +``` +{: .output} + +We can see all the details of our job, most importantly that it is in the `R` +or `RUNNING` state. Sometimes our jobs might need to wait in a queue +(`PENDING`) or they might have failed (`F`) with some non-zero exit code. diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/email-notifications.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/email-notifications.snip new file mode 100644 index 00000000..c72732a7 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/email-notifications.snip @@ -0,0 +1,5 @@ +> Jobs on an HPC system might run for days or even weeks. It is possible to configure +> the {{ site.sched.name }} scheduler such that an email notification is sent when a +> job starts running and/or when the job terminates. Unfortunately, {{ site.sched.name }} email +> notifications are not enabled on {{ site.remote.name }}. +{: .challenge} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/job-with-name-status.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/job-with-name-status.snip new file mode 100644 index 00000000..53261c17 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/job-with-name-status.snip @@ -0,0 +1,5 @@ +``` + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) +2165985 standard new_name auser R 0:35 1 r1i0n24 +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/option-flags-list.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/option-flags-list.snip new file mode 100644 index 00000000..03ee5fe0 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/option-flags-list.snip @@ -0,0 +1,17 @@ +* `-J, --job-name=` — set a name for the job to help identify it in Slurm command output. + +* `-A, --account=` — your budget ID is usually something like tc01 or tc01-test. + +* `-p, --partition=` — the partition specifies the set of nodes you want to run on. + +* `-q, --qos=` — the Quality of Service (QoS) specifies the limits of your job (e.g., maximum number of nodes, maximum walltime). + +* `-t, --time=` — the maximum walltime for your job, e.g. for a 6.5 hour walltime, you would use `--time=06:30:00`. + +* `--exclusive` — setting this flag ensures that you have exclusive access to a compute node. + +* `-N, --nodes=` — the number of nodes to use for the job. + +* `--ntasks-per-node=` — the number of parallel processes (e.g. MPI ranks) per node. + +* `-c, --cpus-per-task=` — the number of threads per parallel process (e.g. the number of OpenMP threads per MPI task for hybrid MPI/OpenMP jobs). diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/print-sched-variables.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/print-sched-variables.snip new file mode 100644 index 00000000..bca87ac4 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/print-sched-variables.snip @@ -0,0 +1,35 @@ +> ## Job environment variables +> +> When {{ site.sched.name }} runs a job, it sets a number of environment +> variables for the job. One of these will let us check what directory our job +> script was submitted from. The `SLURM_SUBMIT_DIR` variable is set to the +> directory from which our job was submitted. +> +> Using the `SLURM_SUBMIT_DIR` variable, modify your job so that it prints out the +> location from which the job was submitted. +> +> > ## Solution +> > +> > ``` +> > {{ site.remote.prompt }} nano example-job.sh +> > {{ site.remote.prompt }} cat example-job.sh +> > ``` +> > {: .language-bash} +> > +> > ``` +> > {{ site.remote.bash_shebang }} +> > {{ site.sched.comment }} {{ site.sched.flag.partition }} +> > {{ site.sched.comment }} {{ site.sched.flag.qos }} +> > {{ site.sched.comment }} {{ site.sched.flag.time }} 00:01:15 +> > +> > sleep 60 # time in seconds +> > +> > echo -n "This script is running on " +> > hostname +> > +> > echo "This job was launched in the following directory:" +> > echo ${SLURM_SUBMIT_DIR} +> > ``` +> > {: .output} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/runtime-exceeded-job.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/runtime-exceeded-job.snip new file mode 100644 index 00000000..6bca2938 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/runtime-exceeded-job.snip @@ -0,0 +1,4 @@ +``` +{{ site.remote.prompt }} cat slurm-2166477.out +``` +{: .language-bash} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/runtime-exceeded-output.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/runtime-exceeded-output.snip new file mode 100644 index 00000000..9cdd6366 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/runtime-exceeded-output.snip @@ -0,0 +1,4 @@ +``` +slurmstepd: error: *** JOB 2166477 ON r1i0n24 CANCELLED AT 2022-02-09T14:34:34 DUE TO TIME LIMIT *** +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-job-begin.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-job-begin.snip new file mode 100644 index 00000000..4fbbc9ae --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-job-begin.snip @@ -0,0 +1,5 @@ +``` + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) +2166487 standard overrun auser R 0:20 1 r1i0n24 +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-job-cancel.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-job-cancel.snip new file mode 100644 index 00000000..7f3ff115 --- /dev/null +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-job-cancel.snip @@ -0,0 +1,4 @@ +``` +...(no output from squeue when there are no jobs to display)... +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-multiple-jobs.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/terminate-multiple-jobs.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/using-nodes-interactively.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/using-nodes-interactively.snip similarity index 87% rename from _includes/snippets_library/EPCC_Cirrus_pbs/scheduler/using-nodes-interactively.snip rename to _includes/snippets_library/EPCC_Cirrus_slurm/scheduler/using-nodes-interactively.snip index 3c031537..063fb1e4 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/using-nodes-interactively.snip +++ b/_includes/snippets_library/EPCC_Cirrus_slurm/scheduler/using-nodes-interactively.snip @@ -9,7 +9,7 @@ uses a single core: {: .language-bash} You should be presented with a bash prompt. Note that the prompt will likely -change to reflect your new location, in this case the worker node we are logged +change to reflect your new location, in this case the compute node we are logged on. You can also verify this with `hostname`. When you are done with the interactive job, type `exit` to quit your session. diff --git a/_includes/snippets_library/EPCC_Cirrus_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/EPCC_Cirrus_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/_config_options.yml b/_includes/snippets_library/HPCC_MagicCastle_slurm/_config_options.yml new file mode 100644 index 00000000..45f7a367 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/_config_options.yml @@ -0,0 +1,66 @@ +# --------------------------------------------------------------- +# HPC Carpentries in the Cloud: Slurm + Software Stack from EESSI +# --------------------------------------------------------------- +# +# The HPC Carpentry Cluster in the Cloud is provided as a public +# service by volunteers. It is provisioned with Magic Castle +# using the EESSI +# software stack. If you need an +# account, please visit . +# +# Compute responsibly. +--- + +snippets: "/snippets_library/HPCC_MagicCastle_slurm" + +local: + prompt: "[you@laptop:~]$" + bash_shebang: "#!/usr/bin/env bash" + +remote: + name: "HPC Carpentry's Cloud Cluster" + login: "cluster.hpc-carpentry.org" + portal: "https://mokey.cluster.hpc-carpentry.org" + host: "login1" + node: "smnode1" + location: "cluster.hpc-carpentry.org" + homedir: "/home" + user: "yourUsername" + module_python3: "Python" + prompt: "[yourUsername@login1 ~]$" + bash_shebang: "#!/bin/bash" + +sched: + name: "Slurm" + submit: + name: "sbatch" + options: "" + queue: + debug: "smnode" + testing: "cpubase_bycore_b1" + status: "squeue" + flag: + user: "-u yourUsername" + interactive: "" + histdetail: "-l -j" + name: "-J" + time: "-t" + queue: "-p" + del: "scancel" + interactive: "srun" + info: "sinfo" + comment: "#SBATCH" + hist: "sacct -u yourUsername" + hist_filter: "" + +episode_order: + - 10-hpc-intro + - 11-connecting + - 12-cluster + - 13-scheduler + - 14-environment-variables + - 15-modules + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/cluster/queue-info.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/cluster/queue-info.snip new file mode 100644 index 00000000..decfc331 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/cluster/queue-info.snip @@ -0,0 +1,7 @@ +``` +PARTITION AVAIL TIMELIMIT NODES STATE NODELIST +cpubase_bycore_b1* up infinite 4 idle node[1-2],smnode[1-2] +node up infinite 2 idle node[1-2] +smnode up infinite 2 idle smnode[1-2] +``` +{: .output} diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/cluster/specific-node-info.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/cluster/specific-node-info.snip similarity index 82% rename from _includes/snippets_library/EPCC_Cirrus_pbs/cluster/specific-node-info.snip rename to _includes/snippets_library/HPCC_MagicCastle_slurm/cluster/specific-node-info.snip index ca334755..b70845bd 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/cluster/specific-node-info.snip +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/cluster/specific-node-info.snip @@ -5,7 +5,7 @@ > memory available on the worker nodes: > > ``` -> {{ site.remote.prompt }} pbsnodes {{ site.remote.node }} +> {{ site.remote.prompt }} sinfo -o "%n %c %m" | column -t > ``` > {: .language-bash} {: .challenge} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/available-modules.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/available-modules.snip new file mode 100644 index 00000000..f6f3f50b --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/available-modules.snip @@ -0,0 +1,21 @@ +``` +~~~ /cvmfs/pilot.eessi-hpc.org/2020.12/software/x86_64/amd/zen2/modules/all ~~~ + Bazel/3.6.0-GCCcore-x.y.z NSS/3.51-GCCcore-x.y.z + Bison/3.5.3-GCCcore-x.y.z Ninja/1.10.0-GCCcore-x.y.z + Boost/1.72.0-gompi-2020a OSU-Micro-Benchmarks/5.6.3-gompi-2020a + CGAL/4.14.3-gompi-2020a-Python-3.x.y OpenBLAS/0.3.9-GCC-x.y.z + CMake/3.16.4-GCCcore-x.y.z OpenFOAM/v2006-foss-2020a + +[removed most of the output here for clarity] + + Where: + L: Module is loaded + Aliases: Aliases exist: foo/1.2.3 (1.2) means that "module load foo/1.2" + will load foo/1.2.3 + D: Default Module + +Use "module spider" to find all possible modules and extensions. +Use "module keyword key1 key2 ..." to search for all possible modules matching +any of the "keys". +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/default-modules.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/missing-python.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/missing-python.snip new file mode 100644 index 00000000..89039d32 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/missing-python.snip @@ -0,0 +1,33 @@ +If the `python3` command was unavailable, we would see output like + +``` +/usr/bin/which: no python3 in (/cvmfs/pilot.eessi-hpc.org/2020.12/compat/linux/x86_64/usr/bin:/opt/software/slurm/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/puppetlabs/bin:/home/{{site.remote.user}}/.local/bin:/home/{{site.remote.user}}/bin) +``` +{: .output} + +Note that this wall of text is really a list, with values separated +by the `:` character. The output is telling us that the `which` command +searched the following directories for `python3`, without success: + +``` +/cvmfs/pilot.eessi-hpc.org/2020.12/compat/linux/x86_64/usr/bin +/opt/software/slurm/bin +/usr/local/bin +/usr/bin +/usr/local/sbin +/usr/sbin +/opt/puppetlabs/bin +/home/{{site.remote.user}}/.local/bin +/home/{{site.remote.user}}/bin +``` +{: .output} + +However, in our case we do have an existing `python3` available so we see + +``` +/cvmfs/pilot.eessi-hpc.org/2020.12/compat/linux/x86_64/usr/bin/python3 +``` +{: .output} + +We need a different Python than the system provided one though, so let us load +a module to access it. diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/module-load-python.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/module-load-python.snip new file mode 100644 index 00000000..d9bab7b4 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/module-load-python.snip @@ -0,0 +1,5 @@ +``` +{{ site.remote.prompt }} module load {{ site.remote.module_python3 }} +{{ site.remote.prompt }} which python3 +``` +{: .language-bash} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-executable-dir.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-executable-dir.snip new file mode 100644 index 00000000..46dec092 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-executable-dir.snip @@ -0,0 +1,4 @@ +``` +/cvmfs/pilot.eessi-hpc.org/2020.12/software/x86_64/amd/zen2/software/Python/3.x.y-GCCcore-x.y.z/bin/python3 +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-ls-dir-command.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-ls-dir-command.snip new file mode 100644 index 00000000..80319d0a --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-ls-dir-command.snip @@ -0,0 +1,4 @@ +``` +{{ site.remote.prompt }} ls /cvmfs/pilot.eessi-hpc.org/2020.12/software/x86_64/amd/zen2/software/Python/3.x.y-GCCcore-x.y.z/bin +``` +{: .language-bash} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-ls-dir-output.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-ls-dir-output.snip new file mode 100644 index 00000000..01d010ba --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-ls-dir-output.snip @@ -0,0 +1,16 @@ +``` +2to3 nosetests-3.8 python rst2s5.py +2to3-3.8 pasteurize python3 rst2xetex.py +chardetect pbr python3.8 rst2xml.py +cygdb pip python3.8-config rstpep2html.py +cython pip3 python3-config runxlrd.py +cythonize pip3.8 rst2html4.py sphinx-apidoc +easy_install pybabel rst2html5.py sphinx-autogen +easy_install-3.8 __pycache__ rst2html.py sphinx-build +futurize pydoc3 rst2latex.py sphinx-quickstart +idle3 pydoc3.8 rst2man.py tabulate +idle3.8 pygmentize rst2odt_prepstyles.py virtualenv +netaddr pytest rst2odt.py wheel +nosetests py.test rst2pseudoxml.py +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-module-path.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-module-path.snip new file mode 100644 index 00000000..68e97df1 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/python-module-path.snip @@ -0,0 +1,4 @@ +``` +/cvmfs/pilot.eessi-hpc.org/2020.12/software/x86_64/amd/zen2/software/Python/3.x.y-GCCcore-x.y.z/bin:/cvmfs/pilot.eessi-hpc.org/2020.12/software/x86_64/amd/zen2/software/SQLite/3.31.1-GCCcore-x.y.z/bin:/cvmfs/pilot.eessi-hpc.org/2020.12/software/x86_64/amd/zen2/software/Tcl/8.6.10-GCCcore-x.y.z/bin:/cvmfs/pilot.eessi-hpc.org/2020.12/software/x86_64/amd/zen2/software/GCCcore/x.y.z/bin:/cvmfs/pilot.eessi-hpc.org/2020.12/compat/linux/x86_64/usr/bin:/opt/software/slurm/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/opt/puppetlabs/bin:/home/user01/.local/bin:/home/user01/bin +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/software-dependencies.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/software-dependencies.snip new file mode 100644 index 00000000..fe107f2e --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/software-dependencies.snip @@ -0,0 +1,87 @@ +To demonstrate, let's use `module list`. `module list` shows all loaded +software modules. + +``` +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modules: + 1) GCCcore/x.y.z 4) GMP/6.2.0-GCCcore-x.y.z + 2) Tcl/8.6.10-GCCcore-x.y.z 5) libffi/3.3-GCCcore-x.y.z + 3) SQLite/3.31.1-GCCcore-x.y.z 6) Python/3.x.y-GCCcore-x.y.z +``` +{: .output} + +``` +{{ site.remote.prompt }} module load GROMACS +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modules: + 1) GCCcore/x.y.z 14) libfabric/1.11.0-GCCcore-x.y.z + 2) Tcl/8.6.10-GCCcore-x.y.z 15) PMIx/3.1.5-GCCcore-x.y.z + 3) SQLite/3.31.1-GCCcore-x.y.z 16) OpenMPI/4.0.3-GCC-x.y.z + 4) GMP/6.2.0-GCCcore-x.y.z 17) OpenBLAS/0.3.9-GCC-x.y.z + 5) libffi/3.3-GCCcore-x.y.z 18) gompi/2020a + 6) Python/3.x.y-GCCcore-x.y.z 19) FFTW/3.3.8-gompi-2020a + 7) GCC/x.y.z 20) ScaLAPACK/2.1.0-gompi-2020a + 8) numactl/2.0.13-GCCcore-x.y.z 21) foss/2020a + 9) libxml2/2.9.10-GCCcore-x.y.z 22) pybind11/2.4.3-GCCcore-x.y.z-Pytho... + 10) libpciaccess/0.16-GCCcore-x.y.z 23) SciPy-bundle/2020.03-foss-2020a-Py... + 11) hwloc/2.2.0-GCCcore-x.y.z 24) networkx/2.4-foss-2020a-Python-3.8... + 12) libevent/2.1.11-GCCcore-x.y.z 25) GROMACS/2020.1-foss-2020a-Python-3... + 13) UCX/1.8.0-GCCcore-x.y.z +``` +{: .output} + +So in this case, loading the `GROMACS` module (a bioinformatics software +package), also loaded `GMP/6.2.0-GCCcore-x.y.z` and +`SciPy-bundle/2020.03-foss-2020a-Python-3.x.y` as well. Let's try unloading the +`GROMACS` package. + +``` +{{ site.remote.prompt }} module unload GROMACS +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +Currently Loaded Modules: + 1) GCCcore/x.y.z 13) UCX/1.8.0-GCCcore-x.y.z + 2) Tcl/8.6.10-GCCcore-x.y.z 14) libfabric/1.11.0-GCCcore-x.y.z + 3) SQLite/3.31.1-GCCcore-x.y.z 15) PMIx/3.1.5-GCCcore-x.y.z + 4) GMP/6.2.0-GCCcore-x.y.z 16) OpenMPI/4.0.3-GCC-x.y.z + 5) libffi/3.3-GCCcore-x.y.z 17) OpenBLAS/0.3.9-GCC-x.y.z + 6) Python/3.x.y-GCCcore-x.y.z 18) gompi/2020a + 7) GCC/x.y.z 19) FFTW/3.3.8-gompi-2020a + 8) numactl/2.0.13-GCCcore-x.y.z 20) ScaLAPACK/2.1.0-gompi-2020a + 9) libxml2/2.9.10-GCCcore-x.y.z 21) foss/2020a + 10) libpciaccess/0.16-GCCcore-x.y.z 22) pybind11/2.4.3-GCCcore-x.y.z-Pytho... + 11) hwloc/2.2.0-GCCcore-x.y.z 23) SciPy-bundle/2020.03-foss-2020a-Py... + 12) libevent/2.1.11-GCCcore-x.y.z 24) networkx/2.4-foss-2020a-Python-3.x.y +``` +{: .output} + +So using `module unload` "un-loads" a module, and depending on how a site is + configured it may also unload all of the dependencies (in our case it does + not). If we wanted to unload everything at once, we could run `module purge` + (unloads everything). + +``` +{{ site.remote.prompt }} module purge +{{ site.remote.prompt }} module list +``` +{: .language-bash} + +``` +No modules loaded +``` +{: .output} + +Note that `module purge` is informative. It will also let us know if a default +set of "sticky" packages cannot be unloaded (and how to actually unload these +if we truly so desired). diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/wrong-gcc-version.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/wrong-gcc-version.snip new file mode 100644 index 00000000..8fbd2825 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/modules/wrong-gcc-version.snip @@ -0,0 +1,5 @@ + diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/eight-tasks-jobscript.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/eight-tasks-jobscript.snip new file mode 100644 index 00000000..2f643071 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/eight-tasks-jobscript.snip @@ -0,0 +1,16 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 8 + +# Load the computing environment we need +# (mpi4py and numpy are in SciPy-bundle) +module load {{ site.remote.module_python3 }} +module load SciPy-bundle + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/four-tasks-jobscript.snip new file mode 100644 index 00000000..19804d74 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/four-tasks-jobscript.snip @@ -0,0 +1,16 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 4 + +# Load the computing environment we need +# (mpi4py and numpy are in SciPy-bundle) +module load {{ site.remote.module_python3 }} +module load SciPy-bundle + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/one-task-jobscript.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/one-task-jobscript.snip new file mode 100644 index 00000000..1941ef04 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/parallel/one-task-jobscript.snip @@ -0,0 +1,14 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 1 + +# Load the computing environment we need +module load {{ site.remote.module_python3 }} + +# Execute the task +amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/account-history.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/account-history.snip new file mode 100644 index 00000000..d5a87620 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/account-history.snip @@ -0,0 +1,14 @@ +``` + JobID JobName Partition Account AllocCPUS State ExitCode +------------ ---------- ---------- ---------- ---------- ---------- -------- +7 file.sh cpubase_b+ def-spons+ 1 COMPLETED 0:0 +7.batch batch def-spons+ 1 COMPLETED 0:0 +7.extern extern def-spons+ 1 COMPLETED 0:0 +8 file.sh cpubase_b+ def-spons+ 1 COMPLETED 0:0 +8.batch batch def-spons+ 1 COMPLETED 0:0 +8.extern extern def-spons+ 1 COMPLETED 0:0 +9 example-j+ cpubase_b+ def-spons+ 1 COMPLETED 0:0 +9.batch batch def-spons+ 1 COMPLETED 0:0 +9.extern extern def-spons+ 1 COMPLETED 0:0 +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/monitor-processes-top.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/monitor-processes-top.snip new file mode 100644 index 00000000..12685735 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/monitor-processes-top.snip @@ -0,0 +1,19 @@ +``` +top - 21:00:19 up 3:07, 1 user, load average: 1.06, 1.05, 0.96 +Tasks: 311 total, 1 running, 222 sleeping, 0 stopped, 0 zombie +%Cpu(s): 7.2 us, 3.2 sy, 0.0 ni, 89.0 id, 0.0 wa, 0.2 hi, 0.2 si, 0.0 st +KiB Mem : 16303428 total, 8454704 free, 3194668 used, 4654056 buff/cache +KiB Swap: 8220668 total, 8220668 free, 0 used. 11628168 avail Mem + + PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND + 1693 jeff 20 0 4270580 346944 171372 S 29.8 2.1 9:31.89 gnome-shell + 3140 jeff 20 0 3142044 928972 389716 S 27.5 5.7 13:30.29 Web Content + 3057 jeff 20 0 3115900 521368 231288 S 18.9 3.2 10:27.71 firefox + 6007 jeff 20 0 813992 112336 75592 S 4.3 0.7 0:28.25 tilix + 1742 jeff 20 0 975080 164508 130624 S 2.0 1.0 3:29.83 Xwayland + 1 root 20 0 230484 11924 7544 S 0.3 0.1 0:06.08 systemd + 68 root 20 0 0 0 0 I 0.3 0.0 0:01.25 kworker/4:1 + 2913 jeff 20 0 965620 47892 37432 S 0.3 0.3 0:11.76 code + 2 root 20 0 0 0 0 S 0.0 0.0 0:00.02 kthreadd +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/system-memory-free.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/system-memory-free.snip new file mode 100644 index 00000000..ec4c0d3f --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/resources/system-memory-free.snip @@ -0,0 +1,6 @@ +``` +total used free shared buff/cache available +Mem: 3.8G 1.5G 678M 327M 1.6G 1.6G +Swap: 3.9G 170M 3.7G +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/basic-job-script.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/basic-job-script.snip new file mode 100644 index 00000000..06b7fc91 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/basic-job-script.snip @@ -0,0 +1,4 @@ +``` +Submitted batch job 7 +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/basic-job-status.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/basic-job-status.snip new file mode 100644 index 00000000..6bb1b93e --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/basic-job-status.snip @@ -0,0 +1,9 @@ +``` +JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 9 cpubase_b example- user01 R 0:05 1 node1 +``` +{: .output} + +We can see all the details of our job, most importantly that it is in the `R` +or `RUNNING` state. Sometimes our jobs might need to wait in a queue +(`PENDING`) or have an error (`E`). diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/job-with-name-status.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/job-with-name-status.snip new file mode 100644 index 00000000..838c5464 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/job-with-name-status.snip @@ -0,0 +1,5 @@ +``` +JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 10 cpubase_b hello-wo user01 R 0:02 1 node1 +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/option-flags-list.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/option-flags-list.snip new file mode 100644 index 00000000..5e80b164 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/option-flags-list.snip @@ -0,0 +1,15 @@ +* `--ntasks=` or `-n `: How many CPU cores does your job need, + in total? + +* `--time ` or `-t `: + How much real-world time (walltime) will your job take to run? The `` + part can be omitted. + +* `--mem=`: How much memory on a node does your job need in + megabytes? You can also specify gigabytes using by adding a little "g" + afterwards (example: `--mem=5g`) + +* `--nodes=` or `-N `: How many separate machines does your job + need to run on? Note that if you set `ntasks` to a number greater than what + one machine can offer, {{ site.sched.name }} will set this value + automatically. diff --git a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/print-sched-variables.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/print-sched-variables.snip similarity index 65% rename from _includes/snippets_library/EPCC_Cirrus_pbs/scheduler/print-sched-variables.snip rename to _includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/print-sched-variables.snip index bcda18c6..5234a4ed 100644 --- a/_includes/snippets_library/EPCC_Cirrus_pbs/scheduler/print-sched-variables.snip +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/print-sched-variables.snip @@ -2,11 +2,10 @@ > > When {{ site.sched.name }} runs a job, it sets a number of environment > variables for the job. One of these will let us check what directory our job -> script was submitted from. The `PBS_O_WORKDIR` variable is set to the -> directory from which our job was submitted. -> -> Using the `PBS_O_WORKDIR` variable, modify your job so that it prints out the -> location from which the job was submitted. +> script was submitted from. The `SLURM_SUBMIT_DIR` variable is set to the +> directory from which our job was submitted. Using the `SLURM_SUBMIT_DIR` +> variable, modify your job so that it prints out the location from which the +> job was submitted. > > > ## Solution > > @@ -18,13 +17,13 @@ > > > > ``` > > {{ site.remote.bash_shebang }} -> > #PBS -l 00:00:30 +> > #SBATCH -t 00:00:30 > > > > echo -n "This script is running on " > > hostname > > > > echo "This job was launched in the following directory:" -> > echo ${PBS_O_WORKDIR} +> > echo ${SLURM_SUBMIT_DIR} > > ``` > > {: .output} > {: .solution} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/runtime-exceeded-job.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/runtime-exceeded-job.snip new file mode 100644 index 00000000..a9eae8de --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/runtime-exceeded-job.snip @@ -0,0 +1,4 @@ +``` +{{ site.remote.prompt }} cat slurm-12.out +``` +{: .language-bash} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/runtime-exceeded-output.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/runtime-exceeded-output.snip new file mode 100644 index 00000000..180d3ea4 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/runtime-exceeded-output.snip @@ -0,0 +1,6 @@ +``` +This script is running on ... +slurmstepd: error: *** JOB 12 ON node1 CANCELLED AT 2021-02-19T13:55:57 +DUE TO TIME LIMIT *** +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-job-begin.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-job-begin.snip new file mode 100644 index 00000000..c7af2eef --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-job-begin.snip @@ -0,0 +1,7 @@ +``` +Submitted batch job 13 + +JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 13 cpubase_b long_job user01 R 0:02 1 node1 +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-job-cancel.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-job-cancel.snip new file mode 100644 index 00000000..5429787d --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-job-cancel.snip @@ -0,0 +1,4 @@ +``` +JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) +``` +{: .output} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-multiple-jobs.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-multiple-jobs.snip new file mode 100644 index 00000000..3a229b55 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/terminate-multiple-jobs.snip @@ -0,0 +1,27 @@ +> ## Cancelling multiple jobs +> +> We can also cancel all of our jobs at once using the `-u` option. This will +> delete all jobs for a specific user (in this case, yourself). Note that you +> can only delete your own jobs. +> +> Try submitting multiple jobs and then cancelling them all. +> +> > ## Solution +> > +> > First, submit a trio of jobs: +> > +> > ``` +> > {{ site.remote.prompt }} {{ site.sched.submit.name }} {% if site.sched.submit.options != '' %}{{ site.sched.submit.options }} {% endif %}example-job.sh +> > {{ site.remote.prompt }} {{ site.sched.submit.name }} {% if site.sched.submit.options != '' %}{{ site.sched.submit.options }} {% endif %}example-job.sh +> > {{ site.remote.prompt }} {{ site.sched.submit.name }} {% if site.sched.submit.options != '' %}{{ site.sched.submit.options }} {% endif %}example-job.sh +> > ``` +> > {: .language-bash} +> > +> > Then, cancel them all: +> > +> > ``` +> > {{ site.remote.prompt }} {{ site.sched.del }} -u {{ site.remote.user }} +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/using-nodes-interactively.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/using-nodes-interactively.snip new file mode 100644 index 00000000..40b11437 --- /dev/null +++ b/_includes/snippets_library/HPCC_MagicCastle_slurm/scheduler/using-nodes-interactively.snip @@ -0,0 +1,69 @@ +`{{ site.sched.interactive }}` runs a single command on the cluster and then +exits. Let's demonstrate this by running the `hostname` command with +`{{ site.sched.interactive }}`. (We can cancel an `{{ site.sched.interactive }}` +job with `Ctrl-c`.) + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} hostname +``` +{: .language-bash} + +``` +{{ site.remote.node }} +``` +{: .output} + +`{{ site.sched.interactive }}` accepts all of the same options as +`{{ site.sched.submit.name }}`. However, instead of specifying these in a script, +these options are specified on the command-line when starting a job. To submit +a job that uses 2 CPUs for instance, we could use the following command: + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} -n 2 echo "This job will use 2 CPUs." +``` +{: .language-bash} + +``` +This job will use 2 CPUs. +This job will use 2 CPUs. +``` +{: .output} + +Typically, the resulting shell environment will be the same as that for +`{{ site.sched.submit.name }}`. + +### Interactive jobs + +Sometimes, you will need a lot of resources for interactive use. Perhaps it's +our first time running an analysis or we are attempting to debug something that +went wrong with a previous job. Fortunately, {{ site.sched.name }} makes it +easy to start an interactive job with `{{ site.sched.interactive }}`: + +``` +{{ site.remote.prompt }} {{ site.sched.interactive }} --pty bash +``` +{: .language-bash} + +You should be presented with a bash prompt. Note that the prompt will likely +change to reflect your new location, in this case the compute node we are +logged on. You can also verify this with `hostname`. + +> ## Creating remote graphics +> +> To see graphical output inside your jobs, you need to use X11 forwarding. To +> connect with this feature enabled, use the `-Y` option when you login with +> the `ssh` command, e.g., `ssh -Y {{ site.remote.user }}@{{ site.remote.login }}`. +> +> To demonstrate what happens when you create a graphics window on the remote +> node, use the `xeyes` command. A relatively adorable pair of eyes should pop +> up (press `Ctrl-C` to stop). If you are using a Mac, you must have installed +> XQuartz (and restarted your computer) for this to work. +> +> If your cluster has the +> [slurm-spank-x11](https://github.com/hautreux/slurm-spank-x11) plugin +> installed, you can ensure X11 forwarding within interactive jobs by using the +> `--x11` option for `{{ site.sched.interactive }}` with the command +> `{{ site.sched.interactive }} --x11 --pty bash`. +{: .callout} + +When you are done with the interactive job, type `exit` to quit your session. diff --git a/_includes/snippets_library/HPCC_MagicCastle_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/HPCC_MagicCastle_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/_config_options.yml b/_includes/snippets_library/Magic_Castle_EESSI_slurm/_config_options.yml index f5a838c8..46d805a3 100644 --- a/_includes/snippets_library/Magic_Castle_EESSI_slurm/_config_options.yml +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/_config_options.yml @@ -70,14 +70,16 @@ sched: info: "sinfo" comment: "#SBATCH" hist: "sacct -u yourUsername" + hist_filter: "" episode_order: - 10-hpc-intro - 11-connecting - 12-cluster - 13-scheduler - - 14-modules - - 15-transferring-files - - 16-parallel - - 17-resources - - 18-responsibility + - 14-environment-variables + - 15-modules + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/cluster/root-folders.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/cluster/root-folders.snip new file mode 100644 index 00000000..715de741 --- /dev/null +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/cluster/root-folders.snip @@ -0,0 +1,6 @@ +``` +bin etc lib64 proc sbin sys var +boot {{ site.remote.homedir | replace: "/", "" }} mnt root scratch tmp working +dev lib opt run srv usr +``` +{: .output} diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/modules/default-modules.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/eight-tasks-jobscript.snip similarity index 68% rename from _includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/one-task-with-memory-jobscript.snip rename to _includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/eight-tasks-jobscript.snip index 13418f34..7f99b7fe 100644 --- a/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/one-task-with-memory-jobscript.snip +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/eight-tasks-jobscript.snip @@ -1,10 +1,9 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 -{{ site.sched.comment }} -n 1 -{{ site.sched.comment }} --mem=3G +{{ site.sched.comment }} -n 8 # Load the computing environment we need # (mpi4py and numpy are in SciPy-bundle) @@ -12,6 +11,6 @@ module load Python module load SciPy-bundle # Execute the task -python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/four-tasks-jobscript.snip index 1512adde..a777f674 100644 --- a/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/four-tasks-jobscript.snip @@ -1,10 +1,9 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 4 -{{ site.sched.comment }} --mem=3G # Load the computing environment we need # (mpi4py and numpy are in SciPy-bundle) @@ -12,6 +11,6 @@ module load Python module load SciPy-bundle # Execute the task -mpiexec python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/one-task-jobscript.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/one-task-jobscript.snip new file mode 100644 index 00000000..fa665b10 --- /dev/null +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/parallel/one-task-jobscript.snip @@ -0,0 +1,14 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 1 + +# Load the computing environment we need +module load Python + +# Execute the task +amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/resources/hist-fields.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/resources/hist-fields.snip new file mode 100644 index 00000000..f0e215ba --- /dev/null +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/resources/hist-fields.snip @@ -0,0 +1,6 @@ +* **Hostname**: Where did your job run? +* **MaxRSS**: What was the maximum amount of memory used? +* **Elapsed**: How long did the job take? +* **State**: What is the job currently doing/what happened to it? +* **MaxDiskRead**: Amount of data read from disk. +* **MaxDiskWrite**: Amount of data written to disk. diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/scheduler/email-notifications.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/scheduler/email-notifications.snip new file mode 100644 index 00000000..e681b3c0 --- /dev/null +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/scheduler/email-notifications.snip @@ -0,0 +1,19 @@ +> Jobs on an HPC system might run for days or even weeks. We probably have +> better things to do than constantly check on the status of our job with +> `{{ site.sched.status }}`. Looking at the manual page for +> `{{ site.sched.submit.name }}`, can you set up our test job to send you an email +> when it finishes? +> +> > ## Hint +> > +> > You can use the *manual pages* for {{ site.sched.name }} utilities to find +> > more about their capabilities. On the command line, these are accessed +> > through the `man` utility: run `man `. You can find the same +> > information online by searching > "man ". +> > +> > ``` +> > {{ site.remote.prompt }} man {{ site.sched.submit.name }} +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/scheduler/using-nodes-interactively.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/scheduler/using-nodes-interactively.snip index a7da3415..81bf67c6 100644 --- a/_includes/snippets_library/Magic_Castle_EESSI_slurm/scheduler/using-nodes-interactively.snip +++ b/_includes/snippets_library/Magic_Castle_EESSI_slurm/scheduler/using-nodes-interactively.snip @@ -62,8 +62,8 @@ logged on. You can also verify this with `hostname`. > If your cluster has the > [slurm-spank-x11](https://github.com/hautreux/slurm-spank-x11) plugin > installed, you can ensure X11 forwarding within interactive jobs by using the -> `--x11` option for `{{ site.sched.interactive }}` with the command `{{ -> site.sched.interactive }} --x11 --pty bash`. +> `--x11` option for `{{ site.sched.interactive }}` with the command +> `{{ site.sched.interactive }} --x11 --pty bash`. {: .callout} When you are done with the interactive job, type `exit` to quit your session. diff --git a/_includes/snippets_library/Magic_Castle_EESSI_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/Magic_Castle_EESSI_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/_config_options.yml b/_includes/snippets_library/NIST_CTCMS_slurm/_config_options.yml index 393d5580..0533c725 100644 --- a/_includes/snippets_library/NIST_CTCMS_slurm/_config_options.yml +++ b/_includes/snippets_library/NIST_CTCMS_slurm/_config_options.yml @@ -8,9 +8,9 @@ # `_includes/snippets_library`. To use one, replace options # below with those in `_config_options.yml` from the # library. E.g, to customise for Cirrus at EPCC, running -# PBS, we could replace the options below with those from +# Slurm, we could replace the options below with those from # -# _includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml +# _includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml # # If your cluster is not represented in the library, please # copy an existing folder, rename it, and customize for your @@ -21,18 +21,18 @@ snippets: "/snippets_library/NIST_CTCMS_slurm" local: prompt: "[user@laptop ~]$" - bash_shebang: "#!/usr/bin/env bash" + bash_shebang: "#!/usr/bin/bash" remote: - name: "ruth" - login: "ruth.nist.gov" - host: "ruth" + name: "mr-french" + login: "mr-french.nist.gov" + host: "mr-french" node: "r001" location: "National Institute of Standards and Technology" homedir: "/users" user: "yourUsername" - prompt: "501 ruth%" - bash_shebang: "#!/usr/bin/env bash" + prompt: "501 mr-french%" + bash_shebang: "#!/bin/bash" sched: name: "Slurm" @@ -46,22 +46,25 @@ sched: flag: user: "-u yourUsername" interactive: "" - histdetail: "--format=JobID,JobName,Submit,Start,State,ReqCPUS,Reserved,Elapsed,MaxRSS" + histdetail: "--format=JobName,Submit,Start,State,ReqCPUS,Reserved,Elapsed,MaxRSS -j" name: "-J" time: "-t" queue: "-p" + partition: "-p serial" del: "scancel" interactive: "srun" info: "sinfo" comment: "#SBATCH" hist: "sacct -u yourUsername" + hist_filter: "" episode_order: - 10-hpc-intro - 11-connecting - 12-cluster - 13-scheduler - - 15-transferring-files - - 16-parallel - - 17-resources - - 18-responsibility + - 14-environment-variables + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/cluster/root-folders.snip b/_includes/snippets_library/NIST_CTCMS_slurm/cluster/root-folders.snip new file mode 100644 index 00000000..715de741 --- /dev/null +++ b/_includes/snippets_library/NIST_CTCMS_slurm/cluster/root-folders.snip @@ -0,0 +1,6 @@ +``` +bin etc lib64 proc sbin sys var +boot {{ site.remote.homedir | replace: "/", "" }} mnt root scratch tmp working +dev lib opt run srv usr +``` +{: .output} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/modules/default-modules.snip b/_includes/snippets_library/NIST_CTCMS_slurm/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/NIST_CTCMS_slurm/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/parallel/eight-tasks-jobscript.snip b/_includes/snippets_library/NIST_CTCMS_slurm/parallel/eight-tasks-jobscript.snip new file mode 100644 index 00000000..7fa5d183 --- /dev/null +++ b/_includes/snippets_library/NIST_CTCMS_slurm/parallel/eight-tasks-jobscript.snip @@ -0,0 +1,11 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 8 + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/NIST_CTCMS_slurm/parallel/four-tasks-jobscript.snip index 5eb930b4..0303186a 100644 --- a/_includes/snippets_library/NIST_CTCMS_slurm/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/NIST_CTCMS_slurm/parallel/four-tasks-jobscript.snip @@ -1,12 +1,11 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 4 -{{ site.sched.comment }} --mem=3G # Execute the task -mpiexec python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/parallel/one-task-jobscript.snip b/_includes/snippets_library/NIST_CTCMS_slurm/parallel/one-task-jobscript.snip new file mode 100644 index 00000000..e5fe4b59 --- /dev/null +++ b/_includes/snippets_library/NIST_CTCMS_slurm/parallel/one-task-jobscript.snip @@ -0,0 +1,11 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 1 + +# Execute the task +amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/resources/account-history.snip b/_includes/snippets_library/NIST_CTCMS_slurm/resources/account-history.snip index 1fd637fa..61ac15e5 100644 --- a/_includes/snippets_library/NIST_CTCMS_slurm/resources/account-history.snip +++ b/_includes/snippets_library/NIST_CTCMS_slurm/resources/account-history.snip @@ -6,11 +6,11 @@ 212341 env {{ site.sched.queue.debug }} 2 COMPLETED 212342 mpirun {{ site.sched.queue.testing }} 2 COMPLETED 212343 mpirun {{ site.sched.queue.testing }} 2 COMPLETED -212344 cpi {{ site.sched.queue.testing }} 2 COMPLETED -212345 cpi {{ site.sched.queue.testing }} 2 COMPLETED +212344 amdahl {{ site.sched.queue.testing }} 2 COMPLETED +212345 amdahl {{ site.sched.queue.testing }} 2 COMPLETED 212346 bash {{ site.sched.queue.testing }} 2 COMPLETED 212346.0 bash 2 COMPLETED -212346.1 cpi 2 COMPLETED -212347 cpi {{ site.sched.queue.testing }} 2 FAILED +212346.1 amdahl 2 COMPLETED +212347 amdahl {{ site.sched.queue.testing }} 2 FAILED ``` {: .output} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/resources/hist-fields.snip b/_includes/snippets_library/NIST_CTCMS_slurm/resources/hist-fields.snip new file mode 100644 index 00000000..f0e215ba --- /dev/null +++ b/_includes/snippets_library/NIST_CTCMS_slurm/resources/hist-fields.snip @@ -0,0 +1,6 @@ +* **Hostname**: Where did your job run? +* **MaxRSS**: What was the maximum amount of memory used? +* **Elapsed**: How long did the job take? +* **State**: What is the job currently doing/what happened to it? +* **MaxDiskRead**: Amount of data read from disk. +* **MaxDiskWrite**: Amount of data written to disk. diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/scheduler/email-notifications.snip b/_includes/snippets_library/NIST_CTCMS_slurm/scheduler/email-notifications.snip new file mode 100644 index 00000000..e681b3c0 --- /dev/null +++ b/_includes/snippets_library/NIST_CTCMS_slurm/scheduler/email-notifications.snip @@ -0,0 +1,19 @@ +> Jobs on an HPC system might run for days or even weeks. We probably have +> better things to do than constantly check on the status of our job with +> `{{ site.sched.status }}`. Looking at the manual page for +> `{{ site.sched.submit.name }}`, can you set up our test job to send you an email +> when it finishes? +> +> > ## Hint +> > +> > You can use the *manual pages* for {{ site.sched.name }} utilities to find +> > more about their capabilities. On the command line, these are accessed +> > through the `man` utility: run `man `. You can find the same +> > information online by searching > "man ". +> > +> > ``` +> > {{ site.remote.prompt }} man {{ site.sched.submit.name }} +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/scheduler/print-sched-variables.snip b/_includes/snippets_library/NIST_CTCMS_slurm/scheduler/print-sched-variables.snip index 5234a4ed..90e7dbf8 100644 --- a/_includes/snippets_library/NIST_CTCMS_slurm/scheduler/print-sched-variables.snip +++ b/_includes/snippets_library/NIST_CTCMS_slurm/scheduler/print-sched-variables.snip @@ -17,7 +17,8 @@ > > > > ``` > > {{ site.remote.bash_shebang }} -> > #SBATCH -t 00:00:30 +> > {{ site.sched.comment }} {{ site.sched.flag.partition }} +> > {{ site.sched.comment }} {{ site.sched.flag.time }} 00:00:20 > > > > echo -n "This script is running on " > > hostname diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/NIST_CTCMS_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/_config_options.yml b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/_config_options.yml index 1b68b14d..80e2be9b 100644 --- a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/_config_options.yml +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/_config_options.yml @@ -8,9 +8,9 @@ # `_includes/snippets_library`. To use one, replace options # below with those in `_config_options.yml` from the # library. E.g, to customise for Cirrus at EPCC, running -# PBS, we could replace the options below with those from +# Slurm, we could replace the options below with those from # -# _includes/snippets_library/EPCC_Cirrus_pbs/_config_options.yml +# _includes/snippets_library/EPCC_Cirrus_slurm/_config_options.yml # # If your cluster is not represented in the library, please # copy an existing folder, rename it, and customize for your @@ -55,14 +55,16 @@ sched: info: "sinfo" comment: "#SBATCH" hist: "sacct -u $USER" + hist_filter: "" episode_order: - 10-hpc-intro - 11-connecting - 12-cluster - 13-scheduler - - 14-modules - - 15-transferring-files - - 16-parallel - - 17-resources - - 18-responsibility + - 14-environment-variables + - 15-modules + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/cluster/root-folders.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/cluster/root-folders.snip new file mode 100644 index 00000000..715de741 --- /dev/null +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/cluster/root-folders.snip @@ -0,0 +1,6 @@ +``` +bin etc lib64 proc sbin sys var +boot {{ site.remote.homedir | replace: "/", "" }} mnt root scratch tmp working +dev lib opt run srv usr +``` +{: .output} diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/modules/default-modules.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/eight-tasks-jobscript.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/eight-tasks-jobscript.snip new file mode 100644 index 00000000..38365d68 --- /dev/null +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/eight-tasks-jobscript.snip @@ -0,0 +1,14 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} +{{ site.sched.comment }} -N 1 +{{ site.sched.comment }} -n 8 + +# Load the computing environment we need +module load python3 + +# Execute the task +mpiexec amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/four-tasks-jobscript.snip index ac8effab..14046d46 100644 --- a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/four-tasks-jobscript.snip @@ -1,15 +1,14 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 4 -{{ site.sched.comment }} --mem=3G # Load the computing environment we need module load python3 # Execute the task -mpiexec python pi.py 100000000 +mpiexec amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/NIST_CTCMS_slurm/parallel/one-task-with-memory-jobscript.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/one-task-jobscript.snip similarity index 70% rename from _includes/snippets_library/NIST_CTCMS_slurm/parallel/one-task-with-memory-jobscript.snip rename to _includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/one-task-jobscript.snip index 5838157f..0e2a9947 100644 --- a/_includes/snippets_library/NIST_CTCMS_slurm/parallel/one-task-with-memory-jobscript.snip +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/parallel/one-task-jobscript.snip @@ -1,15 +1,14 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} serial-pi +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job {{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} {{ site.sched.comment }} -N 1 {{ site.sched.comment }} -n 1 -{{ site.sched.comment }} --mem=3G # Load the computing environment we need module load python3 # Execute the task -python pi.py 100000000 +amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/resources/hist-fields.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/resources/hist-fields.snip new file mode 100644 index 00000000..f0e215ba --- /dev/null +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/resources/hist-fields.snip @@ -0,0 +1,6 @@ +* **Hostname**: Where did your job run? +* **MaxRSS**: What was the maximum amount of memory used? +* **Elapsed**: How long did the job take? +* **State**: What is the job currently doing/what happened to it? +* **MaxDiskRead**: Amount of data read from disk. +* **MaxDiskWrite**: Amount of data written to disk. diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/scheduler/email-notifications.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/scheduler/email-notifications.snip new file mode 100644 index 00000000..e681b3c0 --- /dev/null +++ b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/scheduler/email-notifications.snip @@ -0,0 +1,19 @@ +> Jobs on an HPC system might run for days or even weeks. We probably have +> better things to do than constantly check on the status of our job with +> `{{ site.sched.status }}`. Looking at the manual page for +> `{{ site.sched.submit.name }}`, can you set up our test job to send you an email +> when it finishes? +> +> > ## Hint +> > +> > You can use the *manual pages* for {{ site.sched.name }} utilities to find +> > more about their capabilities. On the command line, these are accessed +> > through the `man` utility: run `man `. You can find the same +> > information online by searching > "man ". +> > +> > ``` +> > {{ site.remote.prompt }} man {{ site.sched.submit.name }} +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/Norway_SIGMA2_SAGA_slurm/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..e69de29b diff --git a/_includes/snippets_library/README.md b/_includes/snippets_library/README.md index 56b65740..6be1cee2 100644 --- a/_includes/snippets_library/README.md +++ b/_includes/snippets_library/README.md @@ -28,5 +28,3 @@ If the naming seems counter-intuitive, please feel free to make changes locally, and file an issue of submit a pull request to fix it upstream. None of this is set in stone, and improvements are always welcome. - - diff --git a/_includes/snippets_library/UCL_Myriad_sge/_config_options.yml b/_includes/snippets_library/UCL_Myriad_sge/_config_options.yml index b23d7cfc..41b1ffe1 100644 --- a/_includes/snippets_library/UCL_Myriad_sge/_config_options.yml +++ b/_includes/snippets_library/UCL_Myriad_sge/_config_options.yml @@ -7,7 +7,8 @@ local: remote: name: "Myriad" login: "myriad.rc.ucl.ac.uk" - node: node-d00a-001 + host: "login12.myriad.ucl.ac.uk" + node: node-d00a-005 location: "University College London" homedir: "/home" user: "yourUsername" @@ -33,6 +34,7 @@ sched: info: "qhost" comment: "#$ " hist: "jobhist" + hist_filter: "" bash_shebang: "#!/bin/bash -l" episode_order: @@ -40,8 +42,9 @@ episode_order: - 11-connecting - 12-cluster - 13-scheduler - - 14-modules - - 15-transferring-files - - 16-parallel - - 17-resources - - 18-responsibility + - 14-environment-variables + - 15-modules + - 16-transferring-files + - 17-parallel + - 18-resources + - 19-responsibility diff --git a/_includes/snippets_library/UCL_Myriad_sge/cluster/root-folders.snip b/_includes/snippets_library/UCL_Myriad_sge/cluster/root-folders.snip new file mode 100644 index 00000000..715de741 --- /dev/null +++ b/_includes/snippets_library/UCL_Myriad_sge/cluster/root-folders.snip @@ -0,0 +1,6 @@ +``` +bin etc lib64 proc sbin sys var +boot {{ site.remote.homedir | replace: "/", "" }} mnt root scratch tmp working +dev lib opt run srv usr +``` +{: .output} diff --git a/_includes/snippets_library/UCL_Myriad_sge/modules/default-modules.snip b/_includes/snippets_library/UCL_Myriad_sge/modules/default-modules.snip new file mode 100644 index 00000000..a448dd96 --- /dev/null +++ b/_includes/snippets_library/UCL_Myriad_sge/modules/default-modules.snip @@ -0,0 +1,4 @@ +``` +No Modulefiles Currently Loaded. +``` +{: .output} diff --git a/_includes/snippets_library/UCL_Myriad_sge/parallel/eight-tasks-jobscript.snip b/_includes/snippets_library/UCL_Myriad_sge/parallel/eight-tasks-jobscript.snip new file mode 100644 index 00000000..5f564db8 --- /dev/null +++ b/_includes/snippets_library/UCL_Myriad_sge/parallel/eight-tasks-jobscript.snip @@ -0,0 +1,17 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} -l mem=3G +{{ site.sched.comment }} -l h_rt=00:30:00 +{{ site.sched.comment }} -pe mpi 8 +{{ site.sched.comment }} -cwd + +# Load the computing environment we need +module load python3 +module unload compilers mpi +module load mpi4py + +# Execute the task +gerun amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/UCL_Myriad_sge/parallel/four-tasks-jobscript.snip b/_includes/snippets_library/UCL_Myriad_sge/parallel/four-tasks-jobscript.snip index b1d90eb9..ac523d57 100644 --- a/_includes/snippets_library/UCL_Myriad_sge/parallel/four-tasks-jobscript.snip +++ b/_includes/snippets_library/UCL_Myriad_sge/parallel/four-tasks-jobscript.snip @@ -1,13 +1,17 @@ ``` {{ site.remote.bash_shebang }} -{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-pi -{{ site.sched.comment }} {{ site.sched.flag.queue }} {{ site.sched.queue.testing }} -{{ site.sched.comment }} -l nodes=1:ppn=4:mem=3G +{{ site.sched.comment }} {{ site.sched.flag.name }} parallel-job +{{ site.sched.comment }} -l mem=3G +{{ site.sched.comment }} -l h_rt=00:30:00 +{{ site.sched.comment }} -pe mpi 4 +{{ site.sched.comment }} -cwd # Load the computing environment we need module load python3 +module unload compilers mpi +module load mpi4py # Execute the task -mpiexec python pi.py 100000000 +gerun amdahl ``` {: .language-bash} diff --git a/_includes/snippets_library/UCL_Myriad_sge/parallel/one-task-jobscript.snip b/_includes/snippets_library/UCL_Myriad_sge/parallel/one-task-jobscript.snip new file mode 100644 index 00000000..4836b3e3 --- /dev/null +++ b/_includes/snippets_library/UCL_Myriad_sge/parallel/one-task-jobscript.snip @@ -0,0 +1,17 @@ +``` +{{ site.remote.bash_shebang }} +{{ site.sched.comment }} {{ site.sched.flag.name }} solo-job +{{ site.sched.comment }} -l mem=3G +{{ site.sched.comment }} -l h_rt=00:30:00 +{{ site.sched.comment }} -pe mpi 1 +{{ site.sched.comment }} -cwd + +# Load the computing environment we need +module load python3 +module unload compilers mpi +module load mpi4py + +# Execute the task +amdahl +``` +{: .language-bash} diff --git a/_includes/snippets_library/UCL_Myriad_sge/resources/hist-fields.snip b/_includes/snippets_library/UCL_Myriad_sge/resources/hist-fields.snip new file mode 100644 index 00000000..f0e215ba --- /dev/null +++ b/_includes/snippets_library/UCL_Myriad_sge/resources/hist-fields.snip @@ -0,0 +1,6 @@ +* **Hostname**: Where did your job run? +* **MaxRSS**: What was the maximum amount of memory used? +* **Elapsed**: How long did the job take? +* **State**: What is the job currently doing/what happened to it? +* **MaxDiskRead**: Amount of data read from disk. +* **MaxDiskWrite**: Amount of data written to disk. diff --git a/_includes/snippets_library/UCL_Myriad_sge/scheduler/email-notifications.snip b/_includes/snippets_library/UCL_Myriad_sge/scheduler/email-notifications.snip new file mode 100644 index 00000000..e681b3c0 --- /dev/null +++ b/_includes/snippets_library/UCL_Myriad_sge/scheduler/email-notifications.snip @@ -0,0 +1,19 @@ +> Jobs on an HPC system might run for days or even weeks. We probably have +> better things to do than constantly check on the status of our job with +> `{{ site.sched.status }}`. Looking at the manual page for +> `{{ site.sched.submit.name }}`, can you set up our test job to send you an email +> when it finishes? +> +> > ## Hint +> > +> > You can use the *manual pages* for {{ site.sched.name }} utilities to find +> > more about their capabilities. On the command line, these are accessed +> > through the `man` utility: run `man `. You can find the same +> > information online by searching > "man ". +> > +> > ``` +> > {{ site.remote.prompt }} man {{ site.sched.submit.name }} +> > ``` +> > {: .language-bash} +> {: .solution} +{: .challenge} diff --git a/_includes/snippets_library/UCL_Myriad_sge/scheduler/option-flags-list.snip b/_includes/snippets_library/UCL_Myriad_sge/scheduler/option-flags-list.snip index f6961da7..e7f9010c 100644 --- a/_includes/snippets_library/UCL_Myriad_sge/scheduler/option-flags-list.snip +++ b/_includes/snippets_library/UCL_Myriad_sge/scheduler/option-flags-list.snip @@ -4,8 +4,8 @@ * `-pe mpi ` — How many CPUs does your job need? If you only need one CPU you can leave this out. -* `mem=` — How much memory per process does your job need? Must be +* `-l mem=` — How much memory per process does your job need? Must be an integer followed by M, G, or T to specify Mega, Giga or Terabytes. -* `wd /home//Scratch/` — Set the working +* `-wd /home//Scratch/` — Set the working directory to somewhere in your scratch space. diff --git a/_includes/snippets_library/UCL_Myriad_sge/transferring-files/filezilla-ssh-tunnel-instructions.snip b/_includes/snippets_library/UCL_Myriad_sge/transferring-files/filezilla-ssh-tunnel-instructions.snip new file mode 100644 index 00000000..8f0e9e1e --- /dev/null +++ b/_includes/snippets_library/UCL_Myriad_sge/transferring-files/filezilla-ssh-tunnel-instructions.snip @@ -0,0 +1,21 @@ +> ## Connecting Remotely Via an SSH Tunnel +> +> If you are remote and not using a VPN, you will need to set up an SSH tunnel +using your terminal. +> +> ``` +> {{ site.local.prompt }} ssh -L 3333:{{ site.remote.login }}:22 {{ site.remote.user }}@ssh-gateway.ucl.ac.uk -N +> ``` +> {: .language-bash} +> +> This connects the local port 3333 to the cluster's SSH port, via the +> remote gateway. The `-N` option tells it not to execute any remote +> commands. Leave this running in the terminal and in FileZilla, set +> +> * Host: `sftp://localhost` +> * User: Your cluster username +> * Password: Your cluster password (leave blank to use your SSH keys) +> * Port: 3333 +> +> Hit "Quickconnect" to connect. +{: .callout} diff --git a/files/goostats.py b/files/goostats.py deleted file mode 100755 index 654b44c0..00000000 --- a/files/goostats.py +++ /dev/null @@ -1,136 +0,0 @@ -#!/usr/bin/env python3 - -""" -Parallel code to extract mean, min, and max of Nelle Nemo's assay results -""" - -import locale as l10n -from mpi4py import MPI -import numpy as np -import os -import sys -l10n.setlocale(l10n.LC_ALL, "") - -# Declare an MPI Communicator for the parallel processes to talk through -comm = MPI.COMM_WORLD - -# Read the number of parallel processes tied into the comm channel -cpus = comm.Get_size() - -# Find out the index ("rank") of *this* process -rank = comm.Get_rank() - - -def list_assay_files(path): - """ - Walk the specified path, using one rank *only*. - Record NENE*.txt files labeled A or B (not Z). - Return list of file paths. - """ - if rank != 0: - print("Rank {} tried scanning the directory.".format(rank)) - sys.exit() - - valid_names = [] - for root, dirs, files in os.walk(path): - for f in files: - if f.startswith("NENE") and f.endswith(("A.txt", "B.txt")): - fullpath = os.path.join(root, f) - valid_names.append(fullpath) - - return valid_names - - -def partition_files(list_of_files, number_of_parts): - """ - Split the provided list of files into a number of roughly-equal parts - """ - return np.array_split(list_of_files, number_of_parts) - - -def get_local_file_names(path): - if rank == 0: - # Let only one MPI process scan the directory for files. - all_files = list_assay_files(path) - partitions = partition_files(all_files, cpus) - else: - partitions = [] - - # Every rank gets its own chunk of the list of assay files. - # This function is *blocking*: no rank returns until all are able to. - return comm.scatter(partitions, root = 0) - - -def extract_concentrations(goo_file): - """ - Read file `goo_file` into NumPy array. - Return array if it contains 300 entries. - """ - concentrations = np.loadtxt(goo_file) - if len(concentrations) != 300: - return None - return concentrations - - -def get_assay_results(files): - # Every rank reads their private list of files into NumPy arrays - concentrations = [] - for f in files: - result = extract_concentrations(f) - if result is not None: - concentrations.append(result) - - print("Rank {} crunched data from {} files.".format(comm.Get_rank(), len(concentrations))) - - # Convert list of NumPy arrays into a 2-D NumPy array - return np.array(concentrations) - - -# "Main" program - -if __name__ == '__main__': - """ - This program is entered as many times as there are MPI processes. - - Each process knows its index, called 'rank', and the number of - ranks, called 'cpus', from the MPI calls at the top of the module. - """ - - # Guard against improper invocations of the program - - usage_string = "Usage:\n mpirun -np {} {} directory_name" - - if len(sys.argv) != 2 or sys.argv[1] == "--help": - if rank == 0: - print(usage_string.format(cpus, sys.argv[0])) - sys.exit() - - # Distribute assay files in the specified directory to the parallel ranks - path = sys.argv[1] - files = get_local_file_names(path) - - # Read local set of files into NumPy array -- ignoring partial results - concentrations = get_assay_results(files) - - # Calculate the total number of valid assay results from local numbers - valid_results = len(concentrations) # local - valid_results = comm.reduce(valid_results) # global - - # For each protein, collect the mean, min, and max values from all files - assay_avg = np.sum(concentrations, axis=0).tolist() - assay_min = np.amin(concentrations, axis=0).tolist() - assay_max = np.amax(concentrations, axis=0).tolist() - - for i in range(len(assay_avg)): - assay_avg[i] = comm.reduce(assay_avg[i], op=MPI.SUM) - assay_min[i] = comm.reduce(assay_min[i], op=MPI.MIN) - assay_max[i] = comm.reduce(assay_max[i], op=MPI.MAX) - - # Generate the global report using Rank 0, only - if rank == 0: - assay_avg = np.divide(assay_avg, valid_results) - csv_name = "{}.csv".format(path.rstrip("/")) # prevent "path/.csv", which would be a hidden file - with open(csv_name, "w") as csv: - print("mean,min,max", file=csv) - for a, n, x in zip(assay_avg, assay_min, assay_max): - print("{},{},{}".format(a, n, x), file=csv) diff --git a/files/hpc-intro-data.tar.gz b/files/hpc-intro-data.tar.gz deleted file mode 100644 index 8ccc37cf..00000000 Binary files a/files/hpc-intro-data.tar.gz and /dev/null differ diff --git a/files/hpc-intro-data.zip b/files/hpc-intro-data.zip deleted file mode 100644 index 7efd5d6d..00000000 Binary files a/files/hpc-intro-data.zip and /dev/null differ diff --git a/files/hpc-intro-pi-code.tar.gz b/files/hpc-intro-pi-code.tar.gz new file mode 100644 index 00000000..33b43cb9 Binary files /dev/null and b/files/hpc-intro-pi-code.tar.gz differ diff --git a/files/jargon.html b/files/jargon.html index 63f68bfc..66364281 100644 --- a/files/jargon.html +++ b/files/jargon.html @@ -103,7 +103,7 @@ A collection of standalone computers that are networked together. They will frequently have software installed that allow the coordinated running of other software across all of these computers. This allows these networked computers -to work together to accomplish computing tasks faster. +to work together to accomplish computing tasks faster. --- diff --git a/files/pi-mpi-minimal.py b/files/pi-mpi-minimal.py deleted file mode 100755 index 407d90f6..00000000 --- a/files/pi-mpi-minimal.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python3 -import numpy as np -import sys -from mpi4py import MPI - -def inside_circle(total_count): - x = np.random.uniform(size=total_count) - y = np.random.uniform(size=total_count) - radii = np.sqrt(x*x + y*y) - count = len(radii[np.where(radii<=1.0)]) - return count - -if __name__ == '__main__': - comm = MPI.COMM_WORLD - cpus = comm.Get_size() - rank = comm.Get_rank() - n_samples = int(sys.argv[1]) - if rank == 0: - partitions = [ int(n_samples / cpus) ] * cpus - counts = [ int(0) ] * cpus - else: - partitions = None - counts = None - partition_item = comm.scatter(partitions, root=0) - count_item = inside_circle(partition_item) - counts = comm.gather(count_item, root=0) - if rank == 0: - my_pi = 4.0 * sum(counts) / sum(partitions) - print(my_pi) diff --git a/files/pi-mpi.py b/files/pi-mpi.py deleted file mode 100755 index e4b2e53f..00000000 --- a/files/pi-mpi.py +++ /dev/null @@ -1,125 +0,0 @@ -#!/usr/bin/env python3 - -"""Parallel example code for estimating the value of π. - -We can estimate the value of π by a stochastic algorithm. Consider a -circle of radius 1, inside a square that bounds it, with vertices at -(1,1), (1,-1), (-1,-1), and (-1,1). The area of the circle is just π, -whereas the area of the square is 4. So, the fraction of the area of the -square which is covered by the circle is π/4. - -A point selected at random uniformly from the square thus has a -probability π/4 of being within the circle. - -We can estimate π by examining a large number of randomly-selected -points from the square, and seeing what fraction of them lie within the -circle. If this fraction is f, then our estimate for π is π ≈ 4f. - -Thanks to symmetry, we can compute points in one quadrant, rather -than within the entire unit square, and arrive at identical results. - -This task lends itself naturally to parallelization -- the task of -selecting a sample point and deciding whether or not it's inside the -circle is independent of all the other samples, so they can be done -simultaneously. We only need to aggregate the data at the end to compute -our fraction f and our estimate for π. -""" - -import numpy as np -import sys -import datetime -from mpi4py import MPI - - -def inside_circle(total_count): - """Single-processor task for a group of samples. - - Generates uniform random x and y arrays of size total_count, on the - interval [0,1), and returns the number of the resulting (x,y) pairs - which lie inside the unit circle. - """ - - host_name = MPI.Get_processor_name() - print("Rank {} generating {:n} samples on host {}.".format( - rank, total_count, host_name)) - x = np.float64(np.random.uniform(size=total_count)) - y = np.float64(np.random.uniform(size=total_count)) - - radii = np.sqrt(x*x + y*y) - - count = len(radii[np.where(radii<=1.0)]) - - return count - - -if __name__ == '__main__': - """Main executable. - - This function runs the 'inside_circle' function with a defined number - of samples. The results are then used to estimate π. - - An estimate of the required memory, elapsed calculation time, and - accuracy of calculating π are also computed. - """ - - # Declare an MPI Communicator for the parallel processes to talk through - comm = MPI.COMM_WORLD - - # Read the number of parallel processes tied into the comm channel - cpus = comm.Get_size() - - # Find out the index ("rank") of *this* process - rank = comm.Get_rank() - - if len(sys.argv) > 1: - n_samples = int(sys.argv[1]) - else: - n_samples = 8738128 # trust me, this number is not random :-) - - if rank == 0: - # Time how long it takes to estimate π. - start_time = datetime.datetime.now() - print("Generating {:n} samples.".format(n_samples)) - # Rank zero builds two arrays with one entry for each rank: - # one for the number of samples they should run, and - # one to store the count info each rank returns. - partitions = [ int(n_samples / cpus) ] * cpus - counts = [ int(0) ] * cpus - else: - partitions = None - counts = None - - # All ranks participate in the "scatter" operation, which assigns - # the local scalar values to their appropriate array components. - # partition_item is the number of samples this rank should generate, - # and count_item is the place to put the number of counts we see. - partition_item = comm.scatter(partitions, root=0) - count_item = comm.scatter(counts, root=0) - - # Each rank locally populates its count_item variable. - count_item = inside_circle(partition_item) - - # All ranks participate in the "gather" operation, which sums the - # rank's count_items into the total "counts". - counts = comm.gather(count_item, root=0) - - if rank == 0: - # Only rank zero writes the result, although it's known to all. - my_pi = 4.0 * sum(counts) / n_samples - elapsed_time = (datetime.datetime.now() - start_time).total_seconds() - - # Memory required is dominated by the size of x, y, and radii from - # inside_circle(), calculated in MiB - size_of_float = np.dtype(np.float64).itemsize - memory_required = 3 * n_samples * size_of_float / (1024**2) - - # accuracy is calculated as a percent difference from a known estimate - # of π. - pi_specific = np.pi - accuracy = 100*(1-my_pi/pi_specific) - - # Uncomment either summary format for verbose or terse output - # summary = "{:d} core(s), {:d} samples, {:f} MiB memory, {:f} seconds, {:f}% error" - summary = "{:d},{:d},{:f},{:f},{:f}" - print(summary.format(cpus, n_samples, memory_required, elapsed_time, - accuracy)) diff --git a/files/pi-serial-minimized.py b/files/pi-serial-minimized.py deleted file mode 100644 index acc99d31..00000000 --- a/files/pi-serial-minimized.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python3 -import numpy as np -import sys - -def inside_circle(total_count): - x = np.random.uniform(size=total_count) - y = np.random.uniform(size=total_count) - radii = np.sqrt(x*x + y*y) - count = len(radii[np.where(radii<=1.0)]) - return count - -if __name__ == '__main__': - n_samples = int(sys.argv[1]) - counts = inside_circle(n_samples) - my_pi = 4.0 * counts / n_samples - print(my_pi) diff --git a/files/pi-serial.py b/files/pi-serial.py deleted file mode 100755 index c5289c84..00000000 --- a/files/pi-serial.py +++ /dev/null @@ -1,80 +0,0 @@ -#!/usr/bin/env python3 - -"""Serial example code for estimating the value of π. - -We can estimate the value of π by a stochastic algorithm. Consider a -circle of radius 1, inside a square that bounds it, with vertices at -(1,1), (1,-1), (-1,-1), and (-1,1). The area of the circle is just π, -whereas the area of the square is 4. So, the fraction of the area of the -square which is covered by the circle is π/4. - -A point selected at random uniformly from the square thus has a -probability π/4 of being within the circle. - -We can estimate π by examining a large number of randomly-selected -points from the square, and seeing what fraction of them lie within the -circle. If this fraction is f, then our estimate for π is π ≈ 4f. - -Thanks to symmetry, we can compute points in one quadrant, rather -than within the entire unit square, and arrive at identical results. -""" - -import numpy as np -import sys -import datetime - - -def inside_circle(total_count): - """Single-processor task for a group of samples. - - Generates uniform random x and y arrays of size total_count, on the - interval [0,1), and returns the number of the resulting (x,y) pairs - which lie inside the unit circle. - """ - - x = np.float64(np.random.uniform(size=total_count)) - y = np.float64(np.random.uniform(size=total_count)) - - radii = np.sqrt(x*x + y*y) - - count = len(radii[np.where(radii<=1.0)]) - - return count - - -if __name__ == '__main__': - """Main executable. - - This function runs the 'inside_circle' function with a defined number - of samples. The results are then used to estimate π. - - An estimate of the required memory, elapsed calculation time, and - accuracy of calculating π are also computed. - """ - - if len(sys.argv) > 1: - n_samples = int(sys.argv[1]) - else: - n_samples = 8738128 # trust me, this number is not random :-) - - # Time how long it takes to estimate π. - start_time = datetime.datetime.now() - counts = inside_circle(n_samples) - my_pi = 4.0 * counts / n_samples - elapsed_time = (datetime.datetime.now() - start_time).total_seconds() - - # Memory required is dominated by the size of x, y, and radii from - # inside_circle(), calculated in MiB - size_of_float = np.dtype(np.float64).itemsize - memory_required = 3 * n_samples * size_of_float / (1024**2) - - # accuracy is calculated as a percent difference from a known estimate - # of π. - pi_specific = np.pi - accuracy = 100*(1-my_pi/pi_specific) - - # Uncomment either summary format for verbose or terse output - # summary = "{:d} core(s), {:d} samples, {:f} MiB memory, {:f} seconds, {:f}% error" - summary = "{:d},{:d},{:f},{:f},{:f}" - print(summary.format(1, n_samples, memory_required, elapsed_time, - accuracy)) diff --git a/files/pi.py b/files/pi.py deleted file mode 100755 index bac13585..00000000 --- a/files/pi.py +++ /dev/null @@ -1,127 +0,0 @@ -#!/usr/bin/env python3 - -"""Parallel example code for estimating the value of π. - -We can estimate the value of π by a stochastic algorithm. Consider a -circle of radius 1, inside a square that bounds it, with vertices at -(1,1), (1,-1), (-1,-1), and (-1,1). The area of the circle is just π, -whereas the area of the square is 4. So, the fraction of the area of the -square which is covered by the circle is π/4. - -A point selected at random uniformly from the square thus has a -probability π/4 of being within the circle. - -We can estimate π by examining a large number of randomly-selected -points from the square, and seeing what fraction of them lie within the -circle. If this fraction is f, then our estimate for π is π ≈ 4f. - -This task lends itself naturally to parallelization -- the task of -selecting a sample point and deciding whether or not it's inside the -circle is independent of all the other samples, so they can be done -simultaneously. We only need to aggregate the data at the end to compute -our fraction f and our estimate for π. - -Thanks to symmetry, we can compute points in one quadrant, rather -than within the entire unit square, and arrive at identical results. -""" - -import locale as l10n -from mpi4py import MPI -import numpy as np -import sys - -l10n.setlocale(l10n.LC_ALL, "") - -# Declare an MPI Communicator for the parallel processes to talk through -comm = MPI.COMM_WORLD - -# Read the number of parallel processes tied into the comm channel -cpus = comm.Get_size() - - -# Find out the index ("rank") of *this* process -rank = comm.Get_rank() - -np.random.seed(14159265 + rank) - -def inside_circle(total_count): - """Single-processor task for a group of samples. - - Generates uniform random x and y arrays of size total_count, on the - interval [0,1), and returns the number of the resulting (x,y) pairs - which lie inside the unit circle. - """ - host_name = MPI.Get_processor_name() - print("Rank {} generating {:n} samples on host {}.".format( - rank, total_count, host_name)) - - x = np.float64(np.random.uniform(size=total_count)) - y = np.float64(np.random.uniform(size=total_count)) - - radii = np.sqrt(x*x + y*y) - - count = len(radii[np.where(radii<=1.0)]) - - return count - - -if __name__ == '__main__': - """Main MPI executable. - - This conditional is entered as many times as there are MPI processes. - - Each process knows its index, called 'rank', and the number of - ranks, called 'cpus', from the MPI calls at the top of the module. - - Rank 0 divides the data arrays among the ranks (including itself), - then each rank independently runs the 'inside_circle' function with - its share of the samples. The disparate results are then aggregated - via the 'gather' operation, and then the estimate for π is - computed. - - An estimate of the required memory is also computed. - """ - - n_samples = 8738128 # trust me, this number is not random :-) - - if len(sys.argv) > 1: - n_samples = int(sys.argv[1]) - - if rank == 0: - print("Generating {:n} samples.".format(n_samples)) - # Rank zero builds two arrays with one entry for each rank: - # one for the number of samples they should run, and - # one to store the count info each rank returns. - partitions = [ int(n_samples / cpus) for item in range(cpus)] - counts = [ int(0) ] * cpus - else: - partitions = None - counts = None - - # All ranks participate in the "scatter" operation, which assigns - # the local scalar values to their appropriate array components. - # partition_item is the number of samples this rank should generate, - # and count_item is the place to put the number of counts we see. - - partition_item = comm.scatter(partitions, root=0) - count_item = comm.scatter(counts, root=0) - - # Each rank locally populates its count_item variable. - - count_item = inside_circle(partition_item) - - # All ranks participate in the "gather" operation, which creates an array - # of all the rank's count_items on rank zero. - - counts = comm.gather(count_item, root=0) - - if rank == 0: - # Only rank zero has the entire array of results, so only it can - # compute and print the final answer. - my_pi = 4.0 * sum(counts) / n_samples - size_of_float = np.dtype(np.float64).itemsize - run_type = "serial" if cpus == 1 else "mpi" - print("[{:>8} version] required memory {:.1f} MB".format( - run_type, 3 * n_samples * size_of_float / (1024**2))) - print("[using {:>3} cores ] π is {:n} from {:n} samples".format( - cpus, my_pi, n_samples)) diff --git a/files/simple-pi-illustration.py b/files/simple-pi-illustration.py deleted file mode 100644 index e8ba7548..00000000 --- a/files/simple-pi-illustration.py +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program generates a picture of the algorithm used to estimate the value -# of π by random sampling. - -import numpy as np -import matplotlib.pyplot as plt -import matplotlib.patches as pltpatches - -np.random.seed(14159625) - -n = 128 -x = np.random.uniform(size=n) -y = np.random.uniform(size=n) - -with plt.xkcd(): - - plt.figure(figsize=(5,5)) - plt.axis("equal") - plt.xlim([-0.0125, 1.0125]) - plt.ylim([-0.0125, 1.0125]) - - for d in ["left", "top", "bottom", "right"]: - plt.gca().spines[d].set_visible(False) - - plt.xlabel("x", position=(0.8, 0)) - plt.ylabel("y", rotation=0, position=(0, 0.8)) - - plt.xticks([0, 0.5, 1], ["0", "1/2", "1"]) - plt.yticks([0, 0.5, 1], ["0", "1/2", "1"]) - - plt.scatter(x, y, s=8, c=np.random.uniform(size=(n,3))) - - circ = pltpatches.Arc((0, 0), width=2, height=2, angle=0, theta1=0, theta2=90, color="black", linewidth=3) - plt.gca().add_artist(circ) - squa = plt.Rectangle((0, 0), width=1, height=1, fill=None, linewidth=3) - plt.gca().add_artist(squa) - - plt.savefig("pi.png", bbox_inches="tight", dpi=400) diff --git a/reference.md b/reference.md index 61df7f9b..09993605 100644 --- a/reference.md +++ b/reference.md @@ -9,11 +9,12 @@ title: Knowledge Base Search online for the one that fits you best, but here's some to start: * [Slurm summary](https://slurm.schedmd.com/pdfs/summary.pdf) from SchedMD -* [Torque/PBS summary]( - https://gif.biotech.iastate.edu/torque-pbs-job-management-cheat-sheet) +* [Torque/PBS + summary](https://gif.biotech.iastate.edu/torque-pbs-job-management-cheat-sheet) from Iowa State -* [Translating between Slurm and PBS]( - https://www.msi.umn.edu/slurm/pbs-conversion) from University of Minnesota +* [Translating between Slurm and + PBS](https://www.msi.umn.edu/slurm/pbs-conversion) from University of + Minnesota ### Units and Language @@ -28,10 +29,10 @@ History and common language have however mixed this notation with a different meaning. When people say "Kilobyte", they mean 1024 Bytes instead. In that spirit, a Megabyte is 1024 Kilobytes. -To address this ambiguity, the [International System of Quantities]( -https://en.wikipedia.org/wiki/International_System_of_Quantities) standardizes -the *binary* prefixes (with base of 210=1024) by the prefixes Kibi -(ki), Mebi (Mi), Gibi (Gi), etc. For more details, see +To address this ambiguity, the [International System of +Quantities](https://en.wikipedia.org/wiki/International_System_of_Quantities) +standardizes the *binary* prefixes (with base of 210=1024) by the +prefixes Kibi (ki), Mebi (Mi), Gibi (Gi), etc. For more details, see [here](https://en.wikipedia.org/wiki/Binary_prefix). ### "No such file or directory" or "symbol 0096" Errors diff --git a/setup.md b/setup.md index ec686863..633245a7 100644 --- a/setup.md +++ b/setup.md @@ -4,9 +4,9 @@ title: Setup root: . --- -Before the workshop, it is recommended to install or locate a terminal -application on with ssh. Though installation help will be provided at -the workshop, we recommend that these tools are installed (or at least +Before the workshop, it is recommended to install or locate a terminal +application on with ssh. Though installation help will be provided at +the workshop, we recommend that these tools are installed (or at least downloaded) beforehand. > ## Bash and SSH @@ -29,7 +29,7 @@ on a server. ### Unix Shells on Windows Computers with Windows operating systems do not automatically have a Unix Shell -program installed. We recommend using [MobaXterm](https://mobaxterm.mobatek.net) +program installed. We recommend using [MobaXterm](https://mobaxterm.mobatek.net) Home Edition for this lesson as it can be used to run local shell commands and access the cluster via ssh. @@ -42,7 +42,7 @@ To open Terminal, try one or both of the following: * In Finder, select the Go menu, then select Utilities. Locate Terminal in the Utilities folder and open it. -* Use the Mac ‘Spotlight’ computer search function (using Command + +* Use the Mac ‘Spotlight’ computer search function (using Command + Space). Search for: `Terminal` and press Return. For an introduction, see [How to Use Terminal on a Mac][mac-terminal]. @@ -66,5 +66,6 @@ for: `Unix shell [your operating system]`. [ms-wsl]: https://docs.microsoft.com/en-us/windows/wsl/install-win10 [ms-shell]: https://docs.microsoft.com/en-us/powershell/scripting/learn/remoting/ssh-remoting-in-powershell-core?view=powershell-7 [mobax-gen]: https://mobaxterm.mobatek.net/documentation.html -[unix-emulator]: https://faculty.smu.edu/reynolds/unixtut/windows.html +[putty]: https://www.chiark.greenend.org.uk/~sgtatham/putty/ +[unix-emulator]: https://www.cygwin.com/ [wsl]: https://docs.microsoft.com/en-us/windows/wsl/install-win10