|
4 | 4 | \frametitle{Profiling}
|
5 | 5 | \begin{block}{Conceptually}
|
6 | 6 | \begin{itemize}
|
7 |
| - \item take a measurement of a performance aspect of a program |
| 7 | + \item Take a measurement of a performance aspect of a program |
8 | 8 | \begin{itemize}
|
9 |
| - \item where in my code is most of the time spent? |
10 |
| - \item is my program compute or memory bound? |
11 |
| - \item does my program make good use of the cache? |
12 |
| - \item is my program using all cores most of the time? |
13 |
| - \item how often are threads blocked and why? |
14 |
| - \item which API calls are made and in which order? |
| 9 | + \item Where in my code is most of the time spent? |
| 10 | + \item Is my program compute or memory bound? |
| 11 | + \item Does my program make good use of the cache? |
| 12 | + \item Is my program using all cores most of the time? |
| 13 | + \item How often are threads blocked and why? |
| 14 | + \item Which API calls are made and in which order? |
15 | 15 | \item ...
|
16 | 16 | \end{itemize}
|
17 |
| - \item the goal is to find performance bottlenecks |
18 |
| - \item is usually done on a compiled program, not on source code |
| 17 | + \item The goal is to find performance bottlenecks |
| 18 | + \item Usually done on a compiled program, not on source code |
19 | 19 | \end{itemize}
|
20 | 20 | \end{block}
|
21 | 21 | \end{frame}
|
22 | 22 |
|
23 | 23 | \begin{frame}[fragile]
|
24 |
| - \frametitle{perf, VTune and uProf} |
25 |
| - \begin{block}{perf} |
| 24 | + \frametitle{\mintinline{bash}{perf} -- Performance analysis tools for Linux} |
| 25 | + \setlength{\leftmargini}{0pt} |
26 | 26 | \begin{itemize}
|
27 |
| - \item perf is a powerful command line profiling tool for linux |
28 |
| - \item compile with \mintinline{bash}{-g -fno-omit-frame-pointer} |
29 |
| - \item \mintinline{bash}{perf stat -d <prg>} gathers performance statistics while running \mintinline{bash}{<prg>} |
30 |
| - \item \mintinline{bash}{perf record -g <prg>} starts profiling \mintinline{bash}{<prg>} |
31 |
| - \item \mintinline{bash}{perf report} displays a report from the last profile |
32 |
| - \item More information in \href{https://perf.wiki.kernel.org/index.php/Main_Page}{this wiki}, \href{https://www.brendangregg.com/linuxperf.html}{this website} or \href{https://indico.cern.ch/event/980497/contributions/4130271/attachments/2161581/3647235/linux-systems-performance.pdf}{this talk}. |
| 27 | + \item Powerful command line profiling tool for Linux |
| 28 | + \item Not portable, the source code is part of the Linux kernel itself |
| 29 | + \item Much lower overhead compared with \mintinline{bash}{valgrind} |
| 30 | + \item To use it, compile your code with \mintinline{bash}{-g -fno-omit-frame-pointer} |
| 31 | + \item Counting and sampling |
| 32 | + \begin{itemize} |
| 33 | + \item Counting -- count occurrences of a given event (e.g.\ cache misses) |
| 34 | + \item Time-based sampling -- sample the stack at regular time intervals |
| 35 | + \item Event-based sampling -- take samples when event counter overflows |
| 36 | + \item Instruction-based sampling -- sample instructions and precisely count events they create |
| 37 | + \end{itemize} |
| 38 | + \item Static and dynamic tracing |
| 39 | + \begin{itemize} |
| 40 | + \item Static -- pre-defined tracepoints in software (e.g.\ scheduling events) |
| 41 | + \item Dynamic -- tracepoints created dynamically with \mintinline{bash}{perf probe} |
| 42 | + \end{itemize} |
33 | 43 | \end{itemize}
|
34 |
| - \end{block} |
35 |
| - \begin{block}{Intel VTune and AMD uProf} |
36 |
| - \begin{itemize} |
37 |
| - \item Graphical profilers from CPU vendors with rich features |
38 |
| - \item Needs vendor's CPU for full experience |
39 |
| - \item More information on \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html}{Intel's website} and \href{https://developer.amd.com/amd-uprof/}{AMD's website} |
40 |
| - \end{itemize} |
41 |
| - \end{block} |
| 44 | +\end{frame} |
| 45 | + |
| 46 | +\begin{frame}[fragile] |
| 47 | + \frametitle{\mintinline{bash}{perf} commands} |
| 48 | + { \scriptsize |
| 49 | + \begin{block}{} |
| 50 | + \begin{minted}{shell-session} |
| 51 | +$ perf |
| 52 | + usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS] |
| 53 | + The most commonly used perf commands are: |
| 54 | + annotate Read perf.data and display annotated code |
| 55 | + c2c Shared Data C2C/HITM Analyzer. |
| 56 | + config Get and set variables in a configuration file. |
| 57 | + diff Read perf.data and display the differential profile |
| 58 | + evlist List the event names in a perf.data file |
| 59 | + list List all symbolic event types |
| 60 | + mem Profile memory accesses |
| 61 | + record Run a command and record its profile into perf.data |
| 62 | + report Read perf.data and display the profile |
| 63 | + sched Tool to trace/measure scheduler properties (latencies) |
| 64 | + script Read perf.data and display trace output |
| 65 | + stat Run command and gather performance counter statistics |
| 66 | + top System profiling tool. |
| 67 | + version display the version of perf binary |
| 68 | + probe Define new dynamic tracepoints |
| 69 | + trace strace inspired tool |
| 70 | + See 'perf help COMMAND' for more information on a specific command. |
| 71 | + \end{minted} |
| 72 | + \end{block} |
| 73 | + } |
| 74 | +\end{frame} |
| 75 | +
|
| 76 | +\begin{frame}[fragile] |
| 77 | + \frametitle{Listing events with \mintinline{bash}{perf list}} |
| 78 | + { \scriptsize |
| 79 | + \begin{block}{} |
| 80 | + \begin{minted}{shell-session} |
| 81 | +$ # List main hardware events |
| 82 | +$ perf list hw |
| 83 | +
|
| 84 | +List of pre-defined events (to be used in -e): |
| 85 | +
|
| 86 | + branch-instructions OR branches [Hardware event] |
| 87 | + branch-misses [Hardware event] |
| 88 | + cache-misses [Hardware event] |
| 89 | + cache-references [Hardware event] |
| 90 | + cpu-cycles OR cycles [Hardware event] |
| 91 | + instructions [Hardware event] |
| 92 | +
|
| 93 | +$ # List main software/cache events |
| 94 | +$ perf list sw |
| 95 | +$ perf list cache |
| 96 | +
|
| 97 | +$ # List all pre-defined metrics |
| 98 | +$ perf list metric |
| 99 | +
|
| 100 | +$ # List all currently known events: |
| 101 | +$ perf list |
| 102 | + \end{minted} |
| 103 | + \end{block} |
| 104 | + } |
| 105 | +\end{frame} |
| 106 | +
|
| 107 | +\begin{frame}[fragile] |
| 108 | + \frametitle{Counting events with \mintinline{bash}{perf stat}} |
| 109 | + { \scriptsize |
| 110 | + \begin{block}{} |
| 111 | + \begin{minted}{shell-session} |
| 112 | +$ # Standard CPU counter statistics for the specified command: |
| 113 | +$ perf stat <command> |
| 114 | +
|
| 115 | +$ # Detailed CPU counter statistics for the specified command: |
| 116 | +$ perf stat -d <command> |
| 117 | +$ perf stat -dd <command> |
| 118 | +
|
| 119 | +$ # Top-down microarchitecture analysis for the entire system, for 10s: |
| 120 | +$ perf stat -a --topdown -- sleep 10 |
| 121 | +
|
| 122 | +$ # L1 cache hit rate reported every 1000 ms for the specified command: |
| 123 | +$ perf stat -e L1-dcache-loads,L1-dcache-load-misses -I 1000 <command> |
| 124 | +
|
| 125 | +$ # Instruction per cycle and Instruction-level parallelism, for command: |
| 126 | +$ perf stat -M IPC,ILP -- <command> |
| 127 | +
|
| 128 | +$ # Measure GFLOPs system-wide, until Ctrl-C is used to stop: |
| 129 | +$ perf stat -M GFLOPs |
| 130 | +
|
| 131 | +$ # Measure cycles and instructions 10 times, report results with stddev: |
| 132 | +$ perf stat -e cycles,instructions -r 10 -- <command> |
| 133 | + \end{minted} |
| 134 | + \end{block} |
| 135 | + } |
| 136 | +\end{frame} |
| 137 | +
|
| 138 | +
|
| 139 | +\begin{frame}[fragile] |
| 140 | + \frametitle{Recording profiling information with \mintinline{bash}{perf record}} |
| 141 | + { \scriptsize |
| 142 | + \begin{block}{} |
| 143 | + \begin{minted}{shell-session} |
| 144 | +$ # Sample on-CPU functions for the specified command, at 100 Hertz: |
| 145 | +$ perf record -F 100 -- <command> |
| 146 | +
|
| 147 | +$ # Sample CPU stack traces (via frame pointers), at 100 Hertz, for 10s: |
| 148 | +$ perf record -F 100 -g -- sleep 10 |
| 149 | +
|
| 150 | +$ # Sample stack traces for PID using DWARF to unwind stacks, for 10s: |
| 151 | +$ perf record -p <PID> --call-graph=dwarf -- sleep 10 |
| 152 | +
|
| 153 | +$ # Precise on-CPU user stack traces (no skid) using PEBS (Intel CPUs): |
| 154 | +$ perf record -g -e cycles:up -- <command> |
| 155 | +
|
| 156 | +$ # Sample CPU stack traces using Instruction-based sampling (AMD CPUs): |
| 157 | +$ # (Note that you need to use system-wide sampling for IBS on AMD CPUs) |
| 158 | +$ perf record -a -g -e cycles:pp -- <command> |
| 159 | +
|
| 160 | +$ # Sample CPU stack traces once every 10k L1 data cache misses, for 5s: |
| 161 | +$ perf record -a -g -e L1-dcache-load-misses -c 10000 -- sleep 5 |
| 162 | +
|
| 163 | +$ # Sample CPUs at 100 Hertz, and show top addresses and symbols, live: |
| 164 | +$ perf top -F 100 |
| 165 | + \end{minted} |
| 166 | + \end{block} |
| 167 | + } |
| 168 | +\end{frame} |
| 169 | +
|
| 170 | +\begin{frame}[fragile] |
| 171 | + \frametitle{Reporting and annotating source code with \mintinline{bash}{perf}} |
| 172 | + { \scriptsize |
| 173 | + \begin{block}{} |
| 174 | + \begin{minted}{shell-session} |
| 175 | +$ # Standard reporting of perf.data in text UI interface: |
| 176 | +$ perf report |
| 177 | +
|
| 178 | +$ # Report by self-time (excluding time spent in callees): |
| 179 | +$ perf report --no-children |
| 180 | +
|
| 181 | +$ # Report per source line of code (needs debugging info to work): |
| 182 | +$ perf report --no-children -s srcline |
| 183 | +
|
| 184 | +$ # Single inverted (caller-based) call-graph per binary: |
| 185 | +$ perf report --inverted -s comm |
| 186 | +
|
| 187 | +$ # Text-based report per library, without call graph: |
| 188 | +$ perf report --stdio -g none -s dso |
| 189 | +
|
| 190 | +$ # Hierarchical report for functions taking at least 1% of runtime: |
| 191 | +$ perf report --stdio -g none --hierarchy --percent-limit 1 |
| 192 | +
|
| 193 | +$ # Disassemble and annotate a symbol (instructions with percentages): |
| 194 | +$ # (Needs debugging information available to show source code as well) |
| 195 | +$ perf annotate <symbol> |
| 196 | + \end{minted} |
| 197 | + \end{block} |
| 198 | + } |
| 199 | +\end{frame} |
| 200 | +
|
| 201 | +\begin{frame}[fragile] |
| 202 | + \frametitle{Further information on \mintinline{bash}{perf}} |
| 203 | + \begin{itemize} |
| 204 | + \item Official documentation in the Linux repository at |
| 205 | + \href{https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation} |
| 206 | + {linux/tools/perf/Documentation} |
| 207 | + \item Perf Wiki at \url{https://perf.wiki.kernel.org/} |
| 208 | + \item Linux \mintinline{bash}{perf} examples by Brendan Gregg |
| 209 | + \url{https://www.brendangregg.com/linuxperf.html} |
| 210 | + \item Scripts to visualize profiles as flamegraphs |
| 211 | + \url{https://github.com/brendangregg/FlameGraph} |
| 212 | + \item HSF Tools \& Packaging Working Group talk on Indico\\ |
| 213 | + \href{https://indico.cern.ch/event/974382/} |
| 214 | + {Linux Systems Performance: Tracing, Profiling \& Visualization} |
| 215 | + \end{itemize} |
| 216 | +\end{frame} |
| 217 | +
|
| 218 | +\begin{frame}[fragile] |
| 219 | + \frametitle{Intel VTune Profiler} |
| 220 | + \centering |
| 221 | + \includegraphics[width=0.75\textwidth]{tools/vtune.png} |
| 222 | + \begin{itemize} |
| 223 | + \item Very powerful GUI-based profiler for Intel CPUs and GPUs |
| 224 | + \item Now free to use with |
| 225 | + \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html}{Intel oneAPI Base Toolkit} or |
| 226 | + \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html}{standalone} |
| 227 | + \item See the \href{https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/} |
| 228 | + {official online documentation} for more information |
| 229 | + \end{itemize} |
42 | 230 | \end{frame}
|
0 commit comments