Skip to content

Commit cc89f13

Browse files
committed
Expand section on profilers (perf and VTune)
1 parent d054d21 commit cc89f13

File tree

2 files changed

+213
-25
lines changed

2 files changed

+213
-25
lines changed

talk/tools/profiling.tex

+213-25
Original file line numberDiff line numberDiff line change
@@ -4,39 +4,227 @@
44
\frametitle{Profiling}
55
\begin{block}{Conceptually}
66
\begin{itemize}
7-
\item take a measurement of a performance aspect of a program
7+
\item Take a measurement of a performance aspect of a program
88
\begin{itemize}
9-
\item where in my code is most of the time spent?
10-
\item is my program compute or memory bound?
11-
\item does my program make good use of the cache?
12-
\item is my program using all cores most of the time?
13-
\item how often are threads blocked and why?
14-
\item which API calls are made and in which order?
9+
\item Where in my code is most of the time spent?
10+
\item Is my program compute or memory bound?
11+
\item Does my program make good use of the cache?
12+
\item Is my program using all cores most of the time?
13+
\item How often are threads blocked and why?
14+
\item Which API calls are made and in which order?
1515
\item ...
1616
\end{itemize}
17-
\item the goal is to find performance bottlenecks
18-
\item is usually done on a compiled program, not on source code
17+
\item The goal is to find performance bottlenecks
18+
\item Usually done on a compiled program, not on source code
1919
\end{itemize}
2020
\end{block}
2121
\end{frame}
2222

2323
\begin{frame}[fragile]
24-
\frametitle{perf, VTune and uProf}
25-
\begin{block}{perf}
24+
\frametitle{\mintinline{bash}{perf} -- Performance analysis tools for Linux}
25+
\setlength{\leftmargini}{0pt}
2626
\begin{itemize}
27-
\item perf is a powerful command line profiling tool for linux
28-
\item compile with \mintinline{bash}{-g -fno-omit-frame-pointer}
29-
\item \mintinline{bash}{perf stat -d <prg>} gathers performance statistics while running \mintinline{bash}{<prg>}
30-
\item \mintinline{bash}{perf record -g <prg>} starts profiling \mintinline{bash}{<prg>}
31-
\item \mintinline{bash}{perf report} displays a report from the last profile
32-
\item More information in \href{https://perf.wiki.kernel.org/index.php/Main_Page}{this wiki}, \href{https://www.brendangregg.com/linuxperf.html}{this website} or \href{https://indico.cern.ch/event/980497/contributions/4130271/attachments/2161581/3647235/linux-systems-performance.pdf}{this talk}.
27+
\item Powerful command line profiling tool for Linux
28+
\item Not portable, the source code is part of the Linux kernel itself
29+
\item Much lower overhead compared with \mintinline{bash}{valgrind}
30+
\item To use it, compile your code with \mintinline{bash}{-g -fno-omit-frame-pointer}
31+
\item Counting and sampling
32+
\begin{itemize}
33+
\item Counting -- count occurrences of a given event (e.g.\ cache misses)
34+
\item Time-based sampling -- sample the stack at regular time intervals
35+
\item Event-based sampling -- take samples when event counter overflows
36+
\item Instruction-based sampling -- sample instructions and precisely count events they create
37+
\end{itemize}
38+
\item Static and dynamic tracing
39+
\begin{itemize}
40+
\item Static -- pre-defined tracepoints in software (e.g.\ scheduling events)
41+
\item Dynamic -- tracepoints created dynamically with \mintinline{bash}{perf probe}
42+
\end{itemize}
3343
\end{itemize}
34-
\end{block}
35-
\begin{block}{Intel VTune and AMD uProf}
36-
\begin{itemize}
37-
\item Graphical profilers from CPU vendors with rich features
38-
\item Needs vendor's CPU for full experience
39-
\item More information on \href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html}{Intel's website} and \href{https://developer.amd.com/amd-uprof/}{AMD's website}
40-
\end{itemize}
41-
\end{block}
44+
\end{frame}
45+
46+
\begin{frame}[fragile]
47+
\frametitle{\mintinline{bash}{perf} commands}
48+
{ \scriptsize
49+
\begin{block}{}
50+
\begin{minted}{shell-session}
51+
$ perf
52+
usage: perf [--version] [--help] [OPTIONS] COMMAND [ARGS]
53+
The most commonly used perf commands are:
54+
annotate Read perf.data and display annotated code
55+
c2c Shared Data C2C/HITM Analyzer.
56+
config Get and set variables in a configuration file.
57+
diff Read perf.data and display the differential profile
58+
evlist List the event names in a perf.data file
59+
list List all symbolic event types
60+
mem Profile memory accesses
61+
record Run a command and record its profile into perf.data
62+
report Read perf.data and display the profile
63+
sched Tool to trace/measure scheduler properties (latencies)
64+
script Read perf.data and display trace output
65+
stat Run command and gather performance counter statistics
66+
top System profiling tool.
67+
version display the version of perf binary
68+
probe Define new dynamic tracepoints
69+
trace strace inspired tool
70+
See 'perf help COMMAND' for more information on a specific command.
71+
\end{minted}
72+
\end{block}
73+
}
74+
\end{frame}
75+
76+
\begin{frame}[fragile]
77+
\frametitle{Listing events with \mintinline{bash}{perf list}}
78+
{ \scriptsize
79+
\begin{block}{}
80+
\begin{minted}{shell-session}
81+
$ # List main hardware events
82+
$ perf list hw
83+
84+
List of pre-defined events (to be used in -e):
85+
86+
branch-instructions OR branches [Hardware event]
87+
branch-misses [Hardware event]
88+
cache-misses [Hardware event]
89+
cache-references [Hardware event]
90+
cpu-cycles OR cycles [Hardware event]
91+
instructions [Hardware event]
92+
93+
$ # List main software/cache events
94+
$ perf list sw
95+
$ perf list cache
96+
97+
$ # List all pre-defined metrics
98+
$ perf list metric
99+
100+
$ # List all currently known events:
101+
$ perf list
102+
\end{minted}
103+
\end{block}
104+
}
105+
\end{frame}
106+
107+
\begin{frame}[fragile]
108+
\frametitle{Counting events with \mintinline{bash}{perf stat}}
109+
{ \scriptsize
110+
\begin{block}{}
111+
\begin{minted}{shell-session}
112+
$ # Standard CPU counter statistics for the specified command:
113+
$ perf stat <command>
114+
115+
$ # Detailed CPU counter statistics for the specified command:
116+
$ perf stat -d <command>
117+
$ perf stat -dd <command>
118+
119+
$ # Top-down microarchitecture analysis for the entire system, for 10s:
120+
$ perf stat -a --topdown -- sleep 10
121+
122+
$ # L1 cache hit rate reported every 1000 ms for the specified command:
123+
$ perf stat -e L1-dcache-loads,L1-dcache-load-misses -I 1000 <command>
124+
125+
$ # Instruction per cycle and Instruction-level parallelism, for command:
126+
$ perf stat -M IPC,ILP -- <command>
127+
128+
$ # Measure GFLOPs system-wide, until Ctrl-C is used to stop:
129+
$ perf stat -M GFLOPs
130+
131+
$ # Measure cycles and instructions 10 times, report results with stddev:
132+
$ perf stat -e cycles,instructions -r 10 -- <command>
133+
\end{minted}
134+
\end{block}
135+
}
136+
\end{frame}
137+
138+
139+
\begin{frame}[fragile]
140+
\frametitle{Recording profiling information with \mintinline{bash}{perf record}}
141+
{ \scriptsize
142+
\begin{block}{}
143+
\begin{minted}{shell-session}
144+
$ # Sample on-CPU functions for the specified command, at 100 Hertz:
145+
$ perf record -F 100 -- <command>
146+
147+
$ # Sample CPU stack traces (via frame pointers), at 100 Hertz, for 10s:
148+
$ perf record -F 100 -g -- sleep 10
149+
150+
$ # Sample stack traces for PID using DWARF to unwind stacks, for 10s:
151+
$ perf record -p <PID> --call-graph=dwarf -- sleep 10
152+
153+
$ # Precise on-CPU user stack traces (no skid) using PEBS (Intel CPUs):
154+
$ perf record -g -e cycles:up -- <command>
155+
156+
$ # Sample CPU stack traces using Instruction-based sampling (AMD CPUs):
157+
$ # (Note that you need to use system-wide sampling for IBS on AMD CPUs)
158+
$ perf record -a -g -e cycles:pp -- <command>
159+
160+
$ # Sample CPU stack traces once every 10k L1 data cache misses, for 5s:
161+
$ perf record -a -g -e L1-dcache-load-misses -c 10000 -- sleep 5
162+
163+
$ # Sample CPUs at 100 Hertz, and show top addresses and symbols, live:
164+
$ perf top -F 100
165+
\end{minted}
166+
\end{block}
167+
}
168+
\end{frame}
169+
170+
\begin{frame}[fragile]
171+
\frametitle{Reporting and annotating source code with \mintinline{bash}{perf}}
172+
{ \scriptsize
173+
\begin{block}{}
174+
\begin{minted}{shell-session}
175+
$ # Standard reporting of perf.data in text UI interface:
176+
$ perf report
177+
178+
$ # Report by self-time (excluding time spent in callees):
179+
$ perf report --no-children
180+
181+
$ # Report per source line of code (needs debugging info to work):
182+
$ perf report --no-children -s srcline
183+
184+
$ # Single inverted (caller-based) call-graph per binary:
185+
$ perf report --inverted -s comm
186+
187+
$ # Text-based report per library, without call graph:
188+
$ perf report --stdio -g none -s dso
189+
190+
$ # Hierarchical report for functions taking at least 1% of runtime:
191+
$ perf report --stdio -g none --hierarchy --percent-limit 1
192+
193+
$ # Disassemble and annotate a symbol (instructions with percentages):
194+
$ # (Needs debugging information available to show source code as well)
195+
$ perf annotate <symbol>
196+
\end{minted}
197+
\end{block}
198+
}
199+
\end{frame}
200+
201+
\begin{frame}[fragile]
202+
\frametitle{Further information on \mintinline{bash}{perf}}
203+
\begin{itemize}
204+
\item Official documentation in the Linux repository at
205+
\href{https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation}
206+
{linux/tools/perf/Documentation}
207+
\item Perf Wiki at \url{https://perf.wiki.kernel.org/}
208+
\item Linux \mintinline{bash}{perf} examples by Brendan Gregg
209+
\url{https://www.brendangregg.com/linuxperf.html}
210+
\item Scripts to visualize profiles as flamegraphs
211+
\url{https://github.com/brendangregg/FlameGraph}
212+
\item HSF Tools \& Packaging Working Group talk on Indico\\
213+
\href{https://indico.cern.ch/event/974382/}
214+
{Linux Systems Performance: Tracing, Profiling \& Visualization}
215+
\end{itemize}
216+
\end{frame}
217+
218+
\begin{frame}[fragile]
219+
\frametitle{Intel VTune Profiler}
220+
\centering
221+
\includegraphics[width=0.75\textwidth]{tools/vtune.png}
222+
\begin{itemize}
223+
\item Very powerful GUI-based profiler for Intel CPUs and GPUs
224+
\item Now free to use with
225+
\href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/toolkits.html}{Intel oneAPI Base Toolkit} or
226+
\href{https://www.intel.com/content/www/us/en/developer/tools/oneapi/vtune-profiler.html}{standalone}
227+
\item See the \href{https://www.intel.com/content/www/us/en/develop/documentation/vtune-help/}
228+
{official online documentation} for more information
229+
\end{itemize}
42230
\end{frame}

talk/tools/vtune.png

165 KB
Loading

0 commit comments

Comments
 (0)