Skip to content

Commit 80e6574

Browse files
author
Paweł Szulik
authored
Add mon_group support for resctrl. (google#2793)
* Add mon_group support for resctrl. Signed-off-by: Paweł Szulik <[email protected]> * Do not try to setup the root container. Signed-off-by: Paweł Szulik <[email protected]> * Update klog version to avoid errors from golangci-lint. Signed-off-by: Paweł Szulik <[email protected]> * Update klog version in cmd to avoid errors from golangci-lint. Signed-off-by: Paweł Szulik <[email protected]> * Fix go.sum Signed-off-by: Paweł Szulik <[email protected]> * Check if container moved between control groups only if its running. Signed-off-by: Paweł Szulik <[email protected]> * Get NUMA nodes from MachineInfo.Topology. Signed-off-by: Paweł Szulik <[email protected]> * Make code thread safe again. Signed-off-by: Paweł Szulik <[email protected]> * Fix typo. Signed-off-by: Paweł Szulik <[email protected]> * Refactor resctrl collector setup. Signed-off-by: Paweł Szulik <[email protected]> * Refactor resctrl utilies. Signed-off-by: Paweł Szulik <[email protected]> * Better name vars. Signed-off-by: Paweł Szulik <[email protected]> * Add missing python3 in Dockerfile. Signed-off-by: Paweł Szulik <[email protected]> * Add missing procps in Dockerfile. Signed-off-by: Paweł Szulik <[email protected]> * Use const instead of magic value. Signed-off-by: Paweł Szulik <[email protected]> * Delete an unnecessary setting of c.running to false. Signed-off-by: Paweł Szulik <[email protected]> * Do not wrap the error from cAdvisor. Signed-off-by: Paweł Szulik <[email protected]> * Use path in error message. Signed-off-by: Paweł Szulik <[email protected]> * Avoid goroutine looping. Signed-off-by: Paweł Szulik <[email protected]> * Do not use fscommon package from runc/libcontainer. Signed-off-by: Paweł Szulik <[email protected]> * Fix const ASCII names. Signed-off-by: Paweł Szulik <[email protected]> * Use same operator in func. Signed-off-by: Paweł Szulik <[email protected]> * Introduce const variables. Signed-off-by: Paweł Szulik <[email protected]> * Introduce vendor_id in MachineInfo. Signed-off-by: Paweł Szulik <[email protected]> * Extend files which should be omitted when searching control group. Signed-off-by: Paweł Szulik <[email protected]> * Add info about possible bug when reading resctrl values on AMD. Signed-off-by: Paweł Szulik <[email protected]> * Use empty struct map instead of boolean. Signed-off-by: Paweł Szulik <[email protected]> * Move reading file logic. Signed-off-by: Paweł Szulik <[email protected]> * Use scanner to read tasks file. Signed-off-by: Paweł Szulik <[email protected]> * Change the way of searching for the control group. Signed-off-by: Paweł Szulik <[email protected]> * Add comments. Use const value. Signed-off-by: Paweł Szulik <[email protected]> * Comment function. Signed-off-by: Paweł Szulik <[email protected]> * Fix typo. Signed-off-by: Paweł Szulik <[email protected]> * Refactor getAllProcessThreads. Signed-off-by: Paweł Szulik <[email protected]> * Refactor GetVendorID. Signed-off-by: Paweł Szulik <[email protected]> * Rename VendorID. Signed-off-by: Paweł Szulik <[email protected]> * Resctrl collector should be aware of existing mon groups. Signed-off-by: Paweł Szulik <[email protected]> * Optimization for finding control/monitoring group. Signed-off-by: Paweł Szulik <[email protected]> * Avoid having ugly errors. Signed-off-by: Paweł Szulik <[email protected]> * Use strings.HasPrefix(). Signed-off-by: Paweł Szulik <[email protected]> * Add comments. Signed-off-by: Paweł Szulik <[email protected]> * Rename variables. Signed-off-by: Paweł Szulik <[email protected]> * Fix test. Signed-off-by: Paweł Szulik <[email protected]> * Use string map instead of int. Signed-off-by: Paweł Szulik <[email protected]> * Now there is no need to use procps in Dockerfile. Signed-off-by: Paweł Szulik <[email protected]> * Update to go 1.17. Signed-off-by: Paweł Szulik <[email protected]> * Add information about possible race condition. Signed-off-by: Paweł Szulik <[email protected]> * Add warning when docker_only is not set. Signed-off-by: Paweł Szulik <[email protected]> * Fix typo. Signed-off-by: Paweł Szulik <[email protected]>
1 parent 0549d48 commit 80e6574

19 files changed

+1570
-64
lines changed

cmd/cadvisor.go

+3-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ var rawCgroupPrefixWhiteList = flag.String("raw_cgroup_prefix_whitelist", "", "A
7373

7474
var perfEvents = flag.String("perf_events_config", "", "Path to a JSON file containing configuration of perf events to measure. Empty value disabled perf events measuring.")
7575

76+
var resctrlInterval = flag.Duration("resctrl_interval", 0, "Resctrl mon groups updating interval. Zero value disables updating mon groups.")
77+
7678
var (
7779
// Metrics to be ignored.
7880
// Tcp metrics are ignored by default.
@@ -131,7 +133,7 @@ func main() {
131133

132134
collectorHttpClient := createCollectorHttpClient(*collectorCert, *collectorKey)
133135

134-
resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHttpClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents)
136+
resourceManager, err := manager.New(memoryStorage, sysFs, manager.HousekeepingConfigFlags, includedMetrics, &collectorHttpClient, strings.Split(*rawCgroupPrefixWhiteList, ","), strings.Split(*envMetadataWhiteList, ","), *perfEvents, *resctrlInterval)
135137
if err != nil {
136138
klog.Fatalf("Failed to create a manager: %s", err)
137139
}

container/raw/factory.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ import (
2929
"k8s.io/klog/v2"
3030
)
3131

32-
var dockerOnly = flag.Bool("docker_only", false, "Only report docker containers in addition to root stats")
32+
var DockerOnly = flag.Bool("docker_only", false, "Only report docker containers in addition to root stats")
3333
var disableRootCgroupStats = flag.Bool("disable_root_cgroup_stats", false, "Disable collecting root Cgroup stats")
3434

3535
type rawFactory struct {
@@ -69,7 +69,7 @@ func (f *rawFactory) CanHandleAndAccept(name string) (bool, bool, error) {
6969
if name == "/" {
7070
return true, true, nil
7171
}
72-
if *dockerOnly && f.rawPrefixWhiteList[0] == "" {
72+
if *DockerOnly && f.rawPrefixWhiteList[0] == "" {
7373
return true, false, nil
7474
}
7575
for _, prefix := range f.rawPrefixWhiteList {

docs/runtime_options.md

+8
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,14 @@ should be a human readable string that will become a metric name.
422422
* `cas_count_read` will be measured as uncore non-grouped event on all Integrated Memory Controllers Performance Monitoring Units because of unset `type` field and
423423
`uncore_imc` prefix.
424424

425+
## Resctrl
426+
To gain metrics, cAdvisor creates own monitoring groups with `cadvisor` prefix.
427+
428+
Resctrl file system is not hierarchical like cgroups, so users should set `--docker_only` flag to avoid race conditions and unexpected behaviours.
429+
430+
```
431+
--resctrl_interval=0: Resctrl mon groups updating interval. Zero value disables updating mon groups.
432+
```
425433

426434
## Storage driver specific instructions:
427435

info/v1/machine.go

+4
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ type MachineInfo struct {
178178
// The time of this information point.
179179
Timestamp time.Time `json:"timestamp"`
180180

181+
// Vendor id of CPU.
182+
CPUVendorID string `json:"vendor_id"`
183+
181184
// The number of cores in this machine.
182185
NumCores int `json:"num_cores"`
183186

@@ -249,6 +252,7 @@ func (m *MachineInfo) Clone() *MachineInfo {
249252
}
250253
}
251254
copy := MachineInfo{
255+
CPUVendorID: m.CPUVendorID,
252256
Timestamp: m.Timestamp,
253257
NumCores: m.NumCores,
254258
NumPhysicalCores: m.NumPhysicalCores,

machine/info.go

+1
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ func Info(sysFs sysfs.SysFs, fsInfo fs.FsInfo, inHostNamespace bool) (*info.Mach
121121

122122
machineInfo := &info.MachineInfo{
123123
Timestamp: time.Now(),
124+
CPUVendorID: GetCPUVendorID(cpuinfo),
124125
NumCores: numCores,
125126
NumPhysicalCores: GetPhysicalCores(cpuinfo),
126127
NumSockets: GetSockets(cpuinfo),

machine/machine.go

+16
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ var (
4343
cpuClockSpeedMHz = regexp.MustCompile(`(?:cpu MHz|CPU MHz|clock)\s*:\s*([0-9]+\.[0-9]+)(?:MHz)?`)
4444
memoryCapacityRegexp = regexp.MustCompile(`MemTotal:\s*([0-9]+) kB`)
4545
swapCapacityRegexp = regexp.MustCompile(`SwapTotal:\s*([0-9]+) kB`)
46+
vendorIDRegexp = regexp.MustCompile(`vendor_id\s*:\s*(\w+)`)
4647

4748
cpuBusPath = "/sys/bus/cpu/devices/"
4849
isMemoryController = regexp.MustCompile("mc[0-9]+")
@@ -54,6 +55,21 @@ var (
5455
const memTypeFileName = "dimm_mem_type"
5556
const sizeFileName = "size"
5657

58+
// GetCPUVendorID returns "vendor_id" reading /proc/cpuinfo file.
59+
func GetCPUVendorID(procInfo []byte) string {
60+
vendorID := ""
61+
62+
matches := vendorIDRegexp.FindSubmatch(procInfo)
63+
if len(matches) != 2 {
64+
klog.Warning("Cannot read vendor id correctly, set empty.")
65+
return vendorID
66+
}
67+
68+
vendorID = string(matches[1])
69+
70+
return vendorID
71+
}
72+
5773
// GetPhysicalCores returns number of CPU cores reading /proc/cpuinfo file or if needed information from sysfs cpu path
5874
func GetPhysicalCores(procInfo []byte) int {
5975
numCores := getUniqueMatchesCount(string(procInfo), coreRegExp)

machine/topology_test.go

+24
Original file line numberDiff line numberDiff line change
@@ -462,3 +462,27 @@ func TestClockSpeedOnCpuLowerCase(t *testing.T) {
462462
assert.NotNil(t, clockSpeed)
463463
assert.Equal(t, uint64(1450*1000), clockSpeed)
464464
}
465+
466+
func TestGetCPUVendorID(t *testing.T) {
467+
var testCases = []struct {
468+
file string
469+
expected string
470+
}{
471+
{
472+
"./testdata/cpuinfo_onesocket_many_NUMAs",
473+
"GenuineIntel",
474+
},
475+
{
476+
"./testdata/cpuinfo_arm",
477+
"",
478+
},
479+
}
480+
481+
for _, test := range testCases {
482+
testcpuinfo, err := ioutil.ReadFile(test.file)
483+
assert.Nil(t, err)
484+
assert.NotNil(t, testcpuinfo)
485+
cpuVendorID := GetCPUVendorID(testcpuinfo)
486+
assert.Equal(t, test.expected, cpuVendorID)
487+
}
488+
}

manager/container.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ func (cd *containerData) Stop() error {
130130
}
131131
close(cd.stop)
132132
cd.perfCollector.Destroy()
133+
cd.resctrlCollector.Destroy()
133134
return nil
134135
}
135136

@@ -727,7 +728,7 @@ func (cd *containerData) updateStats() error {
727728
return perfStatsErr
728729
}
729730
if resctrlStatsErr != nil {
730-
klog.Errorf("error occurred while collecting resctrl stats for container %s: %s", cInfo.Name, err)
731+
klog.Errorf("error occurred while collecting resctrl stats for container %s: %s", cInfo.Name, resctrlStatsErr)
731732
return resctrlStatsErr
732733
}
733734
return customStatsErr

manager/manager.go

+10-14
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ import (
4949

5050
"github.com/opencontainers/runc/libcontainer/cgroups"
5151
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
52-
"github.com/opencontainers/runc/libcontainer/intelrdt"
5352

5453
"k8s.io/klog/v2"
5554
"k8s.io/utils/clock"
@@ -152,7 +151,7 @@ type HouskeepingConfig = struct {
152151
}
153152

154153
// New takes a memory storage and returns a new manager.
155-
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig HouskeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string) (Manager, error) {
154+
func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig HouskeepingConfig, includedMetricsSet container.MetricSet, collectorHTTPClient *http.Client, rawContainerCgroupPathPrefixWhiteList, containerEnvMetadataWhiteList []string, perfEventsFile string, resctrlInterval time.Duration) (Manager, error) {
156155
if memoryCache == nil {
157156
return nil, fmt.Errorf("manager requires memory storage")
158157
}
@@ -224,7 +223,7 @@ func New(memoryCache *memory.InMemoryCache, sysfs sysfs.SysFs, houskeepingConfig
224223
return nil, err
225224
}
226225

227-
newManager.resctrlManager, err = resctrl.NewManager(selfContainer)
226+
newManager.resctrlManager, err = resctrl.NewManager(resctrlInterval, resctrl.Setup, machineInfo.CPUVendorID, inHostNamespace)
228227
if err != nil {
229228
klog.V(4).Infof("Cannot gather resctrl metrics: %v", err)
230229
}
@@ -269,7 +268,7 @@ type manager struct {
269268
collectorHTTPClient *http.Client
270269
nvidiaManager stats.Manager
271270
perfManager stats.Manager
272-
resctrlManager stats.Manager
271+
resctrlManager resctrl.Manager
273272
// List of raw container cgroup path prefix whitelist.
274273
rawContainerCgroupPathPrefixWhiteList []string
275274
// List of container env prefix whitelist, the matched container envs would be collected into metrics as extra labels.
@@ -336,7 +335,7 @@ func (m *manager) Start() error {
336335

337336
func (m *manager) Stop() error {
338337
defer m.nvidiaManager.Destroy()
339-
defer m.destroyPerfCollectors()
338+
defer m.destroyCollectors()
340339
// Stop and wait on all quit channels.
341340
for i, c := range m.quitChannels {
342341
// Send the exit signal and wait on the thread to exit (by closing the channel).
@@ -354,9 +353,10 @@ func (m *manager) Stop() error {
354353
return nil
355354
}
356355

357-
func (m *manager) destroyPerfCollectors() {
356+
func (m *manager) destroyCollectors() {
358357
for _, container := range m.containers {
359358
container.perfCollector.Destroy()
359+
container.resctrlCollector.Destroy()
360360
}
361361
}
362362

@@ -973,14 +973,11 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
973973
}
974974

975975
if m.includedMetrics.Has(container.ResctrlMetrics) {
976-
resctrlPath, err := intelrdt.GetIntelRdtPath(containerName)
976+
cont.resctrlCollector, err = m.resctrlManager.GetCollector(containerName, func() ([]string, error) {
977+
return cont.getContainerPids(m.inHostNamespace)
978+
}, len(m.machineInfo.Topology))
977979
if err != nil {
978-
klog.V(4).Infof("Error getting resctrl path: %q", err)
979-
} else {
980-
cont.resctrlCollector, err = m.resctrlManager.GetCollector(resctrlPath)
981-
if err != nil {
982-
klog.V(4).Infof("resctrl metrics will not be available for container %s: %s", cont.info.Name, err)
983-
}
980+
klog.V(4).Infof("resctrl metrics will not be available for container %s: %s", cont.info.Name, err)
984981
}
985982
}
986983

@@ -1022,7 +1019,6 @@ func (m *manager) createContainerLocked(containerName string, watchSource watche
10221019
if err != nil {
10231020
return err
10241021
}
1025-
10261022
// Start the container's housekeeping.
10271023
return cont.Start()
10281024
}

0 commit comments

Comments
 (0)