Skip to content

Commit

Permalink
Merge pull request #19 from Akrog/sos-report
Browse files Browse the repository at this point in the history
Gather SOS reports
  • Loading branch information
fmount authored Nov 22, 2023
2 parents 0399fba + b9a08b3 commit d86ed1f
Show file tree
Hide file tree
Showing 6 changed files with 181 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM quay.io/openshift/origin-must-gather:4.13.0 as builder

FROM quay.io/centos/centos:stream9

RUN dnf update -y && dnf install rsync python3-pyyaml -y && dnf clean all
RUN dnf update -y && dnf install xz rsync python3-pyyaml -y && dnf clean all

COPY --from=builder /usr/bin/oc /usr/bin/oc

Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,30 @@ In particular the `openstack-must-gather` will get a dump of:
- `CSVs`, `pkgmanifests`, `subscriptions`, `installplans`, `operatorgroup`
- `Pods`, `Deployments`, `Statefulsets`, `ReplicaSets`, `Service`, `Routes`, `ConfigMaps`, (part of / relevant) `Secrets`
- Network related info (`Metallb` info, `IPAddressPool`, `L2Advertisements`, `NetConfig`, `IPSet`)
- SOS reports for OpenShift nodes that are running OpenStack service pods.

### Customize gathered data

Some openstack-must-gather collectors can be configured via environmental
variables to behave differently. For example SOS gathering can be disabled
passing an empty `SOS_SERVICES` environmental variable.

To provide environmental variables we'll need to invoke the gathering command
manually like this:

```sh
oc adm must-gather --image=quay.io/openstack-k8s-operators/openstack-must-gather -- SOS_SERVICES= gather
```

This is the list of available environmental variables:

- `SOS_SERVICES`: Comma separated list of services to gather SOS reports from.
Empty string skips sos report gathering. Eg: `cinder,glance`. Defaults to all
of them.
- `SOS_ONLY_PLUGINS`: List of SOS report plugins to use. Empty string to run
them all. Defaults to: `block,cifs,crio,devicemapper,devices,iscsi,lvm2,
memory,multipath,nfs,nis,nvme,podman,process,processor,selinux,scsi,udev`.


## Development

Expand Down
1 change: 1 addition & 0 deletions collection-scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ declare -a OSP_SERVICES=(
"designate"
"barbican"
"dataplane"
"ceilometer"
)
export OSP_SERVICES

Expand Down
2 changes: 1 addition & 1 deletion collection-scripts/gather_debug
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
FILE=/tmp/rm-to-finish-gathering
MSG=""\
"Must gather entering debug mode, will sleep until file $FILE is deleted\n"\
"You can go into the container now and execute /usr/bin/gather/n"
"You can go into the container now and execute /usr/bin/gather\n"

echo -e "$MSG"

Expand Down
3 changes: 3 additions & 0 deletions collection-scripts/gather_run
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,6 @@ done

# get SVC status (inspect ctlplane)
/usr/bin/gather_services_status

# get SOS Reports
/usr/bin/gather_sos
151 changes: 151 additions & 0 deletions collection-scripts/gather_sos
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!/bin/bash
# Gather SOS reports from the OpenShift nodes that are running OpenStack pods.
# They are stored uncompressed in the must-gather so there is no nested
# compression of sos reports within the must-gather.
# SOS_SERVICES: comma separated list of services to gather SOS reports from,
# empty string skips sos report gathering. eg: cinder,glance
# Defaults to all of them.
# SOS_ONLY_PLUGINS: list of sos report plugins to use. Empty string to run
# them all. Defaults to: block,cifs,crio,devicemapper,
# devices,iscsi,lvm2,memory,multipath,nfs,nis,nvme,podman,
# process,processor,selinux,scsi,udev
#
# TODO: Confirm with GSS the default for SOS_ONLY_PLUGINS
# TODO: When we deploy in a non dedicated cluster where we can have to gather
# from more than 3 nodes we may want to limit the concurrency.

DIR_NAME=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
# shellcheck disable=SC1091
source "${DIR_NAME}/common.sh"

###############################################################################
# VARIABLE INITIALIZATION

# If unset use all default services
if [ "${SOS_SERVICES-unset}" = "unset" ]; then
SOS_SERVICES=( "${OSP_SERVICES[@]}" )

# If list of services is set and empty, nothing to do
elif [[ -z "${SOS_SERVICES[*]}" ]]; then
echo "Skipping SOS gathering for controller nodes"
exit 0

# If set, convert to an array
else
IFS=',' read -r -a SOS_SERVICES <<< "$SOS_SERVICES"
fi

# Default to some plugins if SOS_LIMIT_PLUGINS is not set
SOS_ONLY_PLUGINS="${SOS_ONLY_PLUGINS-block,cifs,crio,devicemapper,devices,iscsi,lvm2,memory,multipath,nfs,nis,nvme,podman,process,processor,selinux,scsi,udev}"
if [[ -n "$SOS_ONLY_PLUGINS" ]]; then
SOS_LIMIT="--only-plugins $SOS_ONLY_PLUGINS"
fi

SOS_PATH="${BASE_COLLECTION_PATH}/sos-reports"
SOS_PATH_NODES="${BASE_COLLECTION_PATH}/sos-reports/_all_nodes"

TMPDIR=/var/tmp/sos-osp

###############################################################################
# SOS GATHERING

gather_node_sos () {
node="$1"
echo "Generating SOS Report for ${node}"
# Can only run 1 must-gather at a time on each host. We remove any existing
# toolbox container running from previous time with `podman rm`
#
# Current toolbox can ask via stdin if we want to update [1] so we just
# update the container beforehand to prevent it from ever asking. In the
# next toolbox [2] that may no longer be the case.
# [1]: https://github.com/coreos/toolbox/blob/9a7c840fb4881f406287bf29e5f35b6625c7b358/rhcos-toolbox#L37
# [2]: https://github.com/coreos/toolbox/issues/60
oc debug "node/$node" -- chroot /host bash \
-c "echo 'TOOLBOX_NAME=toolbox-osp' > /root/.toolboxrc ; \
rm -rf \"${TMPDIR}\" && \
mkdir -p \"${TMPDIR}\" && \
sudo podman rm --force toolbox-osp; \
sudo --preserve-env podman pull --authfile /var/lib/kubelet/config.json registry.redhat.io/rhel9/support-tools && \
toolbox sos report --batch $SOS_LIMIT --tmp-dir=\"${TMPDIR}\""

# shellcheck disable=SC2181
if [ $? -ne 0 ]; then
echo "Failed to run sos report on node ${node}, won't retrieve data"
exit 1
fi

# Download and decompress the tar.xz file from the remote node into the
# must-gather directory.
# If we don't want to decompress at this stage we would need to modify the
# yank program so it does the nested decompression automatically.
echo "Retrieving SOS Report for ${node}"
mkdir "${SOS_PATH_NODES}/sosreport-$node"
oc debug "node/$node" -- chroot /host bash \
-c "cat ${TMPDIR}/*.tar.xz" | tar --one-top-level="${SOS_PATH_NODES}/sosreport-$node" --strip-components=1 -Jxf -

# shellcheck disable=SC2181
if [ $? -ne 0 ]; then
echo "Failed to download and decompress sosreport-$node.tar.xz not deleting file"
exit 1
fi

# Ensure write access to the sos reports directories so must-gather rsync doesn't fail
chmod +w -R "${SOS_PATH_NODES}/sosreport-$node/"

# Delete the tar.xz file from the remote node
oc debug "node/$node" -- chroot /host bash -c "rm -rf $TMPDIR"
}


###############################################################################
# MAIN
#
dest_svc_path () {
local svc=$1
# Some services have the project name in the service label, eg: cinder
if [[ "${SOS_SERVICES[*]}" == *"${svc}"* ]]; then
echo -n "${SOS_PATH}/${svc}"
return
fi

# Other services have the component in the service label, eg: nova-api
for os_svc in "${SOS_SERVICES[@]}"; do
if [[ "$svc" == "$os_svc"* ]]; then
echo -n "${SOS_PATH}/${os_svc}"
return
fi
done
}

mkdir -p "${SOS_PATH_NODES}"

# Get list of nodes and service label for each of the OpenStack service pods
# Not using -o jsonpath='{.spec.nodeName}' because it uses space separator
svc_nodes=$(/usr/bin/oc -n openstack get pod -l service --no-headers -o=custom-columns=NODE:.spec.nodeName,SVC:.metadata.labels.service,NAME:.metadata.name)
nodes=''
while read -r node svc name; do
svc_path=$(dest_svc_path "$svc")
if [[ -n "$svc_path" ]]; then
nodes="${nodes}${node}"$'\n'
mkdir -p "$svc_path"
sos_dir="${svc_path}/sos-report-${name}"
[[ ! -e "$sos_dir" ]] && ln -s "../_all_nodes/sosreport-$node" "$sos_dir" 2>/dev/null
fi
done <<< "$svc_nodes"

# Remove duplicated nodes because they are running multiple services
nodes=$(echo "$nodes" | sort | uniq)
echo "Will retrieve SOS reports from nodes ${nodes//$'\n'/ }"

for node in $nodes; do
[[ -z "$node" ]] && continue
# Gather SOS report for the node in background
(gather_node_sos "$node")&
done

# Wait for all processes to complete and check their exit status one by one
FAILED=0
for node in $nodes; do
wait -n || FAILED=1
done
exit $FAILED

0 comments on commit d86ed1f

Please sign in to comment.