Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prometheus Integration #98

Merged
merged 17 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,540 changes: 889 additions & 651 deletions package-lock.json

Large diffs are not rendered by default.

15 changes: 10 additions & 5 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,12 @@
"@ng-bootstrap/ng-bootstrap": "^16.0.0",
"@popperjs/core": "2.11.8",
"@stomp/rx-stomp": "2.0.0",
"@swimlane/ngx-charts": "^20.5.0",
"@types/d3-scale": "^4.0.8",
"@types/d3-selection": "^3.0.10",
"@types/d3-shape": "^3.1.6",
"bootstrap": "5.3.2",
"d3": "^7.8.5",
"dayjs": "1.11.10",
"ngx-infinite-scroll": "17.0.0",
"rxjs": "7.8.1",
Expand All @@ -95,17 +100,17 @@
},
"devDependencies": {
"@angular-builders/jest": "17.0.0",
"@angular-devkit/build-angular": "17.0.7",
"@angular-devkit/build-angular": "17.0.9",
"@angular-eslint/eslint-plugin": "17.1.1",
"@angular/cli": "17.0.8",
"@angular/cli": "17.0.9",
"@angular/compiler-cli": "17.0.4",
"@angular/service-worker": "17.0.4",
"@types/jest": "29.5.10",
"@types/node": "20.10.6",
"@types/sockjs-client": "1.5.4",
"@typescript-eslint/eslint-plugin": "6.16.0",
"@typescript-eslint/eslint-plugin": "6.18.0",
"@typescript-eslint/parser": "6.12.0",
"browser-sync": "2.29.3",
"browser-sync": "3.0.2",
"buffer": "6.0.3",
"concurrently": "8.2.2",
"eslint": "8.56.0",
Expand All @@ -120,7 +125,7 @@
"lint-staged": "15.2.0",
"prettier": "3.1.1",
"prettier-plugin-java": "2.5.0",
"prettier-plugin-packagejson": "2.4.7",
"prettier-plugin-packagejson": "2.4.9",
"rimraf": "5.0.5",
"swagger-ui-dist": "5.9.1",
"ts-jest": "29.1.1",
Expand Down
15 changes: 13 additions & 2 deletions src/main/java/de/tum/cit/ase/domain/SimulationRun.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,22 @@ public class SimulationRun {
@Column(name = "start_date_time", nullable = false)
private ZonedDateTime startDateTime;

@Column(name = "end_date_time")
private ZonedDateTime endDateTime;

@Enumerated(EnumType.STRING)
@Column(nullable = false)
private Status status;

@OneToMany(cascade = CascadeType.REMOVE, mappedBy = "simulationRun", fetch = FetchType.EAGER)
@OneToMany(cascade = CascadeType.REMOVE, mappedBy = "simulationRun")
private Set<SimulationStats> stats;

@ManyToOne
@JoinColumn(name = "simulation_id", nullable = false)
@JsonIgnore
private Simulation simulation;

@OneToMany(cascade = CascadeType.REMOVE, mappedBy = "simulationRun", fetch = FetchType.EAGER)
@OneToMany(cascade = CascadeType.REMOVE, mappedBy = "simulationRun")
private Set<LogMessage> logMessages;

@Transient
Expand Down Expand Up @@ -102,6 +105,14 @@ public void setSchedule(SimulationSchedule schedule) {
this.schedule = schedule;
}

public ZonedDateTime getEndDateTime() {
return endDateTime;
}

public void setEndDateTime(ZonedDateTime endDateTime) {
this.endDateTime = endDateTime;
}

public enum Status {
QUEUED,
RUNNING,
Expand Down
38 changes: 38 additions & 0 deletions src/main/java/de/tum/cit/ase/prometheus/MetricValue.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package de.tum.cit.ase.prometheus;

import java.time.Instant;
import java.time.ZoneId;
import java.time.ZonedDateTime;

public class MetricValue {

private ZonedDateTime dateTime;
private double value;

public MetricValue(double timestamp, double value) {
this.value = value;
long millis = (long) (timestamp * 1000);
Instant instant = Instant.ofEpochMilli(millis);
this.dateTime = ZonedDateTime.ofInstant(instant, ZoneId.of("UTC"));
}

public ZonedDateTime getDateTime() {
return dateTime;
}

public void setDateTime(ZonedDateTime dateTime) {
this.dateTime = dateTime;
}

public double getValue() {
return value;
}

public void setValue(double value) {
this.value = value;
}

public String toString() {
return "Timestamp: " + dateTime + ", Value: " + value;
}
}
55 changes: 55 additions & 0 deletions src/main/java/de/tum/cit/ase/prometheus/QueryResponse.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package de.tum.cit.ase.prometheus;

public class QueryResponse {

private String status;
private Data data;

public QueryResponse() {}

public String getStatus() {
return status;
}

public Data getData() {
return data;
}

public void setStatus(String status) {
this.status = status;
}

public void setData(Data data) {
this.data = data;
}

public static class Data {

private Result[] result;

public Data() {}

public Result[] getResult() {
return result;
}

public void setResult(Result[] result) {
this.result = result;
}
}

public static class Result {

private Object[][] values;

public Result() {}

public Object[][] getValues() {
return values;
}

public void setValues(Object[][] values) {
this.values = values;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ public interface SimulationRunRepository extends JpaRepository<SimulationRun, Lo

@Query(value = "select run from SimulationRun run where run.simulation.id = :#{#simulationId}")
List<SimulationRun> findAllBySimulationId(@Param("simulationId") long simulationId);

@Query(value = "select run from SimulationRun run left join fetch run.stats s left join fetch run.logMessages l where run.id = :#{#id}")
SimulationRun findByIdWithStatsAndLogMessages(long id);
}
182 changes: 182 additions & 0 deletions src/main/java/de/tum/cit/ase/service/PrometheusService.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
package de.tum.cit.ase.service;

import de.tum.cit.ase.domain.SimulationRun;
import de.tum.cit.ase.prometheus.MetricValue;
import de.tum.cit.ase.prometheus.QueryResponse;
import de.tum.cit.ase.service.artemis.ArtemisConfiguration;
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.http.client.reactive.ReactorClientHttpConnector;
import org.springframework.stereotype.Service;
import org.springframework.web.reactive.function.client.WebClient;

@Service
public class PrometheusService {

private final Logger log = LoggerFactory.getLogger(PrometheusService.class);

@Value("${prometheus.api-url}")
private String baseUrl;

@Value("${prometheus.auth-token}")
private String authToken;

@Value("${prometheus.resolution}")
private int resolution;

private WebClient webClient;
private final ArtemisConfiguration artemisConfiguration;

public PrometheusService(ArtemisConfiguration artemisConfiguration) {
this.artemisConfiguration = artemisConfiguration;
}

/**
* Get the CPU usage of the Artemis instance for the given run.
* @param run The run to get the CPU usage for.
* @return A list of CPU usage values. An empty list if no Prometheus instance is configured for Artemis.
*/
public List<MetricValue> getCpuUsageArtemis(SimulationRun run) {
log.info("Getting Artemis CPU usage for {}", run);
var instance = artemisConfiguration.getPrometheusInstanceArtemis(run.getSimulation().getServer());
if (instance == null || instance.isBlank()) {
log.warn("No Prometheus instance configured for Artemis on {}", run.getSimulation().getServer());
return List.of();
}
return getCpuUsage(run, instance);
}

/**
* Get the CPU usage of the VCS for the given run.
* @param run The run to get the CPU usage for.
* @return A list of CPU usage values. An empty list if no Prometheus instance is configured for the VCS.
*/
public List<MetricValue> getCpuUsageVcs(SimulationRun run) {
log.info("Getting VCS CPU usage for {}", run);
var instance = artemisConfiguration.getPrometheusInstanceVcs(run.getSimulation().getServer());
if (instance == null || instance.isBlank()) {
log.warn("No Prometheus instance configured for VCS on {}", run.getSimulation().getServer());
return List.of();
}
return getCpuUsage(run, instance);
}

/**
* Get the CPU usage of the CI system for the given run.
* @param run The run to get the CPU usage for.
* @return A list of CPU usage values. An empty list if no Prometheus instance is configured for the CI system.
*/
public List<MetricValue> getCpuUsageCi(SimulationRun run) {
log.info("Getting CI CPU usage for {}", run);
var instance = artemisConfiguration.getPrometheusInstanceCi(run.getSimulation().getServer());
if (instance == null || instance.isBlank()) {
log.warn("No Prometheus instance configured for CI on {}", run.getSimulation().getServer());
return List.of();
}
return getCpuUsage(run, instance);
}

/**
* Execute a Prometheus query. The query is executed for the given time range.
* @param query The query to execute.
* @param start The start of the time range.
* @param end The end of the time range.
* @return The response from Prometheus.
*/
public QueryResponse executeQuery(String query, ZonedDateTime start, ZonedDateTime end) {
log.info("Querying Prometheus: {}", query);
if (webClient == null) {
setupWebclient();
}
return webClient
.get()
.uri(uriBuilder ->
uriBuilder
.path("/api/v1/query_range")
.query("query={query}&start={start}&end={end}&step={resolution}")
.build(query, asPrometheusTimestamp(start), asPrometheusTimestamp(end), resolution)
)
.retrieve()
.bodyToMono(QueryResponse.class)
.block();
}

private List<MetricValue> getCpuUsage(SimulationRun run, String instance) {
// Prometheus query to get the idle-percentage of the given instance`s CPUs.
// We use the idle-percentage because it is the inverse of the CPU usage.
// We take the average across all CPUs.
// The percentage is calculated over the last minute.
var query = "avg(rate(node_cpu_seconds_total{instance=\"" + instance + "\", mode=\"idle\"}[1m]))";

// If the run is still running, we want to get the CPU usage until now.
// If the run is finished, we want to get the CPU usage until the end of the run plus 30 minutes.
ZonedDateTime end = nowUTC();
if (
run.getStatus() == SimulationRun.Status.FINISHED ||
run.getStatus() == SimulationRun.Status.FAILED ||
run.getStatus() == SimulationRun.Status.CANCELLED
) {
if (run.getEndDateTime() != null) {
end = run.getEndDateTime().withZoneSameInstant(ZoneId.of("UTC")).plusMinutes(30);
} else {
// Edge case where the run is over but the end date is not set.
end = run.getStartDateTime().withZoneSameInstant(ZoneId.of("UTC")).plusMinutes(30);
}
// We don't want to get CPU usage in the future, that results in weird graphs.
if (end.isAfter(nowUTC())) {
end = nowUTC();
}
}
var res = executeQuery(query, run.getStartDateTime().withZoneSameInstant(ZoneId.of("UTC")).minusMinutes(30), end);
List<MetricValue> values = new LinkedList<>();
Arrays
.stream(res.getData().getResult())
.forEach(r ->
Arrays
.stream(r.getValues())
.forEach(v -> {
// Prometheus returns a list of lists, where the first value is the timestamp and the second value is the cpu percentage in idle mode.
// We invert the value to get the cpu usage.
// The timestamp can be either a double or an int, depending on the given time range.
try {
values.add(new MetricValue((double) v[0], 1.0 - Double.parseDouble((String) v[1])));
} catch (ClassCastException e) {
values.add(new MetricValue((int) v[0], 1.0 - Double.parseDouble((String) v[1])));
}
})
);
return values;
}

private String asPrometheusTimestamp(ZonedDateTime dateTime) {
Instant instant = dateTime.toInstant();
double timestamp = instant.getEpochSecond() + instant.getNano() / 1_000_000_000.0;
return String.format(Locale.US, "%.3f", timestamp);
}

private ZonedDateTime nowUTC() {
return ZonedDateTime.now(ZoneId.of("UTC"));
}

private void setupWebclient() {
this.webClient =
WebClient
.builder()
.clientConnector(new ReactorClientHttpConnector())
.baseUrl(baseUrl)
.defaultHeader(HttpHeaders.AUTHORIZATION, "Basic " + authToken)
.defaultHeader(HttpHeaders.ACCEPT, MediaType.APPLICATION_JSON_VALUE)
.defaultHeader(HttpHeaders.CONTENT_TYPE, MediaType.APPLICATION_JSON_VALUE)
.build();
}
}
Loading