Skip to content

reduce log level in metrics logger not to trash the log #708

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions cmd/epp/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,17 +142,15 @@ func run() error {
}

poolNamespacedName := types.NamespacedName{
Namespace: *poolNamespace,
Name: *poolName,
Namespace: *poolNamespace,
}
mgr, err := runserver.NewDefaultManager(poolNamespacedName, cfg)
if err != nil {
setupLog.Error(err, "Failed to create controller manager")
return err
}

ctx := ctrl.SetupSignalHandler()

// Set up mapper for metric scraping.
mapping, err := backendmetrics.NewMetricMapping(
*totalQueuedRequestsMetric,
Expand All @@ -167,14 +165,15 @@ func run() error {

pmf := backendmetrics.NewPodMetricsFactory(&backendmetrics.PodMetricsClientImpl{MetricMapping: mapping}, *refreshMetricsInterval)
// Setup runner.
ctx := ctrl.SetupSignalHandler()

datastore := datastore.NewDatastore(ctx, pmf)

serverRunner := &runserver.ExtProcServerRunner{
GrpcPort: *grpcPort,
DestinationEndpointHintMetadataNamespace: *destinationEndpointHintMetadataNamespace,
DestinationEndpointHintKey: *destinationEndpointHintKey,
PoolName: *poolName,
PoolNamespace: *poolNamespace,
PoolNamespacedName: poolNamespacedName,
Datastore: datastore,
SecureServing: *secureServing,
CertPath: *certPath,
Expand Down
10 changes: 5 additions & 5 deletions pkg/epp/backend/metrics/logger.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,8 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
case <-ctx.Done():
logger.V(logutil.DEFAULT).Info("Shutting down prometheus metrics thread")
return
case <-ticker.C: // Periodically flush prometheus metrics for inference pool
flushPrometheusMetricsOnce(logger, datastore)
case <-ticker.C: // Periodically refresh prometheus metrics for inference pool
refreshPrometheusMetrics(logger, datastore)
}
}
}()
Expand Down Expand Up @@ -86,19 +86,19 @@ func StartMetricsLogger(ctx context.Context, datastore Datastore, refreshPrometh
}
}

func flushPrometheusMetricsOnce(logger logr.Logger, datastore Datastore) {
func refreshPrometheusMetrics(logger logr.Logger, datastore Datastore) {
pool, err := datastore.PoolGet()
if err != nil {
// No inference pool or not initialize.
logger.V(logutil.DEFAULT).Info("pool is not initialized, skipping flushing metrics")
logger.V(logutil.DEFAULT).Info("Pool is not initialized, skipping refreshing metrics")
return
}

var kvCacheTotal float64
var queueTotal int

podMetrics := datastore.PodGetAll()
logger.V(logutil.VERBOSE).Info("Flushing Prometheus Metrics", "ReadyPods", len(podMetrics))
logger.V(logutil.TRACE).Info("Refreshing Prometheus Metrics", "ReadyPods", len(podMetrics))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there an alternative log in verbose or default level that logs the number of ready pods when they change?

Copy link
Contributor Author

@nirrozenbaum nirrozenbaum Apr 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't seen any other place logging the ready pods (it's reported as a metric though).
I completely agree this is information we would like to know.

having said that, I don't think it belongs here.
the refreshMetrics function runs every few seconds to update metrics, which makes perfect sense since we would like to update metrics very frequently.
BUT, printing to the EPP log the number of ready pods every 5 seconds is spamming the log and making it very hard to debug (see example above).

IMO the number of pods in the pool shouldn't be printed as part of metrics recalculation function, it should be part of the Inf Pool status (kinda similar to ready pods status in deployment).
we could easily add this field to Inf Pool status + additional printer columns to show it when running kubectl get InferencePool. it's pretty easy to implement in the controllers reconcile event which also updates only when something changes (as opposed to this log printing, which happens every 5 seconds even if nothing has changed).
If this sounds reasonable I'm happy to implement it this week.

@ahg-g cc @kfswain

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

proposed this change in #714

if len(podMetrics) == 0 {
return
}
Expand Down
17 changes: 6 additions & 11 deletions pkg/epp/server/runserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ type ExtProcServerRunner struct {
GrpcPort int
DestinationEndpointHintMetadataNamespace string
DestinationEndpointHintKey string
PoolName string
PoolNamespace string
PoolNamespacedName types.NamespacedName
Datastore datastore.Datastore
SecureServing bool
CertPath string
Expand Down Expand Up @@ -73,8 +72,7 @@ func NewDefaultExtProcServerRunner() *ExtProcServerRunner {
GrpcPort: DefaultGrpcPort,
DestinationEndpointHintKey: DefaultDestinationEndpointHintKey,
DestinationEndpointHintMetadataNamespace: DefaultDestinationEndpointHintMetadataNamespace,
PoolName: DefaultPoolName,
PoolNamespace: DefaultPoolNamespace,
PoolNamespacedName: types.NamespacedName{Name: DefaultPoolName, Namespace: DefaultPoolNamespace},
SecureServing: DefaultSecureServing,
RefreshPrometheusMetricsInterval: DefaultRefreshPrometheusMetricsInterval,
// Datastore can be assigned later.
Expand All @@ -93,13 +91,10 @@ func (r *ExtProcServerRunner) SetupWithManager(ctx context.Context, mgr ctrl.Man
}

if err := (&controller.InferenceModelReconciler{
Datastore: r.Datastore,
Client: mgr.GetClient(),
PoolNamespacedName: types.NamespacedName{
Name: r.PoolName,
Namespace: r.PoolNamespace,
},
Record: mgr.GetEventRecorderFor("InferenceModel"),
Datastore: r.Datastore,
Client: mgr.GetClient(),
PoolNamespacedName: r.PoolNamespacedName,
Record: mgr.GetEventRecorderFor("InferenceModel"),
}).SetupWithManager(ctx, mgr); err != nil {
return fmt.Errorf("failed setting up InferenceModelReconciler: %w", err)
}
Expand Down
2 changes: 1 addition & 1 deletion test/integration/epp/hermetic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1348,7 +1348,7 @@ func BeforeSuite() func() {
serverRunner.TestPodMetricsClient = &backendmetrics.FakePodMetricsClient{}
pmf := backendmetrics.NewPodMetricsFactory(serverRunner.TestPodMetricsClient, 10*time.Millisecond)
// Adjust from defaults
serverRunner.PoolName = "vllm-llama3-8b-instruct-pool"
serverRunner.PoolNamespacedName = types.NamespacedName{Name: "vllm-llama3-8b-instruct-pool", Namespace: "default"}
serverRunner.Datastore = datastore.NewDatastore(context.Background(), pmf)
serverRunner.SecureServing = false

Expand Down