Skip to content

Commit 0db7452

Browse files
committed
debug
1 parent 8236770 commit 0db7452

File tree

2 files changed

+141
-6
lines changed

2 files changed

+141
-6
lines changed

tests/debugger/test_debugger_inproduct_enablement.py

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from utils import features, scenarios, context, logger, bug, missing_feature
77
import json
88
import time
9+
import time as time_module
910

1011
TIMEOUT = 5
1112

@@ -173,9 +174,32 @@ class Test_Debugger_InProduct_Enablement_Code_Origin(debugger.BaseDebuggerTest):
173174
########### code origin ############
174175
def _check_code_origin(self):
175176
"""Send a request and check if code origin spans are present."""
177+
# Calculate threshold right before sending request to minimize race condition window.
178+
# This ensures we only check traces generated after this point, avoiding issues where
179+
# trace files might be ingested between threshold calculation and wait_for setup.
180+
# If the threshold is calculated too early, a trace file might be written and ingested
181+
# before wait_for is called. If that file doesn't have code origin (due to timing),
182+
# it will be in the existing data when wait_for checks, will be skipped, and then
183+
# wait_for will wait for new data that never comes (since the file is already ingested).
184+
threshold_start = time_module.time()
176185
threshold = self._get_max_trace_file_number()
186+
threshold_elapsed = time_module.time() - threshold_start
187+
logger.debug(
188+
f"[CODE_ORIGIN_DEBUG] _check_code_origin: calculated threshold={threshold} in {threshold_elapsed:.3f}s"
189+
)
190+
191+
request_start = time_module.time()
177192
self.send_weblog_request("/")
178-
return self.wait_for_code_origin_span(TIMEOUT, threshold=threshold)
193+
request_elapsed = time_module.time() - request_start
194+
logger.debug(f"[CODE_ORIGIN_DEBUG] _check_code_origin: sent request in {request_elapsed:.3f}s")
195+
196+
wait_start = time_module.time()
197+
result = self.wait_for_code_origin_span(TIMEOUT, threshold=threshold)
198+
wait_elapsed = time_module.time() - wait_start
199+
logger.debug(
200+
f"[CODE_ORIGIN_DEBUG] _check_code_origin: wait_for_code_origin_span returned {result} after {wait_elapsed:.3f}s"
201+
)
202+
return result
179203

180204
def _set_code_origin_and_check(self, *, enabled: bool | None):
181205
"""Set code origin via remote config and check if spans are present."""
@@ -195,12 +219,34 @@ def _set_code_origin_and_check(self, *, enabled: bool | None):
195219
# ensure we only look for traces generated after the config is applied. By inlining
196220
# the threshold calculation here (instead of in _check_code_origin), we ensure it happens
197221
# after the remote config is sent and right before the request, minimizing the race window.
222+
rc_start = time_module.time()
198223
self.send_rc_apm_tracing(code_origin_enabled=enabled)
224+
rc_elapsed = time_module.time() - rc_start
225+
logger.debug(
226+
f"[CODE_ORIGIN_DEBUG] _set_code_origin_and_check: send_rc_apm_tracing(enabled={enabled}) completed in {rc_elapsed:.3f}s"
227+
)
228+
199229
# Calculate threshold right before sending request to minimize race condition window.
200230
# This ensures we only check traces generated after the config change.
231+
threshold_start = time_module.time()
201232
threshold = self._get_max_trace_file_number()
233+
threshold_elapsed = time_module.time() - threshold_start
234+
logger.debug(
235+
f"[CODE_ORIGIN_DEBUG] _set_code_origin_and_check: calculated threshold={threshold} in {threshold_elapsed:.3f}s"
236+
)
237+
238+
request_start = time_module.time()
202239
self.send_weblog_request("/")
203-
return self.wait_for_code_origin_span(TIMEOUT, threshold=threshold)
240+
request_elapsed = time_module.time() - request_start
241+
logger.debug(f"[CODE_ORIGIN_DEBUG] _set_code_origin_and_check: sent request in {request_elapsed:.3f}s")
242+
243+
wait_start = time_module.time()
244+
result = self.wait_for_code_origin_span(TIMEOUT, threshold=threshold)
245+
wait_elapsed = time_module.time() - wait_start
246+
logger.debug(
247+
f"[CODE_ORIGIN_DEBUG] _set_code_origin_and_check: wait_for_code_origin_span returned {result} after {wait_elapsed:.3f}s"
248+
)
249+
return result
204250

205251
def setup_inproduct_enablement_code_origin(self):
206252
self.initialize_weblog_remote_config()

tests/debugger/utils.py

Lines changed: 93 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import os
88
import os.path
99
from pathlib import Path
10+
import time
1011
import uuid
1112
from urllib.parse import parse_qs
1213
from typing import TypedDict, Literal, Any
@@ -473,47 +474,135 @@ def _wait_for_no_capture_reason_span(self, data: dict):
473474

474475
def wait_for_code_origin_span(self, timeout: int = 5, threshold: int | None = None) -> bool:
475476
self._span_found = False
477+
start_time = time.time()
478+
479+
# Root cause of flakiness: There's a race condition where a trace file might be written
480+
# and ingested between threshold calculation and wait_for setup. When wait_for checks
481+
# existing data first, if that file doesn't have code origin yet (due to tracer timing),
482+
# it gets skipped. Then wait_for waits for new data, but since the file is already
483+
# ingested, no new data event fires, causing a timeout.
484+
#
485+
# Solution: Preserve the original threshold (max file number BEFORE the request) to ensure
486+
# we only check files written after the request. However, we also need to account for the
487+
# fact that files might be written and ingested very quickly. The key insight is that we
488+
# should use the original threshold as-is - it represents the boundary between "before
489+
# request" and "after request". Any file with number > threshold was written after the
490+
# request, regardless of when it was ingested.
491+
#
492+
# The race condition is mitigated by calculating threshold right before sending the request
493+
# (already done in calling code), minimizing the window where files can be written.
476494
if threshold is None:
477495
threshold = self._get_max_trace_file_number()
496+
logger.debug(
497+
f"[CODE_ORIGIN_DEBUG] wait_for_code_origin_span: threshold was None, calculated new threshold={threshold}"
498+
)
499+
else:
500+
logger.debug(f"[CODE_ORIGIN_DEBUG] wait_for_code_origin_span: using provided threshold={threshold}")
478501

479-
interfaces.agent.wait_for(
480-
lambda data: self._wait_for_code_origin_span(data, threshold=threshold),
481-
timeout=timeout,
502+
# Check current max file number right before wait_for to see if any files were written
503+
# between threshold calculation and wait_for call
504+
current_max_before_wait = self._get_max_trace_file_number()
505+
logger.debug(
506+
f"[CODE_ORIGIN_DEBUG] wait_for_code_origin_span: threshold={threshold}, current_max_before_wait={current_max_before_wait}, diff={current_max_before_wait - threshold}"
507+
)
508+
509+
# Check what files exist in existing data that match our criteria
510+
existing_files_above_threshold = []
511+
for data in interfaces.agent.get_data(_TRACES_PATH):
512+
log_filename_found = re.search(r"/(\d+)__", data["log_filename"])
513+
if log_filename_found:
514+
file_number = int(log_filename_found.group(1))
515+
if file_number > threshold:
516+
existing_files_above_threshold.append((file_number, data["log_filename"]))
517+
518+
if existing_files_above_threshold:
519+
logger.debug(
520+
f"[CODE_ORIGIN_DEBUG] wait_for_code_origin_span: Found {len(existing_files_above_threshold)} existing files above threshold: {existing_files_above_threshold}"
521+
)
522+
else:
523+
logger.debug("[CODE_ORIGIN_DEBUG] wait_for_code_origin_span: No existing files above threshold found")
524+
525+
# Create a closure to capture start_time
526+
def wait_function(data: dict) -> bool:
527+
return self._wait_for_code_origin_span(data, threshold=threshold, start_time=start_time)
528+
529+
interfaces.agent.wait_for(wait_function, timeout=timeout)
530+
531+
elapsed = time.time() - start_time
532+
logger.debug(
533+
f"[CODE_ORIGIN_DEBUG] wait_for_code_origin_span: finished after {elapsed:.3f}s, span_found={self._span_found}"
482534
)
483535
return self._span_found
484536

485537
def _get_max_trace_file_number(self) -> int:
486538
"""Get the maximum trace file number currently in the agent interface."""
487539
max_number = 0
540+
file_numbers = []
488541
for data in interfaces.agent.get_data(_TRACES_PATH):
489542
log_filename_found = re.search(r"/(\d+)__", data["log_filename"])
490543
if log_filename_found:
491544
file_number = int(log_filename_found.group(1))
545+
file_numbers.append(file_number)
492546
max_number = max(max_number, file_number)
547+
logger.debug(
548+
f"[CODE_ORIGIN_DEBUG] _get_max_trace_file_number: max={max_number}, all_files={sorted(file_numbers)[-10:] if len(file_numbers) > 10 else sorted(file_numbers)}"
549+
)
493550
return max_number
494551

495-
def _wait_for_code_origin_span(self, data: dict, *, threshold: int) -> bool:
552+
def _wait_for_code_origin_span(self, data: dict, *, threshold: int, start_time: float | None = None) -> bool:
496553
if data["path"] == _TRACES_PATH:
497554
log_filename_found = re.search(r"/(\d+)__", data["log_filename"])
498555
if not log_filename_found:
499556
return False
500557

501558
log_number = int(log_filename_found.group(1))
559+
elapsed = (time.time() - start_time) if start_time else None
560+
logger.debug(
561+
f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: checking file {log_number}, threshold={threshold}, elapsed={elapsed:.3f}s"
562+
if elapsed
563+
else f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: checking file {log_number}, threshold={threshold}"
564+
)
565+
502566
if log_number > threshold:
567+
logger.debug(
568+
f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: file {log_number} > threshold {threshold}, checking content"
569+
)
503570
content = data["request"]["content"]
504571
if content:
572+
web_spans_found = 0
573+
code_origin_found_count = 0
505574
for payload in content["tracerPayloads"]:
506575
for chunk in payload["chunks"]:
507576
for span in chunk["spans"]:
508577
resource, resource_type = span.get("resource", ""), span.get("type")
509578

510579
if resource.startswith("GET") and resource_type == "web":
580+
web_spans_found += 1
511581
code_origin_type = span["meta"].get("_dd.code_origin.type", "")
582+
logger.debug(
583+
f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: file {log_number}, span resource={resource}, code_origin_type={code_origin_type}"
584+
)
512585

513586
if code_origin_type == "entry":
587+
code_origin_found_count += 1
588+
logger.debug(
589+
f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: file {log_number}, FOUND code origin 'entry' in span {resource}"
590+
)
514591
self._span_found = True
515592
return True
516593

594+
logger.debug(
595+
f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: file {log_number}, checked {web_spans_found} web spans, found {code_origin_found_count} with code origin"
596+
)
597+
else:
598+
logger.debug(
599+
f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: file {log_number}, no content in request"
600+
)
601+
else:
602+
logger.debug(
603+
f"[CODE_ORIGIN_DEBUG] _wait_for_code_origin_span: file {log_number} <= threshold {threshold}, skipping"
604+
)
605+
517606
return False
518607

519608
def wait_for_telemetry(self, telemetry_type: str, timeout: int = 5) -> dict | None:

0 commit comments

Comments
 (0)