Skip to content

Commit a50083f

Browse files
authored
Add Azure Monitor telemetry to backend (#483)
1 parent 3c59974 commit a50083f

File tree

9 files changed

+677
-18
lines changed

9 files changed

+677
-18
lines changed

backend/poetry.lock

Lines changed: 505 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

backend/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ psutil = "^5.9.5"
2525
vtk = "^9.2.6"
2626
fmu-sumo = "1.0.3"
2727
sumo-wrapper-python = "1.0.6"
28+
azure-monitor-opentelemetry = "^1.1.0"
2829

2930

3031
[tool.poetry.group.dev.dependencies]

backend/src/backend/primary/main.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
import datetime
22
import logging
3+
import os
34

45
from fastapi import FastAPI
56
from fastapi.routing import APIRoute
67
from fastapi.responses import ORJSONResponse
78
from uvicorn.middleware.proxy_headers import ProxyHeadersMiddleware
89

910
from src.backend.utils.add_process_time_to_server_timing_middleware import AddProcessTimeToServerTimingMiddleware
11+
from src.backend.utils.azure_monitor_setup import setup_azure_monitor_telemetry
12+
from src.backend.utils.logging_setup import ensure_console_log_handler_is_configured, setup_normal_log_levels
1013
from src.backend.shared_middleware import add_shared_middlewares
1114
from src.backend.auth.auth_helper import AuthHelper
1215
from .routers.explore import router as explore_router
@@ -26,13 +29,14 @@
2629
from .routers.observations.router import router as observations_router
2730
from .routers.rft.router import router as rft_router
2831

29-
logging.basicConfig(
30-
level=logging.WARNING,
31-
format="%(asctime)s %(levelname)-3s [%(name)s]: %(message)s",
32-
datefmt="%H:%M:%S",
33-
)
34-
logging.getLogger("src.services.sumo_access").setLevel(level=logging.DEBUG)
35-
logging.getLogger("src.backend.primary.routers.surface").setLevel(level=logging.DEBUG)
32+
33+
ensure_console_log_handler_is_configured()
34+
setup_normal_log_levels()
35+
36+
# temporarily set some loggers to DEBUG
37+
logging.getLogger("src.services.sumo_access").setLevel(logging.DEBUG)
38+
39+
LOGGER = logging.getLogger(__name__)
3640

3741

3842
def custom_generate_unique_id(route: APIRoute) -> str:
@@ -45,6 +49,13 @@ def custom_generate_unique_id(route: APIRoute) -> str:
4549
default_response_class=ORJSONResponse,
4650
)
4751

52+
if os.environ.get("APPLICATIONINSIGHTS_CONNECTION_STRING"):
53+
LOGGER.info("Configuring Azure Monitor telemetry for primary backend")
54+
setup_azure_monitor_telemetry(app)
55+
else:
56+
LOGGER.warning("Skipping telemetry configuration, APPLICATIONINSIGHTS_CONNECTION_STRING env variable not set.")
57+
58+
4859
# The tags we add here will determine the name of the frontend api service for our endpoints as well as
4960
# providing some grouping when viewing the openapi documentation.
5061
app.include_router(explore_router, tags=["explore"])

backend/src/backend/primary/routers/surface/router.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ async def get_realization_surface_data(
7575
surf_data_response = converters.to_api_surface_data(xtgeo_surf)
7676
perf_metrics.record_lap("convert")
7777

78-
LOGGER.debug(f"Loaded realization surface in: {perf_metrics.to_string()}")
78+
LOGGER.info(f"Loaded realization surface in: {perf_metrics.to_string()}")
7979

8080
return surf_data_response
8181

@@ -113,7 +113,7 @@ async def get_statistical_surface_data(
113113
surf_data_response: schemas.SurfaceData = converters.to_api_surface_data(xtgeo_surf)
114114
perf_metrics.record_lap("convert")
115115

116-
LOGGER.debug(f"Calculated statistical surface in: {perf_metrics.to_string()}")
116+
LOGGER.info(f"Calculated statistical surface in: {perf_metrics.to_string()}")
117117

118118
return surf_data_response
119119

@@ -158,7 +158,7 @@ async def get_property_surface_resampled_to_static_surface(
158158
surf_data_response: schemas.SurfaceData = converters.to_api_surface_data(resampled_surface)
159159
perf_metrics.record_lap("convert")
160160

161-
LOGGER.debug(f"Loaded property surface in: {perf_metrics.to_string()}")
161+
LOGGER.info(f"Loaded property surface in: {perf_metrics.to_string()}")
162162

163163
return surf_data_response
164164

@@ -200,6 +200,6 @@ async def get_property_surface_resampled_to_statistical_static_surface(
200200

201201
surf_data_response = converters.to_api_surface_data(resampled_surface)
202202

203-
LOGGER.debug(f"Loaded property surface and created image, total time: {timer.elapsed_ms()}ms")
203+
LOGGER.info(f"Loaded property surface and created image, total time: {timer.elapsed_ms()}ms")
204204

205205
return surf_data_response
Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,33 @@
1+
import logging
2+
import os
3+
14
from fastapi import FastAPI
25
from fastapi.responses import ORJSONResponse
36

7+
from src.backend.utils.azure_monitor_setup import setup_azure_monitor_telemetry
8+
from src.backend.utils.logging_setup import ensure_console_log_handler_is_configured, setup_normal_log_levels
49
from src.backend.shared_middleware import add_shared_middlewares
510
from .inactivity_shutdown import InactivityShutdown
611
from .routers.general import router as general_router
712

813
# mypy: disable-error-code="attr-defined"
914
from .routers.grid.router import router as grid_router
1015

16+
17+
ensure_console_log_handler_is_configured()
18+
setup_normal_log_levels()
19+
20+
LOGGER = logging.getLogger(__name__)
21+
22+
1123
app = FastAPI(default_response_class=ORJSONResponse)
1224

25+
if os.environ.get("APPLICATIONINSIGHTS_CONNECTION_STRING"):
26+
LOGGER.info("Configuring Azure Monitor telemetry for user session server")
27+
setup_azure_monitor_telemetry(app)
28+
else:
29+
LOGGER.warning("Skipping telemetry configuration, APPLICATIONINSIGHTS_CONNECTION_STRING env variable not set.")
30+
1331
app.include_router(general_router)
1432
app.include_router(grid_router, prefix="/grid")
1533
add_shared_middlewares(app)
@@ -18,4 +36,4 @@
1836
# minutes without receiving any new requests:
1937
InactivityShutdown(app, inactivity_limit_minutes=30)
2038

21-
print("Succesfully completed user session server initialization.")
39+
LOGGER.info("Successfully completed user session server initialization.")
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
import logging
2+
from azure.monitor.opentelemetry import configure_azure_monitor
3+
from fastapi import FastAPI
4+
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
5+
from opentelemetry.sdk._logs import LoggingHandler
6+
7+
8+
# This is a custom logging handler that does formatting of log messages before passing them on to open telemetry.
9+
# Note that the class we're inheriting from here *is* an OpenTelemetry derived Python logger.
10+
class LoggingHandlerWithFormatting(LoggingHandler):
11+
def emit(self, record: logging.LogRecord) -> None:
12+
# Do a bit of a hack here to format the message before passing it onwards.
13+
# At the same time make sure we restore record.msg in case there are other handlers in the chain.
14+
original_msg = record.msg
15+
original_args = record.args
16+
17+
formatted_msg = self.format(record)
18+
record.msg = formatted_msg
19+
record.args = None
20+
21+
# Note that the logger that we're calling emit on here is an Open Telemetry Logger, not a Python logger.
22+
self._logger.emit(self._translate(record))
23+
24+
# For inspecting and debugging the actual telemetry payload, uncomment the following lines.
25+
# log_record_as_json = self._translate(record).to_json()
26+
# print(f"---- start payload ----\n{log_record_as_json}\n---- end payload -----", flush=True)
27+
28+
record.msg = original_msg
29+
record.args = original_args
30+
31+
32+
# A lot of the configuration will be read from environment variables during the execution of this function.
33+
# Notable environment variables that are consumed are:
34+
# - APPLICATIONINSIGHTS_CONNECTION_STRING
35+
# - OTEL_SERVICE_NAME
36+
# - OTEL_RESOURCE_ATTRIBUTES
37+
def setup_azure_monitor_telemetry(fastapi_app: FastAPI) -> None:
38+
# Under ideal circumstances, the below call to configure_azure_monitor() should be the only call needed
39+
# to configure the entire telemetry stack. However, it seems that there are multiple glitches.
40+
# - Supposedly, FastAPI instrumentation should be added automatically, but this only seems to work
41+
# if the call to configure_azure_monitor() happens very early in the module loading process, and
42+
# specifically it seems that it has to happen before any import of FastAPIP
43+
# - The default log handler that is added does not obey the specified log message format string,
44+
# even if it is set using OTEL_PYTHON_LOG_FORMAT. It justs logs the raw message string.
45+
#
46+
# Note that this call will throw an exception if the APPLICATIONINSIGHTS_CONNECTION_STRING
47+
# environment variable is not set or if it is invalid
48+
configure_azure_monitor()
49+
50+
# The log handler that is added by configure_azure_monitor() does not obey the specified log message
51+
# format string. We therefore replace it with a custom handler that does formatting and set a format string that ensures
52+
# that the logger name is included in the log message.
53+
handler_with_formatting = LoggingHandlerWithFormatting()
54+
handler_with_formatting.setFormatter(logging.Formatter("[%(name)s] %(message)s"))
55+
56+
root_logger = logging.getLogger()
57+
58+
handler_installed_by_config = root_logger.handlers[-1]
59+
root_logger.removeHandler(handler_installed_by_config)
60+
root_logger.addHandler(handler_with_formatting)
61+
62+
FastAPIInstrumentor.instrument_app(fastapi_app)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import logging
2+
3+
4+
def ensure_console_log_handler_is_configured() -> None:
5+
root_logger = logging.getLogger()
6+
7+
# Look for the default console handler that is added by the Python logging module
8+
console_handler: logging.StreamHandler | None = None
9+
for handler in root_logger.handlers:
10+
if isinstance(handler, logging.StreamHandler):
11+
console_handler = handler
12+
break
13+
14+
if not console_handler:
15+
console_handler = logging.StreamHandler()
16+
root_logger.addHandler(console_handler)
17+
18+
# Set our default format string
19+
formatter = logging.Formatter("%(asctime)s %(levelname)-3s [%(name)s]: %(message)s", datefmt="%H:%M:%S")
20+
console_handler.setFormatter(formatter)
21+
22+
23+
def setup_normal_log_levels() -> None:
24+
# Set baseline logging level to INFO
25+
logging.getLogger().setLevel(logging.INFO)
26+
27+
# Make sure some of the more noisy loggers are limited to WARNING
28+
logging.getLogger("xtgeo").setLevel(logging.WARNING)
29+
logging.getLogger("httpx").setLevel(logging.WARNING)
30+
logging.getLogger("azure").setLevel(logging.WARNING)

docker-compose.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ services:
3232
- WEBVIZ_SMDA_RESOURCE_SCOPE
3333
- WEBVIZ_SMDA_SUBSCRIPTION_KEY
3434
- WEBVIZ_SUMO_ENV
35+
- APPLICATIONINSIGHTS_CONNECTION_STRING
36+
- OTEL_SERVICE_NAME=primary-backend
37+
- OTEL_RESOURCE_ATTRIBUTES=service.namespace=local, service.version=dummy0.1.2
3538
- CODESPACE_NAME # Automatically set env. variable by GitHub codespace
3639
volumes:
3740
- ./backend/src:/home/appuser/backend/src
@@ -56,6 +59,9 @@ services:
5659
- WEBVIZ_SMDA_RESOURCE_SCOPE
5760
- WEBVIZ_SMDA_SUBSCRIPTION_KEY
5861
- WEBVIZ_SUMO_ENV
62+
- APPLICATIONINSIGHTS_CONNECTION_STRING
63+
- OTEL_SERVICE_NAME=user-session
64+
- OTEL_RESOURCE_ATTRIBUTES=service.namespace=local, service.version=dummy0.1.2
5965
volumes:
6066
- ./backend/src:/home/appuser/backend/src
6167

radixconfig.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,14 @@ spec:
3939
UVICORN_ENTRYPOINT: src.backend.primary.main:app
4040
environmentConfig:
4141
- environment: prod
42+
secretRefs:
43+
azureKeyVaults:
44+
- name: webviz
45+
items:
46+
- name: WEBVIZ-INSIGHTS-CONNECTIONSTRING-PROD
47+
envVar: APPLICATIONINSIGHTS_CONNECTION_STRING
48+
variables:
49+
OTEL_RESOURCE_ATTRIBUTES: service.name=primary-backend, service.namespace=prod, service.version=tbd0.1.2
4250
resources:
4351
requests:
4452
memory: 8Gi
@@ -55,6 +63,14 @@ spec:
5563
cpu:
5664
averageUtilization: 80
5765
- environment: review
66+
secretRefs:
67+
azureKeyVaults:
68+
- name: webviz
69+
items:
70+
- name: WEBVIZ-INSIGHTS-CONNECTIONSTRING-REVIEW
71+
envVar: APPLICATIONINSIGHTS_CONNECTION_STRING
72+
variables:
73+
OTEL_RESOURCE_ATTRIBUTES: service.name=primary-backend, service.namespace=review, service.version=tbd0.1.2
5874
resources:
5975
requests:
6076
memory: 4Gi
@@ -118,13 +134,29 @@ spec:
118134
environmentConfig:
119135
# As of October 2023 in radix, these settings need to be duplicated for both environments.
120136
- environment: prod
137+
secretRefs:
138+
azureKeyVaults:
139+
- name: webviz
140+
items:
141+
- name: WEBVIZ-INSIGHTS-CONNECTIONSTRING-PROD
142+
envVar: APPLICATIONINSIGHTS_CONNECTION_STRING
143+
variables:
144+
OTEL_RESOURCE_ATTRIBUTES: service.name=user-session, service.namespace=prod, service.version=tbd0.1.2
121145
volumeMounts:
122146
- name: webvizcache
123147
path: /tmp/webvizcache/
124148
blobfuse2:
125149
container: cache
126150
uid: 1234
127151
- environment: review
152+
secretRefs:
153+
azureKeyVaults:
154+
- name: webviz
155+
items:
156+
- name: WEBVIZ-INSIGHTS-CONNECTIONSTRING-REVIEW
157+
envVar: APPLICATIONINSIGHTS_CONNECTION_STRING
158+
variables:
159+
OTEL_RESOURCE_ATTRIBUTES: service.name=user-session, service.namespace=review, service.version=tbd0.1.2
128160
volumeMounts:
129161
- name: webvizcache
130162
path: /tmp/webvizcache/

0 commit comments

Comments
 (0)