diff --git a/docker-dev/README.md b/docker-dev/README.md index 4627eba6..0b6f02b0 100644 --- a/docker-dev/README.md +++ b/docker-dev/README.md @@ -104,6 +104,15 @@ The `-v` drops the SQL volume; remove it to keep ConfigDb state across restarts. 2. `docker compose -f docker-dev/docker-compose.yml stop central-1` — `central-2` should pick up the admin role-leader within ~15 s (Akka split-brain stable-after). Traefik will route traffic to `central-2` once its `/health/active` returns 200. 3. `docker compose -f docker-dev/docker-compose.yml start central-1` — `central-1` rejoins as a follower; `central-2` keeps the leader role until something disturbs it. +## Resource limits & dev logging + +The full single-mesh stack (`central-1`/`central-2` + the four site nodes) can OOM-kill `central-1` on a loaded host. Two settings in the compose file guard against that: + +- **EF Core + ASP.NET Core logs are pinned to `Warning`** on every host node (`Serilog__MinimumLevel__Override__Microsoft.EntityFrameworkCore` / `…Microsoft.AspNetCore` = `Warning`). The host logs via Serilog (`AddZbSerilog` → `ReadFrom.Configuration`), and in `Development` the default level is `Debug` — without these overrides every Deployment-poll emits an `Executed DbCommand` / `SELECT … FROM [Deployment]` line, flooding the Serilog pipeline and starving the Akka cluster heartbeat thread. Application + Akka log levels are left untouched, so this only silences the per-poll SQL chatter. To temporarily restore the SQL log flood for debugging, drop those two env vars (or set them back to `Information`) on the node you're inspecting. +- **Each host node has `mem_limit: 1g`** (`mem_reservation: 512m`). A quiet solo `central-1` measures ~357 MiB; the limit leaves headroom for the deploy/UI load and per-cluster driver subscriptions that push a fully-loaded node higher. The limit/reservation live on the `&otopcua-host` anchor, so all six host services inherit them; `sql`, `traefik`, and the one-shot `migrator`/`cluster-seed` are left unbounded. + +The full six-node host stack therefore needs roughly **6 GiB** of Docker Desktop VM memory just for the host nodes (plus SQL Server's own footprint on top). On a constrained host, either raise the Docker Desktop VM memory or run fewer host services (e.g. just `central-1` + `central-2`, or a single central node) rather than the full mesh. + ## Notes - This compose is for the **local Mac/Linux developer rig**. The team's CI + soak runs go to the remote docker host at `10.100.0.35` (see `docs/v2/dev-environment.md`); the file there mirrors this one with adjusted port bindings. diff --git a/docker-dev/docker-compose.yml b/docker-dev/docker-compose.yml index 8493d326..1172940a 100644 --- a/docker-dev/docker-compose.yml +++ b/docker-dev/docker-compose.yml @@ -121,6 +121,16 @@ services: dockerfile: docker-dev/Dockerfile target: runtime image: otopcua-host:dev + # Per-node memory bounds. The full single-mesh stack (6 host nodes) OOM-killed + # central-1 on a loaded host. Each host node measured ~357 MiB idle-solo and + # climbs under the full mesh + deploy/UI load, so cap at 1g (≈peak + headroom) + # with a 512m reservation. These top-level keys are inherited by every service + # that uses `<<: *otopcua-host` (YAML merge keeps the anchor's scalar keys; only + # the `environment` block is re-declared per service). Compose v2 honors + # `mem_limit`/`mem_reservation`. The full mesh needs ~6g of Docker Desktop VM + # memory — on a constrained host raise the VM memory or run fewer host services. + mem_limit: 1g + mem_reservation: 512m depends_on: sql: { condition: service_healthy } migrator: { condition: service_completed_successfully } @@ -147,6 +157,13 @@ services: Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local" Security__Ldap__ServiceAccountPassword: "serviceaccount123" Security__DeployApiKey: "docker-dev-deploy-key" + # Pin EF Core + ASP.NET Core to Warning so the per-poll Deployment SELECT / + # "Executed DbCommand" Information|Debug lines stop flooding the Serilog + # pipeline and starving the Akka cluster heartbeat thread. The host logs via + # Serilog (AddZbSerilog → ReadFrom.Configuration); these env vars override + # Serilog:MinimumLevel:Override:* (app/Akka levels are left untouched). + Serilog__MinimumLevel__Override__Microsoft.EntityFrameworkCore: "Warning" + Serilog__MinimumLevel__Override__Microsoft.AspNetCore: "Warning" GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}" ports: - "4840:4840" @@ -180,6 +197,10 @@ services: Security__Ldap__ServiceAccountDn: "cn=serviceaccount,dc=zb,dc=local" Security__Ldap__ServiceAccountPassword: "serviceaccount123" Security__DeployApiKey: "docker-dev-deploy-key" + # Quiet EF/AspNetCore SQL flood — see central-1 (Serilog override). mem_limit/ + # mem_reservation are inherited from the *otopcua-host anchor. + Serilog__MinimumLevel__Override__Microsoft.EntityFrameworkCore: "Warning" + Serilog__MinimumLevel__Override__Microsoft.AspNetCore: "Warning" GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}" ports: - "4841:4840" @@ -203,6 +224,10 @@ services: Cluster__PublicHostname: "site-a-1" Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053" Cluster__Roles__0: "driver" + # Quiet EF/AspNetCore SQL flood — see central-1 (Serilog override). mem_limit/ + # mem_reservation are inherited from the *otopcua-host anchor. + Serilog__MinimumLevel__Override__Microsoft.EntityFrameworkCore: "Warning" + Serilog__MinimumLevel__Override__Microsoft.AspNetCore: "Warning" # Resolved at runtime by GalaxyDriver.ResolveApiKey when a DriverInstance's # Gateway.ApiKeySecretRef = "env:GALAXY_MXGW_API_KEY". GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}" @@ -223,6 +248,8 @@ services: Cluster__PublicHostname: "site-a-2" Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053" Cluster__Roles__0: "driver" + Serilog__MinimumLevel__Override__Microsoft.EntityFrameworkCore: "Warning" + Serilog__MinimumLevel__Override__Microsoft.AspNetCore: "Warning" GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}" ports: - "4843:4840" @@ -243,6 +270,8 @@ services: Cluster__PublicHostname: "site-b-1" Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053" Cluster__Roles__0: "driver" + Serilog__MinimumLevel__Override__Microsoft.EntityFrameworkCore: "Warning" + Serilog__MinimumLevel__Override__Microsoft.AspNetCore: "Warning" GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}" ports: - "4844:4840" @@ -261,6 +290,8 @@ services: Cluster__PublicHostname: "site-b-2" Cluster__SeedNodes__0: "akka.tcp://otopcua@central-1:4053" Cluster__Roles__0: "driver" + Serilog__MinimumLevel__Override__Microsoft.EntityFrameworkCore: "Warning" + Serilog__MinimumLevel__Override__Microsoft.AspNetCore: "Warning" GALAXY_MXGW_API_KEY: "${GALAXY_MXGW_API_KEY:-mxgw_otopcua2_GI7-tNozYE6cXGUSgEzL3AHDV7bYcYIHdMwKYgyHdX4}" ports: - "4845:4840"