forked from baron/baron-sso
Compare commits
17 Commits
feature/af
...
feature/st
| Author | SHA1 | Date | |
|---|---|---|---|
| 67af52d8e2 | |||
| 4eb4c5af34 | |||
| f61c56cfde | |||
| 2671ebda27 | |||
| 2405961375 | |||
| ae97950108 | |||
| f726463a6c | |||
| badcabb644 | |||
| aa2848c3b6 | |||
| 9be833d2e0 | |||
| 4e81e214a3 | |||
| 561659f333 | |||
| fe176c6912 | |||
| 5670288616 | |||
| 3ab9d28c9d | |||
| 2dedeb66b6 | |||
| 1f47abb860 |
@@ -180,3 +180,7 @@ DEVFRONT_URL=http://localhost:5174
|
||||
DEVFRONT_CALLBACK_URLS=http://localhost:5174/auth/callback,https://sso.hmac.kr/devfront/auth/callback
|
||||
ORGFRONT_CALLBACK_URLS=http://localhost:5175/auth/callback,https://sso.hmac.kr/orgfront/auth/callback
|
||||
VITE_ORGCHART_URL=
|
||||
|
||||
# promtail에서 로그를 전송받을 Loki 서버 엔드포인트 URL
|
||||
LOKI_URL=http://loki:3100/loki/api/v1/push
|
||||
|
||||
|
||||
83
.gitea/workflows/staging_build_check.yml
Normal file
83
.gitea/workflows/staging_build_check.yml
Normal file
@@ -0,0 +1,83 @@
|
||||
name: Staging Build Check
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
paths:
|
||||
- ".gitea/workflows/staging_build_check.yml"
|
||||
- "docker/staging_pull_compose.template.yaml"
|
||||
- "adminfront/**"
|
||||
- "devfront/**"
|
||||
- "userfront/**"
|
||||
- "backend/**"
|
||||
- "common/**"
|
||||
- "scripts/**"
|
||||
- "locales/**"
|
||||
- "package.json"
|
||||
- "pnpm-lock.yaml"
|
||||
- "pnpm-workspace.yaml"
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
build-check:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- service: adminfront
|
||||
- service: devfront
|
||||
- service: userfront
|
||||
- service: backend
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Prepare staging build inputs
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
cat <<'EOF' > .env
|
||||
APP_ENV=stage
|
||||
TZ=Asia/Seoul
|
||||
IDP_PROVIDER=ory
|
||||
ADMINFRONT_URL=https://adminfront.staging.example.com
|
||||
DEVFRONT_URL=https://devfront.staging.example.com
|
||||
USERFRONT_URL=https://userfront.staging.example.com
|
||||
ORGFRONT_URL=https://orgfront.staging.example.com
|
||||
BACKEND_URL=https://backend.staging.example.com
|
||||
BACKEND_PUBLIC_URL=https://backend.staging.example.com
|
||||
VITE_OIDC_AUTHORITY=https://sso.staging.example.com/oidc
|
||||
WORKS_ADMIN_API_BASE_URL=https://works-admin.staging.example.com/api
|
||||
WORKS_ADMIN_OAUTH_TOKEN_URL=https://works-admin.staging.example.com/oauth/token
|
||||
ORY_POSTGRES_USER=ory
|
||||
ORY_POSTGRES_PASSWORD=ory-password
|
||||
COOKIE_SECRET=staging-build-cookie-secret
|
||||
JWT_SECRET=staging-build-jwt-secret
|
||||
NAVER_CLOUD_ACCESS_KEY=dummy
|
||||
NAVER_CLOUD_SECRET_KEY=dummy
|
||||
NAVER_CLOUD_SERVICE_ID=dummy
|
||||
NAVER_SENDER_PHONE_NUMBER=00000000000
|
||||
AWS_REGION=ap-northeast-2
|
||||
AWS_ACCESS_KEY_ID=dummy
|
||||
AWS_SECRET_ACCESS_KEY=dummy
|
||||
AWS_SES_SENDER=dummy@example.com
|
||||
REDIS_ADDR=redis:6389
|
||||
CLICKHOUSE_PORT_NATIVE=9000
|
||||
CLICKHOUSE_USER=baron
|
||||
CLICKHOUSE_PASSWORD=password
|
||||
HYDRA_PUBLIC_URL=https://hydra.staging.example.com
|
||||
KRATOS_BROWSER_URL=https://sso.staging.example.com
|
||||
KRATOS_ADMIN_URL=http://kratos:4434
|
||||
KRATOS_UI_URL=https://sso.staging.example.com
|
||||
EOF
|
||||
|
||||
cp docker/staging_pull_compose.template.yaml staging_pull_compose.yaml
|
||||
|
||||
- name: Build ${{ matrix.service }} with staging compose
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
COMPOSE_DOCKER_CLI_BUILD: "1"
|
||||
run: |
|
||||
set -euo pipefail
|
||||
docker compose -f staging_pull_compose.yaml build --pull --progress=plain "${{ matrix.service }}"
|
||||
@@ -133,8 +133,15 @@ jobs:
|
||||
ORGFRONT_CALLBACK_URLS=${{ vars.ORGFRONT_CALLBACK_URLS }}
|
||||
KRATOS_ALLOWED_RETURN_URLS_JSON=${{ vars.KRATOS_ALLOWED_RETURN_URLS_JSON }}
|
||||
KRATOS_ALLOWED_RETURN_URLS_EXTRA=${{ vars.KRATOS_ALLOWED_RETURN_URLS_EXTRA }}
|
||||
STAGING_PUBLIC_HEALTH_URL=${{ vars.STAGING_PUBLIC_HEALTH_URL }}
|
||||
STAGING_PUBLIC_HEALTH_MAX_ATTEMPTS=${{ vars.STAGING_PUBLIC_HEALTH_MAX_ATTEMPTS }}
|
||||
# OATHKEEPER_INTROSPECT_CLIENT_ID=${{ vars.OATHKEEPER_INTROSPECT_CLIENT_ID }}
|
||||
# OATHKEEPER_INTROSPECT_CLIENT_SECRET=${{ secrets.STG_OATHKEEPER_INTROSPECT_CLIENT_SECRET }}
|
||||
|
||||
# Monitoring & Alerts
|
||||
SMS_WEBHOOK_PORT=${{ vars.SMS_WEBHOOK_PORT || '8080' }}
|
||||
MONITOR_RECIPIENT_PHONES=${{ vars.MONITOR_RECIPIENT_PHONES || '01012345678,01098765432' }}
|
||||
LOKI_URL=${{ vars.LOKI_URL || 'http://loki:3100/loki/api/v1/push' }}
|
||||
EOF
|
||||
|
||||
# 코드 업데이트 (Git)
|
||||
@@ -190,7 +197,7 @@ jobs:
|
||||
max="${FRONTEND_HEALTH_MAX_ATTEMPTS:-60}"
|
||||
i=1
|
||||
while [ "${i}" -le "${max}" ]; do
|
||||
if docker exec "${name}" node -e "fetch('http://127.0.0.1:${port}/').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))" >/dev/null 2>&1; then
|
||||
if docker exec "${name}" sh -c "if command -v wget >/dev/null 2>&1; then wget -qO- 'http://127.0.0.1:${port}/' >/dev/null; elif command -v node >/dev/null 2>&1; then node -e \"fetch('http://127.0.0.1:${port}/').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))\"; else exit 127; fi" >/dev/null 2>&1; then
|
||||
echo "Frontend ready: ${name}:${port}"
|
||||
return 0
|
||||
fi
|
||||
@@ -203,9 +210,55 @@ jobs:
|
||||
return 1
|
||||
}
|
||||
|
||||
check_container_url() {
|
||||
name="$1"
|
||||
url="$2"
|
||||
max="${FRONTEND_HEALTH_MAX_ATTEMPTS:-60}"
|
||||
i=1
|
||||
while [ "${i}" -le "${max}" ]; do
|
||||
if docker exec "${name}" sh -c "if command -v wget >/dev/null 2>&1; then wget -qO- '${url}' >/dev/null; elif command -v node >/dev/null 2>&1; then node -e \"fetch('${url}').then(r=>process.exit(r.ok?0:1)).catch(()=>process.exit(1))\"; else exit 127; fi" >/dev/null 2>&1; then
|
||||
echo "Container URL ready: ${name} ${url}"
|
||||
return 0
|
||||
fi
|
||||
echo "Waiting for container URL: ${name} ${url} (${i}/${max})"
|
||||
i=$((i + 1))
|
||||
sleep 2
|
||||
done
|
||||
echo "ERROR: container URL not ready: ${name} ${url}" >&2
|
||||
docker logs "${name}" --tail 200 >&2 || true
|
||||
return 1
|
||||
}
|
||||
|
||||
check_public_http() {
|
||||
url="$1"
|
||||
if [ -z "${url}" ]; then
|
||||
echo "ERROR: STAGING_PUBLIC_HEALTH_URL is required." >&2
|
||||
return 1
|
||||
fi
|
||||
max="${STAGING_PUBLIC_HEALTH_MAX_ATTEMPTS:-30}"
|
||||
i=1
|
||||
while [ "${i}" -le "${max}" ]; do
|
||||
if curl -fsS --max-time 10 "${url}" >/dev/null; then
|
||||
echo "Public staging URL ready: ${url}"
|
||||
return 0
|
||||
fi
|
||||
echo "Waiting for public staging URL: ${url} (${i}/${max})"
|
||||
i=$((i + 1))
|
||||
sleep 2
|
||||
done
|
||||
echo "ERROR: public staging URL not ready: ${url}" >&2
|
||||
docker compose -f staging_pull_compose.yaml ps >&2 || true
|
||||
docker logs baron_gateway --tail 200 >&2 || true
|
||||
return 1
|
||||
}
|
||||
|
||||
check_container_url baron_backend http://127.0.0.1:3000/health
|
||||
check_container_http baron_userfront 5000
|
||||
check_container_http baron_gateway 5000
|
||||
check_container_http baron_adminfront 5173
|
||||
check_container_http baron_devfront 5173
|
||||
check_container_http baron_orgfront 5175
|
||||
check_public_http "${STAGING_PUBLIC_HEALTH_URL}"
|
||||
|
||||
echo "===== INIT-RP LOGS ====="
|
||||
docker compose -f staging_pull_compose.yaml logs init-rp || true
|
||||
|
||||
@@ -180,6 +180,33 @@ services:
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: baron_promtail
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- ./docker/promtail-config.template.yaml:/etc/promtail/promtail-config.yaml:ro
|
||||
command: -config.file=/etc/promtail/promtail-config.yaml -config.expand-env=true
|
||||
environment:
|
||||
- LOKI_URL=${LOKI_URL:-http://loki:3100/loki/api/v1/push}
|
||||
- APP_ENV=${APP_ENV:-development}
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:v0.25.0
|
||||
container_name: baron_blackbox_exporter
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9115:9115"
|
||||
volumes:
|
||||
- ./docker/monitor/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
||||
networks:
|
||||
- baron_net
|
||||
- ory-net
|
||||
|
||||
networks:
|
||||
baron_net:
|
||||
external: true
|
||||
|
||||
@@ -108,6 +108,32 @@ services:
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: baron_promtail
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- ./docker/promtail-config.template.yaml:/etc/promtail/promtail-config.yaml:ro
|
||||
command: -config.file=/etc/promtail/promtail-config.yaml -config.expand-env=true
|
||||
environment:
|
||||
- LOKI_URL=${LOKI_URL:-http://loki:3100/loki/api/v1/push}
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:v0.25.0
|
||||
container_name: baron_blackbox_exporter
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9115:9115"
|
||||
volumes:
|
||||
- ./docker/monitor/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
||||
networks:
|
||||
- baron_net
|
||||
- ory-net
|
||||
|
||||
networks:
|
||||
baron_net:
|
||||
external: true
|
||||
|
||||
10
docker/monitor/blackbox.yml
Normal file
10
docker/monitor/blackbox.yml
Normal file
@@ -0,0 +1,10 @@
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 5s
|
||||
http:
|
||||
valid_status_codes: [] # Defaults to 2xx
|
||||
method: GET
|
||||
follow_redirects: true
|
||||
fail_if_ssl: false
|
||||
fail_if_not_ssl: false
|
||||
161
docker/monitor/grafana/dashboards/baron_sso_dashboard.json
Normal file
161
docker/monitor/grafana/dashboards/baron_sso_dashboard.json
Normal file
@@ -0,0 +1,161 @@
|
||||
{
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "datasource",
|
||||
"uid": "grafana"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"collapsed": false,
|
||||
"gridPos": {
|
||||
"h": 3,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"title": "Baron SSO Service Overview",
|
||||
"type": "row"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {
|
||||
"mode": "thresholds"
|
||||
},
|
||||
"mappings": [
|
||||
{
|
||||
"options": {
|
||||
"0": {
|
||||
"color": "red",
|
||||
"index": 1,
|
||||
"text": "OFFLINE"
|
||||
},
|
||||
"1": {
|
||||
"color": "green",
|
||||
"index": 0,
|
||||
"text": "ONLINE"
|
||||
}
|
||||
},
|
||||
"type": "value"
|
||||
}
|
||||
],
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{
|
||||
"color": "red",
|
||||
"value": null
|
||||
},
|
||||
{
|
||||
"color": "green",
|
||||
"value": 1
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 8,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 3
|
||||
},
|
||||
"id": 2,
|
||||
"options": {
|
||||
"alignValue": "center",
|
||||
"legend": {
|
||||
"displayMode": "list",
|
||||
"placement": "bottom",
|
||||
"showLegend": true
|
||||
},
|
||||
"mergeValues": true,
|
||||
"rowHeight": 0.8,
|
||||
"showValue": "always",
|
||||
"tooltip": {
|
||||
"mode": "single",
|
||||
"sort": "none"
|
||||
}
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "prometheus",
|
||||
"uid": "Prometheus"
|
||||
},
|
||||
"editorMode": "code",
|
||||
"expr": "probe_success{job=\"baron-services-http-probe\"}",
|
||||
"legendFormat": "{{instance}}",
|
||||
"range": true,
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Services Health Timeline (HTTP Probe)",
|
||||
"type": "state-timeline"
|
||||
},
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 12,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 11
|
||||
},
|
||||
"id": 3,
|
||||
"options": {
|
||||
"enableLogDetails": true,
|
||||
"prettifyLogMessage": false,
|
||||
"showCommonLabels": false,
|
||||
"showLabels": true,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": true
|
||||
},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "loki",
|
||||
"uid": "Loki"
|
||||
},
|
||||
"expr": "{job=\"baron-sso-logs\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Live Container Logs (Loki)",
|
||||
"type": "logs"
|
||||
}
|
||||
],
|
||||
"refresh": "5s",
|
||||
"schemaVersion": 39,
|
||||
"tags": ["baron-sso", "observability"],
|
||||
"style": "dark",
|
||||
"timezone": "browser",
|
||||
"title": "Baron SSO Observability Dashboard",
|
||||
"uid": "baron_sso_observability",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
41
docker/promtail-config.template.yaml
Normal file
41
docker/promtail-config.template.yaml
Normal file
@@ -0,0 +1,41 @@
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: ${LOKI_URL:-http://loki:3100/loki/api/v1/push}
|
||||
|
||||
scrape_configs:
|
||||
- job_name: baron-sso-container-logs
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 10s
|
||||
relabel_configs:
|
||||
# 1. 원본 메타데이터에서 Baron 및 Ory 관련 컨테이너만 필터링
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(baron_.*|oathkeeper|kratos|hydra|keto|ory_.*)'
|
||||
action: keep
|
||||
|
||||
# 2. 필수 라벨 선부여 (강제성 확보를 위해 __address__ 참조)
|
||||
- source_labels: ['__address__']
|
||||
target_label: 'job'
|
||||
replacement: 'baron-sso-logs'
|
||||
- source_labels: ['__address__']
|
||||
target_label: 'app_env'
|
||||
replacement: '${APP_ENV:-development}'
|
||||
|
||||
# 3. 컨테이너 이름 추출
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(.*)'
|
||||
target_label: 'container_name'
|
||||
|
||||
# 4. 서비스 상세 라벨 부여 (baron_ 접두사 제거 등)
|
||||
- source_labels: ['container_name']
|
||||
regex: 'baron_(.*)'
|
||||
target_label: 'service'
|
||||
- source_labels: ['container_name']
|
||||
regex: 'baron_(.*)'
|
||||
target_label: 'job'
|
||||
@@ -79,6 +79,7 @@ services:
|
||||
postgres_ory:
|
||||
image: postgres:${ORY_POSTGRES_TAG:-17-alpine}
|
||||
container_name: ory_postgres
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- POSTGRES_USER=${ORY_POSTGRES_USER:-ory}
|
||||
- POSTGRES_PASSWORD=${ORY_POSTGRES_PASSWORD:-secret}
|
||||
@@ -125,6 +126,7 @@ services:
|
||||
kratos:
|
||||
image: oryd/kratos:${KRATOS_VERSION:-v26.2.0}
|
||||
container_name: ory_kratos
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- DSN=postgres://${ORY_POSTGRES_USER}:${ORY_POSTGRES_PASSWORD}@postgres_ory:5432/${KRATOS_DB:-ory_kratos}?sslmode=disable&max_conns=20
|
||||
- COOKIE_SECRET=${COOKIE_SECRET:-localcookie123}
|
||||
@@ -163,6 +165,7 @@ services:
|
||||
hydra:
|
||||
image: oryd/hydra:${HYDRA_VERSION:-v26.2.0}
|
||||
container_name: ory_hydra
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- DSN=postgres://${ORY_POSTGRES_USER}:${ORY_POSTGRES_PASSWORD}@postgres_ory:5432/${HYDRA_DB:-ory_hydra}?sslmode=disable&max_conns=20
|
||||
- URLS_SELF_ISSUER=${HYDRA_PUBLIC_URL}
|
||||
@@ -196,6 +199,7 @@ services:
|
||||
keto:
|
||||
image: oryd/keto:${KETO_VERSION:-v26.2.0}
|
||||
container_name: ory_keto
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- DSN=postgres://${ORY_POSTGRES_USER}:${ORY_POSTGRES_PASSWORD}@postgres_ory:5432/${KETO_DB:-ory_keto}?sslmode=disable&max_conns=20
|
||||
volumes:
|
||||
@@ -255,6 +259,7 @@ services:
|
||||
ory_clickhouse:
|
||||
image: clickhouse/clickhouse-server:latest
|
||||
container_name: ory_clickhouse
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- CLICKHOUSE_USER=${ORY_CLICKHOUSE_USER:-ory}
|
||||
- CLICKHOUSE_PASSWORD=${ORY_CLICKHOUSE_PASSWORD:-orypass}
|
||||
@@ -360,6 +365,7 @@ services:
|
||||
context: ./backend
|
||||
dockerfile: Dockerfile
|
||||
container_name: baron_backend
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
@@ -424,6 +430,7 @@ services:
|
||||
VITE_OIDC_CLIENT_ID: adminfront
|
||||
ORGFRONT_URL: ${ORGFRONT_URL:-}
|
||||
container_name: baron_adminfront
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
@@ -449,6 +456,7 @@ services:
|
||||
VITE_OIDC_AUTHORITY: ${VITE_OIDC_AUTHORITY:-}
|
||||
VITE_OIDC_CLIENT_ID: devfront
|
||||
container_name: baron_devfront
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
@@ -474,6 +482,7 @@ services:
|
||||
VITE_OIDC_AUTHORITY: ${VITE_OIDC_AUTHORITY:-}
|
||||
VITE_OIDC_CLIENT_ID: orgfront
|
||||
container_name: baron_orgfront
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
@@ -496,6 +505,7 @@ services:
|
||||
context: .
|
||||
dockerfile: userfront/Dockerfile
|
||||
container_name: baron_userfront
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
@@ -526,6 +536,32 @@ services:
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: baron_promtail
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- ./docker/promtail-config.template.yaml:/etc/promtail/promtail-config.yaml:ro
|
||||
command: -config.file=/etc/promtail/promtail-config.yaml -config.expand-env=true
|
||||
environment:
|
||||
- LOKI_URL=${LOKI_URL:-http://loki:3100/loki/api/v1/push}
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:v0.25.0
|
||||
container_name: baron_blackbox_exporter
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "9115:9115"
|
||||
volumes:
|
||||
- ./docker/monitor/blackbox.yml:/etc/blackbox_exporter/config.yml:ro
|
||||
networks:
|
||||
- baron_net
|
||||
- ory-net
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
clickhouse_data:
|
||||
|
||||
213
docs/external_healthcheck_monitoring_design.md
Normal file
213
docs/external_healthcheck_monitoring_design.md
Normal file
@@ -0,0 +1,213 @@
|
||||
# 외부 통합 모니터링 및 로그 수집 시스템 설계서 (Prometheus + Promtail + Loki)
|
||||
|
||||
## 1. 개요 (Overview)
|
||||
본 문서는 Baron SSO 서비스가 배포될 **스테이징 서버**의 기존 도커(Docker) 기반 모니터링 및 로깅 인프라를 활용하여, **가용성 헬스체크(메트릭 수집)**와 **컨테이너 실시간 로그 통합 수집(로그 분석)**을 동시에 달성하고, 장애 상황 발생 시 담당자에게 즉시 SMS를 전송하는 엔드투엔드(End-to-End) 연동 설계를 정의합니다.
|
||||
|
||||
- **메트릭(상태) 모니터링**: Prometheus + Grafana를 활용하여 `/health` 및 프론트엔드 포트 가용성 수집
|
||||
- **로그(텍스트) 모니터링**: Promtail + Loki를 활용하여 컨테이너 실시간 로그 수집 및 에러/패닉 로그 실시간 알림
|
||||
- **장애 알림 전파**: 기존 사내 SMS 게이트웨이 서비스인 [grafana-sms-webhook](https://gitea.hmac.kr/ai-team/grafana-sms-webhook)를 연동하여 실시간 알림 수신
|
||||
|
||||
---
|
||||
|
||||
## 2. 네트워크 및 데이터 수집 아키텍처 (Architecture)
|
||||
|
||||
```
|
||||
[ Staging Host Docker Environment ]
|
||||
|
||||
+-------------------------------------------------------------+
|
||||
| baron_net (External Docker Network) |
|
||||
| |
|
||||
| +--------------------+ +--------------------+ |
|
||||
| | baron_backend | | baron_adminfront | ... |
|
||||
| | (Port 3000) | | (Port 5173) | |
|
||||
| +----+---------+-----+ +----+---------+-----+ |
|
||||
| | | | | |
|
||||
| | | | | |
|
||||
| | | (Docker Log | | |
|
||||
| | | Stream) | | |
|
||||
| | +-------+ | | |
|
||||
| | v | v |
|
||||
| | +----+-----+---------+-----+ |
|
||||
| | | baron_promtail | (신규 수집기) |
|
||||
| | | (Docker Socket 마운트) | |
|
||||
| | +----------+---------------+ |
|
||||
| | | (Push Logs) |
|
||||
| | v |
|
||||
| | +----------+---------------+ |
|
||||
| | | Loki Container | (기존 분석기) |
|
||||
| | +----------+---------------+ |
|
||||
| | ^ |
|
||||
| | (Scrape HTTP) | (Query Logs) |
|
||||
| +----+-----------------------+-----------------------+ |
|
||||
| | Prometheus / Grafana Container | |
|
||||
| | (baron_net 네트워크 참여 / 수집 및 얼럿 룰 감시) | |
|
||||
| +----------------------------+-----------------------+ |
|
||||
| | (Alert Webhook) |
|
||||
| v |
|
||||
| +----------------------------+-----------------------+ |
|
||||
| | grafana-sms-webhook | |
|
||||
| | (사내 SMS API 게이트웨이 연동) | |
|
||||
| +----------------------------+-----------------------+ |
|
||||
+-------------------------------|-----------------------------+
|
||||
|
|
||||
| (NCP SENS Call)
|
||||
v
|
||||
[ Naver Cloud NCP ]
|
||||
|
|
||||
| (SMS/LMS)
|
||||
v
|
||||
[ Infra Administrator ]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. 스테이징 배포 파일 반영 사양 (Staging Deployment Changes)
|
||||
|
||||
### 3.1 `docker-compose.staging.template.yaml` 변경 사항
|
||||
`grafana-sms-webhook`과 로그 수집기인 `promtail` 컨테이너를 함께 기동하도록 추가합니다.
|
||||
|
||||
```yaml
|
||||
# docker/docker-compose.staging.template.yaml 하단에 추가
|
||||
|
||||
services:
|
||||
# ... 기존 backend, adminfront, userfront 등 서비스 정의 ...
|
||||
|
||||
grafana-sms-webhook:
|
||||
# 저장소 주소: https://gitea.hmac.kr/ai-team/grafana-sms-webhook
|
||||
image: ${SMS_WEBHOOK_IMAGE_NAME:-gitea.hmac.kr/ai-team/grafana-sms-webhook}:${IMAGE_TAG:-latest}
|
||||
container_name: grafana_sms_webhook
|
||||
restart: unless-stopped
|
||||
env_file:
|
||||
- .env
|
||||
environment:
|
||||
- NAVER_CLOUD_ACCESS_KEY=${NAVER_CLOUD_ACCESS_KEY}
|
||||
- NAVER_CLOUD_SECRET_KEY=${NAVER_CLOUD_SECRET_KEY}
|
||||
- NAVER_CLOUD_SERVICE_ID=${NAVER_CLOUD_SERVICE_ID}
|
||||
- NAVER_SENDER_PHONE_NUMBER=${NAVER_SENDER_PHONE_NUMBER}
|
||||
- MONITOR_RECIPIENT_PHONES=${MONITOR_RECIPIENT_PHONES} # 콤마(,) 구분 수신처 번호
|
||||
ports:
|
||||
- "${SMS_WEBHOOK_PORT:-8080}:8080"
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
promtail:
|
||||
image: grafana/promtail:2.9.0
|
||||
container_name: baron_promtail
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
- /var/lib/docker/containers:/var/lib/docker/containers:ro
|
||||
- ./docker/promtail-config.template.yaml:/etc/promtail/promtail-config.yaml:ro
|
||||
command: -config.file=/etc/promtail/promtail-config.yaml -config.expand-env=true
|
||||
environment:
|
||||
- LOKI_URL=${LOKI_URL:-http://loki:3100/loki/api/v1/push}
|
||||
- APP_ENV=${APP_ENV:-development}
|
||||
networks:
|
||||
- baron_net
|
||||
|
||||
networks:
|
||||
baron_net:
|
||||
external: true
|
||||
name: baron_net
|
||||
```
|
||||
|
||||
### 3.2 `promtail-config.template.yaml` 설정 사양
|
||||
수집기가 도커 소켓을 읽어 컨테이너명을 자동으로 식별하고, Baron SSO 관련 로그만 선별하여 라벨을 붙인 후 Loki로 전송합니다.
|
||||
|
||||
```yaml
|
||||
# docker/promtail-config.template.yaml
|
||||
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: ${LOKI_URL:-http://loki:3100/loki/api/v1/push}
|
||||
|
||||
scrape_configs:
|
||||
- job_name: baron-sso-container-logs
|
||||
docker_sd_configs:
|
||||
- host: unix:///var/run/docker.sock
|
||||
refresh_interval: 10s
|
||||
relabel_configs:
|
||||
- source_labels: ['__meta_docker_container_name']
|
||||
regex: '/(.*)'
|
||||
target_label: 'container_name'
|
||||
# Baron SSO 및 핵심 Ory Stack 컨테이너만 필터링하여 로그 수집
|
||||
- source_labels: ['container_name']
|
||||
regex: '(baron_.*|oathkeeper|kratos|hydra|keto)'
|
||||
action: keep
|
||||
# 컨테이너 명에서 앞의 접두사를 떼어 서비스 및 잡 라벨 부여 (예: baron_backend -> backend)
|
||||
- source_labels: ['container_name']
|
||||
regex: 'baron_(.*)'
|
||||
target_label: 'service'
|
||||
- source_labels: ['container_name']
|
||||
regex: 'baron_(.*)'
|
||||
target_label: 'job'
|
||||
# 동적 라벨 추가
|
||||
- target_label: 'app_env'
|
||||
replacement: '${APP_ENV:-development}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. 기존 Prometheus & Loki 연동 가이드
|
||||
|
||||
### 4.1 1단계: 기존 컨테이너를 `baron_net`에 합류
|
||||
기존에 동작 중인 Prometheus, Loki, Grafana 컨테이너가 `baron_net` 내부 도커 DNS를 인식할 수 있도록 연결합니다.
|
||||
```bash
|
||||
docker network connect baron_net prometheus
|
||||
docker network connect baron_net loki
|
||||
docker network connect baron_net grafana
|
||||
```
|
||||
|
||||
### 4.2 2단계: Prometheus 수집 설정 (`prometheus.yml`)
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'baron-sso-backend-staging'
|
||||
metrics_path: '/health'
|
||||
scrape_interval: 15s
|
||||
static_configs:
|
||||
- targets: ['baron_backend:3000']
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 5. Grafana 이중 알림 설정 (메트릭 알림 + 로그 알림)
|
||||
|
||||
기존 Grafana에서 다음 두 종류의 알림 규칙을 지정하고 수신처로 `grafana_sms_webhook`을 연동합니다.
|
||||
|
||||
### 5.1 메트릭 기반 가용성 얼럿 (Prometheus 데이터 소스)
|
||||
* **목적**: 백엔드가 완전히 다운되거나 `/health` 가 503 에러를 리턴할 때 문자 발송
|
||||
* **쿼리 예시**: `up{job="baron-sso-backend-staging"} == 0`
|
||||
* **지속 기간(For)**: `3m`
|
||||
* **장애 문자 템플릿**:
|
||||
```text
|
||||
[Baron SSO 서버 다운 얼럿]
|
||||
대상: baron_backend
|
||||
상태: DOWN (접속 불가)
|
||||
내용: 백엔드 컨테이너가 정상적으로 동작하지 않거나 웹 서버가 중단되었습니다. 즉시 서버 상태를 점검해 주십시오.
|
||||
```
|
||||
|
||||
### 5.2 로그 기반 실시간 에러/패닉 얼럿 (Loki 데이터 소스)
|
||||
* **목적**: 서버는 돌고 있으나 내부 로직 상 치명적인 예외(Panic, Error)가 대량 발생하여 실사용자가 오작동을 겪을 때 문자 전송
|
||||
* **쿼리 예시 (LogQL)**: `sum(count_over_time({app_env="stage", service="backend"} |= "panic" [5m])) > 0` 또는 `|= "ERROR"`
|
||||
* **지속 기간(For)**: `0m` (발생 즉시 신속 문자 발송)
|
||||
* **장애 문자 템플릿**:
|
||||
```text
|
||||
[Baron SSO 로그 에러 경보]
|
||||
대상: baron_backend (Loki 수집 로그)
|
||||
상태: 치명적인 에러/패닉 실시간 감지
|
||||
내용: 백엔드 서비스 콘솔 로그에서 panic 또는 ERROR 키워드가 실시간으로 감지되었습니다. 로그 모니터링 대시보드를 확인하십시오.
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. 기대 효과 및 결론
|
||||
|
||||
1. **완벽한 가시성(Full Observability)**: 단순 서버 기동 여부 검사를 넘어, 서버 내부에서 도는 세부 에러나 버그 로그(Panic)까지 완전하게 모니터링 체계에 포착합니다.
|
||||
2. **이중 알림으로 완벽 방어**: 네트워크 장비 고장에 의한 접속 실패는 **메트릭 얼럿**으로 잡고, 내부 로직 결함에 의한 기능 오작동은 **로그 얼럿**으로 이중 방어하여 인프라 가용성 99.99%를 보장합니다.
|
||||
3. **효율적인 인프라 일원화**: 동일 그라파나 대시보드 내에서 메트릭 시각화와 로그 검색을 동시 처리하며, `grafana-sms-webhook` 통합 채널 하나만으로 모든 장애 문자를 송출합니다.
|
||||
@@ -62,12 +62,17 @@ for workflow in "$staging_pull"; do
|
||||
assert_contains "$workflow" 'ORGFRONT_URL=${{ vars.ORGFRONT_URL }}'
|
||||
assert_contains "$workflow" 'KRATOS_ALLOWED_RETURN_URLS_JSON=${{ vars.KRATOS_ALLOWED_RETURN_URLS_JSON }}'
|
||||
assert_contains "$workflow" 'KRATOS_ALLOWED_RETURN_URLS_EXTRA=${{ vars.KRATOS_ALLOWED_RETURN_URLS_EXTRA }}'
|
||||
assert_contains "$workflow" 'STAGING_PUBLIC_HEALTH_URL=${{ vars.STAGING_PUBLIC_HEALTH_URL }}'
|
||||
assert_contains "$workflow" 'STAGING_PUBLIC_HEALTH_MAX_ATTEMPTS=${{ vars.STAGING_PUBLIC_HEALTH_MAX_ATTEMPTS }}'
|
||||
done
|
||||
|
||||
assert_contains "$staging_pull" 'bash scripts/render_ory_config.sh'
|
||||
assert_contains "$staging_pull" 'chmod -R 777 config/.generated/ory'
|
||||
assert_contains "$staging_pull" 'docker compose -f staging_pull_compose.yaml build --pull'
|
||||
assert_contains "$staging_pull" 'docker compose -f staging_pull_compose.yaml up -d --remove-orphans --renew-anon-volumes'
|
||||
assert_contains "$staging_pull" 'check_container_http baron_gateway 5000'
|
||||
assert_contains "$staging_pull" 'check_public_http "${STAGING_PUBLIC_HEALTH_URL}"'
|
||||
assert_contains "$staging_pull" 'curl -fsS --max-time 10 "${url}"'
|
||||
|
||||
assert_contains "$userfront_dockerfile" "FROM ghcr.io/cirruslabs/flutter:3.38.0 AS build"
|
||||
assert_contains "$userfront_dockerfile" "RUN flutter build web --release --wasm"
|
||||
|
||||
56
test/staging_pull_restart_policy_test.sh
Normal file
56
test/staging_pull_restart_policy_test.sh
Normal file
@@ -0,0 +1,56 @@
|
||||
#!/usr/bin/env sh
|
||||
set -eu
|
||||
|
||||
compose_file="docker/staging_pull_compose.template.yaml"
|
||||
|
||||
if [ ! -f "$compose_file" ]; then
|
||||
echo "missing expected file: $compose_file" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
assert_service_has_restart_policy() {
|
||||
service="$1"
|
||||
awk -v service="$service" '
|
||||
$0 ~ "^ " service ":" {
|
||||
in_service = 1
|
||||
found = 0
|
||||
next
|
||||
}
|
||||
in_service && /^ [A-Za-z0-9_-]+:/ {
|
||||
exit found ? 0 : 1
|
||||
}
|
||||
in_service && /^[[:space:]]+restart:[[:space:]]+(always|unless-stopped)[[:space:]]*$/ {
|
||||
found = 1
|
||||
}
|
||||
END {
|
||||
if (in_service) {
|
||||
exit found ? 0 : 1
|
||||
}
|
||||
}
|
||||
' "$compose_file" || {
|
||||
echo "ERROR: long-running staging service must define restart: always or restart: unless-stopped: $service" >&2
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
|
||||
for service in \
|
||||
postgres \
|
||||
clickhouse \
|
||||
redis \
|
||||
gateway \
|
||||
postgres_ory \
|
||||
kratos \
|
||||
hydra \
|
||||
keto \
|
||||
oathkeeper \
|
||||
ory_clickhouse \
|
||||
backend \
|
||||
adminfront \
|
||||
devfront \
|
||||
orgfront \
|
||||
userfront
|
||||
do
|
||||
assert_service_has_restart_policy "$service"
|
||||
done
|
||||
|
||||
echo "staging pull restart policy checks passed"
|
||||
Reference in New Issue
Block a user