forked from baron/baron-sso
백업/복구로직 변경, 깜빡임 버그 해결
This commit is contained in:
460
scripts/backup/restore.sh
Executable file
460
scripts/backup/restore.sh
Executable file
@@ -0,0 +1,460 @@
|
||||
#!/usr/bin/env bash
|
||||
set -Eeuo pipefail
|
||||
|
||||
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
source "$script_dir/lib/common.sh"
|
||||
source "$script_dir/lib/postgres.sh"
|
||||
source "$script_dir/lib/clickhouse.sh"
|
||||
source "$script_dir/lib/config.sh"
|
||||
source "$script_dir/lib/report.sh"
|
||||
|
||||
dry_run=false
|
||||
if [[ "${1:-}" == "--dry-run" ]]; then
|
||||
dry_run=true
|
||||
fi
|
||||
|
||||
repo_root="$(backup_repo_root)"
|
||||
backup_input="${BACKUP:-}"
|
||||
dump_file="${DUMP_FILE:-}"
|
||||
backup_source="directory"
|
||||
temp_extract_dir=""
|
||||
report_path=""
|
||||
report_started_at="$(backup_utc_now)"
|
||||
report_status="started"
|
||||
report_message=""
|
||||
dump_checksum_status="not_run"
|
||||
target_verification_status="not_run"
|
||||
target_verification_reports="[]"
|
||||
|
||||
json_array_from_words() {
|
||||
local words="$1"
|
||||
if [[ -z "$words" ]]; then
|
||||
printf '[]\n'
|
||||
return
|
||||
fi
|
||||
|
||||
printf '%s\n' $words | jq -R . | jq -sc .
|
||||
}
|
||||
|
||||
write_restore_report() {
|
||||
local status="$1"
|
||||
local message="${2:-}"
|
||||
local finished_at
|
||||
local services_json
|
||||
local restore_policy_json="{}"
|
||||
|
||||
[[ -n "$report_path" ]] || return 0
|
||||
|
||||
finished_at="$(backup_utc_now)"
|
||||
services_json="$(json_array_from_words "${services:-}")"
|
||||
if [[ -n "${backup_dir:-}" && -f "$backup_dir/manifest.json" ]]; then
|
||||
restore_policy_json="$(jq -c '.restore_policy // {}' "$backup_dir/manifest.json")"
|
||||
fi
|
||||
|
||||
mkdir -p "$(dirname "$report_path")"
|
||||
jq -n \
|
||||
--arg format_version "1" \
|
||||
--arg started_at "$report_started_at" \
|
||||
--arg finished_at "$finished_at" \
|
||||
--arg status "$status" \
|
||||
--arg message "$message" \
|
||||
--arg backup_source "$backup_source" \
|
||||
--arg backup_dir "${backup_dir:-}" \
|
||||
--arg dump_file "$dump_file" \
|
||||
--argjson services "$services_json" \
|
||||
--arg allow_non_empty_restore "${allow_non_empty:-false}" \
|
||||
--arg dry_run "$dry_run" \
|
||||
--arg dump_checksum "$dump_checksum_status" \
|
||||
--arg target_row_counts "$target_verification_status" \
|
||||
--argjson target_reports "$target_verification_reports" \
|
||||
--argjson restore_policy "$restore_policy_json" \
|
||||
'{
|
||||
format_version: $format_version,
|
||||
started_at: $started_at,
|
||||
finished_at: $finished_at,
|
||||
status: $status,
|
||||
message: $message,
|
||||
backup_source: $backup_source,
|
||||
backup_dir: $backup_dir,
|
||||
dump_file: (if $dump_file == "" then null else $dump_file end),
|
||||
services: $services,
|
||||
allow_non_empty_restore: ($allow_non_empty_restore == "true"),
|
||||
dry_run: ($dry_run == "true"),
|
||||
restore_policy: $restore_policy,
|
||||
verification: {
|
||||
dump_checksum: $dump_checksum,
|
||||
target_row_counts: $target_row_counts,
|
||||
target_reports: $target_reports
|
||||
}
|
||||
}' >"$report_path"
|
||||
write_restore_markdown_report "$report_path"
|
||||
}
|
||||
|
||||
cleanup_restore_input() {
|
||||
if [[ -n "$temp_extract_dir" ]]; then
|
||||
rm -rf "$temp_extract_dir"
|
||||
fi
|
||||
}
|
||||
|
||||
on_restore_error() {
|
||||
local exit_code=$?
|
||||
write_restore_report "failed" "${report_message:-restore failed}"
|
||||
cleanup_restore_input
|
||||
exit "$exit_code"
|
||||
}
|
||||
|
||||
trap on_restore_error ERR
|
||||
trap cleanup_restore_input EXIT
|
||||
|
||||
resolve_backup_input() {
|
||||
if [[ -n "$backup_input" && -n "$dump_file" ]]; then
|
||||
backup_die "set only one of BACKUP or DUMP_FILE for restore."
|
||||
fi
|
||||
|
||||
if [[ -n "$backup_input" ]]; then
|
||||
backup_dir="$backup_input"
|
||||
backup_require_path "$backup_dir"
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -z "$dump_file" ]]; then
|
||||
backup_die "BACKUP or DUMP_FILE is required. Example: make restore BACKUP=backups/baron-sso-backup-YYYYMMDD-HHMMSSZ CONFIRM_RESTORE=baron-sso"
|
||||
fi
|
||||
|
||||
backup_require_path "$dump_file"
|
||||
backup_require_command tar
|
||||
temp_extract_dir="$(mktemp -d /tmp/baron-sso-restore.XXXXXX)"
|
||||
backup_source="dump_file"
|
||||
|
||||
case "$dump_file" in
|
||||
*.tar.zst)
|
||||
backup_require_command zstd
|
||||
tar --zstd --no-same-owner -xf "$dump_file" -C "$temp_extract_dir"
|
||||
;;
|
||||
*.tar.gz | *.tgz)
|
||||
tar -xzf "$dump_file" -C "$temp_extract_dir"
|
||||
;;
|
||||
*)
|
||||
backup_die "unsupported DUMP_FILE archive format: $dump_file"
|
||||
;;
|
||||
esac
|
||||
|
||||
mapfile -t manifest_files < <(find "$temp_extract_dir" -type f -name manifest.json | sort)
|
||||
if [[ "${#manifest_files[@]}" -ne 1 ]]; then
|
||||
backup_die "DUMP_FILE must contain exactly one backup directory with manifest.json."
|
||||
fi
|
||||
|
||||
backup_dir="$(dirname "${manifest_files[0]}")"
|
||||
}
|
||||
|
||||
quote_pg_ident() {
|
||||
local raw="$1"
|
||||
printf '"%s"' "${raw//\"/\"\"}"
|
||||
}
|
||||
|
||||
collect_postgres_exact_row_counts() {
|
||||
local container="$1"
|
||||
local user="$2"
|
||||
local password="$3"
|
||||
local database="$4"
|
||||
local output_file="$5"
|
||||
local schema
|
||||
local table
|
||||
local quoted_schema
|
||||
local quoted_table
|
||||
local count
|
||||
|
||||
: >"$output_file"
|
||||
docker exec -e "PGPASSWORD=$password" "$container" \
|
||||
psql -U "$user" -d "$database" -At -F $'\t' \
|
||||
-c "select schemaname, tablename from pg_tables where schemaname not in ('pg_catalog','information_schema') order by 1,2" \
|
||||
| while IFS=$'\t' read -r schema table; do
|
||||
[[ -n "$schema" && -n "$table" ]] || continue
|
||||
quoted_schema="$(quote_pg_ident "$schema")"
|
||||
quoted_table="$(quote_pg_ident "$table")"
|
||||
count="$(docker exec -e "PGPASSWORD=$password" "$container" \
|
||||
psql -U "$user" -d "$database" -At \
|
||||
-c "select count(*) from ${quoted_schema}.${quoted_table}")"
|
||||
printf '%s.%s:%s\n' "$schema" "$table" "$count"
|
||||
done | sort >"$output_file"
|
||||
}
|
||||
|
||||
collect_postgres_dump_row_counts() {
|
||||
local container="$1"
|
||||
local user="$2"
|
||||
local password="$3"
|
||||
local database="$4"
|
||||
local dump_path="$5"
|
||||
local output_file="$6"
|
||||
local scratch_db
|
||||
local scratch_ident
|
||||
|
||||
backup_require_path "$dump_path"
|
||||
scratch_db="${database}_restore_verify_$(date -u '+%Y%m%d%H%M%S')_$$"
|
||||
scratch_ident="$(quote_pg_ident "$scratch_db")"
|
||||
|
||||
docker exec -e "PGPASSWORD=$password" "$container" \
|
||||
psql -U "$user" -d postgres -v ON_ERROR_STOP=1 \
|
||||
-c "drop database if exists ${scratch_ident} with (force)" \
|
||||
-c "create database ${scratch_ident}"
|
||||
|
||||
docker exec -i -e "PGPASSWORD=$password" "$container" \
|
||||
pg_restore -U "$user" -d "$scratch_db" --clean --if-exists <"$dump_path"
|
||||
|
||||
collect_postgres_exact_row_counts "$container" "$user" "$password" "$scratch_db" "$output_file"
|
||||
|
||||
docker exec -e "PGPASSWORD=$password" "$container" \
|
||||
psql -U "$user" -d postgres -v ON_ERROR_STOP=1 \
|
||||
-c "drop database if exists ${scratch_ident} with (force)"
|
||||
}
|
||||
|
||||
collect_clickhouse_exact_row_counts() {
|
||||
local container="$1"
|
||||
local user="$2"
|
||||
local password="$3"
|
||||
local table_list="$4"
|
||||
local output_file="$5"
|
||||
local database
|
||||
local table
|
||||
local engine
|
||||
local count
|
||||
|
||||
: >"$output_file"
|
||||
while IFS=$'\t' read -r database table engine; do
|
||||
[[ -n "$database" && -n "$table" ]] || continue
|
||||
count="$(docker exec "$container" clickhouse-client --user "$user" --password "$password" \
|
||||
--query "select count() from \`${database}\`.\`${table}\`")"
|
||||
printf '%s.%s:%s\n' "$database" "$table" "$count"
|
||||
done <"$table_list" | sort >"$output_file"
|
||||
}
|
||||
|
||||
collect_clickhouse_native_stable_row_counts() {
|
||||
local container="$1"
|
||||
local user="$2"
|
||||
local password="$3"
|
||||
local input_dir="$4"
|
||||
local output_file="$5"
|
||||
local scratch_db
|
||||
local database
|
||||
local table
|
||||
local engine
|
||||
local safe_name
|
||||
|
||||
scratch_db="$(basename "$input_dir")_restore_verify_$(date -u '+%Y%m%d%H%M%S')_$$"
|
||||
: >"$output_file"
|
||||
|
||||
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
|
||||
--query "drop database if exists \`${scratch_db}\`"
|
||||
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
|
||||
--query "create database \`${scratch_db}\`"
|
||||
|
||||
while IFS=$'\t' read -r database table engine; do
|
||||
[[ -n "$database" && -n "$table" ]] || continue
|
||||
if [[ "$engine" == *View* || "$engine" == *AggregatingMergeTree* ]]; then
|
||||
continue
|
||||
fi
|
||||
|
||||
safe_name="${database}__${table}"
|
||||
backup_require_path "$input_dir/data/${safe_name}.native"
|
||||
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
|
||||
--query "create table \`${scratch_db}\`.\`${table}\` as \`${database}\`.\`${table}\`"
|
||||
docker exec -i "$container" clickhouse-client --user "$user" --password "$password" \
|
||||
--query "insert into \`${scratch_db}\`.\`${table}\` format Native" <"$input_dir/data/${safe_name}.native"
|
||||
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
|
||||
--query "select '${database}.${table}:' || toString(count()) from \`${scratch_db}\`.\`${table}\`" \
|
||||
>>"$output_file"
|
||||
done <"$input_dir/tables.tsv"
|
||||
|
||||
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
|
||||
--query "drop database if exists \`${scratch_db}\`"
|
||||
sort -o "$output_file" "$output_file"
|
||||
}
|
||||
|
||||
filter_clickhouse_stable_row_counts() {
|
||||
local table_list="$1"
|
||||
local counts_file="$2"
|
||||
local output_file="$3"
|
||||
local database
|
||||
local table
|
||||
local engine
|
||||
|
||||
: >"$output_file"
|
||||
while IFS=$'\t' read -r database table engine; do
|
||||
[[ -n "$database" && -n "$table" ]] || continue
|
||||
if [[ "$engine" == *View* || "$engine" == *AggregatingMergeTree* ]]; then
|
||||
continue
|
||||
fi
|
||||
grep -F "${database}.${table}:" "$counts_file" >>"$output_file" || true
|
||||
done <"$table_list"
|
||||
sort -o "$output_file" "$output_file"
|
||||
}
|
||||
|
||||
compare_row_count_report() {
|
||||
local label="$1"
|
||||
local expected_file="$2"
|
||||
local actual_file="$3"
|
||||
local diff_file="$4"
|
||||
|
||||
backup_require_path "$expected_file"
|
||||
if diff -u <(sort "$expected_file") <(sort "$actual_file") >"$diff_file"; then
|
||||
jq -n \
|
||||
--arg label "$label" \
|
||||
--arg expected "$expected_file" \
|
||||
--arg actual "$actual_file" \
|
||||
--arg status "passed" \
|
||||
'{label:$label, expected:$expected, actual:$actual, status:$status}'
|
||||
else
|
||||
jq -n \
|
||||
--arg label "$label" \
|
||||
--arg expected "$expected_file" \
|
||||
--arg actual "$actual_file" \
|
||||
--arg diff "$diff_file" \
|
||||
--arg status "failed" \
|
||||
'{label:$label, expected:$expected, actual:$actual, diff:$diff, status:$status}'
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
verify_restored_targets() {
|
||||
local report_dir
|
||||
local report_items=()
|
||||
local item
|
||||
local expected
|
||||
local actual
|
||||
local diff_file
|
||||
local db_name
|
||||
|
||||
report_dir="$(dirname "$report_path")/restore-targets-$(date -u '+%Y%m%d-%H%M%SZ')"
|
||||
mkdir -p "$report_dir"
|
||||
|
||||
if service_enabled postgres "$services"; then
|
||||
expected="$report_dir/baron-postgres-expected-row-counts.txt"
|
||||
actual="$report_dir/baron-postgres-row-counts.txt"
|
||||
diff_file="$report_dir/baron-postgres-row-counts.diff"
|
||||
collect_postgres_dump_row_counts baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}" "$backup_dir/postgres/baron.dump" "$expected"
|
||||
collect_postgres_exact_row_counts baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}" "$actual"
|
||||
item="$(compare_row_count_report "postgres" "$expected" "$actual" "$diff_file")"
|
||||
report_items+=("$item")
|
||||
fi
|
||||
|
||||
if service_enabled ory-postgres "$services"; then
|
||||
for db_name in "${KRATOS_DB:-ory_kratos}" "${HYDRA_DB:-ory_hydra}" "${KETO_DB:-ory_keto}"; do
|
||||
expected="$report_dir/${db_name}-expected-row-counts.txt"
|
||||
actual="$report_dir/${db_name}-row-counts.txt"
|
||||
diff_file="$report_dir/${db_name}-row-counts.diff"
|
||||
collect_postgres_dump_row_counts ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "$db_name" "$backup_dir/postgres/${db_name}.dump" "$expected"
|
||||
collect_postgres_exact_row_counts ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "$db_name" "$actual"
|
||||
item="$(compare_row_count_report "ory-postgres/$db_name" "$expected" "$actual" "$diff_file")"
|
||||
report_items+=("$item")
|
||||
done
|
||||
fi
|
||||
|
||||
if service_enabled clickhouse "$services"; then
|
||||
expected="$report_dir/baron_clickhouse-stable-expected-row-counts.txt"
|
||||
actual="$report_dir/baron_clickhouse-stable-row-counts.txt"
|
||||
diff_file="$report_dir/baron_clickhouse-row-counts.diff"
|
||||
collect_clickhouse_exact_row_counts baron_clickhouse "${CLICKHOUSE_USER:-baron}" "${CLICKHOUSE_PASSWORD:-password}" "$backup_dir/clickhouse/baron_clickhouse/tables.tsv" "$actual"
|
||||
mv "$actual" "$report_dir/baron_clickhouse-row-counts.txt"
|
||||
collect_clickhouse_native_stable_row_counts baron_clickhouse "${CLICKHOUSE_USER:-baron}" "${CLICKHOUSE_PASSWORD:-password}" "$backup_dir/clickhouse/baron_clickhouse" "$expected"
|
||||
filter_clickhouse_stable_row_counts "$backup_dir/clickhouse/baron_clickhouse/tables.tsv" "$report_dir/baron_clickhouse-row-counts.txt" "$actual"
|
||||
item="$(compare_row_count_report "clickhouse" "$expected" "$actual" "$diff_file")"
|
||||
report_items+=("$item")
|
||||
fi
|
||||
|
||||
if service_enabled ory-clickhouse "$services"; then
|
||||
expected="$report_dir/ory_clickhouse-stable-expected-row-counts.txt"
|
||||
actual="$report_dir/ory_clickhouse-stable-row-counts.txt"
|
||||
diff_file="$report_dir/ory_clickhouse-row-counts.diff"
|
||||
collect_clickhouse_exact_row_counts ory_clickhouse "${ORY_CLICKHOUSE_USER:-ory}" "${ORY_CLICKHOUSE_PASSWORD:-orypass}" "$backup_dir/clickhouse/ory_clickhouse/tables.tsv" "$actual"
|
||||
mv "$actual" "$report_dir/ory_clickhouse-row-counts.txt"
|
||||
collect_clickhouse_native_stable_row_counts ory_clickhouse "${ORY_CLICKHOUSE_USER:-ory}" "${ORY_CLICKHOUSE_PASSWORD:-orypass}" "$backup_dir/clickhouse/ory_clickhouse" "$expected"
|
||||
filter_clickhouse_stable_row_counts "$backup_dir/clickhouse/ory_clickhouse/tables.tsv" "$report_dir/ory_clickhouse-row-counts.txt" "$actual"
|
||||
item="$(compare_row_count_report "ory-clickhouse" "$expected" "$actual" "$diff_file")"
|
||||
report_items+=("$item")
|
||||
fi
|
||||
|
||||
if service_enabled config "$services"; then
|
||||
backup_require_path "$repo_root/config-restored"
|
||||
item="$(jq -n \
|
||||
--arg label "config" \
|
||||
--arg actual "$repo_root/config-restored" \
|
||||
--arg status "passed" \
|
||||
'{label:$label, actual:$actual, status:$status}')"
|
||||
report_items+=("$item")
|
||||
fi
|
||||
|
||||
target_verification_reports="$(printf '%s\n' "${report_items[@]}" | jq -s '.')"
|
||||
target_verification_status="passed"
|
||||
}
|
||||
|
||||
resolve_backup_input
|
||||
|
||||
if [[ -n "${RESTORE_REPORT:-}" ]]; then
|
||||
report_path="$RESTORE_REPORT"
|
||||
elif [[ "$backup_source" == "dump_file" ]]; then
|
||||
archive_name="$(basename "$dump_file")"
|
||||
archive_name="${archive_name%.tar.zst}"
|
||||
archive_name="${archive_name%.tar.gz}"
|
||||
archive_name="${archive_name%.tgz}"
|
||||
report_path="$repo_root/reports/restore/${archive_name}-restore-report.json"
|
||||
else
|
||||
report_path="$backup_dir/reports/restore-report.json"
|
||||
fi
|
||||
|
||||
if [[ "${CONFIRM_RESTORE:-}" != "baron-sso" ]]; then
|
||||
backup_die "CONFIRM_RESTORE=baron-sso is required for restore."
|
||||
fi
|
||||
|
||||
services="$(normalize_service_filter "${RESTORE_SERVICES:-all}")"
|
||||
allow_non_empty="${ALLOW_NON_EMPTY_RESTORE:-false}"
|
||||
|
||||
if [[ "${RESTORE_TEST_NON_EMPTY:-}" == "1" && "$allow_non_empty" != "true" ]]; then
|
||||
backup_die "non-empty restore target is not allowed by default. Set ALLOW_NON_EMPTY_RESTORE=true only for an approved restore rehearsal."
|
||||
fi
|
||||
|
||||
if [[ "$dry_run" == "true" ]]; then
|
||||
backup_log "Restore plan for $backup_dir"
|
||||
backup_log "Services: $services"
|
||||
backup_log "ALLOW_NON_EMPTY_RESTORE=$allow_non_empty"
|
||||
backup_log "RESTORE_REPORT=$report_path"
|
||||
write_restore_report "planned" "restore dry-run completed"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "$allow_non_empty" != "true" ]]; then
|
||||
if service_enabled postgres "$services" && postgres_target_has_data baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}"; then
|
||||
backup_die "non-empty restore target is not allowed by default: baron_postgres/${DB_NAME:-baron_sso}"
|
||||
fi
|
||||
if service_enabled ory-postgres "$services" && postgres_target_has_data ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "${KRATOS_DB:-ory_kratos}"; then
|
||||
backup_die "non-empty restore target is not allowed by default: ory_postgres/${KRATOS_DB:-ory_kratos}"
|
||||
fi
|
||||
fi
|
||||
|
||||
BACKUP="$backup_dir" "$script_dir/verify-dump.sh"
|
||||
dump_checksum_status="passed"
|
||||
|
||||
if service_enabled postgres "$services"; then
|
||||
restore_baron_postgres "$backup_dir"
|
||||
fi
|
||||
|
||||
if service_enabled ory-postgres "$services"; then
|
||||
restore_ory_postgres "$backup_dir"
|
||||
fi
|
||||
|
||||
if service_enabled clickhouse "$services"; then
|
||||
restore_baron_clickhouse "$backup_dir"
|
||||
fi
|
||||
|
||||
if service_enabled ory-clickhouse "$services"; then
|
||||
restore_ory_clickhouse "$backup_dir"
|
||||
fi
|
||||
|
||||
if service_enabled config "$services"; then
|
||||
restore_config_snapshot "$backup_dir"
|
||||
fi
|
||||
|
||||
verify_restored_targets
|
||||
write_restore_report "succeeded" "restore completed and target row-count verification passed"
|
||||
|
||||
backup_log "Restore complete. Keep WORKS relay disabled until comparison dry-run passes."
|
||||
backup_log "Restore report: $report_path"
|
||||
Reference in New Issue
Block a user