1
0
forked from baron/baron-sso
Files
baron-sso/scripts/backup/restore.sh
2026-06-12 18:36:18 +09:00

494 lines
17 KiB
Bash
Executable File

#!/usr/bin/env bash
set -Eeuo pipefail
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$script_dir/lib/common.sh"
source "$script_dir/lib/postgres.sh"
source "$script_dir/lib/clickhouse.sh"
source "$script_dir/lib/config.sh"
source "$script_dir/lib/report.sh"
dry_run=false
if [[ "${1:-}" == "--dry-run" ]]; then
dry_run=true
fi
repo_root="$(backup_repo_root)"
restore_input="${RESTORE_INPUT:-${FILE_PATH:-}}"
backup_input="${BACKUP:-}"
dump_file="${DUMP_FILE:-}"
backup_source="directory"
temp_extract_dir=""
report_path=""
report_started_at="$(backup_utc_now)"
report_status="started"
report_message=""
dump_checksum_status="not_run"
target_verification_status="not_run"
target_verification_reports="[]"
json_array_from_words() {
local words="$1"
if [[ -z "$words" ]]; then
printf '[]\n'
return
fi
printf '%s\n' $words | jq -R . | jq -sc .
}
write_restore_report() {
local status="$1"
local message="${2:-}"
local finished_at
local services_json
local restore_policy_json="{}"
[[ -n "$report_path" ]] || return 0
finished_at="$(backup_utc_now)"
services_json="$(json_array_from_words "${services:-}")"
if [[ -n "${backup_dir:-}" && -f "$backup_dir/manifest.json" ]]; then
restore_policy_json="$(jq -c '.restore_policy // {}' "$backup_dir/manifest.json")"
fi
mkdir -p "$(dirname "$report_path")"
jq -n \
--arg format_version "1" \
--arg started_at "$report_started_at" \
--arg finished_at "$finished_at" \
--arg status "$status" \
--arg message "$message" \
--arg backup_source "$backup_source" \
--arg backup_dir "${backup_dir:-}" \
--arg dump_file "$dump_file" \
--argjson services "$services_json" \
--arg allow_non_empty_restore "${allow_non_empty:-false}" \
--arg dry_run "$dry_run" \
--arg dump_checksum "$dump_checksum_status" \
--arg target_row_counts "$target_verification_status" \
--argjson target_reports "$target_verification_reports" \
--argjson restore_policy "$restore_policy_json" \
'{
format_version: $format_version,
started_at: $started_at,
finished_at: $finished_at,
status: $status,
message: $message,
backup_source: $backup_source,
backup_dir: $backup_dir,
dump_file: (if $dump_file == "" then null else $dump_file end),
services: $services,
allow_non_empty_restore: ($allow_non_empty_restore == "true"),
dry_run: ($dry_run == "true"),
restore_policy: $restore_policy,
verification: {
dump_checksum: $dump_checksum,
target_row_counts: $target_row_counts,
target_reports: $target_reports
}
}' >"$report_path"
write_restore_markdown_report "$report_path"
}
cleanup_restore_input() {
if [[ -n "$temp_extract_dir" ]]; then
rm -rf "$temp_extract_dir"
fi
}
on_restore_error() {
local exit_code=$?
write_restore_report "failed" "${report_message:-restore failed}"
cleanup_restore_input
exit "$exit_code"
}
trap on_restore_error ERR
trap cleanup_restore_input EXIT
resolve_backup_input() {
local input_count=0
[[ -n "$restore_input" ]] && input_count=$((input_count + 1))
[[ -n "$backup_input" ]] && input_count=$((input_count + 1))
[[ -n "$dump_file" ]] && input_count=$((input_count + 1))
if [[ "$input_count" -gt 1 ]]; then
backup_die "set only one restore input: RESTORE_INPUT, BACKUP, or DUMP_FILE."
fi
if [[ -n "$restore_input" ]]; then
backup_require_path "$restore_input"
if [[ -d "$restore_input" ]]; then
backup_dir="$restore_input"
backup_source="directory"
return
fi
if [[ ! -f "$restore_input" ]]; then
backup_die "restore input must be a backup directory or supported archive: $restore_input"
fi
case "$restore_input" in
*.tar.zst | *.tar.gz | *.tgz | *.zip)
dump_file="$restore_input"
;;
*)
backup_die "unsupported restore input file extension: $restore_input"
;;
esac
fi
if [[ -n "$backup_input" ]]; then
backup_dir="$backup_input"
backup_require_path "$backup_dir"
return
fi
if [[ -z "$dump_file" ]]; then
backup_die "BACKUP or DUMP_FILE is required. Example: make restore BACKUP=backups/baron-sso-backup-YYYYMMDD-HHMMSSZ CONFIRM_RESTORE=baron-sso"
fi
backup_require_path "$dump_file"
backup_require_command tar
temp_extract_dir="$(mktemp -d /tmp/baron-sso-restore.XXXXXX)"
backup_source="dump_file"
case "$dump_file" in
*.tar.zst)
backup_require_command zstd
tar --zstd --no-same-owner -xf "$dump_file" -C "$temp_extract_dir"
;;
*.tar.gz | *.tgz)
tar -xzf "$dump_file" -C "$temp_extract_dir"
;;
*.zip)
backup_require_command unzip
unzip -q "$dump_file" -d "$temp_extract_dir"
;;
*)
backup_die "unsupported DUMP_FILE archive format: $dump_file"
;;
esac
mapfile -t manifest_files < <(find "$temp_extract_dir" -type f -name manifest.json | sort)
if [[ "${#manifest_files[@]}" -ne 1 ]]; then
backup_die "DUMP_FILE must contain exactly one backup directory with manifest.json."
fi
backup_dir="$(dirname "${manifest_files[0]}")"
}
quote_pg_ident() {
local raw="$1"
printf '"%s"' "${raw//\"/\"\"}"
}
collect_postgres_exact_row_counts() {
local container="$1"
local user="$2"
local password="$3"
local database="$4"
local output_file="$5"
local schema
local table
local quoted_schema
local quoted_table
local count
: >"$output_file"
docker exec -e "PGPASSWORD=$password" "$container" \
psql -U "$user" -d "$database" -At -F $'\t' \
-c "select schemaname, tablename from pg_tables where schemaname not in ('pg_catalog','information_schema') order by 1,2" \
| while IFS=$'\t' read -r schema table; do
[[ -n "$schema" && -n "$table" ]] || continue
quoted_schema="$(quote_pg_ident "$schema")"
quoted_table="$(quote_pg_ident "$table")"
count="$(docker exec -e "PGPASSWORD=$password" "$container" \
psql -U "$user" -d "$database" -At \
-c "select count(*) from ${quoted_schema}.${quoted_table}")"
printf '%s.%s:%s\n' "$schema" "$table" "$count"
done | sort >"$output_file"
}
collect_postgres_dump_row_counts() {
local container="$1"
local user="$2"
local password="$3"
local database="$4"
local dump_path="$5"
local output_file="$6"
local scratch_db
local scratch_ident
backup_require_path "$dump_path"
scratch_db="${database}_restore_verify_$(date -u '+%Y%m%d%H%M%S')_$$"
scratch_ident="$(quote_pg_ident "$scratch_db")"
docker exec -e "PGPASSWORD=$password" "$container" \
psql -U "$user" -d postgres -v ON_ERROR_STOP=1 \
-c "drop database if exists ${scratch_ident} with (force)" \
-c "create database ${scratch_ident}"
docker exec -i -e "PGPASSWORD=$password" "$container" \
pg_restore -U "$user" -d "$scratch_db" --clean --if-exists <"$dump_path"
collect_postgres_exact_row_counts "$container" "$user" "$password" "$scratch_db" "$output_file"
docker exec -e "PGPASSWORD=$password" "$container" \
psql -U "$user" -d postgres -v ON_ERROR_STOP=1 \
-c "drop database if exists ${scratch_ident} with (force)"
}
collect_clickhouse_exact_row_counts() {
local container="$1"
local user="$2"
local password="$3"
local table_list="$4"
local output_file="$5"
local database
local table
local engine
local count
: >"$output_file"
while IFS=$'\t' read -r database table engine; do
[[ -n "$database" && -n "$table" ]] || continue
count="$(docker exec "$container" clickhouse-client --user "$user" --password "$password" \
--query "select count() from \`${database}\`.\`${table}\`")"
printf '%s.%s:%s\n' "$database" "$table" "$count"
done <"$table_list" | sort >"$output_file"
}
collect_clickhouse_native_stable_row_counts() {
local container="$1"
local user="$2"
local password="$3"
local input_dir="$4"
local output_file="$5"
local scratch_db
local database
local table
local engine
local safe_name
scratch_db="$(basename "$input_dir")_restore_verify_$(date -u '+%Y%m%d%H%M%S')_$$"
: >"$output_file"
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
--query "drop database if exists \`${scratch_db}\`"
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
--query "create database \`${scratch_db}\`"
while IFS=$'\t' read -r database table engine; do
[[ -n "$database" && -n "$table" ]] || continue
if [[ "$engine" == *View* || "$engine" == *AggregatingMergeTree* ]]; then
continue
fi
safe_name="${database}__${table}"
backup_require_path "$input_dir/data/${safe_name}.native"
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
--query "create table \`${scratch_db}\`.\`${table}\` as \`${database}\`.\`${table}\`"
docker exec -i "$container" clickhouse-client --user "$user" --password "$password" \
--query "insert into \`${scratch_db}\`.\`${table}\` format Native" <"$input_dir/data/${safe_name}.native"
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
--query "select '${database}.${table}:' || toString(count()) from \`${scratch_db}\`.\`${table}\`" \
>>"$output_file"
done <"$input_dir/tables.tsv"
docker exec "$container" clickhouse-client --user "$user" --password "$password" \
--query "drop database if exists \`${scratch_db}\`"
sort -o "$output_file" "$output_file"
}
filter_clickhouse_stable_row_counts() {
local table_list="$1"
local counts_file="$2"
local output_file="$3"
local database
local table
local engine
: >"$output_file"
while IFS=$'\t' read -r database table engine; do
[[ -n "$database" && -n "$table" ]] || continue
if [[ "$engine" == *View* || "$engine" == *AggregatingMergeTree* ]]; then
continue
fi
grep -F "${database}.${table}:" "$counts_file" >>"$output_file" || true
done <"$table_list"
sort -o "$output_file" "$output_file"
}
compare_row_count_report() {
local label="$1"
local expected_file="$2"
local actual_file="$3"
local diff_file="$4"
backup_require_path "$expected_file"
if diff -u <(sort "$expected_file") <(sort "$actual_file") >"$diff_file"; then
jq -n \
--arg label "$label" \
--arg expected "$expected_file" \
--arg actual "$actual_file" \
--arg status "passed" \
'{label:$label, expected:$expected, actual:$actual, status:$status}'
else
jq -n \
--arg label "$label" \
--arg expected "$expected_file" \
--arg actual "$actual_file" \
--arg diff "$diff_file" \
--arg status "failed" \
'{label:$label, expected:$expected, actual:$actual, diff:$diff, status:$status}'
return 1
fi
}
verify_restored_targets() {
local report_dir
local report_items=()
local item
local expected
local actual
local diff_file
local db_name
report_dir="$(dirname "$report_path")/restore-targets-$(date -u '+%Y%m%d-%H%M%SZ')"
mkdir -p "$report_dir"
if service_enabled postgres "$services"; then
expected="$report_dir/baron-postgres-expected-row-counts.txt"
actual="$report_dir/baron-postgres-row-counts.txt"
diff_file="$report_dir/baron-postgres-row-counts.diff"
collect_postgres_dump_row_counts baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}" "$backup_dir/postgres/baron.dump" "$expected"
collect_postgres_exact_row_counts baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}" "$actual"
item="$(compare_row_count_report "postgres" "$expected" "$actual" "$diff_file")"
report_items+=("$item")
fi
if service_enabled ory-postgres "$services"; then
for db_name in "${KRATOS_DB:-ory_kratos}" "${HYDRA_DB:-ory_hydra}" "${KETO_DB:-ory_keto}"; do
expected="$report_dir/${db_name}-expected-row-counts.txt"
actual="$report_dir/${db_name}-row-counts.txt"
diff_file="$report_dir/${db_name}-row-counts.diff"
collect_postgres_dump_row_counts ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "$db_name" "$backup_dir/postgres/${db_name}.dump" "$expected"
collect_postgres_exact_row_counts ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "$db_name" "$actual"
item="$(compare_row_count_report "ory-postgres/$db_name" "$expected" "$actual" "$diff_file")"
report_items+=("$item")
done
fi
if service_enabled clickhouse "$services"; then
expected="$report_dir/baron_clickhouse-stable-expected-row-counts.txt"
actual="$report_dir/baron_clickhouse-stable-row-counts.txt"
diff_file="$report_dir/baron_clickhouse-row-counts.diff"
collect_clickhouse_exact_row_counts baron_clickhouse "${CLICKHOUSE_USER:-baron}" "${CLICKHOUSE_PASSWORD:-password}" "$backup_dir/clickhouse/baron_clickhouse/tables.tsv" "$actual"
mv "$actual" "$report_dir/baron_clickhouse-row-counts.txt"
collect_clickhouse_native_stable_row_counts baron_clickhouse "${CLICKHOUSE_USER:-baron}" "${CLICKHOUSE_PASSWORD:-password}" "$backup_dir/clickhouse/baron_clickhouse" "$expected"
filter_clickhouse_stable_row_counts "$backup_dir/clickhouse/baron_clickhouse/tables.tsv" "$report_dir/baron_clickhouse-row-counts.txt" "$actual"
item="$(compare_row_count_report "clickhouse" "$expected" "$actual" "$diff_file")"
report_items+=("$item")
fi
if service_enabled ory-clickhouse "$services"; then
expected="$report_dir/ory_clickhouse-stable-expected-row-counts.txt"
actual="$report_dir/ory_clickhouse-stable-row-counts.txt"
diff_file="$report_dir/ory_clickhouse-row-counts.diff"
collect_clickhouse_exact_row_counts ory_clickhouse "${ORY_CLICKHOUSE_USER:-ory}" "${ORY_CLICKHOUSE_PASSWORD:-orypass}" "$backup_dir/clickhouse/ory_clickhouse/tables.tsv" "$actual"
mv "$actual" "$report_dir/ory_clickhouse-row-counts.txt"
collect_clickhouse_native_stable_row_counts ory_clickhouse "${ORY_CLICKHOUSE_USER:-ory}" "${ORY_CLICKHOUSE_PASSWORD:-orypass}" "$backup_dir/clickhouse/ory_clickhouse" "$expected"
filter_clickhouse_stable_row_counts "$backup_dir/clickhouse/ory_clickhouse/tables.tsv" "$report_dir/ory_clickhouse-row-counts.txt" "$actual"
item="$(compare_row_count_report "ory-clickhouse" "$expected" "$actual" "$diff_file")"
report_items+=("$item")
fi
if service_enabled config "$services"; then
backup_require_path "$repo_root/config-restored"
item="$(jq -n \
--arg label "config" \
--arg actual "$repo_root/config-restored" \
--arg status "passed" \
'{label:$label, actual:$actual, status:$status}')"
report_items+=("$item")
fi
target_verification_reports="$(printf '%s\n' "${report_items[@]}" | jq -s '.')"
target_verification_status="passed"
}
resolve_backup_input
if [[ -n "${RESTORE_REPORT:-}" ]]; then
report_path="$RESTORE_REPORT"
elif [[ "$backup_source" == "dump_file" ]]; then
archive_name="$(basename "$dump_file")"
archive_name="${archive_name%.tar.zst}"
archive_name="${archive_name%.tar.gz}"
archive_name="${archive_name%.tgz}"
report_path="$repo_root/reports/restore/${archive_name}-restore-report.json"
else
report_path="$backup_dir/reports/restore-report.json"
fi
if [[ "${CONFIRM_RESTORE:-}" != "baron-sso" ]]; then
backup_die "CONFIRM_RESTORE=baron-sso is required for restore."
fi
services="$(normalize_service_filter "${RESTORE_SERVICES:-all}")"
allow_non_empty="${ALLOW_NON_EMPTY_RESTORE:-false}"
if [[ "${RESTORE_TEST_NON_EMPTY:-}" == "1" && "$allow_non_empty" != "true" ]]; then
backup_die "non-empty restore target is not allowed by default. Set ALLOW_NON_EMPTY_RESTORE=true only for an approved restore rehearsal."
fi
if [[ "$dry_run" == "true" ]]; then
backup_log "Restore plan for $backup_dir"
backup_log "Services: $services"
backup_log "ALLOW_NON_EMPTY_RESTORE=$allow_non_empty"
backup_log "RESTORE_REPORT=$report_path"
write_restore_report "planned" "restore dry-run completed"
exit 0
fi
if [[ "$allow_non_empty" != "true" ]]; then
if service_enabled postgres "$services" && postgres_target_has_data baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}"; then
backup_die "non-empty restore target is not allowed by default: baron_postgres/${DB_NAME:-baron_sso}"
fi
if service_enabled ory-postgres "$services" && postgres_target_has_data ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "${KRATOS_DB:-ory_kratos}"; then
backup_die "non-empty restore target is not allowed by default: ory_postgres/${KRATOS_DB:-ory_kratos}"
fi
fi
BACKUP="$backup_dir" "$script_dir/verify-dump.sh"
dump_checksum_status="passed"
if service_enabled postgres "$services"; then
restore_baron_postgres "$backup_dir"
fi
if service_enabled ory-postgres "$services"; then
restore_ory_postgres "$backup_dir"
fi
if service_enabled clickhouse "$services"; then
restore_baron_clickhouse "$backup_dir"
fi
if service_enabled ory-clickhouse "$services"; then
restore_ory_clickhouse "$backup_dir"
fi
if service_enabled config "$services"; then
restore_config_snapshot "$backup_dir"
fi
verify_restored_targets
write_restore_report "succeeded" "restore completed and target row-count verification passed"
backup_log "Restore complete. Keep WORKS relay disabled until comparison dry-run passes."
backup_log "Restore report: $report_path"