#!/usr/bin/env bash set -Eeuo pipefail script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$script_dir/lib/common.sh" source "$script_dir/lib/postgres.sh" source "$script_dir/lib/clickhouse.sh" source "$script_dir/lib/config.sh" source "$script_dir/lib/report.sh" source "$script_dir/lib/personnel_dataset.sh" dry_run=false if [[ "${1:-}" == "--dry-run" ]]; then dry_run=true fi repo_root="$(backup_repo_root)" restore_input="${RESTORE_INPUT:-${FILE_PATH:-}}" backup_input="${BACKUP:-}" dump_file="${DUMP_FILE:-}" backup_source="directory" temp_extract_dir="" report_path="" report_started_at="$(backup_utc_now)" report_status="started" report_message="" dump_checksum_status="not_run" target_verification_status="not_run" target_verification_reports="[]" dataset="full" json_array_from_words() { local words="$1" if [[ -z "$words" ]]; then printf '[]\n' return fi printf '%s\n' $words | jq -R . | jq -sc . } write_restore_report() { local status="$1" local message="${2:-}" local finished_at local services_json local restore_policy_json="{}" local personnel_policy_json="{}" [[ -n "$report_path" ]] || return 0 finished_at="$(backup_utc_now)" services_json="$(json_array_from_words "${services:-}")" if [[ -n "${backup_dir:-}" && -f "$backup_dir/manifest.json" ]]; then restore_policy_json="$(jq -c '.restore_policy // {}' "$backup_dir/manifest.json")" fi if [[ "${dataset:-full}" == "personnel" ]]; then personnel_policy_json="$(restore_personnel_plan_policy_json "$backup_dir")" fi mkdir -p "$(dirname "$report_path")" jq -n \ --arg format_version "1" \ --arg started_at "$report_started_at" \ --arg finished_at "$finished_at" \ --arg status "$status" \ --arg message "$message" \ --arg backup_source "$backup_source" \ --arg backup_dir "${backup_dir:-}" \ --arg dump_file "$dump_file" \ --argjson services "$services_json" \ --arg allow_non_empty_restore "${allow_non_empty:-false}" \ --arg dry_run "$dry_run" \ --arg dump_checksum "$dump_checksum_status" \ --arg target_row_counts "$target_verification_status" \ --argjson target_reports "$target_verification_reports" \ --argjson restore_policy "$restore_policy_json" \ --arg dataset "${dataset:-full}" \ --argjson personnel_policy "$personnel_policy_json" \ '{ format_version: $format_version, started_at: $started_at, finished_at: $finished_at, status: $status, message: $message, backup_source: $backup_source, backup_dir: $backup_dir, dump_file: (if $dump_file == "" then null else $dump_file end), dataset: $dataset, services: $services, allow_non_empty_restore: ($allow_non_empty_restore == "true"), dry_run: ($dry_run == "true"), restore_policy: (if $dataset == "personnel" then $personnel_policy else $restore_policy end), verification: { dump_checksum: $dump_checksum, target_row_counts: $target_row_counts, target_reports: $target_reports } }' >"$report_path" write_restore_markdown_report "$report_path" } cleanup_restore_input() { if [[ -n "$temp_extract_dir" ]]; then rm -rf "$temp_extract_dir" fi } on_restore_error() { local exit_code=$? write_restore_report "failed" "${report_message:-restore failed}" cleanup_restore_input exit "$exit_code" } trap on_restore_error ERR trap cleanup_restore_input EXIT resolve_backup_input() { local input_count=0 [[ -n "$restore_input" ]] && input_count=$((input_count + 1)) [[ -n "$backup_input" ]] && input_count=$((input_count + 1)) [[ -n "$dump_file" ]] && input_count=$((input_count + 1)) if [[ "$input_count" -gt 1 ]]; then backup_die "set only one restore input: RESTORE_INPUT, BACKUP, or DUMP_FILE." fi if [[ -n "$restore_input" ]]; then backup_require_path "$restore_input" if [[ -d "$restore_input" ]]; then backup_dir="$restore_input" backup_source="directory" return fi if [[ ! -f "$restore_input" ]]; then backup_die "restore input must be a backup directory or supported archive: $restore_input" fi case "$restore_input" in *.tar.zst | *.tar.gz | *.tgz | *.zip) dump_file="$restore_input" ;; *) backup_die "unsupported restore input file extension: $restore_input" ;; esac fi if [[ -n "$backup_input" ]]; then backup_dir="$backup_input" backup_require_path "$backup_dir" return fi if [[ -z "$dump_file" ]]; then backup_die "BACKUP or DUMP_FILE is required. Example: make restore BACKUP=backups/baron-sso-backup-YYYYMMDD-HHMMSSZ CONFIRM_RESTORE=baron-sso" fi backup_require_path "$dump_file" backup_require_command tar temp_extract_dir="$(mktemp -d /tmp/baron-sso-restore.XXXXXX)" backup_source="dump_file" case "$dump_file" in *.tar.zst) backup_require_command zstd tar --zstd --no-same-owner -xf "$dump_file" -C "$temp_extract_dir" ;; *.tar.gz | *.tgz) tar -xzf "$dump_file" -C "$temp_extract_dir" ;; *.zip) backup_require_command unzip unzip -q "$dump_file" -d "$temp_extract_dir" ;; *) backup_die "unsupported DUMP_FILE archive format: $dump_file" ;; esac mapfile -t manifest_files < <(find "$temp_extract_dir" -type f -name manifest.json | sort) if [[ "${#manifest_files[@]}" -ne 1 ]]; then backup_die "DUMP_FILE must contain exactly one backup directory with manifest.json." fi backup_dir="$(dirname "${manifest_files[0]}")" } quote_pg_ident() { local raw="$1" printf '"%s"' "${raw//\"/\"\"}" } collect_postgres_exact_row_counts() { local container="$1" local user="$2" local password="$3" local database="$4" local output_file="$5" local schema local table local quoted_schema local quoted_table local count : >"$output_file" docker exec -e "PGPASSWORD=$password" "$container" \ psql -U "$user" -d "$database" -At -F $'\t' \ -c "select schemaname, tablename from pg_tables where schemaname not in ('pg_catalog','information_schema') order by 1,2" \ | while IFS=$'\t' read -r schema table; do [[ -n "$schema" && -n "$table" ]] || continue quoted_schema="$(quote_pg_ident "$schema")" quoted_table="$(quote_pg_ident "$table")" count="$(docker exec -e "PGPASSWORD=$password" "$container" \ psql -U "$user" -d "$database" -At \ -c "select count(*) from ${quoted_schema}.${quoted_table}")" printf '%s.%s:%s\n' "$schema" "$table" "$count" done | sort >"$output_file" } collect_postgres_dump_row_counts() { local container="$1" local user="$2" local password="$3" local database="$4" local dump_path="$5" local output_file="$6" local scratch_db local scratch_ident backup_require_path "$dump_path" scratch_db="${database}_restore_verify_$(date -u '+%Y%m%d%H%M%S')_$$" scratch_ident="$(quote_pg_ident "$scratch_db")" docker exec -e "PGPASSWORD=$password" "$container" \ psql -U "$user" -d postgres -v ON_ERROR_STOP=1 \ -c "drop database if exists ${scratch_ident} with (force)" \ -c "create database ${scratch_ident}" docker exec -i -e "PGPASSWORD=$password" "$container" \ pg_restore -U "$user" -d "$scratch_db" --clean --if-exists <"$dump_path" collect_postgres_exact_row_counts "$container" "$user" "$password" "$scratch_db" "$output_file" docker exec -e "PGPASSWORD=$password" "$container" \ psql -U "$user" -d postgres -v ON_ERROR_STOP=1 \ -c "drop database if exists ${scratch_ident} with (force)" } collect_clickhouse_exact_row_counts() { local container="$1" local user="$2" local password="$3" local table_list="$4" local output_file="$5" local database local table local engine local count : >"$output_file" while IFS=$'\t' read -r database table engine; do [[ -n "$database" && -n "$table" ]] || continue count="$(docker exec "$container" clickhouse-client --user "$user" --password "$password" \ --query "select count() from \`${database}\`.\`${table}\`")" printf '%s.%s:%s\n' "$database" "$table" "$count" done <"$table_list" | sort >"$output_file" } collect_clickhouse_native_stable_row_counts() { local container="$1" local user="$2" local password="$3" local input_dir="$4" local output_file="$5" local scratch_db local database local table local engine local safe_name scratch_db="$(basename "$input_dir")_restore_verify_$(date -u '+%Y%m%d%H%M%S')_$$" : >"$output_file" docker exec "$container" clickhouse-client --user "$user" --password "$password" \ --query "drop database if exists \`${scratch_db}\`" docker exec "$container" clickhouse-client --user "$user" --password "$password" \ --query "create database \`${scratch_db}\`" while IFS=$'\t' read -r database table engine; do [[ -n "$database" && -n "$table" ]] || continue if [[ "$engine" == *View* || "$engine" == *AggregatingMergeTree* ]]; then continue fi safe_name="${database}__${table}" backup_require_path "$input_dir/data/${safe_name}.native" docker exec "$container" clickhouse-client --user "$user" --password "$password" \ --query "create table \`${scratch_db}\`.\`${table}\` as \`${database}\`.\`${table}\`" docker exec -i "$container" clickhouse-client --user "$user" --password "$password" \ --query "insert into \`${scratch_db}\`.\`${table}\` format Native" <"$input_dir/data/${safe_name}.native" docker exec "$container" clickhouse-client --user "$user" --password "$password" \ --query "select '${database}.${table}:' || toString(count()) from \`${scratch_db}\`.\`${table}\`" \ >>"$output_file" done <"$input_dir/tables.tsv" docker exec "$container" clickhouse-client --user "$user" --password "$password" \ --query "drop database if exists \`${scratch_db}\`" sort -o "$output_file" "$output_file" } filter_clickhouse_stable_row_counts() { local table_list="$1" local counts_file="$2" local output_file="$3" local database local table local engine : >"$output_file" while IFS=$'\t' read -r database table engine; do [[ -n "$database" && -n "$table" ]] || continue if [[ "$engine" == *View* || "$engine" == *AggregatingMergeTree* ]]; then continue fi grep -F "${database}.${table}:" "$counts_file" >>"$output_file" || true done <"$table_list" sort -o "$output_file" "$output_file" } compare_row_count_report() { local label="$1" local expected_file="$2" local actual_file="$3" local diff_file="$4" backup_require_path "$expected_file" if diff -u <(sort "$expected_file") <(sort "$actual_file") >"$diff_file"; then jq -n \ --arg label "$label" \ --arg expected "$expected_file" \ --arg actual "$actual_file" \ --arg status "passed" \ '{label:$label, expected:$expected, actual:$actual, status:$status}' else jq -n \ --arg label "$label" \ --arg expected "$expected_file" \ --arg actual "$actual_file" \ --arg diff "$diff_file" \ --arg status "failed" \ '{label:$label, expected:$expected, actual:$actual, diff:$diff, status:$status}' return 1 fi } verify_restored_targets() { local report_dir local report_items=() local item local expected local actual local diff_file local db_name report_dir="$(dirname "$report_path")/restore-targets-$(date -u '+%Y%m%d-%H%M%SZ')" mkdir -p "$report_dir" if service_enabled postgres "$services"; then expected="$report_dir/baron-postgres-expected-row-counts.txt" actual="$report_dir/baron-postgres-row-counts.txt" diff_file="$report_dir/baron-postgres-row-counts.diff" collect_postgres_dump_row_counts baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}" "$backup_dir/postgres/baron.dump" "$expected" collect_postgres_exact_row_counts baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}" "$actual" item="$(compare_row_count_report "postgres" "$expected" "$actual" "$diff_file")" report_items+=("$item") fi if service_enabled ory-postgres "$services"; then for db_name in "${KRATOS_DB:-ory_kratos}" "${HYDRA_DB:-ory_hydra}" "${KETO_DB:-ory_keto}"; do expected="$report_dir/${db_name}-expected-row-counts.txt" actual="$report_dir/${db_name}-row-counts.txt" diff_file="$report_dir/${db_name}-row-counts.diff" collect_postgres_dump_row_counts ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "$db_name" "$backup_dir/postgres/${db_name}.dump" "$expected" collect_postgres_exact_row_counts ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "$db_name" "$actual" item="$(compare_row_count_report "ory-postgres/$db_name" "$expected" "$actual" "$diff_file")" report_items+=("$item") done fi if service_enabled clickhouse "$services"; then expected="$report_dir/baron_clickhouse-stable-expected-row-counts.txt" actual="$report_dir/baron_clickhouse-stable-row-counts.txt" diff_file="$report_dir/baron_clickhouse-row-counts.diff" collect_clickhouse_exact_row_counts baron_clickhouse "${CLICKHOUSE_USER:-baron}" "${CLICKHOUSE_PASSWORD:-password}" "$backup_dir/clickhouse/baron_clickhouse/tables.tsv" "$actual" mv "$actual" "$report_dir/baron_clickhouse-row-counts.txt" collect_clickhouse_native_stable_row_counts baron_clickhouse "${CLICKHOUSE_USER:-baron}" "${CLICKHOUSE_PASSWORD:-password}" "$backup_dir/clickhouse/baron_clickhouse" "$expected" filter_clickhouse_stable_row_counts "$backup_dir/clickhouse/baron_clickhouse/tables.tsv" "$report_dir/baron_clickhouse-row-counts.txt" "$actual" item="$(compare_row_count_report "clickhouse" "$expected" "$actual" "$diff_file")" report_items+=("$item") fi if service_enabled ory-clickhouse "$services"; then expected="$report_dir/ory_clickhouse-stable-expected-row-counts.txt" actual="$report_dir/ory_clickhouse-stable-row-counts.txt" diff_file="$report_dir/ory_clickhouse-row-counts.diff" collect_clickhouse_exact_row_counts ory_clickhouse "${ORY_CLICKHOUSE_USER:-ory}" "${ORY_CLICKHOUSE_PASSWORD:-orypass}" "$backup_dir/clickhouse/ory_clickhouse/tables.tsv" "$actual" mv "$actual" "$report_dir/ory_clickhouse-row-counts.txt" collect_clickhouse_native_stable_row_counts ory_clickhouse "${ORY_CLICKHOUSE_USER:-ory}" "${ORY_CLICKHOUSE_PASSWORD:-orypass}" "$backup_dir/clickhouse/ory_clickhouse" "$expected" filter_clickhouse_stable_row_counts "$backup_dir/clickhouse/ory_clickhouse/tables.tsv" "$report_dir/ory_clickhouse-row-counts.txt" "$actual" item="$(compare_row_count_report "ory-clickhouse" "$expected" "$actual" "$diff_file")" report_items+=("$item") fi if service_enabled config "$services"; then backup_require_path "$repo_root/config-restored" item="$(jq -n \ --arg label "config" \ --arg actual "$repo_root/config-restored" \ --arg status "passed" \ '{label:$label, actual:$actual, status:$status}')" report_items+=("$item") fi target_verification_reports="$(printf '%s\n' "${report_items[@]}" | jq -s '.')" target_verification_status="passed" } resolve_backup_input if [[ -n "${RESTORE_REPORT:-}" ]]; then report_path="$RESTORE_REPORT" elif [[ "$backup_source" == "dump_file" ]]; then archive_name="$(basename "$dump_file")" archive_name="${archive_name%.tar.zst}" archive_name="${archive_name%.tar.gz}" archive_name="${archive_name%.tgz}" report_path="$repo_root/reports/restore/${archive_name}-restore-report.json" else report_path="$backup_dir/reports/restore-report.json" fi if [[ "${CONFIRM_RESTORE:-}" != "baron-sso" ]]; then backup_die "CONFIRM_RESTORE=baron-sso is required for restore." fi services="$(normalize_service_filter "${RESTORE_SERVICES:-all}")" if [[ -n "${RESTORE_DATASET:-}" ]]; then dataset="$(normalize_dataset_profile "$RESTORE_DATASET")" elif [[ -f "$backup_dir/manifest.json" ]]; then dataset="$(normalize_dataset_profile "$(jq -r '.dataset // "full"' "$backup_dir/manifest.json")")" else dataset="full" fi allow_non_empty="${ALLOW_NON_EMPTY_RESTORE:-false}" if [[ "${RESTORE_TEST_NON_EMPTY:-}" == "1" && "$allow_non_empty" != "true" ]]; then backup_die "non-empty restore target is not allowed by default. Set ALLOW_NON_EMPTY_RESTORE=true only for an approved restore rehearsal." fi if [[ "$dry_run" == "true" ]]; then backup_log "Restore plan for $backup_dir" backup_log "Dataset: $dataset" backup_log "Services: $services" backup_log "ALLOW_NON_EMPTY_RESTORE=$allow_non_empty" backup_log "RESTORE_REPORT=$report_path" write_restore_report "planned" "restore dry-run completed" exit 0 fi if [[ "$allow_non_empty" != "true" ]]; then if service_enabled postgres "$services" && postgres_target_has_data baron_postgres "${DB_USER:-baron}" "${DB_PASSWORD:-password}" "${DB_NAME:-baron_sso}"; then backup_die "non-empty restore target is not allowed by default: baron_postgres/${DB_NAME:-baron_sso}" fi if service_enabled ory-postgres "$services" && postgres_target_has_data ory_postgres "${ORY_POSTGRES_USER:-ory}" "${ORY_POSTGRES_PASSWORD:-secret}" "${KRATOS_DB:-ory_kratos}"; then backup_die "non-empty restore target is not allowed by default: ory_postgres/${KRATOS_DB:-ory_kratos}" fi fi BACKUP="$backup_dir" "$script_dir/verify-dump.sh" dump_checksum_status="passed" if [[ "$dataset" == "personnel" ]]; then restore_personnel_dataset "$backup_dir" "$services" else if service_enabled postgres "$services"; then restore_baron_postgres "$backup_dir" fi if service_enabled ory-postgres "$services"; then restore_ory_postgres "$backup_dir" fi if service_enabled clickhouse "$services"; then restore_baron_clickhouse "$backup_dir" fi if service_enabled ory-clickhouse "$services"; then restore_ory_clickhouse "$backup_dir" fi if service_enabled config "$services"; then restore_config_snapshot "$backup_dir" fi verify_restored_targets fi write_restore_report "succeeded" "restore completed and target row-count verification passed" backup_log "Restore complete. Keep WORKS relay disabled until comparison dry-run passes." backup_log "Restore report: $report_path"