From d0f19e855f3d1c971e51b238490b8bd3958267fa Mon Sep 17 00:00:00 2001 From: Jacob Cody Wimer Date: Tue, 30 Sep 2025 16:31:43 -0400 Subject: [PATCH] Added a mariadb replica watcher to fix replication issues --- app/models/wrestler.rb | 6 +- .../manifests/mariadb-replica-watcher.yaml | 160 ++++++++++++++++++ .../manifests/mariadb-standalone.yaml | 37 +++- deploy/kubernetes/secrets/secrets.yaml | 5 + 4 files changed, 206 insertions(+), 2 deletions(-) create mode 100644 deploy/kubernetes/manifests/mariadb-replica-watcher.yaml diff --git a/app/models/wrestler.rb b/app/models/wrestler.rb index 03b7213..81bf819 100644 --- a/app/models/wrestler.rb +++ b/app/models/wrestler.rb @@ -2,9 +2,13 @@ class Wrestler < ApplicationRecord belongs_to :school, touch: true belongs_to :weight, touch: true has_one :tournament, through: :weight + has_many :deductedPoints, class_name: "Teampointadjust", dependent: :destroy + ## Matches association + # Rails associations expect only a single column so we cannot do a w1 OR w2 + # So we have to create two associations and combine them with the all_matches method has_many :matches_as_w1, ->(wrestler){ where(weight_id: wrestler.weight_id) }, class_name: 'Match', foreign_key: 'w1' has_many :matches_as_w2, ->(wrestler){ where(weight_id: wrestler.weight_id) }, class_name: 'Match', foreign_key: 'w2' - has_many :deductedPoints, class_name: "Teampointadjust", dependent: :destroy + ## attr_accessor :poolAdvancePoints, :originalId, :swapId validates :name, :weight_id, :school_id, presence: true diff --git a/deploy/kubernetes/manifests/mariadb-replica-watcher.yaml b/deploy/kubernetes/manifests/mariadb-replica-watcher.yaml new file mode 100644 index 0000000..cee954b --- /dev/null +++ b/deploy/kubernetes/manifests/mariadb-replica-watcher.yaml @@ -0,0 +1,160 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mariadb-replica-watcher + labels: + app: wrestlingdev + component: mariadb-watcher +spec: + replicas: 1 + selector: + matchLabels: + app: wrestlingdev + component: mariadb-watcher + template: + metadata: + labels: + app: wrestlingdev + component: mariadb-watcher + spec: + containers: + - name: replica-watcher + image: mariadb:10.3 + env: + - name: MARIADB_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: dbpassword + - name: MYSQL_REPLICATION_USER + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_user + - name: MYSQL_REPLICATION_PASSWORD + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_password + - name: MASTER_SERVICE_HOST + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_host + - name: REPLICA_SERVICE_HOST + value: "wrestlingdev-mariadb" + - name: DB_NAME + value: "wrestlingdev" + command: + - bash + - -c + - | + set -euo pipefail + LOG=/var/log/replica-watcher.log + echo "replica-watcher starting: $(date -u)" >>"$LOG" + + trim() { sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'; } + get_val() { + grep -m1 -E "^[[:space:]]*$1[[:space:]]*:" \ + | sed -E "s/^[[:space:]]*$1[[:space:]]*:[[:space:]]*(.*)$/\1/" \ + | tr -d '\r' \ + | xargs + } + + # initial wait + sleep 120 + while true; do + echo "$(date -u) Checking SHOW SLAVE STATUS" | tee -a "$LOG" + SLAVE_RAW=$(mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "SHOW SLAVE STATUS\\G" 2>>"$LOG" || true) + + NEED=0 + if [ -z "$SLAVE_RAW" ]; then + echo "SHOW SLAVE STATUS is empty (replication not configured / not running) -> will rebootstrap" | tee -a "$LOG" + NEED=1 + else + SLAVE_IO=$(echo "$SLAVE_RAW" | get_val "Slave_IO_Running") + SLAVE_SQL=$(echo "$SLAVE_RAW" | get_val "Slave_SQL_Running") + LAST_IO_ERRNO=$(echo "$SLAVE_RAW" | get_val "Last_IO_Errno") + LAST_SQL_ERRNO=$(echo "$SLAVE_RAW" | get_val "Last_SQL_Errno") + LAST_IO_ERR=$(echo "$SLAVE_RAW" | get_val "Last_IO_Error") + LAST_SQL_ERR=$(echo "$SLAVE_RAW" | get_val "Last_SQL_Error") + + echo "Slave IO='${SLAVE_IO:-}' Slave SQL='${SLAVE_SQL:-}'" | tee -a "$LOG" + echo "Last_IO_Errno='${LAST_IO_ERRNO:-}' Last_SQL_Errno='${LAST_SQL_ERRNO:-}'" | tee -a "$LOG" + echo "Last_IO_Error='${LAST_IO_ERR:-}' Last_SQL_Error='${LAST_SQL_ERR:-}'" | tee -a "$LOG" + + if [ "${SLAVE_IO:-}" = "Yes" ] && [ "${SLAVE_SQL:-}" = "Yes" ] \ + && { [ -z "${LAST_IO_ERR:-}" ] || [ "${LAST_IO_ERR,,}" = "no error" ]; } \ + && { [ -z "${LAST_SQL_ERR:-}" ] || [ "${LAST_SQL_ERR,,}" = "no error" ]; } \ + && { [ -z "${LAST_IO_ERRNO:-}" ] || [ "${LAST_IO_ERRNO:-0}" = "0" ]; } \ + && { [ -z "${LAST_SQL_ERRNO:-}" ] || [ "${LAST_SQL_ERRNO:-0}" = "0" ]; }; then + echo "Both slave threads running and no replication errors -> no action" | tee -a "$LOG" + else + NOT_RUNNING=0 + [ "${SLAVE_IO:-No}" != "Yes" ] && NOT_RUNNING=1 + [ "${SLAVE_SQL:-No}" != "Yes" ] && NOT_RUNNING=1 + HAS_ERROR=0 + [ -n "${LAST_IO_ERRNO:-}" ] && [ "${LAST_IO_ERRNO:-0}" != "0" ] && HAS_ERROR=1 + [ -n "${LAST_SQL_ERRNO:-}" ] && [ "${LAST_SQL_ERRNO:-0}" != "0" ] && HAS_ERROR=1 + ERR_TEXT="$(printf '%s %s' "${LAST_IO_ERR:-}" "${LAST_SQL_ERR:-}" | tr '[:upper:]' '[:lower:]' | trim)" + [ -n "$ERR_TEXT" ] && [ "$ERR_TEXT" != "no error" ] && HAS_ERROR=1 + + echo "Decision: NOT_RUNNING=$NOT_RUNNING HAS_ERROR=$HAS_ERROR" | tee -a "$LOG" + [ $NOT_RUNNING -eq 1 ] || [ $HAS_ERROR -eq 1 ] && NEED=1 || echo "Threads healthy -> no action" | tee -a "$LOG" + fi + fi + + if [ $NEED -eq 1 ]; then + echo "$(date -u) Starting rebootstrap flow" | tee -a "$LOG" + + MASTER_STATUS=$(mysql --protocol=TCP -h "$MASTER_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -sse "SHOW MASTER STATUS;" 2>>"$LOG" || true) + MASTER_LOG_FILE=$(echo "$MASTER_STATUS" | awk '{print $1}' | trim || true) + MASTER_LOG_POS=$(echo "$MASTER_STATUS" | awk '{print $2}' | trim || true) + if [ -z "$MASTER_LOG_FILE" ] || [ -z "$MASTER_LOG_POS" ]; then + echo "Failed to get master position from $MASTER_SERVICE_HOST" | tee -a "$LOG" + sleep 120; continue + fi + echo "Master position: ${MASTER_LOG_FILE}:${MASTER_LOG_POS}" | tee -a "$LOG" + + echo "Stopping slave on replica host" | tee -a "$LOG" + mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "STOP SLAVE;" >>"$LOG" 2>&1 || true + + DUMP_FILE="/tmp/${DB_NAME}_backup.sql" + echo "Dumping ${DB_NAME} from master ${MASTER_SERVICE_HOST}" | tee -a "$LOG" + if command -v timeout >/dev/null 2>&1; then + if ! timeout 300 mysqldump --protocol=TCP -h "$MASTER_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" "$DB_NAME" \ + | tee "$DUMP_FILE" >/dev/null 2>>"$LOG"; then + echo "Dump FAILED; aborting this cycle" | tee -a "$LOG"; sleep 120; continue + fi + else + if ! mysqldump --protocol=TCP -h "$MASTER_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" "$DB_NAME" \ + | tee "$DUMP_FILE" >/dev/null 2>>"$LOG"; then + echo "Dump FAILED; aborting this cycle" | tee -a "$LOG"; sleep 120; continue + fi + fi + + ls -lh $DUMP_FILE + + echo "Ensuring database '$DB_NAME' exists on replica" | tee -a "$LOG" + mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" \ + -e "CREATE DATABASE IF NOT EXISTS \`$DB_NAME\`;" >>"$LOG" 2>&1 + + echo "Importing dump into replica host" | tee -a "$LOG" + if ! cat "$DUMP_FILE" | mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" "$DB_NAME" >>"$LOG" 2>&1; then + echo "Import FAILED; aborting this cycle (replication will not be reconfigured)" | tee -a "$LOG" + sleep 120; continue + fi + echo "Import completed successfully" | tee -a "$LOG" + + echo "Reconfiguring replication to ${MASTER_SERVICE_HOST}:${MASTER_LOG_FILE}:${MASTER_LOG_POS}" | tee -a "$LOG" + mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "RESET SLAVE ALL;" >>"$LOG" 2>&1 || true + mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "CHANGE MASTER TO MASTER_HOST='${MASTER_SERVICE_HOST}', MASTER_USER='${MYSQL_REPLICATION_USER}', MASTER_PASSWORD='${MYSQL_REPLICATION_PASSWORD}', MASTER_LOG_FILE='${MASTER_LOG_FILE}', MASTER_LOG_POS=${MASTER_LOG_POS}; START SLAVE;" >>"$LOG" 2>&1 || true + + echo "SHOW SLAVE STATUS after rebootstrap:" | tee -a "$LOG" + mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "SHOW SLAVE STATUS\\G" >>"$LOG" 2>&1 || true + fi + + echo "Sleeping 120s before next check" | tee -a "$LOG" + sleep 120 + done + restartPolicy: Always diff --git a/deploy/kubernetes/manifests/mariadb-standalone.yaml b/deploy/kubernetes/manifests/mariadb-standalone.yaml index 6592da4..afce404 100644 --- a/deploy/kubernetes/manifests/mariadb-standalone.yaml +++ b/deploy/kubernetes/manifests/mariadb-standalone.yaml @@ -56,6 +56,16 @@ spec: secretKeyRef: name: wrestlingdev-secrets key: dbpassword + - name: REPLICATION_USER + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_user + - name: REPLICATION_PASSWORD + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_password ports: - containerPort: 3306 name: mariadb @@ -64,12 +74,36 @@ spec: mountPath: /var/lib/mysql - name: mysettings-config-volume mountPath: /etc/mysql/mariadb.conf.d + # lifecycle: create replication user with proper privileges if it doesn't exist + lifecycle: + postStart: + exec: + command: + - sh + - -c + - | + # Wait up to 60s for mysqld to be available + for i in $(seq 1 60); do + if mysqladmin ping -uroot -p"$MARIADB_ROOT_PASSWORD" --silent; then + echo "mysqld is up" + break + fi + sleep 1 + done + + echo "Ensuring replication user ${REPLICATION_USER} exists and has REPLICATION SLAVE privileges" + + # Create the replication user if it doesn't exist and grant replication privileges. + # Use CREATE USER IF NOT EXISTS so the command is idempotent. + mysql -uroot -p"$MARIADB_ROOT_PASSWORD" -e "CREATE USER IF NOT EXISTS '${REPLICATION_USER}'@'%' IDENTIFIED BY '${REPLICATION_PASSWORD}';" 2>/dev/null || true + mysql -uroot -p"$MARIADB_ROOT_PASSWORD" -e "GRANT REPLICATION SLAVE ON *.* TO '${REPLICATION_USER}'@'%';" 2>/dev/null || true + mysql -uroot -p"$MARIADB_ROOT_PASSWORD" -e "FLUSH PRIVILEGES;" 2>/dev/null || true + echo "Replication user ready (errors ignored to avoid blocking startup)" # resources: # limits: # memory: "512Mi" # requests: # memory: "256Mi" - # cpu: "0.2" - image: jcwimer/mariadb-rclone-backup-docker:10.3 name: mariadb-backup env: @@ -204,5 +238,6 @@ data: performance_schema=ON innodb_log_file_size=32M table_open_cache=4000 + expire_logs_days=7 # /etc/mysql/mariadb.conf.d/70-mysettings.cnf diff --git a/deploy/kubernetes/secrets/secrets.yaml b/deploy/kubernetes/secrets/secrets.yaml index 37c3b31..08ab9d6 100644 --- a/deploy/kubernetes/secrets/secrets.yaml +++ b/deploy/kubernetes/secrets/secrets.yaml @@ -14,6 +14,11 @@ stringData: gmailpassword: PUT_EMAIL_PASSWORD_HERE # gmail password gmailemail: PUT EMAIL ADDRESS HERE passenger_pool_size: "2" + # Replication credentials (create a dedicated user for replication) + replication_user: replica_user_here + replication_password: PUT_REPLICATION_PASSWORD_HERE + # Replication host used by the replica to connect to the master + replication_host: wrestlingdev-mariadb # OPTIONAL # DELETE THESE LINES IF YOU'RE NOT USING THEM influxdb_database: PUT INFLUXDB DATABASE NAME HERE