From c1b01f0daca7866c15b9c0f98f9610e3edc8037b Mon Sep 17 00:00:00 2001 From: Jacob Cody Wimer Date: Mon, 27 Apr 2026 18:54:46 -0400 Subject: [PATCH] Made mariadb's statefulsets, simplified the replica logic by used GTID. --- .../manifests/mariadb-replica-watcher.yaml | 160 ------------------ .../kubernetes/manifests/mariadb-replica.yaml | 132 +++++++++++++-- .../manifests/mariadb-standalone.yaml | 42 +++-- 3 files changed, 145 insertions(+), 189 deletions(-) delete mode 100644 deploy/kubernetes/manifests/mariadb-replica-watcher.yaml diff --git a/deploy/kubernetes/manifests/mariadb-replica-watcher.yaml b/deploy/kubernetes/manifests/mariadb-replica-watcher.yaml deleted file mode 100644 index 5b96d01..0000000 --- a/deploy/kubernetes/manifests/mariadb-replica-watcher.yaml +++ /dev/null @@ -1,160 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: mariadb-replica-watcher - labels: - app: wrestlingdev - component: mariadb-watcher -spec: - replicas: 1 - selector: - matchLabels: - app: wrestlingdev - component: mariadb-watcher - template: - metadata: - labels: - app: wrestlingdev - component: mariadb-watcher - spec: - containers: - - name: replica-watcher - image: mariadb:10.3 - env: - - name: MARIADB_ROOT_PASSWORD - valueFrom: - secretKeyRef: - name: wrestlingdev-secrets - key: dbpassword - - name: MYSQL_REPLICATION_USER - valueFrom: - secretKeyRef: - name: wrestlingdev-secrets - key: replication_user - - name: MYSQL_REPLICATION_PASSWORD - valueFrom: - secretKeyRef: - name: wrestlingdev-secrets - key: replication_password - - name: MASTER_SERVICE_HOST - valueFrom: - secretKeyRef: - name: wrestlingdev-secrets - key: replication_host - - name: REPLICA_SERVICE_HOST - value: "wrestlingdev-mariadb" - - name: DB_NAME - value: "wrestlingdev" - command: - - bash - - -c - - | - set -euo pipefail - LOG=/var/log/replica-watcher.log - echo "replica-watcher starting: $(date -u)" >>"$LOG" - - trim() { sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'; } - get_val() { - grep -m1 -E "^[[:space:]]*$1[[:space:]]*:" \ - | sed -E "s/^[[:space:]]*$1[[:space:]]*:[[:space:]]*(.*)$/\1/" \ - | tr -d '\r' \ - | xargs - } - - # initial wait - sleep 120 - while true; do - echo "$(date -u) Checking SHOW SLAVE STATUS" | tee -a "$LOG" - SLAVE_RAW=$(mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "SHOW SLAVE STATUS\\G" 2>>"$LOG" || true) - - NEED=0 - if [ -z "$SLAVE_RAW" ]; then - echo "SHOW SLAVE STATUS is empty (replication not configured / not running) -> will rebootstrap" | tee -a "$LOG" - NEED=1 - else - SLAVE_IO=$(echo "$SLAVE_RAW" | get_val "Slave_IO_Running") - SLAVE_SQL=$(echo "$SLAVE_RAW" | get_val "Slave_SQL_Running") - LAST_IO_ERRNO=$(echo "$SLAVE_RAW" | get_val "Last_IO_Errno") - LAST_SQL_ERRNO=$(echo "$SLAVE_RAW" | get_val "Last_SQL_Errno") - LAST_IO_ERR=$(echo "$SLAVE_RAW" | get_val "Last_IO_Error") - LAST_SQL_ERR=$(echo "$SLAVE_RAW" | get_val "Last_SQL_Error") - - echo "Slave IO='${SLAVE_IO:-}' Slave SQL='${SLAVE_SQL:-}'" | tee -a "$LOG" - echo "Last_IO_Errno='${LAST_IO_ERRNO:-}' Last_SQL_Errno='${LAST_SQL_ERRNO:-}'" | tee -a "$LOG" - echo "Last_IO_Error='${LAST_IO_ERR:-}' Last_SQL_Error='${LAST_SQL_ERR:-}'" | tee -a "$LOG" - - if [ "${SLAVE_IO:-}" = "Yes" ] && [ "${SLAVE_SQL:-}" = "Yes" ] \ - && { [ -z "${LAST_IO_ERR:-}" ] || [ "${LAST_IO_ERR,,}" = "no error" ]; } \ - && { [ -z "${LAST_SQL_ERR:-}" ] || [ "${LAST_SQL_ERR,,}" = "no error" ]; } \ - && { [ -z "${LAST_IO_ERRNO:-}" ] || [ "${LAST_IO_ERRNO:-0}" = "0" ]; } \ - && { [ -z "${LAST_SQL_ERRNO:-}" ] || [ "${LAST_SQL_ERRNO:-0}" = "0" ]; }; then - echo "Both slave threads running and no replication errors -> no action" | tee -a "$LOG" - else - NOT_RUNNING=0 - [ "${SLAVE_IO:-No}" != "Yes" ] && NOT_RUNNING=1 - [ "${SLAVE_SQL:-No}" != "Yes" ] && NOT_RUNNING=1 - HAS_ERROR=0 - [ -n "${LAST_IO_ERRNO:-}" ] && [ "${LAST_IO_ERRNO:-0}" != "0" ] && HAS_ERROR=1 - [ -n "${LAST_SQL_ERRNO:-}" ] && [ "${LAST_SQL_ERRNO:-0}" != "0" ] && HAS_ERROR=1 - ERR_TEXT="$(printf '%s %s' "${LAST_IO_ERR:-}" "${LAST_SQL_ERR:-}" | tr '[:upper:]' '[:lower:]' | trim)" - [ -n "$ERR_TEXT" ] && [ "$ERR_TEXT" != "no error" ] && HAS_ERROR=1 - - echo "Decision: NOT_RUNNING=$NOT_RUNNING HAS_ERROR=$HAS_ERROR" | tee -a "$LOG" - [ $NOT_RUNNING -eq 1 ] || [ $HAS_ERROR -eq 1 ] && NEED=1 || echo "Threads healthy -> no action" | tee -a "$LOG" - fi - fi - - if [ $NEED -eq 1 ]; then - echo "$(date -u) Starting rebootstrap flow" | tee -a "$LOG" - - MASTER_STATUS=$(mysql --protocol=TCP -h "$MASTER_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -sse "SHOW MASTER STATUS;" 2>>"$LOG" || true) - MASTER_LOG_FILE=$(echo "$MASTER_STATUS" | awk '{print $1}' | trim || true) - MASTER_LOG_POS=$(echo "$MASTER_STATUS" | awk '{print $2}' | trim || true) - if [ -z "$MASTER_LOG_FILE" ] || [ -z "$MASTER_LOG_POS" ]; then - echo "Failed to get master position from $MASTER_SERVICE_HOST" | tee -a "$LOG" - sleep 120; continue - fi - echo "Master position: ${MASTER_LOG_FILE}:${MASTER_LOG_POS}" | tee -a "$LOG" - - echo "Stopping slave on replica host" | tee -a "$LOG" - mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "STOP SLAVE;" >>"$LOG" 2>&1 || true - - DUMP_FILE="/tmp/${DB_NAME}_backup.sql" - echo "Dumping ${DB_NAME} from master ${MASTER_SERVICE_HOST}" | tee -a "$LOG" - if command -v timeout >/dev/null 2>&1; then - if ! timeout 300 mysqldump --protocol=TCP -h "$MASTER_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" --single-transaction "$DB_NAME" \ - | tee "$DUMP_FILE" >/dev/null 2>>"$LOG"; then - echo "Dump FAILED; aborting this cycle" | tee -a "$LOG"; sleep 120; continue - fi - else - if ! mysqldump --protocol=TCP -h "$MASTER_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" --single-transaction "$DB_NAME" \ - | tee "$DUMP_FILE" >/dev/null 2>>"$LOG"; then - echo "Dump FAILED; aborting this cycle" | tee -a "$LOG"; sleep 120; continue - fi - fi - - ls -lh $DUMP_FILE - - echo "Ensuring database '$DB_NAME' exists on replica" | tee -a "$LOG" - mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" \ - -e "CREATE DATABASE IF NOT EXISTS \`$DB_NAME\`;" >>"$LOG" 2>&1 - - echo "Importing dump into replica host" | tee -a "$LOG" - if ! cat "$DUMP_FILE" | mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" "$DB_NAME" >>"$LOG" 2>&1; then - echo "Import FAILED; aborting this cycle (replication will not be reconfigured)" | tee -a "$LOG" - sleep 120; continue - fi - echo "Import completed successfully" | tee -a "$LOG" - - echo "Reconfiguring replication to ${MASTER_SERVICE_HOST}:${MASTER_LOG_FILE}:${MASTER_LOG_POS}" | tee -a "$LOG" - mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "RESET SLAVE ALL;" >>"$LOG" 2>&1 || true - mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "CHANGE MASTER TO MASTER_HOST='${MASTER_SERVICE_HOST}', MASTER_USER='${MYSQL_REPLICATION_USER}', MASTER_PASSWORD='${MYSQL_REPLICATION_PASSWORD}', MASTER_LOG_FILE='${MASTER_LOG_FILE}', MASTER_LOG_POS=${MASTER_LOG_POS}; START SLAVE;" >>"$LOG" 2>&1 || true - - echo "SHOW SLAVE STATUS after rebootstrap:" | tee -a "$LOG" - mysql --protocol=TCP -h "$REPLICA_SERVICE_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" -e "SHOW SLAVE STATUS\\G" >>"$LOG" 2>&1 || true - fi - - echo "Sleeping 120s before next check" | tee -a "$LOG" - sleep 120 - done - restartPolicy: Always diff --git a/deploy/kubernetes/manifests/mariadb-replica.yaml b/deploy/kubernetes/manifests/mariadb-replica.yaml index 44c009e..79f31d4 100644 --- a/deploy/kubernetes/manifests/mariadb-replica.yaml +++ b/deploy/kubernetes/manifests/mariadb-replica.yaml @@ -27,17 +27,19 @@ spec: storage: 20Gi --- apiVersion: apps/v1 -kind: Deployment +kind: StatefulSet metadata: name: wrestlingdev-mariadb labels: app: wrestlingdev spec: + replicas: 1 + serviceName: wrestlingdev-mariadb selector: matchLabels: app: wrestlingdev - strategy: - type: Recreate + updateStrategy: + type: RollingUpdate template: metadata: labels: @@ -47,6 +49,43 @@ spec: prometheus.io/port: "9125" prometheus.io/scrape: "true" spec: + initContainers: + - name: bootstrap + image: mariadb:10.3 + env: + - name: MARIADB_ROOT_PASSWORD + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: dbpassword + - name: MASTER_HOST + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_host + command: + - bash + - -c + - | + if [ -d /var/lib/mysql/mysql ]; then + echo "Data directory already initialized, skipping bootstrap" + exit 0 + fi + echo "Fresh data directory — bootstrapping replica from ${MASTER_HOST}" + DBS=$(mysql --protocol=TCP -h "$MASTER_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" \ + -e "SHOW DATABASES;" --skip-column-names \ + | grep -Ev '^(information_schema|performance_schema|mysql|sys)$' \ + | tr '\n' ' ') + echo "Dumping databases: ${DBS}" + mysqldump --protocol=TCP -h "$MASTER_HOST" -uroot -p"$MARIADB_ROOT_PASSWORD" \ + --single-transaction --master-data=2 --gtid --databases $DBS \ + > /docker-entrypoint-initdb.d/dump.sql + echo "Bootstrap dump complete" + volumeMounts: + - name: wrestlingdev-mariadb-persistent-storage + mountPath: /var/lib/mysql + - name: init-scripts + mountPath: /docker-entrypoint-initdb.d containers: - image: mariadb:10.3 name: mariadb @@ -56,6 +95,48 @@ spec: secretKeyRef: name: wrestlingdev-secrets key: dbpassword + - name: MASTER_HOST + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_host + - name: MYSQL_REPLICATION_USER + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_user + - name: MYSQL_REPLICATION_PASSWORD + valueFrom: + secretKeyRef: + name: wrestlingdev-secrets + key: replication_password + lifecycle: + postStart: + exec: + command: + - bash + - -c + - | + for i in $(seq 1 60); do + mysqladmin ping -uroot -p"$MARIADB_ROOT_PASSWORD" --protocol=TCP -h 127.0.0.1 --silent && break + sleep 2 + done + SLAVE_STATUS=$(mysql -uroot -p"$MARIADB_ROOT_PASSWORD" -e "SHOW SLAVE STATUS\G" 2>/dev/null) + SLAVE_IO=$(echo "$SLAVE_STATUS" | grep -m1 "Slave_IO_Running" | awk '{print $2}') + SLAVE_SQL=$(echo "$SLAVE_STATUS" | grep -m1 "Slave_SQL_Running" | awk '{print $2}') + if [ "${SLAVE_IO}" = "Yes" ] && [ "${SLAVE_SQL}" = "Yes" ]; then + echo "Replication is already running" + exit 0 + fi + mysql -uroot -p"$MARIADB_ROOT_PASSWORD" -e "STOP SLAVE; RESET SLAVE ALL;" + if [ -f /docker-entrypoint-initdb.d/dump.sql ]; then + GTID_POS=$(grep -m1 "SET GLOBAL gtid_slave_pos" /docker-entrypoint-initdb.d/dump.sql | sed "s/.*gtid_slave_pos='\([^']*\)'.*/\1/") + echo "Setting gtid_slave_pos from dump: '${GTID_POS}'" + mysql -uroot -p"$MARIADB_ROOT_PASSWORD" -e "SET GLOBAL gtid_slave_pos='${GTID_POS}';" + fi + mysql -uroot -p"$MARIADB_ROOT_PASSWORD" \ + -e "CHANGE MASTER TO MASTER_HOST='${MASTER_HOST}', MASTER_USER='${MYSQL_REPLICATION_USER}', MASTER_PASSWORD='${MYSQL_REPLICATION_PASSWORD}', MASTER_USE_GTID=slave_pos;" \ + -e "START SLAVE;" ports: - containerPort: 3306 name: mariadb @@ -64,6 +145,8 @@ spec: mountPath: /var/lib/mysql - name: mysettings-config-volume mountPath: /etc/mysql/mariadb.conf.d + - name: init-scripts + mountPath: /docker-entrypoint-initdb.d # resources: # limits: # memory: "512Mi" @@ -180,6 +263,8 @@ spec: - name: mysettings-config-volume configMap: name: mariadb-mysettings + - name: init-scripts + emptyDir: {} --- apiVersion: v1 kind: ConfigMap @@ -191,29 +276,44 @@ metadata: data: 70-mysettings.cnf: | [mariadb] - # Slow log + # Slow query log — records queries taking longer than long_query_time seconds slow_query_log=1 #slow_query_log_file=/var/log/mariadb/slow.log slow_query_log_file=/var/lib/mysql/slow.log long_query_time=0.2 + # mysqltunner recommendations + # Max size for in-memory temp tables before spilling to disk tmp_table_size=32M max_heap_table_size=32M + # Collect detailed query/table statistics (required by some monitoring tools) performance_schema=ON + # Size of each InnoDB redo log file; increase for write-heavy workloads innodb_log_file_size=32M + # Number of open table handles to cache; reduces overhead of reopening tables table_open_cache=4000 - # replica settings - server_id=2 # Default server_id, can be overridden for master/slave - log_bin=mysql-bin # Enable binary logging - binlog_format=ROW # Recommended for replication - log_slave_updates=ON # Ensure slaves log updates (useful for multi-source replication) - sync_binlog=1 # Flush binary logs after each transaction for safety - read_only=0 # Default, will be managed by the init script - expire_logs_days=7 # Retain binary logs for 7 days - # if you want to ignore dbs to replicate - # replicate-ignore-db=wrestlingtourney-queue - # if you only want to replicate certain dbs + # Replication (replica) + # Must be unique and different from the master's server_id + server_id=2 + # Enable binary logging on the replica (required for log_slave_updates) + log_bin=mysql-bin + # ROW format is safest: records exact row changes rather than SQL statements + binlog_format=ROW + # Write replicated events into this replica's own binlog (needed for chained replicas) + log_slave_updates=ON + # Enforce GTID consistency — rejects transactions that would break GTID sequences + gtid_strict_mode=ON + # Flush binlog to disk on every commit; prevents binlog loss on crash + sync_binlog=1 + # Prevent accidental writes directly to the replica + read_only=1 + # How many days to retain binary logs before automatic purge + expire_logs_days=7 + + # Only replicate the application database — rails-specific: excludes the solid_queue DB so + # background job workers can run independently on the replica cluster replicate-do-db=wrestlingdev - + # replicate-ignore-db=wrestlingtourney-queue + # /etc/mysql/mariadb.conf.d/70-mysettings.cnf diff --git a/deploy/kubernetes/manifests/mariadb-standalone.yaml b/deploy/kubernetes/manifests/mariadb-standalone.yaml index 342b898..d520e31 100644 --- a/deploy/kubernetes/manifests/mariadb-standalone.yaml +++ b/deploy/kubernetes/manifests/mariadb-standalone.yaml @@ -27,17 +27,19 @@ spec: storage: 20Gi --- apiVersion: apps/v1 -kind: Deployment +kind: StatefulSet metadata: name: wrestlingdev-mariadb labels: app: wrestlingdev spec: + replicas: 1 + serviceName: wrestlingdev-mariadb selector: matchLabels: app: wrestlingdev - strategy: - type: Recreate + updateStrategy: + type: RollingUpdate template: metadata: labels: @@ -227,25 +229,39 @@ metadata: data: 70-mysettings.cnf: | [mariadb] - # Slow log + # Slow query log — records queries taking longer than long_query_time seconds slow_query_log=1 #slow_query_log_file=/var/log/mariadb/slow.log slow_query_log_file=/var/lib/mysql/slow.log long_query_time=0.2 + # mysqltunner recommendations + # Max size for in-memory temp tables before spilling to disk tmp_table_size=32M max_heap_table_size=32M + # Collect detailed query/table statistics (required by some monitoring tools) performance_schema=ON + # Size of each InnoDB redo log file; increase for write-heavy workloads innodb_log_file_size=32M + # Number of open table handles to cache; reduces overhead of reopening tables table_open_cache=4000 + # How many days to retain general error/slow logs expire_logs_days=7 - # master slave - server_id=1 # Unique server ID for the master - log_bin=mysql-bin # Enable binary logging - binlog_format=ROW # Recommended format for replication (ROW, STATEMENT, or MIXED) - log_slave_updates=ON # Ensure any changes replicated to the master are also logged to the binary log (useful for multi-source replication) - sync_binlog=1 # Ensures binary logs are synchronized with disk after each transaction for data safety - expire_logs_days=7 # Optional: Number of days to retain binary logs (helps with cleanup) - - # /etc/mysql/mariadb.conf.d/70-mysettings.cnf + # Replication (master) + # Unique ID for this server across the whole replication topology + server_id=1 + # Enable binary logging — required for replication + log_bin=mysql-bin + # ROW format is safest: records exact row changes rather than SQL statements + binlog_format=ROW + # Include replicated events in this server's own binlog (needed for chained replicas) + log_slave_updates=ON + # Enforce GTID consistency — rejects transactions that would break GTID sequences + gtid_strict_mode=ON + # Flush binlog to disk on every commit; prevents binlog loss on crash + sync_binlog=1 + # How many days to retain binary logs before automatic purge + expire_logs_days=7 + + # /etc/mysql/mariadb.conf.d/70-mysettings.cnf is included by the main config and will override any conflicting settings in the default config files. This allows us to customize settings without modifying the base image.