Feature Request Restart Stack instead of just container #128

smarthome-enthusiast · 2024-03-01T07:32:54Z

It would be really great if we could restart the entire stack instead of just the containers. Perhaps we could include a label on the containers, like "autoheal.restart.stack=true", so that when a container is detected as unhealthy, we can stop and restart the entire stack.

Jorman · 2024-05-16T13:05:19Z

Hi, sorry for my intrusion, I think this can be linked to #49, I also have this problem and until not long ago the solution worked, since I reinstalled everything something is not working, during startup it goes into an endless loop, I think the entrypoint proposed is "old"
I tried modifying the old entrypoint by adapting it to the new one, but to no avail.
@willfarrell
Do you think it is a good idea to have master/slave functionality? Would you be able to integrate such functionality?

J

Jorman · 2024-05-16T15:37:19Z

Ok, like I said I tried to modify the entrypoint with the modification of #49 but is not working, I don't know how to debug it, I leave it here hoping someone can fix it

#!/usr/bin/env sh

set -e
# shellcheck disable=2039
set -o pipefail

DOCKER_SOCK=${DOCKER_SOCK:-/var/run/docker.sock}
UNIX_SOCK=""
CURL_TIMEOUT=${CURL_TIMEOUT:-30}
WEBHOOK_URL=${WEBHOOK_URL:-""}
WEBHOOK_JSON_KEY=${WEBHOOK_JSON_KEY:-"text"}
APPRISE_URL=${APPRISE_URL:-""}

# only use unix domain socket if no TCP endpoint is defined
case "${DOCKER_SOCK}" in
  "tcp://"*) HTTP_ENDPOINT="$(echo ${DOCKER_SOCK} | sed 's#tcp://#http://#')"
             ;;
  "tcps://"*) HTTP_ENDPOINT="$(echo ${DOCKER_SOCK} | sed 's#tcps://#https://#')"
             CA="--cacert /certs/ca.pem"
             CLIENT_KEY="--key /certs/client-key.pem"
             CLIENT_CERT="--cert /certs/client-cert.pem"
             ;;
  *)         HTTP_ENDPOINT="http://localhost"
             UNIX_SOCK="--unix-socket ${DOCKER_SOCK}"
             ;;
esac

AUTOHEAL_CONTAINER_LABEL=${AUTOHEAL_CONTAINER_LABEL:-autoheal}
AUTOHEAL_START_PERIOD=${AUTOHEAL_START_PERIOD:-0}
AUTOHEAL_INTERVAL=${AUTOHEAL_INTERVAL:-5}
AUTOHEAL_DEFAULT_STOP_TIMEOUT=${AUTOHEAL_DEFAULT_STOP_TIMEOUT:-10}
AUTOHEAL_ONLY_MONITOR_RUNNING=${AUTOHEAL_ONLY_MONITOR_RUNNING:-false}

echo AUTOHEAL_CONTAINER_LABEL=${AUTOHEAL_CONTAINER_LABEL}
echo AUTOHEAL_START_PERIOD=${AUTOHEAL_START_PERIOD}
echo AUTOHEAL_INTERVAL=${AUTOHEAL_INTERVAL}
echo AUTOHEAL_DEFAULT_STOP_TIMEOUT=${AUTOHEAL_DEFAULT_STOP_TIMEOUT}
echo AUTOHEAL_ONLY_MONITOR_RUNNING=${AUTOHEAL_ONLY_MONITOR_RUNNING}

docker_curl() {
  curl --max-time "${CURL_TIMEOUT}" --no-buffer -s \
  ${CA} ${CLIENT_KEY} ${CLIENT_CERT} \
  ${UNIX_SOCK} \
  "$@"
}

# shellcheck disable=2039
get_container_info() {
  local label_filter
  local running_filter
  local url

  # Set container selector
  if [ "$AUTOHEAL_ONLY_MONITOR_RUNNING" = false ]
  then
    running_filter=""
  else
    running_filter=",\"status\":\[\"running\"\]"
  fi

  if [ "$AUTOHEAL_CONTAINER_LABEL" = "all" ]
  then
    label_filter=""
    url="${HTTP_ENDPOINT}/containers/json?filters=\{\"health\":\[\"unhealthy\"\]${label_filter}${running_filter}\}"
  else
    if [ "$1" = "all" ]
    then
      label_filter=",\"label\":\[\"${AUTOHEAL_CONTAINER_LABEL}=master\"\]"
      url="${HTTP_ENDPOINT}/containers/json?filters=\{\"health\":\[\"unhealthy\"\]${label_filter}${running_filter}\}"
      docker_curl "$url"
      
      label_filter=",\"label\":\[\"${AUTOHEAL_CONTAINER_LABEL}=slave\"\]"
      url="${HTTP_ENDPOINT}/containers/json?filters=\{\"health\":\[\"unhealthy\"\]${label_filter}${running_filter}\}"
      docker_curl "$url"
      
      label_filter=",\"label\":\[\"${AUTOHEAL_CONTAINER_LABEL}=true\"\]"
      url="${HTTP_ENDPOINT}/containers/json?filters=\{\"health\":\[\"unhealthy\"\]${label_filter}${running_filter}\}"
    else
      label_filter="\"label\":\[\"${AUTOHEAL_CONTAINER_LABEL}=${1}\"\]"
      url="${HTTP_ENDPOINT}/containers/json?filters=\{${label_filter}${running_filter}\}"
    fi
  fi
  docker_curl "$url"
}

# shellcheck disable=2039
restart_container() {
  local container_id="$1"
  local timeout="$2"

  docker_curl -f -X POST "${HTTP_ENDPOINT}/containers/${container_id}/restart?t=${timeout}"
}

notify_webhook() {
  local text="$@"

  if [ -n "$WEBHOOK_URL" ]
  then
    # execute webhook requests as background process to prevent healer from blocking
    curl -s -X POST -H "Content-type: application/json" -d "$(generate_webhook_payload $text)"  $WEBHOOK_URL
  fi

  if [ -n "$APPRISE_URL" ]
  then
    # execute webhook requests as background process to prevent healer from blocking
    curl -s -X POST -H "Content-type: application/json" -d "$(generate_apprise_payload $text)"  $APPRISE_URL
  fi
}

notify_post_restart_script() {
  if [ -n "$POST_RESTART_SCRIPT" ]
  then
    # execute post restart script as background process to prevent healer from blocking
    $POST_RESTART_SCRIPT "$@" &
  fi
}

# https://towardsdatascience.com/proper-ways-to-pass-environment-variables-in-json-for-curl-post-f797d2698bf3
generate_webhook_payload() {
  local text="$@"
  cat <<EOF
{
  "$WEBHOOK_JSON_KEY":"$text"
}
EOF
}

generate_apprise_payload() {
  local text="$@"
  cat <<EOF
{
  "title":"Autoheal",
  "body":"$text"
}
EOF
}

# SIGTERM-handler
term_handler() {
  exit 143  # 128 + 15 -- SIGTERM
}

loop_containers() {
    STOP_TIMEOUT=".Labels[\"autoheal.stop.timeout\"] // $AUTOHEAL_DEFAULT_STOP_TIMEOUT"
    get_container_info "$1" | \
      jq -r ".[] | select(.Labels[\"autoheal\"] != \"False\") | foreach . as \$CONTAINER([];[]; \$CONTAINER | .Id, .Names[0], .State, ${STOP_TIMEOUT})" | \
      while read -r CONTAINER_ID && read -r CONTAINER_NAME && read -r CONTAINER_STATE && read -r TIMEOUT
    do
      # shellcheck disable=2039
      CONTAINER_SHORT_ID=${CONTAINER_ID:0:12}
      DATE=$(date +%d-%m-%Y" "%H:%M:%S)

      if [ "$CONTAINER_NAME" = "null" ]
      then
        echo "$DATE Container name of (${CONTAINER_SHORT_ID}) is null, which implies container does not exist - don't restart" >&2
      elif [ "$CONTAINER_STATE" = "restarting" ]
      then
        echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be restarting - don't restart"
      else
        echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be unhealthy - Restarting container now with ${TIMEOUT}s timeout"
        if ! restart_container "$CONTAINER_ID" "$TIMEOUT"
        then
          echo "$DATE Restarting container $CONTAINER_SHORT_ID failed" >&2
          notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Failed to restart the container!" &
        else
          notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Successfully restarted the container!" &
        fi
        notify_post_restart_script "$CONTAINER_NAME" "$CONTAINER_SHORT_ID" "$CONTAINER_STATE" "$TIMEOUT" &
      fi
    done
}

restart_all_containers() {
  # Restart master
  loop_containers "master"
  
  # Wait for healthy master container
  cmd=`get_container_info "master" | jq '.[].Status'` 
  loop=`echo $cmd | awk -F[\(\)] '{print $2}'`  
  echo "master container in status: "$cmd
  while [ "$loop" != "healthy" ]; do
     cmd=`get_container_info "master" | jq '.[].Status'`  
     loop=`echo $cmd | awk -F[\(\)] '{print $2}'` 
     echo "master container in status: "$cmd
     sleep "$AUTOHEAL_INTERVAL"
  done  

  # Restart others
  loop_containers "slave" 
}

# shellcheck disable=2039
trap 'kill $$; term_handler' SIGTERM

if [ "$1" = "autoheal" ]
then
  if [ -n "$UNIX_SOCK" ] && ! [ -S "$DOCKER_SOCK" ]
  then
    echo "unix socket is currently not available" >&2
    exit 1
  fi
  # Delayed startup
  if [ "$AUTOHEAL_START_PERIOD" -gt 0 ]
  then
  echo "Monitoring containers for unhealthy status in $AUTOHEAL_START_PERIOD second(s)"
    sleep "$AUTOHEAL_START_PERIOD" &
    wait $!
  fi

  while true
  do
    STOP_TIMEOUT=".Labels[\"autoheal.stop.timeout\"] // $AUTOHEAL_DEFAULT_STOP_TIMEOUT"
    get_container_info "all" | \
      jq -r ".[] | select(.Labels[\"autoheal\"] != \"False\") | foreach . as \$CONTAINER([];[]; \$CONTAINER | .Id, .Names[0], .State, ${STOP_TIMEOUT})" | \
      while read -r CONTAINER_ID && read -r CONTAINER_NAME && read -r CONTAINER_STATE && read -r TIMEOUT && read -r CONTAINER_LABEL
    do
      # shellcheck disable=2039
      CONTAINER_SHORT_ID=${CONTAINER_ID:0:12}
      DATE=$(date +%d-%m-%Y" "%H:%M:%S)

      if [ "$CONTAINER_NAME" = "null" ]
      then
        echo "$DATE Container name of (${CONTAINER_SHORT_ID}) is null, which implies container does not exist - don't restart" >&2
      elif [ "$CONTAINER_STATE" = "restarting" ]
      then
        echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be restarting - don't restart"
      else
        echo "$DATE Container $CONTAINER_NAME (${CONTAINER_SHORT_ID}) found to be unhealthy - Restarting container now with ${TIMEOUT}s timeout"
        if [ "$CONTAINER_LABEL" = "true" ]
        then
          if ! restart_container "$CONTAINER_ID" "$TIMEOUT"
          then
            echo "$DATE Restarting container $CONTAINER_SHORT_ID failed" >&2
            notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Failed to restart the container!" &
          else
            notify_webhook "Container ${CONTAINER_NAME:1} (${CONTAINER_SHORT_ID}) found to be unhealthy. Successfully restarted the container!" &
          fi
          notify_post_restart_script "$CONTAINER_NAME" "$CONTAINER_SHORT_ID" "$CONTAINER_STATE" "$TIMEOUT" &
        else
          restart_all_containers
          break
        fi
      fi
    done
    sleep "$AUTOHEAL_INTERVAL" &
    wait $!
  done

else
  exec "$@"
fi

smarthome-enthusiast · 2024-05-17T15:18:29Z

@Jorman Have a look at #49. Seems like he has posted a working version a few hours ago.

Jorman · 2024-05-17T20:56:17Z

Thank you @smarthome-enthusiast I'll take a look, I've some problem too about compose with more than one services

smarthome-enthusiast · 2024-05-19T10:24:26Z

@Jorman managed to get the new script working fine with the latest version.

For those looking for a solution, thank @baroka in #49 (comment) for his awesome entrypoint script.

Steps:

download script and save it somewhere accessible by docker

CHOWN to docker group and CHMOD the script to add execute permission (don't know if this is necessary but got a permission error without it):

    chown -R $USER:docker  /path/to/script/folder
    chmod 754 /path/to/script

Mount the file as a VOLUME to /entry.sh

     volumes:
        - /path/to/script:/entry.sh

Add the following to Docker compose file under the autoheal service:

     entrypoint: /entry.sh # Adds feature: restart all containers (master first) on unhealthy one (master or slave)
     command: "autoheal"

edit Docker compose file of monitored services with:

     labels:
        autoheal: master/slave

@willfarrell Firstly, thanks for the fantastic docker. Any chance of integrating this?

Jorman · 2024-05-19T12:51:17Z

Hi @smarthome-enthusiast yes seems to work but with some small adjustments to do.
Anyway, I found that with the new docker-compose, mine is 2.27, is possible to restart the whole stack, at least seems to, I made some test like when containers stuck, that in my case was when the docker-ce update or when one image inside the stack was updated or when I restart the service..., and is working, my setup for the stack is

services:
    my-master-service:
        image: ...
        container_name: my_1st_container_name
        ...
        healthcheck:
            test: "ping -c 1 www.google.com || exit 1"
            interval: 60s
            timeout: 5s
            retries: 3
        restart: unless-stopped

    my-1st-slave-service:
        image: ...
        container_name: my_2nd_container_name
        ...
        network_mode: "service:my-master-service"
        depends_on:
          my-master-service:
            condition: service_started
            restart: true
        healthcheck:
            test: "curl --fail http://localhost:my_2nd_container_service_port || exit 1"
            interval: 30s
            timeout: 10s
            retries: 5
        restart: unless-stopped

    my-2nd-slave-service:
        image: ...
        container_name: my_3rd_container_name
        ...
        network_mode: "service:my-master-service"
        depends_on:
          my-master-service:
            condition: service_started
            restart: true
        healthcheck:
            test: "curl --fail http://localhost:my_3rd_container_service_port || exit 1"
            interval: 30s
            timeout: 10s
            retries: 5
        restart: unless-stopped

In my case this's working without specify labels for master/slave, but I'm under testing now, seems that from compose 2.20, more or less, this functionality is active, but you know, for me the best will be if this method will be integrated inside the docker-autoheal

smarthome-enthusiast closed this as completed May 19, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Feature Request Restart Stack instead of just container #128

Feature Request Restart Stack instead of just container #128

smarthome-enthusiast commented Mar 1, 2024

Jorman commented May 16, 2024 •

edited

Jorman commented May 16, 2024

smarthome-enthusiast commented May 17, 2024

Jorman commented May 17, 2024

smarthome-enthusiast commented May 19, 2024

Jorman commented May 19, 2024

**Feature Request** Restart Stack instead of just container #128

**Feature Request** Restart Stack instead of just container #128

Comments

smarthome-enthusiast commented Mar 1, 2024

Jorman commented May 16, 2024 • edited

Jorman commented May 16, 2024

smarthome-enthusiast commented May 17, 2024

Jorman commented May 17, 2024

smarthome-enthusiast commented May 19, 2024

Jorman commented May 19, 2024

Feature Request Restart Stack instead of just container #128

Feature Request Restart Stack instead of just container #128

Jorman commented May 16, 2024 •

edited