Skip to main content
ansible docker-stack.yml

Docker Stack Deployment

Deploy and configure Docker with common containers across multiple hosts

yaml
---
# Ansible Playbook: Deploy Docker Stack
# Usage: ansible-playbook -i inventory docker-stack.yml

- name: Deploy Docker and common containers
  hosts: docker_hosts
  become: yes
  vars:
    docker_compose_version: "2.24.0"
    containers:
      - name: portainer
        image: portainer/portainer-ce:latest
        ports: ["9443:9443"]
        volumes: ["/var/run/docker.sock:/var/run/docker.sock", "portainer_data:/data"]
      - name: watchtower
        image: containrrr/watchtower:latest
        volumes: ["/var/run/docker.sock:/var/run/docker.sock"]
        environment:
          WATCHTOWER_CLEANUP: "true"
          WATCHTOWER_SCHEDULE: "0 0 4 * * *"

  tasks:
    - name: Install Docker dependencies
      apt:
        name:
          - apt-transport-https
          - ca-certificates
          - curl
          - gnupg
          - lsb-release
        state: present
        update_cache: yes

    - name: Add Docker GPG key
      apt_key:
        url: https://download.docker.com/linux/ubuntu/gpg
        state: present

    - name: Add Docker repository
      apt_repository:
        repo: "deb https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
        state: present

    - name: Install Docker
      apt:
        name:
          - docker-ce
          - docker-ce-cli
          - containerd.io
          - docker-buildx-plugin
          - docker-compose-plugin
        state: present

    - name: Start and enable Docker
      systemd:
        name: docker
        state: started
        enabled: yes

    - name: Deploy containers
      community.docker.docker_container:
        name: "{{ item.name }}"
        image: "{{ item.image }}"
        ports: "{{ item.ports | default(omit) }}"
        volumes: "{{ item.volumes | default(omit) }}"
        env: "{{ item.environment | default(omit) }}"
        restart_policy: unless-stopped
      loop: "{{ containers }}"
ansible tailscale-setup.yml

Tailscale Mesh VPN Setup

Install and configure Tailscale across all nodes with subnet routing

yaml
---
# Ansible Playbook: Tailscale Mesh VPN
# Usage: ansible-playbook -i inventory tailscale-setup.yml -e "tailscale_authkey=tskey-xxx"

- name: Install and configure Tailscale
  hosts: all
  become: yes
  vars:
    tailscale_authkey: "{{ lookup('env', 'TAILSCALE_AUTHKEY') }}"
    subnet_routers:
      - host: alpha-centauri
        advertise_routes: "10.42.0.0/24"
      - host: titawin-host
        advertise_routes: "192.168.20.0/24"

  tasks:
    - name: Add Tailscale repository key
      apt_key:
        url: https://pkgs.tailscale.com/stable/ubuntu/jammy.noarmor.gpg
        state: present

    - name: Add Tailscale repository
      apt_repository:
        repo: "deb https://pkgs.tailscale.com/stable/ubuntu jammy main"
        state: present

    - name: Install Tailscale
      apt:
        name: tailscale
        state: present
        update_cache: yes

    - name: Enable IP forwarding (for subnet routers)
      sysctl:
        name: "{{ item }}"
        value: "1"
        sysctl_set: yes
        reload: yes
      loop:
        - net.ipv4.ip_forward
        - net.ipv6.conf.all.forwarding
      when: inventory_hostname in (subnet_routers | map(attribute='host'))

    - name: Start Tailscale service
      systemd:
        name: tailscaled
        state: started
        enabled: yes

    - name: Authenticate with Tailscale
      command: >
        tailscale up
        --authkey={{ tailscale_authkey }}
        --ssh
        --accept-routes
        {% if inventory_hostname in (subnet_routers | map(attribute='host')) %}
        --advertise-routes={{ (subnet_routers | selectattr('host', 'eq', inventory_hostname) | first).advertise_routes }}
        {% endif %}
      register: tailscale_up
      changed_when: "'Success' in tailscale_up.stdout"

    - name: Get Tailscale status
      command: tailscale status
      register: ts_status
      changed_when: false

    - name: Display Tailscale IP
      debug:
        msg: "{{ inventory_hostname }} Tailscale IP: {{ ts_status.stdout_lines[0] }}"
ansible monitoring-stack.yml

Monitoring Stack Deployment

Deploy Glances, Prometheus node exporter, and configure metrics collection

yaml
---
# Ansible Playbook: Monitoring Stack
# Deploys Glances and Prometheus exporters to all hosts

- name: Deploy monitoring agents
  hosts: all
  become: yes
  vars:
    glances_port: 61208
    node_exporter_port: 9100
    glances_version: "4.0.0"

  tasks:
    - name: Install Python dependencies
      apt:
        name:
          - python3-pip
          - python3-docker
        state: present

    - name: Install Glances via pip
      pip:
        name: glances[all]
        state: present

    - name: Create Glances systemd service
      copy:
        dest: /etc/systemd/system/glances.service
        content: |
          [Unit]
          Description=Glances System Monitor
          After=network.target

          [Service]
          ExecStart=/usr/local/bin/glances -w -p {{ glances_port }}
          Restart=on-failure
          RestartSec=10

          [Install]
          WantedBy=multi-user.target
      notify: Reload systemd

    - name: Download Prometheus Node Exporter
      get_url:
        url: "https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz"
        dest: /tmp/node_exporter.tar.gz

    - name: Extract Node Exporter
      unarchive:
        src: /tmp/node_exporter.tar.gz
        dest: /usr/local/bin
        remote_src: yes
        extra_opts: [--strip-components=1]
        creates: /usr/local/bin/node_exporter

    - name: Create Node Exporter systemd service
      copy:
        dest: /etc/systemd/system/node_exporter.service
        content: |
          [Unit]
          Description=Prometheus Node Exporter
          After=network.target

          [Service]
          ExecStart=/usr/local/bin/node_exporter --web.listen-address=:{{ node_exporter_port }}
          Restart=on-failure

          [Install]
          WantedBy=multi-user.target
      notify: Reload systemd

    - name: Start monitoring services
      systemd:
        name: "{{ item }}"
        state: started
        enabled: yes
      loop:
        - glances
        - node_exporter

  handlers:
    - name: Reload systemd
      systemd:
        daemon_reload: yes
terraform proxmox-vm/main.tf

Proxmox VM Provisioning

Terraform module to create VMs on Proxmox with cloud-init

hcl
# Terraform: Proxmox VM Module
# Creates VMs with cloud-init configuration

terraform {
  required_providers {
    proxmox = {
      source  = "Telmate/proxmox"
      version = "~> 2.9"
    }
  }
}

variable "proxmox_host" {
  description = "Proxmox host IP"
  default     = "10.42.0.201"
}

variable "vm_name" {
  description = "Name of the VM"
  type        = string
}

variable "target_node" {
  description = "Proxmox node to deploy on"
  default     = "icarus"
}

variable "cores" {
  description = "Number of CPU cores"
  default     = 4
}

variable "memory" {
  description = "RAM in MB"
  default     = 4096
}

variable "disk_size" {
  description = "Boot disk size"
  default     = "32G"
}

variable "ip_address" {
  description = "Static IP address"
  type        = string
}

variable "gateway" {
  description = "Network gateway"
  default     = "10.42.0.1"
}

variable "ssh_keys" {
  description = "SSH public keys for cloud-init"
  type        = string
}

resource "proxmox_vm_qemu" "vm" {
  name        = var.vm_name
  target_node = var.target_node
  clone       = "ubuntu-cloud-template"

  cores   = var.cores
  sockets = 1
  memory  = var.memory

  agent = 1  # Enable QEMU guest agent

  disk {
    storage = "local-zfs"
    size    = var.disk_size
    type    = "scsi"
  }

  network {
    model  = "virtio"
    bridge = "vmbr0"
  }

  # Cloud-init configuration
  os_type    = "cloud-init"
  ipconfig0  = "ip=${var.ip_address}/24,gw=${var.gateway}"
  ciuser     = "commander"
  sshkeys    = var.ssh_keys

  lifecycle {
    ignore_changes = [
      network,
    ]
  }

  tags = "terraform,${var.vm_name}"
}

output "vm_ip" {
  value = var.ip_address
}

output "vm_id" {
  value = proxmox_vm_qemu.vm.vmid
}
terraform cloudflare-tunnel/main.tf

Cloudflare Tunnel Configuration

Terraform module for Cloudflare Tunnel and DNS records

hcl
# Terraform: Cloudflare Tunnel
# Manages tunnel configuration and DNS records

terraform {
  required_providers {
    cloudflare = {
      source  = "cloudflare/cloudflare"
      version = "~> 4.0"
    }
  }
}

variable "cloudflare_account_id" {
  description = "Cloudflare account ID"
  type        = string
  sensitive   = true
}

variable "cloudflare_zone_id" {
  description = "Cloudflare zone ID for your domain"
  type        = string
}

variable "domain" {
  description = "Base domain name"
  default     = "argobox.com"
}

variable "tunnel_secret" {
  description = "Tunnel secret (base64)"
  type        = string
  sensitive   = true
}

variable "services" {
  description = "Services to expose through tunnel"
  type = list(object({
    subdomain = string
    service   = string
    port      = number
  }))
  default = [
    { subdomain = "git", service = "localhost", port = 3000 },
    { subdomain = "ai", service = "localhost", port = 30000 },
    { subdomain = "vault", service = "localhost", port = 31745 },
  ]
}

# Create the tunnel
resource "cloudflare_tunnel" "homelab" {
  account_id = var.cloudflare_account_id
  name       = "homelab-tunnel"
  secret     = var.tunnel_secret
}

# Configure tunnel routes
resource "cloudflare_tunnel_config" "homelab" {
  account_id = var.cloudflare_account_id
  tunnel_id  = cloudflare_tunnel.homelab.id

  config {
    dynamic "ingress_rule" {
      for_each = var.services
      content {
        hostname = "${ingress_rule.value.subdomain}.${var.domain}"
        service  = "http://${ingress_rule.value.service}:${ingress_rule.value.port}"
      }
    }

    # Catch-all rule (required)
    ingress_rule {
      service = "http_status:404"
    }
  }
}

# Create DNS records pointing to tunnel
resource "cloudflare_record" "tunnel_dns" {
  for_each = { for s in var.services : s.subdomain => s }

  zone_id = var.cloudflare_zone_id
  name    = each.value.subdomain
  value   = "${cloudflare_tunnel.homelab.id}.cfargotunnel.com"
  type    = "CNAME"
  proxied = true
}

output "tunnel_id" {
  value = cloudflare_tunnel.homelab.id
}

output "tunnel_token" {
  value     = cloudflare_tunnel.homelab.tunnel_token
  sensitive = true
}
shell backup.sh

Automated Backup Script

Rsync-based backup script with rotation and remote sync

bash
#!/bin/bash
# Automated Backup Script with Rotation
# Supports local and remote (rsync over SSH) destinations

set -euo pipefail

# Configuration
BACKUP_NAME="homelab-backup"
SOURCE_DIRS=(
    "/etc"
    "/home/commander"
    "/opt/docker"
    "/var/lib/docker/volumes"
)
LOCAL_DEST="/mnt/backups/${BACKUP_NAME}"
REMOTE_HOST="spica-silo"  # Synology NAS
REMOTE_DEST="/volume1/backups/${HOSTNAME}"
RETENTION_DAYS=30
LOG_FILE="/var/log/backup.log"

# Excludes
EXCLUDES=(
    "*.tmp"
    "*.cache"
    "node_modules"
    ".git"
    "__pycache__"
    "*.log"
)

# Build exclude arguments
EXCLUDE_ARGS=""
for pattern in "${EXCLUDES[@]}"; do
    EXCLUDE_ARGS+="--exclude='${pattern}' "
done

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}

# Create timestamped backup directory
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_DIR="${LOCAL_DEST}/${TIMESTAMP}"

log "Starting backup to ${BACKUP_DIR}"

# Create backup directory
mkdir -p "${BACKUP_DIR}"

# Backup each source directory
for src in "${SOURCE_DIRS[@]}"; do
    if [[ -d "$src" ]]; then
        dest_name=$(echo "$src" | tr '/' '_' | sed 's/^_//')
        log "Backing up $src -> ${BACKUP_DIR}/${dest_name}"

        eval rsync -avz --delete ${EXCLUDE_ARGS} \
            "$src/" "${BACKUP_DIR}/${dest_name}/" 2>&1 | tee -a "$LOG_FILE"
    else
        log "WARNING: Source $src does not exist, skipping"
    fi
done

# Create latest symlink
ln -sfn "${BACKUP_DIR}" "${LOCAL_DEST}/latest"

# Sync to remote if available
if ping -c 1 "${REMOTE_HOST}" &> /dev/null; then
    log "Syncing to remote: ${REMOTE_HOST}:${REMOTE_DEST}"
    rsync -avz --delete "${BACKUP_DIR}/" "${REMOTE_HOST}:${REMOTE_DEST}/${TIMESTAMP}/" 2>&1 | tee -a "$LOG_FILE"
    ssh "${REMOTE_HOST}" "ln -sfn ${REMOTE_DEST}/${TIMESTAMP} ${REMOTE_DEST}/latest"
else
    log "WARNING: Remote host ${REMOTE_HOST} unreachable, skipping remote sync"
fi

# Cleanup old backups
log "Cleaning up backups older than ${RETENTION_DAYS} days"
find "${LOCAL_DEST}" -maxdepth 1 -type d -mtime +${RETENTION_DAYS} -exec rm -rf {} \;

if ping -c 1 "${REMOTE_HOST}" &> /dev/null; then
    ssh "${REMOTE_HOST}" "find ${REMOTE_DEST} -maxdepth 1 -type d -mtime +${RETENTION_DAYS} -exec rm -rf {} \;"
fi

log "Backup completed successfully"

# Summary
BACKUP_SIZE=$(du -sh "${BACKUP_DIR}" | cut -f1)
log "Backup size: ${BACKUP_SIZE}"
shell docker-cleanup.sh

Docker Cleanup Script

Safely clean up Docker resources with size reporting

bash
#!/bin/bash
# Docker Cleanup Script
# Removes unused images, containers, volumes, and networks

set -euo pipefail

echo "=== Docker Cleanup Script ==="
echo "Started at: $(date)"
echo

# Show current disk usage
echo "Current Docker disk usage:"
docker system df
echo

# Stop and remove exited containers
EXITED=$(docker ps -aq -f status=exited)
if [[ -n "$EXITED" ]]; then
    echo "Removing exited containers..."
    docker rm $EXITED
else
    echo "No exited containers to remove"
fi

# Remove dangling images
DANGLING=$(docker images -q -f dangling=true)
if [[ -n "$DANGLING" ]]; then
    echo "Removing dangling images..."
    docker rmi $DANGLING
else
    echo "No dangling images to remove"
fi

# Remove unused volumes
echo "Removing unused volumes..."
docker volume prune -f

# Remove unused networks
echo "Removing unused networks..."
docker network prune -f

# Optional: Remove all unused images (uncomment if needed)
# echo "Removing all unused images..."
# docker image prune -a -f

# Final cleanup with system prune
echo "Running system prune..."
docker system prune -f

echo
echo "Final Docker disk usage:"
docker system df

echo
echo "Cleanup completed at: $(date)"
systemd openrc-service-template

OpenRC Service Template

Template for creating OpenRC init scripts (Gentoo/Alpine)

bash
#!/sbin/openrc-run
# OpenRC Service Template
# Place in /etc/init.d/ and chmod +x
# Enable with: rc-update add servicename default

name="myservice"
description="My Custom Service"

# Service configuration
command="/usr/local/bin/myservice"
command_args="--config /etc/myservice/config.yaml"
command_user="commander"
command_group="commander"
command_background=true

# PID file location
pidfile="/run/${RC_SVCNAME}.pid"

# Log configuration
output_log="/var/log/${RC_SVCNAME}.log"
error_log="/var/log/${RC_SVCNAME}.err"

# Dependencies
depend() {
    need net
    after firewall
    use dns logger
}

# Pre-start checks
start_pre() {
    checkpath --directory --owner ${command_user}:${command_group} --mode 0755 /var/lib/myservice
    checkpath --file --owner ${command_user}:${command_group} --mode 0640 /etc/myservice/config.yaml
}

# Custom start function (optional)
start() {
    ebegin "Starting ${name}"
    start-stop-daemon --start \
        --exec ${command} \
        --user ${command_user} \
        --group ${command_group} \
        --background \
        --make-pidfile \
        --pidfile ${pidfile} \
        --stdout ${output_log} \
        --stderr ${error_log} \
        -- ${command_args}
    eend $?
}

# Custom stop function (optional)
stop() {
    ebegin "Stopping ${name}"
    start-stop-daemon --stop \
        --exec ${command} \
        --pidfile ${pidfile}
    eend $?
}

# Status check
status() {
    if [ -f "${pidfile}" ]; then
        if kill -0 $(cat ${pidfile}) 2>/dev/null; then
            einfo "${name} is running (PID: $(cat ${pidfile}))"
            return 0
        fi
    fi
    einfo "${name} is not running"
    return 3
}
systemd backup.timer

Systemd Timer Template

Systemd service and timer for scheduled tasks

ini
# Systemd Timer: backup.timer
# Place in /etc/systemd/system/
# Enable with: systemctl enable --now backup.timer

# === backup.service ===
# [Unit]
# Description=Automated Backup Service
# After=network-online.target
# Wants=network-online.target
#
# [Service]
# Type=oneshot
# ExecStart=/usr/local/bin/backup.sh
# User=root
# StandardOutput=journal
# StandardError=journal
#
# [Install]
# WantedBy=multi-user.target

# === backup.timer ===
[Unit]
Description=Run backup daily at 3 AM

[Timer]
# Run at 3:00 AM every day
OnCalendar=*-*-* 03:00:00

# Add randomized delay up to 15 minutes
RandomizedDelaySec=900

# Run immediately if we missed the last scheduled time
Persistent=true

# Don't run if system was just booted
OnBootSec=5min

[Install]
WantedBy=timers.target

# === Useful timer expressions ===
# OnCalendar=hourly           # Every hour
# OnCalendar=daily            # Every day at midnight
# OnCalendar=weekly           # Every Monday at midnight
# OnCalendar=*-*-* 04:00:00   # Every day at 4 AM
# OnCalendar=Mon *-*-* 02:00  # Every Monday at 2 AM
# OnCalendar=*-*-01 00:00:00  # First of every month

# === Commands ===
# systemctl list-timers              # List all timers
# systemctl status backup.timer      # Check timer status
# systemctl start backup.service     # Run manually
# journalctl -u backup.service       # View logs
shell zfs-snapshot.sh

ZFS Snapshot Management

Automated ZFS snapshots with retention policy

bash
#!/bin/bash
# ZFS Snapshot Management Script
# Creates snapshots with automatic rotation

set -euo pipefail

# Configuration
POOL="tank-storage"
SNAPSHOT_PREFIX="auto"
HOURLY_KEEP=24
DAILY_KEEP=30
WEEKLY_KEEP=12

log() {
    echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}

# Create snapshot
create_snapshot() {
    local snap_type="$1"
    local timestamp=$(date +%Y%m%d_%H%M%S)
    local snap_name="${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_${timestamp}"

    log "Creating snapshot: ${snap_name}"
    zfs snapshot -r "${snap_name}"
}

# List snapshots by type
list_snapshots() {
    local snap_type="$1"
    zfs list -t snapshot -o name -s creation | grep "${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_" || true
}

# Delete old snapshots
cleanup_snapshots() {
    local snap_type="$1"
    local keep="$2"

    local snapshots=($(list_snapshots "${snap_type}"))
    local count=${#snapshots[@]}

    if (( count > keep )); then
        local to_delete=$((count - keep))
        log "Cleaning up ${to_delete} old ${snap_type} snapshots"

        for ((i=0; i<to_delete; i++)); do
            log "Deleting: ${snapshots[i]}"
            zfs destroy -r "${snapshots[i]}"
        done
    else
        log "No ${snap_type} snapshots to clean (have ${count}, keep ${keep})"
    fi
}

# Main logic based on argument
case "${1:-hourly}" in
    hourly)
        create_snapshot "hourly"
        cleanup_snapshots "hourly" $HOURLY_KEEP
        ;;
    daily)
        create_snapshot "daily"
        cleanup_snapshots "daily" $DAILY_KEEP
        ;;
    weekly)
        create_snapshot "weekly"
        cleanup_snapshots "weekly" $WEEKLY_KEEP
        ;;
    list)
        echo "=== Hourly Snapshots ==="
        list_snapshots "hourly"
        echo
        echo "=== Daily Snapshots ==="
        list_snapshots "daily"
        echo
        echo "=== Weekly Snapshots ==="
        list_snapshots "weekly"
        ;;
    status)
        echo "=== ZFS Pool Status ==="
        zpool status $POOL
        echo
        echo "=== Snapshot Counts ==="
        echo "Hourly: $(list_snapshots hourly | wc -l) / $HOURLY_KEEP"
        echo "Daily:  $(list_snapshots daily | wc -l) / $DAILY_KEEP"
        echo "Weekly: $(list_snapshots weekly | wc -l) / $WEEKLY_KEEP"
        ;;
    *)
        echo "Usage: $0 {hourly|daily|weekly|list|status}"
        exit 1
        ;;
esac

log "Done"