Infrastructure as Code
Production-ready automation templates for homelab infrastructure. Ansible playbooks, Terraform modules, and shell scripts used to manage the ArgoBox environment.
Docker Stack Deployment
Deploy and configure Docker with common containers across multiple hosts
---
# Ansible Playbook: Deploy Docker Stack
# Usage: ansible-playbook -i inventory docker-stack.yml
- name: Deploy Docker and common containers
hosts: docker_hosts
become: yes
vars:
docker_compose_version: "2.24.0"
containers:
- name: portainer
image: portainer/portainer-ce:latest
ports: ["9443:9443"]
volumes: ["/var/run/docker.sock:/var/run/docker.sock", "portainer_data:/data"]
- name: watchtower
image: containrrr/watchtower:latest
volumes: ["/var/run/docker.sock:/var/run/docker.sock"]
environment:
WATCHTOWER_CLEANUP: "true"
WATCHTOWER_SCHEDULE: "0 0 4 * * *"
tasks:
- name: Install Docker dependencies
apt:
name:
- apt-transport-https
- ca-certificates
- curl
- gnupg
- lsb-release
state: present
update_cache: yes
- name: Add Docker GPG key
apt_key:
url: https://download.docker.com/linux/ubuntu/gpg
state: present
- name: Add Docker repository
apt_repository:
repo: "deb https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
state: present
- name: Install Docker
apt:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-buildx-plugin
- docker-compose-plugin
state: present
- name: Start and enable Docker
systemd:
name: docker
state: started
enabled: yes
- name: Deploy containers
community.docker.docker_container:
name: "{{ item.name }}"
image: "{{ item.image }}"
ports: "{{ item.ports | default(omit) }}"
volumes: "{{ item.volumes | default(omit) }}"
env: "{{ item.environment | default(omit) }}"
restart_policy: unless-stopped
loop: "{{ containers }}"
Tailscale Mesh VPN Setup
Install and configure Tailscale across all nodes with subnet routing
---
# Ansible Playbook: Tailscale Mesh VPN
# Usage: ansible-playbook -i inventory tailscale-setup.yml -e "tailscale_authkey=tskey-xxx"
- name: Install and configure Tailscale
hosts: all
become: yes
vars:
tailscale_authkey: "{{ lookup('env', 'TAILSCALE_AUTHKEY') }}"
subnet_routers:
- host: alpha-centauri
advertise_routes: "10.42.0.0/24"
- host: titawin-host
advertise_routes: "192.168.20.0/24"
tasks:
- name: Add Tailscale repository key
apt_key:
url: https://pkgs.tailscale.com/stable/ubuntu/jammy.noarmor.gpg
state: present
- name: Add Tailscale repository
apt_repository:
repo: "deb https://pkgs.tailscale.com/stable/ubuntu jammy main"
state: present
- name: Install Tailscale
apt:
name: tailscale
state: present
update_cache: yes
- name: Enable IP forwarding (for subnet routers)
sysctl:
name: "{{ item }}"
value: "1"
sysctl_set: yes
reload: yes
loop:
- net.ipv4.ip_forward
- net.ipv6.conf.all.forwarding
when: inventory_hostname in (subnet_routers | map(attribute='host'))
- name: Start Tailscale service
systemd:
name: tailscaled
state: started
enabled: yes
- name: Authenticate with Tailscale
command: >
tailscale up
--authkey={{ tailscale_authkey }}
--ssh
--accept-routes
{% if inventory_hostname in (subnet_routers | map(attribute='host')) %}
--advertise-routes={{ (subnet_routers | selectattr('host', 'eq', inventory_hostname) | first).advertise_routes }}
{% endif %}
register: tailscale_up
changed_when: "'Success' in tailscale_up.stdout"
- name: Get Tailscale status
command: tailscale status
register: ts_status
changed_when: false
- name: Display Tailscale IP
debug:
msg: "{{ inventory_hostname }} Tailscale IP: {{ ts_status.stdout_lines[0] }}"
Monitoring Stack Deployment
Deploy Glances, Prometheus node exporter, and configure metrics collection
---
# Ansible Playbook: Monitoring Stack
# Deploys Glances and Prometheus exporters to all hosts
- name: Deploy monitoring agents
hosts: all
become: yes
vars:
glances_port: 61208
node_exporter_port: 9100
glances_version: "4.0.0"
tasks:
- name: Install Python dependencies
apt:
name:
- python3-pip
- python3-docker
state: present
- name: Install Glances via pip
pip:
name: glances[all]
state: present
- name: Create Glances systemd service
copy:
dest: /etc/systemd/system/glances.service
content: |
[Unit]
Description=Glances System Monitor
After=network.target
[Service]
ExecStart=/usr/local/bin/glances -w -p {{ glances_port }}
Restart=on-failure
RestartSec=10
[Install]
WantedBy=multi-user.target
notify: Reload systemd
- name: Download Prometheus Node Exporter
get_url:
url: "https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz"
dest: /tmp/node_exporter.tar.gz
- name: Extract Node Exporter
unarchive:
src: /tmp/node_exporter.tar.gz
dest: /usr/local/bin
remote_src: yes
extra_opts: [--strip-components=1]
creates: /usr/local/bin/node_exporter
- name: Create Node Exporter systemd service
copy:
dest: /etc/systemd/system/node_exporter.service
content: |
[Unit]
Description=Prometheus Node Exporter
After=network.target
[Service]
ExecStart=/usr/local/bin/node_exporter --web.listen-address=:{{ node_exporter_port }}
Restart=on-failure
[Install]
WantedBy=multi-user.target
notify: Reload systemd
- name: Start monitoring services
systemd:
name: "{{ item }}"
state: started
enabled: yes
loop:
- glances
- node_exporter
handlers:
- name: Reload systemd
systemd:
daemon_reload: yes
Proxmox VM Provisioning
Terraform module to create VMs on Proxmox with cloud-init
# Terraform: Proxmox VM Module
# Creates VMs with cloud-init configuration
terraform {
required_providers {
proxmox = {
source = "Telmate/proxmox"
version = "~> 2.9"
}
}
}
variable "proxmox_host" {
description = "Proxmox host IP"
default = "10.42.0.201"
}
variable "vm_name" {
description = "Name of the VM"
type = string
}
variable "target_node" {
description = "Proxmox node to deploy on"
default = "icarus"
}
variable "cores" {
description = "Number of CPU cores"
default = 4
}
variable "memory" {
description = "RAM in MB"
default = 4096
}
variable "disk_size" {
description = "Boot disk size"
default = "32G"
}
variable "ip_address" {
description = "Static IP address"
type = string
}
variable "gateway" {
description = "Network gateway"
default = "10.42.0.1"
}
variable "ssh_keys" {
description = "SSH public keys for cloud-init"
type = string
}
resource "proxmox_vm_qemu" "vm" {
name = var.vm_name
target_node = var.target_node
clone = "ubuntu-cloud-template"
cores = var.cores
sockets = 1
memory = var.memory
agent = 1 # Enable QEMU guest agent
disk {
storage = "local-zfs"
size = var.disk_size
type = "scsi"
}
network {
model = "virtio"
bridge = "vmbr0"
}
# Cloud-init configuration
os_type = "cloud-init"
ipconfig0 = "ip=${var.ip_address}/24,gw=${var.gateway}"
ciuser = "commander"
sshkeys = var.ssh_keys
lifecycle {
ignore_changes = [
network,
]
}
tags = "terraform,${var.vm_name}"
}
output "vm_ip" {
value = var.ip_address
}
output "vm_id" {
value = proxmox_vm_qemu.vm.vmid
}
Cloudflare Tunnel Configuration
Terraform module for Cloudflare Tunnel and DNS records
# Terraform: Cloudflare Tunnel
# Manages tunnel configuration and DNS records
terraform {
required_providers {
cloudflare = {
source = "cloudflare/cloudflare"
version = "~> 4.0"
}
}
}
variable "cloudflare_account_id" {
description = "Cloudflare account ID"
type = string
sensitive = true
}
variable "cloudflare_zone_id" {
description = "Cloudflare zone ID for your domain"
type = string
}
variable "domain" {
description = "Base domain name"
default = "argobox.com"
}
variable "tunnel_secret" {
description = "Tunnel secret (base64)"
type = string
sensitive = true
}
variable "services" {
description = "Services to expose through tunnel"
type = list(object({
subdomain = string
service = string
port = number
}))
default = [
{ subdomain = "git", service = "localhost", port = 3000 },
{ subdomain = "ai", service = "localhost", port = 30000 },
{ subdomain = "vault", service = "localhost", port = 31745 },
]
}
# Create the tunnel
resource "cloudflare_tunnel" "homelab" {
account_id = var.cloudflare_account_id
name = "homelab-tunnel"
secret = var.tunnel_secret
}
# Configure tunnel routes
resource "cloudflare_tunnel_config" "homelab" {
account_id = var.cloudflare_account_id
tunnel_id = cloudflare_tunnel.homelab.id
config {
dynamic "ingress_rule" {
for_each = var.services
content {
hostname = "${ingress_rule.value.subdomain}.${var.domain}"
service = "http://${ingress_rule.value.service}:${ingress_rule.value.port}"
}
}
# Catch-all rule (required)
ingress_rule {
service = "http_status:404"
}
}
}
# Create DNS records pointing to tunnel
resource "cloudflare_record" "tunnel_dns" {
for_each = { for s in var.services : s.subdomain => s }
zone_id = var.cloudflare_zone_id
name = each.value.subdomain
value = "${cloudflare_tunnel.homelab.id}.cfargotunnel.com"
type = "CNAME"
proxied = true
}
output "tunnel_id" {
value = cloudflare_tunnel.homelab.id
}
output "tunnel_token" {
value = cloudflare_tunnel.homelab.tunnel_token
sensitive = true
}
Automated Backup Script
Rsync-based backup script with rotation and remote sync
#!/bin/bash
# Automated Backup Script with Rotation
# Supports local and remote (rsync over SSH) destinations
set -euo pipefail
# Configuration
BACKUP_NAME="homelab-backup"
SOURCE_DIRS=(
"/etc"
"/home/commander"
"/opt/docker"
"/var/lib/docker/volumes"
)
LOCAL_DEST="/mnt/backups/${BACKUP_NAME}"
REMOTE_HOST="spica-silo" # Synology NAS
REMOTE_DEST="/volume1/backups/${HOSTNAME}"
RETENTION_DAYS=30
LOG_FILE="/var/log/backup.log"
# Excludes
EXCLUDES=(
"*.tmp"
"*.cache"
"node_modules"
".git"
"__pycache__"
"*.log"
)
# Build exclude arguments
EXCLUDE_ARGS=""
for pattern in "${EXCLUDES[@]}"; do
EXCLUDE_ARGS+="--exclude='${pattern}' "
done
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# Create timestamped backup directory
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
BACKUP_DIR="${LOCAL_DEST}/${TIMESTAMP}"
log "Starting backup to ${BACKUP_DIR}"
# Create backup directory
mkdir -p "${BACKUP_DIR}"
# Backup each source directory
for src in "${SOURCE_DIRS[@]}"; do
if [[ -d "$src" ]]; then
dest_name=$(echo "$src" | tr '/' '_' | sed 's/^_//')
log "Backing up $src -> ${BACKUP_DIR}/${dest_name}"
eval rsync -avz --delete ${EXCLUDE_ARGS} \
"$src/" "${BACKUP_DIR}/${dest_name}/" 2>&1 | tee -a "$LOG_FILE"
else
log "WARNING: Source $src does not exist, skipping"
fi
done
# Create latest symlink
ln -sfn "${BACKUP_DIR}" "${LOCAL_DEST}/latest"
# Sync to remote if available
if ping -c 1 "${REMOTE_HOST}" &> /dev/null; then
log "Syncing to remote: ${REMOTE_HOST}:${REMOTE_DEST}"
rsync -avz --delete "${BACKUP_DIR}/" "${REMOTE_HOST}:${REMOTE_DEST}/${TIMESTAMP}/" 2>&1 | tee -a "$LOG_FILE"
ssh "${REMOTE_HOST}" "ln -sfn ${REMOTE_DEST}/${TIMESTAMP} ${REMOTE_DEST}/latest"
else
log "WARNING: Remote host ${REMOTE_HOST} unreachable, skipping remote sync"
fi
# Cleanup old backups
log "Cleaning up backups older than ${RETENTION_DAYS} days"
find "${LOCAL_DEST}" -maxdepth 1 -type d -mtime +${RETENTION_DAYS} -exec rm -rf {} \;
if ping -c 1 "${REMOTE_HOST}" &> /dev/null; then
ssh "${REMOTE_HOST}" "find ${REMOTE_DEST} -maxdepth 1 -type d -mtime +${RETENTION_DAYS} -exec rm -rf {} \;"
fi
log "Backup completed successfully"
# Summary
BACKUP_SIZE=$(du -sh "${BACKUP_DIR}" | cut -f1)
log "Backup size: ${BACKUP_SIZE}"
Docker Cleanup Script
Safely clean up Docker resources with size reporting
#!/bin/bash
# Docker Cleanup Script
# Removes unused images, containers, volumes, and networks
set -euo pipefail
echo "=== Docker Cleanup Script ==="
echo "Started at: $(date)"
echo
# Show current disk usage
echo "Current Docker disk usage:"
docker system df
echo
# Stop and remove exited containers
EXITED=$(docker ps -aq -f status=exited)
if [[ -n "$EXITED" ]]; then
echo "Removing exited containers..."
docker rm $EXITED
else
echo "No exited containers to remove"
fi
# Remove dangling images
DANGLING=$(docker images -q -f dangling=true)
if [[ -n "$DANGLING" ]]; then
echo "Removing dangling images..."
docker rmi $DANGLING
else
echo "No dangling images to remove"
fi
# Remove unused volumes
echo "Removing unused volumes..."
docker volume prune -f
# Remove unused networks
echo "Removing unused networks..."
docker network prune -f
# Optional: Remove all unused images (uncomment if needed)
# echo "Removing all unused images..."
# docker image prune -a -f
# Final cleanup with system prune
echo "Running system prune..."
docker system prune -f
echo
echo "Final Docker disk usage:"
docker system df
echo
echo "Cleanup completed at: $(date)"
OpenRC Service Template
Template for creating OpenRC init scripts (Gentoo/Alpine)
#!/sbin/openrc-run
# OpenRC Service Template
# Place in /etc/init.d/ and chmod +x
# Enable with: rc-update add servicename default
name="myservice"
description="My Custom Service"
# Service configuration
command="/usr/local/bin/myservice"
command_args="--config /etc/myservice/config.yaml"
command_user="commander"
command_group="commander"
command_background=true
# PID file location
pidfile="/run/${RC_SVCNAME}.pid"
# Log configuration
output_log="/var/log/${RC_SVCNAME}.log"
error_log="/var/log/${RC_SVCNAME}.err"
# Dependencies
depend() {
need net
after firewall
use dns logger
}
# Pre-start checks
start_pre() {
checkpath --directory --owner ${command_user}:${command_group} --mode 0755 /var/lib/myservice
checkpath --file --owner ${command_user}:${command_group} --mode 0640 /etc/myservice/config.yaml
}
# Custom start function (optional)
start() {
ebegin "Starting ${name}"
start-stop-daemon --start \
--exec ${command} \
--user ${command_user} \
--group ${command_group} \
--background \
--make-pidfile \
--pidfile ${pidfile} \
--stdout ${output_log} \
--stderr ${error_log} \
-- ${command_args}
eend $?
}
# Custom stop function (optional)
stop() {
ebegin "Stopping ${name}"
start-stop-daemon --stop \
--exec ${command} \
--pidfile ${pidfile}
eend $?
}
# Status check
status() {
if [ -f "${pidfile}" ]; then
if kill -0 $(cat ${pidfile}) 2>/dev/null; then
einfo "${name} is running (PID: $(cat ${pidfile}))"
return 0
fi
fi
einfo "${name} is not running"
return 3
}
Systemd Timer Template
Systemd service and timer for scheduled tasks
# Systemd Timer: backup.timer
# Place in /etc/systemd/system/
# Enable with: systemctl enable --now backup.timer
# === backup.service ===
# [Unit]
# Description=Automated Backup Service
# After=network-online.target
# Wants=network-online.target
#
# [Service]
# Type=oneshot
# ExecStart=/usr/local/bin/backup.sh
# User=root
# StandardOutput=journal
# StandardError=journal
#
# [Install]
# WantedBy=multi-user.target
# === backup.timer ===
[Unit]
Description=Run backup daily at 3 AM
[Timer]
# Run at 3:00 AM every day
OnCalendar=*-*-* 03:00:00
# Add randomized delay up to 15 minutes
RandomizedDelaySec=900
# Run immediately if we missed the last scheduled time
Persistent=true
# Don't run if system was just booted
OnBootSec=5min
[Install]
WantedBy=timers.target
# === Useful timer expressions ===
# OnCalendar=hourly # Every hour
# OnCalendar=daily # Every day at midnight
# OnCalendar=weekly # Every Monday at midnight
# OnCalendar=*-*-* 04:00:00 # Every day at 4 AM
# OnCalendar=Mon *-*-* 02:00 # Every Monday at 2 AM
# OnCalendar=*-*-01 00:00:00 # First of every month
# === Commands ===
# systemctl list-timers # List all timers
# systemctl status backup.timer # Check timer status
# systemctl start backup.service # Run manually
# journalctl -u backup.service # View logs
ZFS Snapshot Management
Automated ZFS snapshots with retention policy
#!/bin/bash
# ZFS Snapshot Management Script
# Creates snapshots with automatic rotation
set -euo pipefail
# Configuration
POOL="tank-storage"
SNAPSHOT_PREFIX="auto"
HOURLY_KEEP=24
DAILY_KEEP=30
WEEKLY_KEEP=12
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
}
# Create snapshot
create_snapshot() {
local snap_type="$1"
local timestamp=$(date +%Y%m%d_%H%M%S)
local snap_name="${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_${timestamp}"
log "Creating snapshot: ${snap_name}"
zfs snapshot -r "${snap_name}"
}
# List snapshots by type
list_snapshots() {
local snap_type="$1"
zfs list -t snapshot -o name -s creation | grep "${POOL}@${SNAPSHOT_PREFIX}_${snap_type}_" || true
}
# Delete old snapshots
cleanup_snapshots() {
local snap_type="$1"
local keep="$2"
local snapshots=($(list_snapshots "${snap_type}"))
local count=${#snapshots[@]}
if (( count > keep )); then
local to_delete=$((count - keep))
log "Cleaning up ${to_delete} old ${snap_type} snapshots"
for ((i=0; i<to_delete; i++)); do
log "Deleting: ${snapshots[i]}"
zfs destroy -r "${snapshots[i]}"
done
else
log "No ${snap_type} snapshots to clean (have ${count}, keep ${keep})"
fi
}
# Main logic based on argument
case "${1:-hourly}" in
hourly)
create_snapshot "hourly"
cleanup_snapshots "hourly" $HOURLY_KEEP
;;
daily)
create_snapshot "daily"
cleanup_snapshots "daily" $DAILY_KEEP
;;
weekly)
create_snapshot "weekly"
cleanup_snapshots "weekly" $WEEKLY_KEEP
;;
list)
echo "=== Hourly Snapshots ==="
list_snapshots "hourly"
echo
echo "=== Daily Snapshots ==="
list_snapshots "daily"
echo
echo "=== Weekly Snapshots ==="
list_snapshots "weekly"
;;
status)
echo "=== ZFS Pool Status ==="
zpool status $POOL
echo
echo "=== Snapshot Counts ==="
echo "Hourly: $(list_snapshots hourly | wc -l) / $HOURLY_KEEP"
echo "Daily: $(list_snapshots daily | wc -l) / $DAILY_KEEP"
echo "Weekly: $(list_snapshots weekly | wc -l) / $WEEKLY_KEEP"
;;
*)
echo "Usage: $0 {hourly|daily|weekly|list|status}"
exit 1
;;
esac
log "Done"