Fargate Spot lets you run containers at up to 70% discount compared to on-demand pricing. Unlike EC2 Spot, you don’t manage instances — AWS handles the infrastructure while you handle interruptions gracefully. This guide covers real-world patterns for production Fargate Spot.

How Fargate Spot Works

Regular Fargate: $0.04048/vCPU/hour + $0.004445/GB/hour
Fargate Spot:    $0.01214/vCPU/hour + $0.001334/GB/hour (70% discount)

Interruption: 2-minute warning via SIGTERM → container terminates

ECS with Fargate Spot

# Cluster with capacity providers
resource "aws_ecs_cluster" "main" {
  name = "production"
}

resource "aws_ecs_cluster_capacity_providers" "main" {
  cluster_name = aws_ecs_cluster.main.name

  capacity_providers = ["FARGATE", "FARGATE_SPOT"]

  # Default: 80% Spot, 20% on-demand baseline
  default_capacity_provider_strategy {
    base              = 1          # At least 1 on-demand task
    weight            = 1
    capacity_provider = "FARGATE"
  }

  default_capacity_provider_strategy {
    weight            = 4          # 4x weight = 80% Spot
    capacity_provider = "FARGATE_SPOT"
  }
}

Service Configuration

resource "aws_ecs_service" "app" {
  name            = "app"
  cluster         = aws_ecs_cluster.main.id
  task_definition = aws_ecs_task_definition.app.arn
  desired_count   = 5

  # Override cluster default for this service
  capacity_provider_strategy {
    capacity_provider = "FARGATE"
    base              = 2        # 2 on-demand for baseline
    weight            = 1
  }

  capacity_provider_strategy {
    capacity_provider = "FARGATE_SPOT"
    weight            = 3        # 3 Spot for every 1 on-demand
  }

  network_configuration {
    subnets         = var.private_subnet_ids
    security_groups = [aws_security_group.app.id]
  }

  load_balancer {
    target_group_arn = aws_lb_target_group.app.arn
    container_name   = "app"
    container_port   = 8080
  }

  # Enable graceful shutdown
  deployment_configuration {
    maximum_percent         = 200
    minimum_healthy_percent = 100
  }

  # Roll back failed deployments
  deployment_circuit_breaker {
    enable   = true
    rollback = true
  }
}

Handling Interruptions

Graceful Shutdown in Your App

# app.py - Flask example with graceful shutdown
import signal
import sys
from flask import Flask
import threading
import time

app = Flask(__name__)
shutdown_event = threading.Event()
active_requests = 0
request_lock = threading.Lock()

@app.before_request
def track_request_start():
    global active_requests
    with request_lock:
        active_requests += 1

@app.after_request
def track_request_end(response):
    global active_requests
    with request_lock:
        active_requests -= 1
    return response

def graceful_shutdown(signum, frame):
    print("Received shutdown signal, draining...")
    shutdown_event.set()
    
    # Wait for active requests to complete (max 30s)
    deadline = time.time() + 30
    while active_requests > 0 and time.time() < deadline:
        print(f"Waiting for {active_requests} requests to complete...")
        time.sleep(1)
    
    print("Shutdown complete")
    sys.exit(0)

signal.signal(signal.SIGTERM, graceful_shutdown)
signal.signal(signal.SIGINT, graceful_shutdown)

@app.route('/health')
def health():
    if shutdown_event.is_set():
        return 'shutting down', 503
    return 'ok', 200

@app.route('/process')
def process():
    # Your business logic
    time.sleep(2)
    return 'done'

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)

Node.js Graceful Shutdown

// server.js
const express = require('express');
const app = express();

let isShuttingDown = false;
let activeConnections = new Set();

const server = app.listen(8080, () => {
  console.log('Server running on port 8080');
});

// Track connections
server.on('connection', (conn) => {
  activeConnections.add(conn);
  conn.on('close', () => activeConnections.delete(conn));
});

// Health check returns 503 during shutdown
app.get('/health', (req, res) => {
  if (isShuttingDown) {
    return res.status(503).send('shutting down');
  }
  res.send('ok');
});

app.get('/process', async (req, res) => {
  await doWork();
  res.send('done');
});

// Graceful shutdown handler
function shutdown(signal) {
  console.log(`Received ${signal}, starting graceful shutdown`);
  isShuttingDown = true;

  // Stop accepting new connections
  server.close(() => {
    console.log('Server closed');
    process.exit(0);
  });

  // Force close after 30 seconds
  setTimeout(() => {
    console.error('Forcing shutdown');
    activeConnections.forEach((conn) => conn.destroy());
    process.exit(1);
  }, 30000);
}

process.on('SIGTERM', () => shutdown('SIGTERM'));
process.on('SIGINT', () => shutdown('SIGINT'));

ECS Task Definition for Spot

resource "aws_ecs_task_definition" "app" {
  family                   = "app"
  network_mode             = "awsvpc"
  requires_compatibilities = ["FARGATE"]
  cpu                      = 512
  memory                   = 1024
  execution_role_arn       = aws_iam_role.execution.arn
  task_role_arn            = aws_iam_role.task.arn

  container_definitions = jsonencode([
    {
      name  = "app"
      image = "${var.ecr_repo}:${var.image_tag}"
      
      portMappings = [
        {
          containerPort = 8080
          protocol      = "tcp"
        }
      ]

      # Graceful shutdown config
      stopTimeout = 30  # Give container 30s to drain

      healthCheck = {
        command     = ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]
        interval    = 10
        timeout     = 5
        retries     = 3
        startPeriod = 30
      }

      logConfiguration = {
        logDriver = "awslogs"
        options = {
          awslogs-group         = aws_cloudwatch_log_group.app.name
          awslogs-region        = var.region
          awslogs-stream-prefix = "app"
        }
      }
    }
  ])
}

Capacity Provider Strategies

Strategy 1: Cost Optimized (Batch Jobs)

# 100% Spot for non-critical batch processing
capacity_provider_strategy {
  capacity_provider = "FARGATE_SPOT"
  weight            = 1
  base              = 0
}

Strategy 2: Balanced (Web APIs)

# 70% Spot with on-demand baseline
capacity_provider_strategy {
  capacity_provider = "FARGATE"
  base              = 2
  weight            = 3
}

capacity_provider_strategy {
  capacity_provider = "FARGATE_SPOT"
  weight            = 7
}

Strategy 3: High Availability (Critical Services)

# Majority on-demand with some Spot savings
capacity_provider_strategy {
  capacity_provider = "FARGATE"
  base              = 3
  weight            = 3
}

capacity_provider_strategy {
  capacity_provider = "FARGATE_SPOT"
  weight            = 1
}

Monitoring Spot Interruptions

# CloudWatch alarm for Spot interruptions
resource "aws_cloudwatch_metric_alarm" "spot_interruptions" {
  alarm_name          = "fargate-spot-interruptions"
  comparison_operator = "GreaterThanThreshold"
  evaluation_periods  = 1
  metric_name         = "SpotInterruptionCount"
  namespace           = "ECS/ContainerInsights"
  period              = 300
  statistic           = "Sum"
  threshold           = 5
  alarm_description   = "High Spot interruption rate"

  dimensions = {
    ClusterName = aws_ecs_cluster.main.name
    ServiceName = aws_ecs_service.app.name
  }

  alarm_actions = [aws_sns_topic.alerts.arn]
}
# Lambda to track interruption patterns
import boto3
from datetime import datetime, timedelta

cloudwatch = boto3.client('cloudwatch')
ecs = boto3.client('ecs')

def handler(event, context):
    # Get stopped tasks in last hour
    response = ecs.list_tasks(
        cluster='production',
        desiredStatus='STOPPED',
        maxResults=100
    )
    
    spot_interruptions = 0
    for task_arn in response['taskArns']:
        task = ecs.describe_tasks(cluster='production', tasks=[task_arn])
        if task['tasks'][0].get('stopCode') == 'SpotInterruption':
            spot_interruptions += 1
    
    cloudwatch.put_metric_data(
        Namespace='Custom/ECS',
        MetricData=[{
            'MetricName': 'SpotInterruptionsPerHour',
            'Value': spot_interruptions,
            'Unit': 'Count',
            'Dimensions': [
                {'Name': 'Cluster', 'Value': 'production'}
            ]
        }]
    )

Best Practices

1. Over-provision During Interruptions

resource "aws_appautoscaling_target" "ecs" {
  max_capacity       = 20  # Higher max to handle Spot volatility
  min_capacity       = 5
  resource_id        = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.app.name}"
  scalable_dimension = "ecs:service:DesiredCount"
  service_namespace  = "ecs"
}

2. Use Multiple Availability Zones

resource "aws_ecs_service" "app" {
  # ... other config ...

  network_configuration {
    subnets = [
      aws_subnet.private_a.id,
      aws_subnet.private_b.id,
      aws_subnet.private_c.id,  # All 3 AZs for Spot diversity
    ]
    security_groups = [aws_security_group.app.id]
  }
}

3. Architect for Interruption

# Store state externally, not in-memory
# Use ElastiCache or DynamoDB for session state
# Use SQS for work queues with visibility timeout

# docker-compose.yml for local dev
services:
  app:
    build: .
    environment:
      - REDIS_URL=redis://redis:6379
      - SQS_QUEUE_URL=${SQS_QUEUE_URL}
    depends_on:
      - redis
  
  redis:
    image: redis:alpine

Cost Calculator

# Calculate Spot savings
# 1 vCPU, 2GB memory, running 24/7

# On-demand Fargate:
# vCPU: 0.04048 * 24 * 30 = $29.15/month
# Memory: 0.004445 * 2 * 24 * 30 = $6.40/month
# Total: $35.55/task/month

# Fargate Spot (70% discount):
# vCPU: 0.01214 * 24 * 30 = $8.74/month
# Memory: 0.001334 * 2 * 24 * 30 = $1.92/month
# Total: $10.66/task/month

# Savings: $24.89/task/month (70%)

Key Takeaways

  1. Fargate Spot saves 70% but you must handle 2-minute interruption warnings
  2. Use capacity provider strategies to mix Spot and on-demand based on criticality
  3. Always have an on-demand baseline for production services (base parameter)
  4. Implement graceful shutdown — SIGTERM handler that drains connections
  5. Health checks should fail fast during shutdown to stop new traffic
  6. Spread across AZs for Spot capacity diversity
  7. Externalize state — Spot tasks can die anytime, don’t rely on local storage

“Fargate Spot is free money for stateless workloads. If your app can handle a restart, there’s no reason not to use it.”