Fargate Spot: Cut Costs Without Managing Servers
Save up to 70% on container workloads with Fargate Spot. Learn capacity strategies, interruption handling, and best practices for production use.
Fargate Spot lets you run containers at up to 70% discount compared to on-demand pricing. Unlike EC2 Spot, you don’t manage instances — AWS handles the infrastructure while you handle interruptions gracefully. This guide covers real-world patterns for production Fargate Spot.
How Fargate Spot Works
Regular Fargate: $0.04048/vCPU/hour + $0.004445/GB/hour
Fargate Spot: $0.01214/vCPU/hour + $0.001334/GB/hour (70% discount)
Interruption: 2-minute warning via SIGTERM → container terminates
ECS with Fargate Spot
# Cluster with capacity providers
resource "aws_ecs_cluster" "main" {
name = "production"
}
resource "aws_ecs_cluster_capacity_providers" "main" {
cluster_name = aws_ecs_cluster.main.name
capacity_providers = ["FARGATE", "FARGATE_SPOT"]
# Default: 80% Spot, 20% on-demand baseline
default_capacity_provider_strategy {
base = 1 # At least 1 on-demand task
weight = 1
capacity_provider = "FARGATE"
}
default_capacity_provider_strategy {
weight = 4 # 4x weight = 80% Spot
capacity_provider = "FARGATE_SPOT"
}
}
Service Configuration
resource "aws_ecs_service" "app" {
name = "app"
cluster = aws_ecs_cluster.main.id
task_definition = aws_ecs_task_definition.app.arn
desired_count = 5
# Override cluster default for this service
capacity_provider_strategy {
capacity_provider = "FARGATE"
base = 2 # 2 on-demand for baseline
weight = 1
}
capacity_provider_strategy {
capacity_provider = "FARGATE_SPOT"
weight = 3 # 3 Spot for every 1 on-demand
}
network_configuration {
subnets = var.private_subnet_ids
security_groups = [aws_security_group.app.id]
}
load_balancer {
target_group_arn = aws_lb_target_group.app.arn
container_name = "app"
container_port = 8080
}
# Enable graceful shutdown
deployment_configuration {
maximum_percent = 200
minimum_healthy_percent = 100
}
# Roll back failed deployments
deployment_circuit_breaker {
enable = true
rollback = true
}
}
Handling Interruptions
Graceful Shutdown in Your App
# app.py - Flask example with graceful shutdown
import signal
import sys
from flask import Flask
import threading
import time
app = Flask(__name__)
shutdown_event = threading.Event()
active_requests = 0
request_lock = threading.Lock()
@app.before_request
def track_request_start():
global active_requests
with request_lock:
active_requests += 1
@app.after_request
def track_request_end(response):
global active_requests
with request_lock:
active_requests -= 1
return response
def graceful_shutdown(signum, frame):
print("Received shutdown signal, draining...")
shutdown_event.set()
# Wait for active requests to complete (max 30s)
deadline = time.time() + 30
while active_requests > 0 and time.time() < deadline:
print(f"Waiting for {active_requests} requests to complete...")
time.sleep(1)
print("Shutdown complete")
sys.exit(0)
signal.signal(signal.SIGTERM, graceful_shutdown)
signal.signal(signal.SIGINT, graceful_shutdown)
@app.route('/health')
def health():
if shutdown_event.is_set():
return 'shutting down', 503
return 'ok', 200
@app.route('/process')
def process():
# Your business logic
time.sleep(2)
return 'done'
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8080)
Node.js Graceful Shutdown
// server.js
const express = require('express');
const app = express();
let isShuttingDown = false;
let activeConnections = new Set();
const server = app.listen(8080, () => {
console.log('Server running on port 8080');
});
// Track connections
server.on('connection', (conn) => {
activeConnections.add(conn);
conn.on('close', () => activeConnections.delete(conn));
});
// Health check returns 503 during shutdown
app.get('/health', (req, res) => {
if (isShuttingDown) {
return res.status(503).send('shutting down');
}
res.send('ok');
});
app.get('/process', async (req, res) => {
await doWork();
res.send('done');
});
// Graceful shutdown handler
function shutdown(signal) {
console.log(`Received ${signal}, starting graceful shutdown`);
isShuttingDown = true;
// Stop accepting new connections
server.close(() => {
console.log('Server closed');
process.exit(0);
});
// Force close after 30 seconds
setTimeout(() => {
console.error('Forcing shutdown');
activeConnections.forEach((conn) => conn.destroy());
process.exit(1);
}, 30000);
}
process.on('SIGTERM', () => shutdown('SIGTERM'));
process.on('SIGINT', () => shutdown('SIGINT'));
ECS Task Definition for Spot
resource "aws_ecs_task_definition" "app" {
family = "app"
network_mode = "awsvpc"
requires_compatibilities = ["FARGATE"]
cpu = 512
memory = 1024
execution_role_arn = aws_iam_role.execution.arn
task_role_arn = aws_iam_role.task.arn
container_definitions = jsonencode([
{
name = "app"
image = "${var.ecr_repo}:${var.image_tag}"
portMappings = [
{
containerPort = 8080
protocol = "tcp"
}
]
# Graceful shutdown config
stopTimeout = 30 # Give container 30s to drain
healthCheck = {
command = ["CMD-SHELL", "curl -f http://localhost:8080/health || exit 1"]
interval = 10
timeout = 5
retries = 3
startPeriod = 30
}
logConfiguration = {
logDriver = "awslogs"
options = {
awslogs-group = aws_cloudwatch_log_group.app.name
awslogs-region = var.region
awslogs-stream-prefix = "app"
}
}
}
])
}
Capacity Provider Strategies
Strategy 1: Cost Optimized (Batch Jobs)
# 100% Spot for non-critical batch processing
capacity_provider_strategy {
capacity_provider = "FARGATE_SPOT"
weight = 1
base = 0
}
Strategy 2: Balanced (Web APIs)
# 70% Spot with on-demand baseline
capacity_provider_strategy {
capacity_provider = "FARGATE"
base = 2
weight = 3
}
capacity_provider_strategy {
capacity_provider = "FARGATE_SPOT"
weight = 7
}
Strategy 3: High Availability (Critical Services)
# Majority on-demand with some Spot savings
capacity_provider_strategy {
capacity_provider = "FARGATE"
base = 3
weight = 3
}
capacity_provider_strategy {
capacity_provider = "FARGATE_SPOT"
weight = 1
}
Monitoring Spot Interruptions
# CloudWatch alarm for Spot interruptions
resource "aws_cloudwatch_metric_alarm" "spot_interruptions" {
alarm_name = "fargate-spot-interruptions"
comparison_operator = "GreaterThanThreshold"
evaluation_periods = 1
metric_name = "SpotInterruptionCount"
namespace = "ECS/ContainerInsights"
period = 300
statistic = "Sum"
threshold = 5
alarm_description = "High Spot interruption rate"
dimensions = {
ClusterName = aws_ecs_cluster.main.name
ServiceName = aws_ecs_service.app.name
}
alarm_actions = [aws_sns_topic.alerts.arn]
}
# Lambda to track interruption patterns
import boto3
from datetime import datetime, timedelta
cloudwatch = boto3.client('cloudwatch')
ecs = boto3.client('ecs')
def handler(event, context):
# Get stopped tasks in last hour
response = ecs.list_tasks(
cluster='production',
desiredStatus='STOPPED',
maxResults=100
)
spot_interruptions = 0
for task_arn in response['taskArns']:
task = ecs.describe_tasks(cluster='production', tasks=[task_arn])
if task['tasks'][0].get('stopCode') == 'SpotInterruption':
spot_interruptions += 1
cloudwatch.put_metric_data(
Namespace='Custom/ECS',
MetricData=[{
'MetricName': 'SpotInterruptionsPerHour',
'Value': spot_interruptions,
'Unit': 'Count',
'Dimensions': [
{'Name': 'Cluster', 'Value': 'production'}
]
}]
)
Best Practices
1. Over-provision During Interruptions
resource "aws_appautoscaling_target" "ecs" {
max_capacity = 20 # Higher max to handle Spot volatility
min_capacity = 5
resource_id = "service/${aws_ecs_cluster.main.name}/${aws_ecs_service.app.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"
}
2. Use Multiple Availability Zones
resource "aws_ecs_service" "app" {
# ... other config ...
network_configuration {
subnets = [
aws_subnet.private_a.id,
aws_subnet.private_b.id,
aws_subnet.private_c.id, # All 3 AZs for Spot diversity
]
security_groups = [aws_security_group.app.id]
}
}
3. Architect for Interruption
# Store state externally, not in-memory
# Use ElastiCache or DynamoDB for session state
# Use SQS for work queues with visibility timeout
# docker-compose.yml for local dev
services:
app:
build: .
environment:
- REDIS_URL=redis://redis:6379
- SQS_QUEUE_URL=${SQS_QUEUE_URL}
depends_on:
- redis
redis:
image: redis:alpine
Cost Calculator
# Calculate Spot savings
# 1 vCPU, 2GB memory, running 24/7
# On-demand Fargate:
# vCPU: 0.04048 * 24 * 30 = $29.15/month
# Memory: 0.004445 * 2 * 24 * 30 = $6.40/month
# Total: $35.55/task/month
# Fargate Spot (70% discount):
# vCPU: 0.01214 * 24 * 30 = $8.74/month
# Memory: 0.001334 * 2 * 24 * 30 = $1.92/month
# Total: $10.66/task/month
# Savings: $24.89/task/month (70%)
Key Takeaways
- Fargate Spot saves 70% but you must handle 2-minute interruption warnings
- Use capacity provider strategies to mix Spot and on-demand based on criticality
- Always have an on-demand baseline for production services (base parameter)
- Implement graceful shutdown — SIGTERM handler that drains connections
- Health checks should fail fast during shutdown to stop new traffic
- Spread across AZs for Spot capacity diversity
- Externalize state — Spot tasks can die anytime, don’t rely on local storage
“Fargate Spot is free money for stateless workloads. If your app can handle a restart, there’s no reason not to use it.”