Checkpoint Workflow Builder
Design and implement fault-tolerant workflows that can resume from any point of failure.
Overview
Complex workflows often fail mid-execution due to:
- Network timeouts
- System crashes
- Resource exhaustion
- External service failures
- User interruptions
This skill teaches you to build workflows that:
- Save progress at key checkpoints
- Resume from last successful state
- Handle partial failures gracefully
- Provide clear progress visibility
- Enable manual intervention points
When to Use
Use this skill when:
- Building multi-phase data pipelines
- Implementing long-running migration scripts
- Creating deployment workflows
- Processing large batches of items
- Orchestrating multi-system operations
- Building ETL (Extract, Transform, Load) workflows
- Implementing saga patterns for distributed systems
- Creating user-facing wizards with save/resume
Core Concepts
State Machine Pattern
┌─────────┐
│ INIT │
└────┬────┘
│
▼
┌────────────┐
│ DOWNLOAD │
└─────┬──────┘
│
▼
┌────────────┐
│ PROCESS │
└─────┬──────┘
│
▼
┌────────────┐
│ VALIDATE │
└─────┬──────┘
│
▼
┌────────────┐
│ FINALIZE │
└─────┬──────┘
│
▼
┌────────────┐
│ COMPLETE │
└────────────┘
Each state:
- Has clear entry conditions
- Performs specific operations
- Saves checkpoint before transition
- Can be resumed independently
Basic Implementation
Simple State Machine
#!/bin/bash
# state-machine.sh - Basic resumable workflow
STATE_FILE=".workflow_state"
# Read current state (default: INIT)
CURRENT_STATE=$(cat "$STATE_FILE" 2>/dev/null || echo "INIT")
echo "Current state: $CURRENT_STATE"
case "$CURRENT_STATE" in
INIT)
echo "=== Phase 1: Initialization ==="
# Initialize workspace
mkdir -p workspace
mkdir -p results
# Download dependencies
echo "Setting up environment..."
# Save next state
echo "DOWNLOAD" > "$STATE_FILE"
echo "✓ Initialization complete"
echo "Run again to continue"
;;
DOWNLOAD)
echo "=== Phase 2: Download Data ==="
# Download data files
echo "Downloading data..."
# curl -o workspace/data.zip https://example.com/data.zip
# Verify download
if [ -f "workspace/data.zip" ]; then
echo "EXTRACT" > "$STATE_FILE"
echo "✓ Download complete"
echo "Run again to continue"
else
echo "✗ Download failed - fix and run again"
exit 1
fi
;;
EXTRACT)
echo "=== Phase 3: Extract Data ==="
# Extract files
echo "Extracting data..."
# unzip workspace/data.zip -d workspace/
echo "PROCESS" > "$STATE_FILE"
echo "✓ Extraction complete"
echo "Run again to continue"
;;
PROCESS)
echo "=== Phase 4: Process Data ==="
# Process data
echo "Processing data..."
# ./process_data.sh workspace/ results/
echo "VALIDATE" > "$STATE_FILE"
echo "✓ Processing complete"
echo "Run again to continue"
;;
VALIDATE)
echo "=== Phase 5: Validate Results ==="
# Validate results
echo "Validating results..."
# ./validate.sh results/
if [ $? -eq 0 ]; then
echo "FINALIZE" > "$STATE_FILE"
echo "✓ Validation passed"
echo "Run again to finalize"
else
echo "✗ Validation failed"
echo "Fix issues and change state to PROCESS to reprocess"
exit 1
fi
;;
FINALIZE)
echo "=== Phase 6: Finalize ==="
# Cleanup and finalize
echo "Finalizing workflow..."
# mv results/ final/
# rm -rf workspace/
echo "COMPLETE" > "$STATE_FILE"
echo "✓ Workflow complete!"
;;
COMPLETE)
echo "=== Workflow Already Complete ==="
echo "Results available in: final/"
;;
*)
echo "✗ Unknown state: $CURRENT_STATE"
echo "Reset with: echo 'INIT' > $STATE_FILE"
exit 1
;;
esac
Enhanced with Progress Tracking
#!/bin/bash
# enhanced-state-machine.sh - With detailed progress
STATE_FILE=".workflow_state"
PROGRESS_FILE=".workflow_progress.json"
# Initialize progress tracking
init_progress() {
cat > "$PROGRESS_FILE" << EOF
{
"current_state": "INIT",
"started_at": "$(date -Iseconds)",
"updated_at": "$(date -Iseconds)",
"phases": {
"INIT": {"status": "pending", "started": null, "completed": null},
"DOWNLOAD": {"status": "pending", "started": null, "completed": null},
"PROCESS": {"status": "pending", "started": null, "completed": null},
"VALIDATE": {"status": "pending", "started": null, "completed": null},
"FINALIZE": {"status": "pending", "started": null, "completed": null}
}
}
EOF
}
# Update progress
update_progress() {
local state="$1"
local status="$2" # "running", "completed", "failed"
local timestamp="$(date -Iseconds)"
if [ ! -f "$PROGRESS_FILE" ]; then
init_progress
fi
jq --arg state "$state" \
--arg status "$status" \
--arg timestamp "$timestamp" \
'.current_state = $state |
.updated_at = $timestamp |
.phases[$state].status = $status |
(.phases[$state].started //= $timestamp) |
(if $status == "completed" then .phases[$state].completed = $timestamp else . end)' \
"$PROGRESS_FILE" > "$PROGRESS_FILE.tmp"
mv "$PROGRESS_FILE.tmp" "$PROGRESS_FILE"
}
# Show progress
show_progress() {
if [ ! -f "$PROGRESS_FILE" ]; then
echo "No progress file found"
return
fi
echo "=== Workflow Progress ==="
echo ""
jq -r '.phases | to_entries | .[] |
"[\(.value.status | ascii_upcase)] \(.key)" +
(if .value.started then " (started: " + .value.started + ")" else "" end)' \
"$PROGRESS_FILE"
echo ""
echo "Current: $(jq -r '.current_state' $PROGRESS_FILE)"
echo "Updated: $(jq -r '.updated_at' $PROGRESS_FILE)"
}
# Workflow implementation
run_workflow() {
CURRENT_STATE=$(cat "$STATE_FILE" 2>/dev/null || echo "INIT")
# Show progress before executing
show_progress
case "$CURRENT_STATE" in
INIT)
update_progress "INIT" "running"
echo "Initializing..."
# ... initialization logic ...
update_progress "INIT" "completed"
echo "DOWNLOAD" > "$STATE_FILE"
;;
DOWNLOAD)
update_progress "DOWNLOAD" "running"
echo "Downloading..."
# ... download logic ...
update_progress "DOWNLOAD" "completed"
echo "PROCESS" > "$STATE_FILE"
;;
# ... other states ...
esac
}
run_workflow
Advanced Patterns
Pattern 1: Batched Checkpoint
#!/bin/bash
# batched-checkpoint.sh - Process items with batch checkpoints
ITEMS_FILE="items.txt"
CHECKPOINT_FILE=".batch_checkpoint"
BATCH_SIZE=10
# Load checkpoint
if [ -f "$CHECKPOINT_FILE" ]; then
LAST_COMPLETED=$(cat "$CHECKPOINT_FILE")
echo "Resuming from item $LAST_COMPLETED"
else
LAST_COMPLETED=0
fi
TOTAL_ITEMS=$(wc -l < "$ITEMS_FILE")
ITEMS_PROCESSED=0
# Process in batches
tail -n +$((LAST_COMPLETED + 1)) "$ITEMS_FILE" | while read -r item; do
# Process item
echo "Processing: $item"
process_item "$item"
ITEMS_PROCESSED=$((ITEMS_PROCESSED + 1))
# Checkpoint every batch
if [ $((ITEMS_PROCESSED % BATCH_SIZE)) -eq 0 ]; then
CURRENT_POSITION=$((LAST_COMPLETED + ITEMS_PROCESSED))
echo "$CURRENT_POSITION" > "$CHECKPOINT_FILE"
echo "Checkpoint: $CURRENT_POSITION/$TOTAL_ITEMS"
# Optional: Break for timeout management
if [ $((ITEMS_PROCESSED)) -ge $((BATCH_SIZE * 5)) ]; then
echo "Processed 5 bat