Kubernetes 漸進式發布
DevOps9 個節點 · 10 條連接devops
視覺化
ex-k8s-rollout.osop.yaml
# Kubernetes Progressive Rollout
# Apply manifests, progressive canary rollout with automated health checks
osop_version: "2.0"
id: k8s-rollout
name: "Kubernetes 漸進式發布"
nodes:
- id: validate_manifests
type: cli
purpose: Validate Kubernetes manifests against cluster API schemas using kubeconform
runtime:
command: "kubeconform -strict -summary -output json k8s/"
outputs: [validation_result]
timeout_sec: 30
- id: apply_manifests
type: cli
purpose: Apply Kubernetes manifests with server-side apply for conflict resolution
runtime:
command: "kubectl apply --server-side --force-conflicts -f k8s/ -n ${NAMESPACE}"
inputs: [validation_result]
outputs: [applied_resources]
security:
credentials: [KUBECONFIG]
timeout_sec: 120
- id: canary_10
type: cli
purpose: Route 10% of traffic to new version and monitor error rates
runtime:
command: |
kubectl patch rollout app -n ${NAMESPACE} \
--type merge -p '{"spec":{"strategy":{"canary":{"steps":[{"setWeight":10}]}}}}'
inputs: [applied_resources]
outputs: [canary_10_status]
timeout_sec: 300
explain: |
Initial canary phase. If error rate exceeds 1% threshold over 5 minutes,
the rollout automatically pauses for investigation.
- id: health_check_10
type: api
purpose: Query Prometheus for error rate and latency metrics at 10% canary weight
runtime:
endpoint: /api/v1/query_range
method: POST
url: https://prometheus.internal
inputs: [canary_10_status]
outputs: [error_rate_10, p99_latency_10]
retry_policy:
max_retries: 3
backoff_sec: 60
timeout_sec: 600
explain: "Waits 5 minutes, then queries error rate and p99 latency."
- id: canary_50
type: cli
purpose: Promote canary to 50% traffic weight after passing health checks
runtime:
command: |
kubectl argo rollouts promote app -n ${NAMESPACE} &&
kubectl argo rollouts set weight app 50 -n ${NAMESPACE}
inputs: [error_rate_10]
outputs: [canary_50_status]
timeout_sec: 300
- id: health_check_50
type: api
purpose: Query Prometheus for error rate and latency metrics at 50% canary weight
runtime:
endpoint: /api/v1/query_range
method: POST
url: https://prometheus.internal
inputs: [canary_50_status]
outputs: [error_rate_50, p99_latency_50]
retry_policy:
max_retries: 3
backoff_sec: 60
timeout_sec: 600
- id: full_promote
type: cli
purpose: Promote canary to 100% and complete the rollout
runtime:
command: "kubectl argo rollouts promote --full app -n ${NAMESPACE}"
inputs: [error_rate_50]
outputs: [rollout_complete]
timeout_sec: 300
- id: rollback
type: cli
purpose: Abort the rollout and restore previous stable version
runtime:
command: "kubectl argo rollouts abort app -n ${NAMESPACE} && kubectl argo rollouts undo app -n ${NAMESPACE}"
outputs: [rollback_status]
timeout_sec: 120
- id: notify_status
type: api
purpose: Send rollout result to Slack and PagerDuty
runtime:
endpoint: /api/chat.postMessage
method: POST
url: https://slack.com
inputs: [rollout_complete]
security:
auth: bearer_token
secret_ref: SLACK_BOT_TOKEN
edges:
- from: validate_manifests
to: apply_manifests
mode: sequential
- from: apply_manifests
to: canary_10
mode: sequential
- from: canary_10
to: health_check_10
mode: sequential
- from: health_check_10
to: canary_50
mode: conditional
condition: "error_rate_10 < 0.01 && p99_latency_10 < 500"
- from: health_check_10
to: rollback
mode: conditional
condition: "error_rate_10 >= 0.01 || p99_latency_10 >= 500"
label: "Health check failed at 10%"
- from: canary_50
to: health_check_50
mode: sequential
- from: health_check_50
to: full_promote
mode: conditional
condition: "error_rate_50 < 0.01 && p99_latency_50 < 500"
- from: health_check_50
to: rollback
mode: conditional
condition: "error_rate_50 >= 0.01 || p99_latency_50 >= 500"
label: "Health check failed at 50%"
- from: full_promote
to: notify_status
mode: sequential
- from: rollback
to: notify_status
mode: sequential