Kubernetes 漸進式發布

DevOps
9 個節點 · 10 條連接devops
ex-k8s-rollout.osop.yaml
# Kubernetes Progressive Rollout
# Apply manifests, progressive canary rollout with automated health checks
osop_version: "2.0"
id: k8s-rollout
name: "Kubernetes 漸進式發布"

nodes:
  - id: validate_manifests
    type: cli
    purpose: Validate Kubernetes manifests against cluster API schemas using kubeconform
    runtime:
      command: "kubeconform -strict -summary -output json k8s/"
    outputs: [validation_result]
    timeout_sec: 30

  - id: apply_manifests
    type: cli
    purpose: Apply Kubernetes manifests with server-side apply for conflict resolution
    runtime:
      command: "kubectl apply --server-side --force-conflicts -f k8s/ -n ${NAMESPACE}"
    inputs: [validation_result]
    outputs: [applied_resources]
    security:
      credentials: [KUBECONFIG]
    timeout_sec: 120

  - id: canary_10
    type: cli
    purpose: Route 10% of traffic to new version and monitor error rates
    runtime:
      command: |
        kubectl patch rollout app -n ${NAMESPACE} \
          --type merge -p '{"spec":{"strategy":{"canary":{"steps":[{"setWeight":10}]}}}}'
    inputs: [applied_resources]
    outputs: [canary_10_status]
    timeout_sec: 300
    explain: |
      Initial canary phase. If error rate exceeds 1% threshold over 5 minutes,
      the rollout automatically pauses for investigation.

  - id: health_check_10
    type: api
    purpose: Query Prometheus for error rate and latency metrics at 10% canary weight
    runtime:
      endpoint: /api/v1/query_range
      method: POST
      url: https://prometheus.internal
    inputs: [canary_10_status]
    outputs: [error_rate_10, p99_latency_10]
    retry_policy:
      max_retries: 3
      backoff_sec: 60
    timeout_sec: 600
    explain: "Waits 5 minutes, then queries error rate and p99 latency."

  - id: canary_50
    type: cli
    purpose: Promote canary to 50% traffic weight after passing health checks
    runtime:
      command: |
        kubectl argo rollouts promote app -n ${NAMESPACE} &&
        kubectl argo rollouts set weight app 50 -n ${NAMESPACE}
    inputs: [error_rate_10]
    outputs: [canary_50_status]
    timeout_sec: 300

  - id: health_check_50
    type: api
    purpose: Query Prometheus for error rate and latency metrics at 50% canary weight
    runtime:
      endpoint: /api/v1/query_range
      method: POST
      url: https://prometheus.internal
    inputs: [canary_50_status]
    outputs: [error_rate_50, p99_latency_50]
    retry_policy:
      max_retries: 3
      backoff_sec: 60
    timeout_sec: 600

  - id: full_promote
    type: cli
    purpose: Promote canary to 100% and complete the rollout
    runtime:
      command: "kubectl argo rollouts promote --full app -n ${NAMESPACE}"
    inputs: [error_rate_50]
    outputs: [rollout_complete]
    timeout_sec: 300

  - id: rollback
    type: cli
    purpose: Abort the rollout and restore previous stable version
    runtime:
      command: "kubectl argo rollouts abort app -n ${NAMESPACE} && kubectl argo rollouts undo app -n ${NAMESPACE}"
    outputs: [rollback_status]
    timeout_sec: 120

  - id: notify_status
    type: api
    purpose: Send rollout result to Slack and PagerDuty
    runtime:
      endpoint: /api/chat.postMessage
      method: POST
      url: https://slack.com
    inputs: [rollout_complete]
    security:
      auth: bearer_token
      secret_ref: SLACK_BOT_TOKEN

edges:
  - from: validate_manifests
    to: apply_manifests
    mode: sequential

  - from: apply_manifests
    to: canary_10
    mode: sequential

  - from: canary_10
    to: health_check_10
    mode: sequential

  - from: health_check_10
    to: canary_50
    mode: conditional
    condition: "error_rate_10 < 0.01 && p99_latency_10 < 500"

  - from: health_check_10
    to: rollback
    mode: conditional
    condition: "error_rate_10 >= 0.01 || p99_latency_10 >= 500"
    label: "Health check failed at 10%"

  - from: canary_50
    to: health_check_50
    mode: sequential

  - from: health_check_50
    to: full_promote
    mode: conditional
    condition: "error_rate_50 < 0.01 && p99_latency_50 < 500"

  - from: health_check_50
    to: rollback
    mode: conditional
    condition: "error_rate_50 >= 0.01 || p99_latency_50 >= 500"
    label: "Health check failed at 50%"

  - from: full_promote
    to: notify_status
    mode: sequential

  - from: rollback
    to: notify_status
    mode: sequential