A/B Testing Pipeline

AI/ML
7 nodes · 8 edgesai ml
Visual
ex-ab-testing.osop.yaml
# A/B Testing Pipeline
# Create experiment, split traffic, collect metrics, statistical analysis, decision
osop_version: "2.0"
id: ab-testing
name: A/B Testing Pipeline

nodes:
  - id: define_experiment
    type: human
    purpose: Product manager defines hypothesis, variants, success metrics, and sample size
    role: product_manager
    outputs: [experiment_id, hypothesis, variants, primary_metric, sample_size, duration_days]
    explain: |
      Requires a clear hypothesis, at least two variants (control + treatment),
      a primary metric with minimum detectable effect, and power analysis.

  - id: create_experiment
    type: api
    purpose: Register experiment in the feature flagging platform and configure targeting
    runtime:
      endpoint: /api/v2/experiments
      method: POST
      url: https://flagsmith.internal
    inputs: [experiment_id, variants, sample_size]
    outputs: [flag_key, experiment_url]
    security:
      auth: bearer_token
      secret_ref: FLAGSMITH_API_KEY
    timeout_sec: 30

  - id: split_traffic
    type: api
    purpose: Activate traffic allocation and begin routing users to experiment variants
    runtime:
      endpoint: /api/v2/experiments/${experiment_id}/activate
      method: POST
      url: https://flagsmith.internal
    inputs: [flag_key]
    outputs: [activation_timestamp, allocation_config]
    security:
      auth: bearer_token
      secret_ref: FLAGSMITH_API_KEY

  - id: collect_metrics
    type: cli
    purpose: Run daily metric collection job aggregating events from analytics pipeline
    runtime:
      command: |
        python collect_experiment_metrics.py \
          --experiment-id ${experiment_id} \
          --metric ${primary_metric} \
          --source bigquery \
          --output metrics/${experiment_id}/
    inputs: [experiment_id, primary_metric, activation_timestamp]
    outputs: [daily_metrics, sample_count_control, sample_count_treatment]
    timeout_sec: 600
    explain: |
      Runs daily until sample size reached or duration expires.
      Checks for sample ratio mismatch (SRM) on each collection run.

  - id: statistical_analysis
    type: cli
    purpose: Run frequentist and Bayesian analysis on collected experiment data
    runtime:
      command: |
        python analyze_experiment.py \
          --experiment-id ${experiment_id} \
          --method bayesian,frequentist \
          --confidence 0.95 \
          --output analysis/${experiment_id}/report.json
    inputs: [daily_metrics]
    outputs: [p_value, confidence_interval, probability_of_improvement, lift_estimate, srm_check]
    timeout_sec: 300

  - id: make_decision
    type: human
    purpose: Product and data team review results and decide to ship, iterate, or kill
    role: product_manager
    inputs: [p_value, confidence_interval, probability_of_improvement, lift_estimate, srm_check]
    outputs: [decision]
    approval_gate:
      required_approvers: 2
      timeout_min: 4320
    explain: |
      Decision options: SHIP (roll out winning variant), ITERATE (modify and re-test),
      or KILL (revert to control). SRM warnings require investigation before decision.

  - id: apply_decision
    type: api
    purpose: Update feature flag to reflect the experiment decision
    runtime:
      endpoint: /api/v2/experiments/${experiment_id}/resolve
      method: POST
      url: https://flagsmith.internal
    inputs: [decision, flag_key]
    outputs: [resolution_status]
    security:
      auth: bearer_token
      secret_ref: FLAGSMITH_API_KEY

edges:
  - from: define_experiment
    to: create_experiment
    mode: sequential

  - from: create_experiment
    to: split_traffic
    mode: sequential

  - from: split_traffic
    to: collect_metrics
    mode: sequential

  - from: collect_metrics
    to: statistical_analysis
    mode: conditional
    condition: "sample_count_control >= sample_size / 2 && sample_count_treatment >= sample_size / 2"

  - from: collect_metrics
    to: collect_metrics
    mode: loop
    when: "sample_count_control < sample_size / 2"
    label: "Continue collecting until sample size reached"

  - from: statistical_analysis
    to: make_decision
    mode: sequential

  - from: make_decision
    to: apply_decision
    mode: sequential

  - from: make_decision
    to: define_experiment
    mode: conditional
    condition: "decision == 'ITERATE'"
    label: "Iterate with modified hypothesis"
←
RAG Document Pipeline
ML Model Training Pipeline
→