{
  "schema_version": "1.0",
  "name": "cohesive-llm-benchmark",
  "description": "End-to-end Nextflow-aware benchmark for natural-language to bioinformatics pipeline generators. Validates that each generated .nf parses (DSL2) and schedules the expected number of step processes under nextflow -stub-run.",
  "generated_at": "2026-05-26T19:03:42Z",
  "homepage": "https://genpat-it.github.io/cohesive-llm-benchmark",
  "repository": "https://github.com/genpat-it/cohesive-llm-benchmark",
  "framework": "https://github.com/genpat-it/cohesive-ngsmanager",
  "llm_under_test": "https://github.com/mgradyn/izs-llm",
  "docs": {
    "methodology": "https://genpat-it.github.io/cohesive-llm-benchmark/../blob/main/METHODOLOGY.md",
    "error_taxonomy": "https://genpat-it.github.io/cohesive-llm-benchmark/../blob/main/docs/error_taxonomy.md",
    "dataset_schema": "https://genpat-it.github.io/cohesive-llm-benchmark/../blob/main/docs/dataset_schema.md",
    "explorer": "https://genpat-it.github.io/cohesive-llm-benchmark/explorer.html"
  },
  "datasets": {
    "single_turn_curated": {
      "path": "dataset/dataset_50.jsonl",
      "url": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/dataset/dataset_50.jsonl",
      "size": 50,
      "description": "Curated single-turn subset (stable historical baseline)."
    },
    "single_turn_full": {
      "path": "dataset/dataset_200.jsonl",
      "url": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/dataset/dataset_200.jsonl",
      "size": 200,
      "description": "200 single-turn prompts (curated 50 + 150 combinatorial extension)."
    },
    "single_turn_full_with_multi": {
      "path": "dataset/dataset_205.jsonl",
      "url": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/dataset/dataset_205.jsonl",
      "size": 205,
      "description": "200 + 5 multi-sample workflow blueprints (category X*: panaroo, vcf2mst, grapetree, reportree-alleles, reportree-vcf)."
    },
    "multi_turn_curated": {
      "path": "dataset/dataset_modifications.jsonl",
      "url": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/dataset/dataset_modifications.jsonl",
      "size": 17,
      "description": "17 curated multi-turn conversations (34 turns)."
    },
    "multi_turn_full": {
      "path": "dataset/dataset_modifications_full.jsonl",
      "url": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/dataset/dataset_modifications_full.jsonl",
      "size": 159,
      "description": "159 multi-turn conversations (330 turns)."
    }
  },
  "verdict_tags": {
    "literal-match": "LLM steps match the ground truth exactly.",
    "extras-best-practice": "LLM added upstream best-practice steps (trimming, species-id, host-depletion, ...) -- biologically sound but beyond the literal prompt.",
    "extras-irrelevant": "LLM added steps that are not a common best-practice add-on.",
    "missing-steps": "LLM left out required ground-truth steps.",
    "hallucinated": "LLM used step/include names that don't exist in the framework."
  },
  "runs": [
    {
      "run_id": "example_run_mistral",
      "kind": "single-turn",
      "metadata": {
        "run_started_at": "2026-05-19T16:32:00Z",
        "bench": {
          "commit": "6736ab62c9813d6b764529e6fa858ce52d722fb8",
          "short_commit": "6736ab6",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": false
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": false
        },
        "llm": {
          "name": "izs-llm",
          "model": "labs-devstral-small-2512",
          "api_url": "http://127.0.0.1:8765",
          "commit": "73ace31b427cad01a61edf1acdc3331943df015b",
          "short_commit": "73ace31",
          "branch": "feat/validation-and-anti-hallucination",
          "remote": "https://github.com/mgradyn/izs-llm",
          "dirty": false,
          "model_source": "corrected post-hoc; izs-llm calls labs-devstral-small-2512 which resolves to public devstral-2512 (see https://api.mistral.ai/v1/models)",
          "api_alias": "labs-devstral-small-2512",
          "resolved_model": "devstral-2512",
          "model_family": "Devstral (Mistral, code-tuned, medium-class ~25-30B params)"
        },
        "dataset": "dataset_50.jsonl",
        "notes": "reference run captured during initial dataset publication"
      },
      "summary": {
        "total": 50,
        "passed": 43,
        "passed_pct": 86.0,
        "syntax_ok": 50,
        "no_code_returned": 0,
        "by_category": {
          "?": 50
        },
        "by_error_category": {
          "missing_param": 5,
          "silent_no_op": 2
        },
        "by_tag": {
          "literal-match": 37,
          "extras-irrelevant": 8,
          "missing-steps": 7,
          "extras-best-practice": 3
        }
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/example_run_mistral/verdicts_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/example_run_mistral/metadata.json"
      }
    },
    {
      "run_id": "example_run_mistral_multi_turn",
      "kind": "multi-turn",
      "metadata": {
        "run_started_at": "2026-05-21T07:12:25Z",
        "bench": {
          "commit": "d590ecf1025a61ab0a51dd52ed474bbe4ed5c184",
          "short_commit": "d590ecf",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "?",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_modifications.jsonl"
      },
      "summary": {
        "total": 34,
        "passed": 29,
        "passed_pct": 85.3,
        "syntax_ok": 34,
        "no_code_returned": 0,
        "by_category": {
          "modification": 34
        },
        "by_error_category": {
          "missing_param": 3,
          "silent_no_op": 1,
          "file_not_found": 1
        },
        "by_tag": {
          "literal-match": 30,
          "extras-best-practice": 2,
          "harness-param-gap": 2,
          "missing-steps": 2,
          "extras-irrelevant": 1
        },
        "conversations_total": 17,
        "conversations_fully_passed": 14,
        "conversations_fully_passed_pct": 82.4
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/example_run_mistral_multi_turn/verdicts_modifications_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/example_run_mistral_multi_turn/metadata.json"
      }
    },
    {
      "run_id": "llm_full_200",
      "kind": "single-turn",
      "metadata": {
        "run_started_at": "2026-05-21T10:39:53Z",
        "bench": {
          "commit": "1dd5441f348dbf0d304ef059a512409bfec880b1",
          "short_commit": "1dd5441",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "codestral-latest",
          "api_url": "http://127.0.0.1:8765",
          "api_alias": "codestral-latest",
          "resolved_model": "codestral-2508 (or whatever -latest pins to today; mid-2025)",
          "model_family": "Codestral (Mistral production, code-tuned, ~22B params)",
          "model_source": "set by hand in izs-llm/app/core/config.py during 2026-05-21 re-run with chat logs"
        },
        "dataset": "dataset_50.jsonl"
      },
      "summary": {
        "total": 200,
        "passed": 163,
        "passed_pct": 81.5,
        "syntax_ok": 191,
        "no_code_returned": 4,
        "by_category": {
          "?": 200
        },
        "by_error_category": {
          "file_not_found": 13,
          "missing_param": 13,
          "no_code": 4,
          "arity_error": 3,
          "none": 2,
          "silent_no_op": 1,
          "partial_dag": 1
        },
        "by_tag": {
          "literal-match": 159,
          "extras-best-practice": 27,
          "extras-irrelevant": 16,
          "missing-steps": 11
        }
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200/verdicts_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200/metadata.json"
      }
    },
    {
      "run_id": "llm_full_200_devstral_2507",
      "kind": "single-turn",
      "metadata": {
        "run_started_at": "2026-05-22T10:10:35Z",
        "bench": {
          "commit": "b8948afd3e444cc6e30c1aa3bb1e58bcdbf4537a",
          "short_commit": "b8948af",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "devstral-small-2507",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_50.jsonl"
      },
      "summary": {
        "total": 200,
        "passed": 116,
        "passed_pct": 58.0,
        "syntax_ok": 182,
        "no_code_returned": 15,
        "by_category": {
          "?": 200
        },
        "by_error_category": {
          "file_not_found": 44,
          "no_code": 15,
          "silent_no_op": 12,
          "none": 4,
          "partial_dag": 4,
          "missing_param": 3,
          "arity_error": 2
        },
        "by_tag": {
          "literal-match": 134,
          "missing-steps": 49,
          "extras-irrelevant": 29,
          "extras-best-practice": 16
        }
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200_devstral_2507/verdicts_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200_devstral_2507/metadata.json"
      }
    },
    {
      "run_id": "llm_full_200_devstral_labs",
      "kind": "single-turn",
      "metadata": {
        "run_started_at": "2026-05-22T16:59:07Z",
        "bench": {
          "commit": "b8948afd3e444cc6e30c1aa3bb1e58bcdbf4537a",
          "short_commit": "b8948af",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "labs-devstral-small-2512",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_50.jsonl"
      },
      "summary": {
        "total": 200,
        "passed": 182,
        "passed_pct": 91.0,
        "syntax_ok": 197,
        "no_code_returned": 2,
        "by_category": {
          "?": 200
        },
        "by_error_category": {
          "none": 8,
          "missing_param": 6,
          "no_code": 2,
          "file_not_found": 1,
          "arity_error": 1
        },
        "by_tag": {
          "literal-match": 171,
          "extras-best-practice": 16,
          "extras-irrelevant": 11,
          "missing-steps": 10
        }
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200_devstral_labs/verdicts_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200_devstral_labs/metadata.json"
      }
    },
    {
      "run_id": "llm_full_200_mistral_small",
      "kind": "single-turn",
      "metadata": {
        "run_started_at": "2026-05-22T06:52:42Z",
        "bench": {
          "commit": "b8948afd3e444cc6e30c1aa3bb1e58bcdbf4537a",
          "short_commit": "b8948af",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "mistral-small-latest",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_50.jsonl"
      },
      "summary": {
        "total": 200,
        "passed": 139,
        "passed_pct": 69.5,
        "syntax_ok": 192,
        "no_code_returned": 0,
        "by_category": {
          "?": 200
        },
        "by_error_category": {
          "file_not_found": 26,
          "missing_param": 19,
          "arity_error": 7,
          "none": 4,
          "silent_no_op": 4,
          "partial_dag": 1
        },
        "by_tag": {
          "literal-match": 130,
          "extras-irrelevant": 47,
          "missing-steps": 33,
          "extras-best-practice": 27
        }
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200_mistral_small/verdicts_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_200_mistral_small/metadata.json"
      }
    },
    {
      "run_id": "llm_full_multi_turn",
      "kind": "multi-turn",
      "metadata": {
        "run_started_at": "2026-05-21T12:21:40Z",
        "bench": {
          "commit": "1dd5441f348dbf0d304ef059a512409bfec880b1",
          "short_commit": "1dd5441",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "codestral-latest",
          "api_url": "http://127.0.0.1:8765",
          "api_alias": "codestral-latest",
          "resolved_model": "codestral-2508 (or whatever -latest pins to today; mid-2025)",
          "model_family": "Codestral (Mistral production, code-tuned, ~22B params)",
          "model_source": "set by hand in izs-llm/app/core/config.py during 2026-05-21 re-run with chat logs"
        },
        "dataset": "dataset_modifications.jsonl"
      },
      "summary": {
        "total": 330,
        "passed": 217,
        "passed_pct": 65.8,
        "syntax_ok": 321,
        "no_code_returned": 3,
        "by_category": {
          "modification": 330
        },
        "by_error_category": {
          "missing_param": 40,
          "silent_no_op": 26,
          "partial_dag": 25,
          "file_not_found": 15,
          "arity_error": 4,
          "no_code": 3
        },
        "by_tag": {
          "literal-match": 228,
          "extras-irrelevant": 69,
          "missing-steps": 64,
          "extras-best-practice": 24
        },
        "conversations_total": 159,
        "conversations_fully_passed": 91,
        "conversations_fully_passed_pct": 57.2
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn/verdicts_modifications_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn/metadata.json"
      }
    },
    {
      "run_id": "llm_full_multi_turn_devstral_2507",
      "kind": "multi-turn",
      "metadata": {
        "run_started_at": "2026-05-22T11:42:17Z",
        "bench": {
          "commit": "b8948afd3e444cc6e30c1aa3bb1e58bcdbf4537a",
          "short_commit": "b8948af",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "devstral-small-2507",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_modifications.jsonl"
      },
      "summary": {
        "total": 319,
        "passed": 123,
        "passed_pct": 38.6,
        "syntax_ok": 290,
        "no_code_returned": 19,
        "by_category": {
          "modification": 319
        },
        "by_error_category": {
          "file_not_found": 122,
          "missing_param": 22,
          "no_code": 19,
          "silent_no_op": 15,
          "partial_dag": 10,
          "arity_error": 8
        },
        "by_tag": {
          "literal-match": 233,
          "missing-steps": 66,
          "extras-irrelevant": 40,
          "extras-best-practice": 10
        },
        "conversations_total": 159,
        "conversations_fully_passed": 33,
        "conversations_fully_passed_pct": 20.8
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn_devstral_2507/verdicts_modifications_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn_devstral_2507/metadata.json"
      }
    },
    {
      "run_id": "llm_full_multi_turn_devstral_labs",
      "kind": "multi-turn",
      "metadata": {
        "run_started_at": "2026-05-25T08:32:22Z",
        "bench": {
          "commit": "b8948afd3e444cc6e30c1aa3bb1e58bcdbf4537a",
          "short_commit": "b8948af",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "labs-devstral-small-2512",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_modifications.jsonl"
      },
      "summary": {
        "total": 328,
        "passed": 281,
        "passed_pct": 85.7,
        "syntax_ok": 324,
        "no_code_returned": 3,
        "by_category": {
          "modification": 328
        },
        "by_error_category": {
          "missing_param": 23,
          "partial_dag": 10,
          "silent_no_op": 6,
          "file_not_found": 3,
          "no_code": 3,
          "ngsmanager_naming": 2
        },
        "by_tag": {
          "literal-match": 283,
          "missing-steps": 38,
          "extras-irrelevant": 16,
          "extras-best-practice": 3
        },
        "conversations_total": 159,
        "conversations_fully_passed": 134,
        "conversations_fully_passed_pct": 84.3
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn_devstral_labs/verdicts_modifications_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn_devstral_labs/metadata.json"
      }
    },
    {
      "run_id": "llm_full_multi_turn_mistral_small",
      "kind": "multi-turn",
      "metadata": {
        "run_started_at": "2026-05-22T08:00:31Z",
        "bench": {
          "commit": "b8948afd3e444cc6e30c1aa3bb1e58bcdbf4537a",
          "short_commit": "b8948af",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": true
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "mistral-small-latest",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_modifications.jsonl"
      },
      "summary": {
        "total": 329,
        "passed": 222,
        "passed_pct": 67.5,
        "syntax_ok": 305,
        "no_code_returned": 13,
        "by_category": {
          "modification": 329
        },
        "by_error_category": {
          "missing_param": 55,
          "file_not_found": 17,
          "no_code": 13,
          "arity_error": 11,
          "silent_no_op": 10,
          "partial_dag": 1
        },
        "by_tag": {
          "literal-match": 226,
          "extras-irrelevant": 85,
          "missing-steps": 75,
          "extras-best-practice": 10
        },
        "conversations_total": 159,
        "conversations_fully_passed": 88,
        "conversations_fully_passed_pct": 55.3
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn_mistral_small/verdicts_modifications_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_full_multi_turn_mistral_small/metadata.json"
      }
    },
    {
      "run_id": "llm_multi_workflows",
      "kind": "single-turn",
      "metadata": {
        "run_started_at": "2026-05-21T07:02:52Z",
        "bench": {
          "commit": "5fd5070a3be8508d39137824def80ac49c26dff6",
          "short_commit": "5fd5070",
          "branch": "main",
          "remote": "https://github.com/genpat-it/cohesive-llm-benchmark.git",
          "dirty": false
        },
        "framework": {
          "commit": "5c2d0192e116f50d7905f289b81d61fd9d61d2cf",
          "short_commit": "5c2d019",
          "branch": "HEAD",
          "remote": "https://github.com/genpat-it/cohesive-ngsmanager.git",
          "dirty": true
        },
        "llm": {
          "name": "?",
          "model": "?",
          "api_url": "http://127.0.0.1:8765"
        },
        "dataset": "dataset_50.jsonl"
      },
      "summary": {
        "total": 5,
        "passed": 2,
        "passed_pct": 40.0,
        "syntax_ok": 5,
        "no_code_returned": 0,
        "by_category": {
          "?": 5
        },
        "by_error_category": {
          "silent_no_op": 2,
          "file_not_found": 1
        },
        "by_tag": {
          "literal-match": 3,
          "extras-irrelevant": 2
        }
      },
      "files": {
        "verdicts_jsonl": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_multi_workflows/verdicts_augmented.jsonl",
        "metadata_json": "https://raw.githubusercontent.com/genpat-it/cohesive-llm-benchmark/main/results/llm_multi_workflows/metadata.json"
      }
    }
  ]
}
