{
- "headers": [
- "Model",
- "Provider",
- "Total score (%)",
- "execution (%)",
- "search (%)",
- "ambiguity (%)",
- "adaptability (%)",
- "time (%)",
- "noise (%)",
- "A2A (%)",
- "Number of runs",
- "Submitter",
- "Submission date"
- "data": [
- [
- "GPT-5 (high)",
- "OpenAI",
- 42.1,
- "69.2 ± 2.1",
- "79.6 ± 1.8",
- "51.9 ± 2.3",
- "40.4 ± 2.2",
- " 0.0 ± 0.0",
- "35.4 ± 2.2",
- "17.9 ± 1.8",
- 3,
- "Meta",
- "2025-09-09"
- [
- "Claude-4-Sonnet Thinking",
- "Anthropic",
- 37.8,
- "62.1 ± 2.2",
- "60.6 ± 2.2",
- "27.3 ± 2.0",
- "42.1 ± 2.3",
- " 8.5 ± 1.3",
- "31.2 ± 2.1",
- "32.5 ± 2.1",
- 3,
- "Meta",
- "2025-09-29"
- [
- "Claude-4-Sonnet",
- "Anthropic",
- 34.8,
- "57.9 ± 2.3",
- "59.8 ± 2.2",
- "24.2 ± 2.0",
- "38.1 ± 2.2",
- " 8.1 ± 1.2",
- "27.7 ± 2.0",
- "27.9 ± 2.0",
- 3,
- "Meta",
- "2025-09-09"
- [
- "GPT-5 (low)",
- "OpenAI",
- 34.6,
- "52.7 ± 2.3",
- "64.2 ± 2.2",
- "39.6 ± 2.2",
- "30.2 ± 2.1",
- " 2.3 ± 0.7",
- "28.3 ± 2.1",
- "24.6 ± 2.0",
- 3,
- "Meta",
- "2025-09-09"
- [
- "Gemini-2.5-Pro",
- "Google",
- 25.8,
- "39.2 ± 2.2",
- "57.7 ± 2.3",
- "18.1 ± 1.8",
- "17.5 ± 1.7",
- " 7.3 ± 1.2",
- "20.4 ± 1.8",
- "20.4 ± 1.8",
- 3,
- "Meta",
- "2025-09-09"
- [
- "DeepSeek-v3.1 Terminus",
- "DeepSeek",
- 23.1,
- "43.1 ± 3.9",
- "34.4 ± 3.8",
- "13.1 ± 2.7",
- "32.5 ± 3.7",
- " 1.9 ± 1.1",
- "17.5 ± 3.0",
- "19.4 ± 3.1",
- 3,
- "Meta",
- "2025-09-30"
- [
- "DeepSeek-v3.1",
- "DeepSeek",
- 21.9,
- "39.8 ± 2.2",
- "36.2 ± 2.2",
- "11.2 ± 1.4",
- "31.2 ± 2.1",
- " 1.7 ± 0.6",
- "17.3 ± 1.7",
- "16.0 ± 1.7",
- 3,
- "Meta",
- "2025-09-29"
- [
- "Kimi-K2",
- "Moonshot",
- 20.1,
- "34.2 ± 2.2",
- "36.0 ± 2.2",
- " 8.3 ± 1.3",
- "24.0 ± 1.9",
- " 0.8 ± 0.4",
- "18.8 ± 1.8",
- "18.3 ± 1.8",
- 3,
- "Meta",
- "2025-09-09"
- [
- "GPT-5 (minimal)",
- "OpenAI",
- 18.2,
- "31.9 ± 2.1",
- "26.2 ± 2.0",
- "20.6 ± 1.8",
- "19.2 ± 1.8",
- " 5.2 ± 1.0",
- "13.1 ± 1.5",
- "11.5 ± 1.5",
- 3,
- "Meta",
- "2025-09-09"
- [
- "Grok-4",
- "xAI",
- 15.7,
- " 8.8 ± 2.2",
- "57.5 ± 3.9",
- " 9.4 ± 2.3",
- " 4.4 ± 1.6",
- " 0.0 ± 0.0",
- "15.6 ± 2.9",
- "14.4 ± 2.8",
- 3,
- "Meta",
- "2025-09-09"
- [
- "Qwen3-235B-thinking",
- "Alibaba",
- 15.7,
- "28.1 ± 2.1",
- "36.2 ± 3.8",
- "10.0 ± 2.4",
- "16.2 ± 2.9",
- " 0.0 ± 0.0",
- " 6.9 ± 2.0",
- "12.5 ± 2.6",
- 3,
- "Meta",
- "2025-09-29"
- [
- "GPT-OSS 120B (high)",
- "OpenAI",
- 13.7,
- "17.9 ± 0.8",
- "33.1 ± 2.3",
- " 8.3 ± 0.2",
- "10.6 ± 1.3",
- " 0.6 ± 0.4",
- "14.6 ± 0.2",
- "10.6 ± 0.4",
- 3,
- "Meta",
- "2025-09-29"
- [
- "Qwen3-235B",
- "Alibaba",
- 11.6,
- "22.7 ± 1.9",
- "22.3 ± 1.9",
- " 6.5 ± 1.1",
- " 8.1 ± 1.2",
- " 1.2 ± 0.5",
- "10.8 ± 1.4",
- " 9.4 ± 1.3",
- 3,
- "Meta",
- "2025-09-09"
- [
- "GPT-4o",
- "OpenAI",
- 7.4,
- " 8.3 ± 1.3",
- "17.5 ± 1.7",
- " 4.4 ± 0.9",
- " 6.2 ± 1.1",
- " 5.8 ± 1.1",
- " 4.6 ± 1.0",
- " 5.2 ± 1.0",
- 3,
- "Meta",
- "2025-09-09"
- [
- "Llama 4 Maverick",
- "Meta",
- 7.4,
- "13.8 ± 1.6",
- "14.4 ± 1.6",
- " 2.1 ± 0.7",
- " 5.0 ± 1.0",
- " 1.2 ± 0.5",
- " 6.2 ± 1.1",
- " 9.2 ± 1.3",
- 3,
- "Meta",
- "2025-09-09"
- [
- "Llama 3.3 70B Instruct",
- "Meta",
- 4.4,
- " 7.1 ± 1.2",
- "11.5 ± 1.5",
- " 1.7 ± 0.6",
- " 1.9 ± 0.6",
- " 0.4 ± 0.3",
- " 3.8 ± 0.9",
- " 4.6 ± 1.0",
- 3,
- "Meta",
- "2025-09-09"
- [
- "metadata": null