File size: 6,584 Bytes
6b02fc9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
<svg xmlns="http://www.w3.org/2000/svg" width="1100" height="650" viewBox="0 0 1100 650" role="img" aria-labelledby="title desc">
  <title id="title">Held-out Acta agentic benchmark improvements</title>
  <desc id="desc">Grouped horizontal bar chart comparing base Qwen3.5-9B and Qwen3.5-9B NSC-ACE on held-out Acta structural agentic metrics.</desc>
  <defs>
    <linearGradient id="baseGrad" x1="0" x2="1" y1="0" y2="0">
      <stop offset="0%" stop-color="#94a3b8"/>
      <stop offset="100%" stop-color="#64748b"/>
    </linearGradient>
    <linearGradient id="aceGrad" x1="0" x2="1" y1="0" y2="0">
      <stop offset="0%" stop-color="#22c55e"/>
      <stop offset="100%" stop-color="#0f766e"/>
    </linearGradient>
    <filter id="shadow" x="-10%" y="-30%" width="120%" height="160%">
      <feDropShadow dx="0" dy="2" stdDeviation="2" flood-color="#0f172a" flood-opacity="0.18"/>
    </filter>
  </defs>
  <rect width="1100" height="650" rx="18" fill="#f8fafc"/>
  <rect x="28" y="28" width="1044" height="594" rx="16" fill="#ffffff" stroke="#e2e8f0"/>
  <text x="56" y="76" font-family="Inter, Segoe UI, Arial, sans-serif" font-size="30" font-weight="800" fill="#0f172a">Held-out Acta Agentic Structural Eval</text>
  <text x="56" y="108" font-family="Inter, Segoe UI, Arial, sans-serif" font-size="16" fill="#475569">80 held-out prompts. Greedy decoding. Higher is better.</text>
  <g font-family="Inter, Segoe UI, Arial, sans-serif" font-size="14" fill="#475569">
    <rect x="765" y="64" width="20" height="12" rx="3" fill="url(#baseGrad)"/>
    <text x="794" y="75">Qwen3.5-9B base</text>
    <rect x="765" y="90" width="20" height="12" rx="3" fill="url(#aceGrad)"/>
    <text x="794" y="101">NSC-ACE adapter</text>
  </g>

  <g stroke="#e2e8f0" stroke-width="1">
    <line x1="300" y1="145" x2="300" y2="570"/>
    <line x1="445" y1="145" x2="445" y2="570"/>
    <line x1="590" y1="145" x2="590" y2="570"/>
    <line x1="735" y1="145" x2="735" y2="570"/>
    <line x1="880" y1="145" x2="880" y2="570"/>
  </g>
  <g font-family="Inter, Segoe UI, Arial, sans-serif" font-size="12" fill="#64748b">
    <text x="292" y="590">0</text>
    <text x="432" y="590">25%</text>
    <text x="577" y="590">50%</text>
    <text x="722" y="590">75%</text>
    <text x="865" y="590">100%</text>
  </g>

  <g font-family="Inter, Segoe UI, Arial, sans-serif">
    <g transform="translate(0,150)">
      <text x="56" y="23" font-size="15" font-weight="700" fill="#0f172a">Composite structural score</text>
      <rect x="300" y="5" width="466" height="16" rx="8" fill="url(#baseGrad)" filter="url(#shadow)"/>
      <rect x="300" y="27" width="549" height="16" rx="8" fill="url(#aceGrad)" filter="url(#shadow)"/>
      <text x="779" y="18" font-size="13" fill="#334155">80.4%</text>
      <text x="862" y="40" font-size="13" font-weight="700" fill="#065f46">94.7%</text>
      <text x="930" y="31" font-size="13" font-weight="700" fill="#0f766e">+14.3 pts</text>
    </g>
    <g transform="translate(0,210)">
      <text x="56" y="23" font-size="15" font-weight="700" fill="#0f172a">Format reward</text>
      <rect x="300" y="5" width="409" height="16" rx="8" fill="url(#baseGrad)" filter="url(#shadow)"/>
      <rect x="300" y="27" width="548" height="16" rx="8" fill="url(#aceGrad)" filter="url(#shadow)"/>
      <text x="722" y="18" font-size="13" fill="#334155">70.6%</text>
      <text x="861" y="40" font-size="13" font-weight="700" fill="#065f46">94.4%</text>
      <text x="930" y="31" font-size="13" font-weight="700" fill="#0f766e">+23.8 pts</text>
    </g>
    <g transform="translate(0,270)">
      <text x="56" y="23" font-size="15" font-weight="700" fill="#0f172a">Tool-call rate</text>
      <rect x="300" y="5" width="479" height="16" rx="8" fill="url(#baseGrad)" filter="url(#shadow)"/>
      <rect x="300" y="27" width="566" height="16" rx="8" fill="url(#aceGrad)" filter="url(#shadow)"/>
      <text x="792" y="18" font-size="13" fill="#334155">82.5%</text>
      <text x="879" y="40" font-size="13" font-weight="700" fill="#065f46">97.5%</text>
      <text x="930" y="31" font-size="13" font-weight="700" fill="#0f766e">+15.0 pts</text>
    </g>
    <g transform="translate(0,330)">
      <text x="56" y="23" font-size="15" font-weight="700" fill="#0f172a">1-2 call sweet spot</text>
      <rect x="300" y="5" width="457" height="16" rx="8" fill="url(#baseGrad)" filter="url(#shadow)"/>
      <rect x="300" y="27" width="551" height="16" rx="8" fill="url(#aceGrad)" filter="url(#shadow)"/>
      <text x="770" y="18" font-size="13" fill="#334155">78.8%</text>
      <text x="864" y="40" font-size="13" font-weight="700" fill="#065f46">95.0%</text>
      <text x="930" y="31" font-size="13" font-weight="700" fill="#0f766e">+16.2 pts</text>
    </g>
    <g transform="translate(0,390)">
      <text x="56" y="23" font-size="15" font-weight="700" fill="#0f172a">Reasoning tag rate</text>
      <rect x="300" y="5" width="276" height="16" rx="8" fill="url(#baseGrad)" filter="url(#shadow)"/>
      <rect x="300" y="27" width="566" height="16" rx="8" fill="url(#aceGrad)" filter="url(#shadow)"/>
      <text x="589" y="18" font-size="13" fill="#334155">47.5%</text>
      <text x="879" y="40" font-size="13" font-weight="700" fill="#065f46">97.5%</text>
      <text x="930" y="31" font-size="13" font-weight="700" fill="#0f766e">+50.0 pts</text>
    </g>
    <g transform="translate(0,450)">
      <text x="56" y="23" font-size="15" font-weight="700" fill="#0f172a">Tool-use reward</text>
      <rect x="300" y="5" width="479" height="16" rx="8" fill="url(#baseGrad)" filter="url(#shadow)"/>
      <rect x="300" y="27" width="560" height="16" rx="8" fill="url(#aceGrad)" filter="url(#shadow)"/>
      <text x="792" y="18" font-size="13" fill="#334155">82.6%</text>
      <text x="873" y="40" font-size="13" font-weight="700" fill="#065f46">96.5%</text>
      <text x="930" y="31" font-size="13" font-weight="700" fill="#0f766e">+13.9 pts</text>
    </g>
    <g transform="translate(0,510)">
      <text x="56" y="23" font-size="15" font-weight="700" fill="#0f172a">Reasoning-depth reward</text>
      <rect x="300" y="5" width="490" height="16" rx="8" fill="url(#baseGrad)" filter="url(#shadow)"/>
      <rect x="300" y="27" width="560" height="16" rx="8" fill="url(#aceGrad)" filter="url(#shadow)"/>
      <text x="803" y="18" font-size="13" fill="#334155">84.5%</text>
      <text x="873" y="40" font-size="13" font-weight="700" fill="#065f46">96.5%</text>
      <text x="930" y="31" font-size="13" font-weight="700" fill="#0f766e">+12.1 pts</text>
    </g>
  </g>
</svg>