|
1 | 1 | import argparse |
2 | 2 | import os |
3 | | -import subprocess |
4 | 3 | import statistics |
5 | 4 | import tqdm |
6 | 5 | from turnkeyml.state import State |
@@ -137,91 +136,51 @@ def run( |
137 | 136 | for iteration in tqdm.tqdm( |
138 | 137 | range(iterations), desc="iterations", disable=iterations < 2 |
139 | 138 | ): |
140 | | - cmd = [ |
141 | | - state.model.executable, |
142 | | - "-m", |
143 | | - state.model.model, |
144 | | - "--ctx-size", |
145 | | - str(context_size), |
146 | | - "-n", |
147 | | - str(output_tokens), |
148 | | - "-t", |
149 | | - str(state.model.threads), |
150 | | - "-p", |
151 | | - prompt, |
152 | | - "-e", |
153 | | - ] |
154 | | - |
155 | | - cmd = [str(m) for m in cmd] |
156 | | - |
157 | 139 | try: |
158 | | - process = subprocess.Popen( |
159 | | - cmd, |
160 | | - stdout=subprocess.PIPE, |
161 | | - stderr=subprocess.PIPE, |
162 | | - universal_newlines=True, |
163 | | - encoding="utf-8", |
164 | | - errors="replace", |
165 | | - ) |
166 | | - |
167 | | - raw_output, stderr = process.communicate() |
168 | | - if process.returncode != 0: |
| 140 | + # Use the adapter's generate method which already has the timeout and error handling |
| 141 | + raw_output, stderr = state.model.generate(prompt, return_raw=True) |
| 142 | + |
| 143 | + # Parse the timing information from the output |
| 144 | + ms_per_token = None |
| 145 | + time_to_first_token_ms = None |
| 146 | + |
| 147 | + # Look for timing in both stdout and stderr |
| 148 | + for output in [raw_output, stderr]: |
| 149 | + for line in output.splitlines(): |
| 150 | + if "llama_perf_context_print: eval time =" in line: |
| 151 | + parts = line.split("(")[1].strip() |
| 152 | + parts = parts.split(",") |
| 153 | + ms_per_token = float( |
| 154 | + parts[0].split("ms per token")[0].strip() |
| 155 | + ) |
| 156 | + if "llama_perf_context_print: prompt eval time =" in line: |
| 157 | + parts = line.split("=")[1].split("/")[0] |
| 158 | + time_to_first_token_ms = float(parts.split("ms")[0].strip()) |
| 159 | + |
| 160 | + if ms_per_token is None or time_to_first_token_ms is None: |
169 | 161 | error_msg = ( |
170 | | - f"llama.cpp failed with return code {process.returncode}.\n" |
| 162 | + "Could not find timing information in llama.cpp output.\n" |
171 | 163 | ) |
172 | | - error_msg += f"Command: {' '.join(cmd)}\n" |
173 | | - error_msg += f"Error output:\n{stderr}\n" |
174 | | - error_msg += f"Standard output:\n{raw_output}" |
| 164 | + error_msg += "Raw output:\n" + raw_output + "\n" |
| 165 | + error_msg += "Stderr:\n" + stderr |
175 | 166 | raise Exception(error_msg) |
176 | 167 |
|
177 | | - if raw_output is None: |
178 | | - raise Exception("No output received from llama.cpp process") |
| 168 | + # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0 |
| 169 | + # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases |
| 170 | + # as performance data for generating a few tokens is not relevant. |
| 171 | + tokens_per_second = 0 |
| 172 | + if output_tokens > 5 and ms_per_token > 0: |
| 173 | + tokens_per_second = 1000 / ms_per_token |
| 174 | + time_to_first_token = time_to_first_token_ms / 1000 |
179 | 175 |
|
180 | | - except Exception as e: |
181 | | - error_msg = f"Failed to run llama.cpp command: {str(e)}\n" |
182 | | - error_msg += f"Command: {' '.join(cmd)}" |
183 | | - raise Exception(error_msg) |
| 176 | + if iteration > warmup_iterations - 1: |
| 177 | + iteration_tokens_per_second.append(tokens_per_second) |
| 178 | + iteration_time_to_first_token.append(time_to_first_token) |
184 | 179 |
|
185 | | - ms_per_token = None |
186 | | - time_to_first_token_ms = None |
187 | | - for line in raw_output.splitlines(): |
188 | | - if "llama_perf_context_print: eval time =" in line: |
189 | | - parts = line.split("(")[1].strip() |
190 | | - parts = parts.split(",") |
191 | | - ms_per_token = float(parts[0].split("ms per token")[0].strip()) |
192 | | - if "llama_perf_context_print: prompt eval time =" in line: |
193 | | - parts = line.split("=")[1].split("/")[0] |
194 | | - time_to_first_token_ms = float(parts.split("ms")[0].strip()) |
195 | | - |
196 | | - if ms_per_token is None or time_to_first_token_ms is None: |
197 | | - # Look in stderr as well since some versions of llama.cpp output timing there |
198 | | - for line in stderr.splitlines(): |
199 | | - if "llama_perf_context_print: eval time =" in line: |
200 | | - parts = line.split("(")[1].strip() |
201 | | - parts = parts.split(",") |
202 | | - ms_per_token = float(parts[0].split("ms per token")[0].strip()) |
203 | | - if "llama_perf_context_print: prompt eval time =" in line: |
204 | | - parts = line.split("=")[1].split("/")[0] |
205 | | - time_to_first_token_ms = float(parts.split("ms")[0].strip()) |
206 | | - |
207 | | - if ms_per_token is None or time_to_first_token_ms is None: |
208 | | - error_msg = "Could not find timing information in llama.cpp output.\n" |
209 | | - error_msg += "Raw output:\n" + raw_output + "\n" |
210 | | - error_msg += "Error output:\n" + stderr |
| 180 | + except Exception as e: |
| 181 | + error_msg = f"Failed to run benchmark: {str(e)}" |
211 | 182 | raise Exception(error_msg) |
212 | 183 |
|
213 | | - # When output_tokens is set to 1 for accuracy tests, ms_per_token tends to 0 |
214 | | - # and causes a divide-by-zero error. Set tokens_per_second to 0 in such cases |
215 | | - # as performance data for generating a few tokens is not relevant. |
216 | | - tokens_per_second = 0 |
217 | | - if output_tokens > 5 and ms_per_token > 0: |
218 | | - tokens_per_second = 1000 / ms_per_token |
219 | | - time_to_first_token = time_to_first_token_ms / 1000 |
220 | | - |
221 | | - if iteration > warmup_iterations - 1: |
222 | | - iteration_tokens_per_second.append(tokens_per_second) |
223 | | - iteration_time_to_first_token.append(time_to_first_token) |
224 | | - |
225 | 184 | token_generation_tokens_per_second = statistics.mean( |
226 | 185 | iteration_tokens_per_second |
227 | 186 | ) |
|
0 commit comments