refactor: update invoke method to return ScoutState and include addit… #13

Workflow file for this run

	name: CI
Check failure on line 1 in .github/workflows/ci.yml View workflow run for this annotation GitHub Actions / .github/workflows/ci.yml Invalid workflow file `(Line: 35, Col: 9): Unrecognized named-value: 'secrets'. Located at position 1 within expression: secrets.API_KEY != ''`

	on:
	push:
	branches: [ main ]
	pull_request:
	workflow_dispatch:

	jobs:
	lint:
	name: Lint and type-check
	runs-on: ubuntu-latest
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: "3.13"

	- name: Install uv
	uses: astral-sh/setup-uv@v3

	- name: Sync dependencies
	run: uv sync

	- name: Ruff + MyPy
	run: make lint

	validation-benchmark:
	name: Validation Benchmark
	runs-on: ubuntu-latest
	# Only run when API key secrets are available
	if: ${{ secrets.API_KEY != '' }}
	env:
	# Prefer SCOUT_; fall back to OPENROUTER_ for recon
	API_KEY: ${{ secrets.API_KEY }}
	API_BASE: ${{ secrets.API_BASE }}
	MODEL: ${{ secrets.MODEL }}
	# Allow overriding the command from repo variables if desired
	VALIDATION_COMMAND: ${{ vars.VALIDATION_COMMAND }}
	# Optional: space-separated list of benchmark dirs to run (relative to repo root)
	BENCHMARK_PATHS: ${{ vars.BENCHMARK_PATHS }}
	# Optional: limit auto-discovery count if BENCHMARK_PATHS is empty
	MAX_BENCHMARKS: ${{ vars.MAX_BENCHMARKS }}
	# Optional: injected flag value for builds
	FLAG: ${{ vars.FLAG }}
	steps:
	- name: Checkout
	uses: actions/checkout@v4

	- name: Checkout validation-benchmarks
	uses: actions/checkout@v4
	with:
	repository: xbow-engineering/validation-benchmarks
	path: validation-benchmarks

	- name: Set up Python
	uses: actions/setup-python@v5
	with:
	python-version: "3.13"

	- name: Install uv
	uses: astral-sh/setup-uv@v3

	- name: Sync dependencies
	run: uv sync

	- name: Environment info
	run: \|
	python --version
	uv --version
	docker version
	docker compose version

	- name: Discover benchmarks (from BENCHMARK_PATHS or auto)
	id: discover
	shell: bash
	run: \|
	set -euo pipefail
	paths="${BENCHMARK_PATHS:-}"
	if [ -n "$paths" ]; then
	printf '%s\n' $paths > bench_list.txt
	else
	if [ -d validation-benchmarks/benchmarks ]; then
	mapfile -t found < <(find validation-benchmarks/benchmarks -mindepth 1 -maxdepth 1 -type d \| sort \| head -n "${MAX_BENCHMARKS:-1}")
	if [ "${#found[@]}" -gt 0 ]; then
	printf '%s\n' "${found[@]}" > bench_list.txt
	fi
	fi
	fi
	count="$( [ -f bench_list.txt ] && wc -l < bench_list.txt \|\| echo 0 )"
	echo "count=$count" >> "$GITHUB_OUTPUT"
	echo "Discovered $count benchmark(s)."

	- name: Run validation-benchmarks (Docker Compose)
	if: steps.discover.outputs.count != '0'
	shell: bash
	run: \|
	set -euo pipefail
	mkdir -p bench-logs
	while IFS= read -r bench; do
	echo "::group::Building $bench"
	pushd "$bench" >/dev/null
	flag_val="${FLAG:-FLAG{ci-validation}}"
	(make build FLAG="$flag_val") \|\| docker compose build --build-arg flag="$flag_val"
	docker compose up -d --wait
	popd >/dev/null
	echo "::endgroup::"
	export XBOW_INITIAL_CONTEXT="CHALLENGE: $(basename "$bench"). The target is in localhost."
	safe_name="$(echo "$bench" \| sed 's#[/ ]#_#g')"
	# Capture output and validate discovered flag matches injected flag
	set +e
	uv run python main.py 2>&1 \| tee "bench-logs/${safe_name}-xbow.log"
	status=$?
	set -e
	pushd "$bench" >/dev/null
	docker compose ps \|\| true
	docker compose logs --no-color > "../bench-logs/${safe_name}.log" \|\| true
	docker compose down -v \|\| true
	popd >/dev/null
	if ! grep -q "$flag_val" "bench-logs/${safe_name}-xbow.log"; then
	echo "Expected flag not found in output for $bench"
	echo "Expected: $flag_val"
	exit 1
	fi
	if [ $status -ne 0 ]; then
	echo "Run against $bench exited with status $status"
	exit $status
	fi
	done < bench_list.txt

	- name: Fallback simple validation (no benchmarks discovered)
	if: steps.discover.outputs.count == '0'
	shell: bash
	run: \|
	set -euo pipefail
	cmd="${VALIDATION_COMMAND:-uv run python main.py}"
	echo "Running fallback: $cmd"
	eval "$cmd"

	- name: Upload benchmark logs
	if: steps.discover.outputs.count != '0'
	uses: actions/upload-artifact@v4
	with:
	name: validation-benchmarks-logs
	path: bench-logs/**

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

refactor: update invoke method to return ScoutState and include addit… #13

Workflow file

refactor: update invoke method to return ScoutState and include addit… #13

Uh oh!

Workflow file for this run

GitHub Actions / .github/workflows/ci.yml