Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
114 changes: 114 additions & 0 deletions .github/scripts/ai/agent_chroma_release.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from __future__ import annotations
import argparse, json, os, sys
from typing import Any, Dict, List
import chromadb

def load_manifest(db_dir: str) -> Dict[str, Any]:
path = os.path.join(db_dir, "MANIFEST.json")
if os.path.isfile(path):
try:
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
print(f"::warning :: Failed to read MANIFEST.json: {e}", file=sys.stderr)
return {}

def list_collection_names(client: chromadb.Client) -> List[str]:
try:
cols = client.list_collections()
# Older Chroma returns objects with .name; newer returns dicts
names = []
for c in cols:
name = getattr(c, "name", None) or (isinstance(c, dict) and c.get("name"))
if name:
names.append(name)
return names
except Exception as e:
print(f"::warning :: Unable to list collections: {e}", file=sys.stderr)
return []

def resolve_collection_name(client: chromadb.Client, requested: str | None, manifest: Dict[str, Any]) -> str:
names = list_collection_names(client)
man_name = (manifest.get("collection") if isinstance(manifest, dict) else None) or None

# 1) Prefer manifest value if it exists in store
if man_name and man_name in names:
return man_name

# 2) If a requested name is provided and exists, use it
if requested and requested in names:
return requested

# 3) If there is exactly one collection, use it (common case)
if len(names) == 1:
return names[0]

# 4) No obvious match → raise with guidance
msg = ["No matching Chroma collection found in the persisted store."]
msg.append(f"- Requested: {requested!r}")
msg.append(f"- Manifest: {man_name!r}")
msg.append(f"- Available: {names if names else '[] (none)'}")
raise RuntimeError("\n".join(msg))

def fmt_row(i, doc, meta, dist=None):
src = meta.get("source_path") or meta.get("source") or meta.get("path") or "unknown"
head = (doc or "").strip().splitlines()[0][:120]
d = f" (distance: {dist:.4f})" if isinstance(dist, (int, float)) else ""
return f"{i+1}. {src}{d}\n {head}"

def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", required=True, help="Path to Chroma persist directory (e.g., .kb_index)")
ap.add_argument("--collection", default=None, help="Optional explicit collection name override")
ap.add_argument("--question", required=True)
ap.add_argument("--top-k", type=int, default=5)
args = ap.parse_args()

if not os.path.isdir(args.db):
print(f"::error :: Chroma persist dir not found: {args.db}", file=sys.stderr)
sys.exit(1)

client = chromadb.PersistentClient(path=args.db)
manifest = load_manifest(args.db)

try:
col_name = resolve_collection_name(client, args.collection, manifest)
except Exception as e:
print(f"::error :: {e}", file=sys.stderr)
sys.exit(1)

try:
col = client.get_collection(col_name)
except Exception as e:
print(f"::error :: Failed to open collection '{col_name}': {e}", file=sys.stderr)
sys.exit(1)

res = col.query(
query_texts=[args.question],
n_results=args.top_k,
include=["distances", "documents", "metadatas", "ids"],
)

docs = res.get("documents", [[]])[0]
metas = res.get("metadatas", [[]])[0]
dists = res.get("distances", [[]])[0]

# Diagnostics
print(f"[agent] Using collection: {col_name}")
if manifest:
print(f"[agent] MANIFEST: model={manifest.get('embed_model')} collection={manifest.get('collection')} count={manifest.get('count')}")

# Build a concise grounded answer (no LLM)
lines = [f"### Question", args.question, "", f"### Collection: {col_name}", "", "### Top Matches"]
for i, (doc, meta, dist) in enumerate(zip(docs, metas, dists)):
lines.append(fmt_row(i, doc, meta, dist))
lines += ["", "### Grounded Answer (heuristic)",
"Below are the most relevant excerpts; see sources above."]

os.makedirs("dist", exist_ok=True)
with open("dist/answer.md", "w", encoding="utf-8") as f:
f.write("\n".join(lines))
print("\n".join(lines))

if __name__ == "__main__":
main()
95 changes: 95 additions & 0 deletions .github/scripts/ai/agent_from_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from __future__ import annotations
import argparse, json, os, sys
from typing import Any, Dict, List
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

def read_jsonl(path: str):
with open(path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
try:
yield json.loads(line)
except Exception:
continue

def extract_text(obj: Dict[str, Any]) -> str:
for k in ("text","content","page_content","chunk","body"):
v = obj.get(k)
if isinstance(v, str) and v.strip():
return v
return ""

def extract_id(obj: Dict[str,Any], i: int) -> str:
return str(obj.get("id") or f"auto-{i}")

def main():
ap = argparse.ArgumentParser()
ap.add_argument("--index", required=True, help="Path to index.jsonl")
ap.add_argument("--persist", default=".kb_index_ephemeral", help="Where to build temporary Chroma store")
ap.add_argument("--collection", default="hpcckb")
ap.add_argument("--model", default="sentence-transformers/all-MiniLM-L6-v2")
ap.add_argument("--batch-size", type=int, default=256)
ap.add_argument("--question", required=True)
ap.add_argument("--top-k", type=int, default=5)
args = ap.parse_args()

os.makedirs(args.persist, exist_ok=True)
client = chromadb.PersistentClient(path=args.persist, settings=Settings(anonymized_telemetry=False))
try:
client.delete_collection(args.collection)
except Exception:
pass
col = client.create_collection(args.collection, metadata={"hnsw:space":"cosine"})

# Load docs
ids, docs, metas = [], [], []
for i, obj in enumerate(read_jsonl(args.index)):
txt = extract_text(obj)
if not txt.strip():
continue
ids.append(extract_id(obj, i))
docs.append(txt)
metas.append(obj.get("metadata", {}))

if not docs:
print(f"::error :: No documents found in {args.index}", file=sys.stderr)
sys.exit(1)

model = SentenceTransformer(args.model)
B = args.batch_size
vecs: List[List[float]] = []
for s in tqdm(range(0, len(docs), B), desc="Embedding"):
batch = docs[s:s+B]
emb = model.encode(batch, normalize_embeddings=True, show_progress_bar=False)
vecs.extend(emb.tolist())

col.add(ids=ids, documents=docs, metadatas=metas, embeddings=vecs)

res = col.query(query_texts=[args.question], n_results=args.top_k, include=["distances","documents","metadatas","ids"])
docs_r = res.get("documents", [[]])[0]
metas_r = res.get("metadatas", [[]])[0]
dists_r = res.get("distances", [[]])[0]

def fmt_row(i, doc, meta, dist):
src = meta.get("source_path") or meta.get("source") or meta.get("path") or "unknown"
head = doc.strip().splitlines()[0][:120]
return f"{i+1}. {src} (distance: {dist:.4f})\n {head}"

lines = [f"### Question", args.question, "", "### Top Matches"]
for i, (d, m, dist) in enumerate(zip(docs_r, metas_r, dists_r)):
lines.append(fmt_row(i, d, m, dist))
lines += ["", "### Grounded Answer (heuristic)",
"Below are the most relevant excerpts; see sources above."]

os.makedirs("dist", exist_ok=True)
with open("dist/answer.md", "w", encoding="utf-8") as f:
f.write("\n".join(lines))
print("\n".join(lines))

if __name__ == "__main__":
main()
37 changes: 37 additions & 0 deletions .github/scripts/kb/chunking-policy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# .github/scripts/kb/chunking-policy.yml

defaults:
max_chars: 2400
overlap: 300
heading_aware: true
preserve_code_fences: true

rules:
# Issues are organized into buckets (00100, 00200, ...) under issues/
- path_glob: "hpcc4j/kb/hpcc4j/issues/**/*.md"
max_chars: 2600
overlap: 280
respect_front_matter: true

# Wiki content (submodule)
- path_glob: "hpcc4j/kb/hpcc4j/wiki/**/*.md"
max_chars: 2000
overlap: 300
heading_aware: true

# Project documentation
- path_glob: "hpcc4j/kb/hpcc4j/project-docs/**/*.md"
max_chars: 2400
overlap: 300
heading_aware: true

# Spark HPCC connector docs
- path_glob: "hpcc4j/kb/hpcc4j/spark-hpcc-connector/**/*.md"
max_chars: 2400
overlap: 320
preserve_code_fences: true

# Catch-all to prevent empty indices if a specific rule misses
- path_glob: "hpcc4j/kb/hpcc4j/**/*.md"
max_chars: 2400
overlap: 300
Loading
Loading