Disable strictness for export of llama (#168)

rsuderman · web-flow · commit a038133058a3 · 2024-09-05T21:06:34.000-07:00
Strictness validates correctness but this results in loading the tensors
to memory. Disabling helps with export speed.
diff --git a/sharktank/sharktank/examples/export_paged_llm_v1.py b/sharktank/sharktank/examples/export_paged_llm_v1.py
@@ -45,6 +45,11 @@ def main():
         help="Include verbose logging",
         action="store_true",
     )
+    parser.add_argument(
+        "--strict",
+        help="Enables strictness during export",
+        action="store_true",
+    )
 
     args = cli.parse(parser)
     dataset = cli.get_input_dataset(args)
@@ -117,6 +122,7 @@ def generate_batch_prefill(bs: int):
             name=f"prefill_bs{bs}",
             args=(tokens, seq_lens, seq_block_ids, cache_state),
             dynamic_shapes=dynamic_shapes,
+            strict=args.strict,
         )
         def _(model, tokens, seq_lens, seq_block_ids, cache_state):
             sl = tokens.shape[1]
@@ -174,6 +180,7 @@ def generate_batch_decode(bs: int):
                 cache_state,
             ),
             dynamic_shapes=dynamic_shapes,
+            strict=args.strict,
         )
         def _(
             model,