Alias gguf tensors instead of copy (#167)

rsuderman · web-flow · commit e051c37ee913 · 2024-09-05T12:47:19.000-07:00
Using `torch.as_tensor` we can alias the tensor rather than copy during
gguf file loading. This avoids duplicating the entire tensor contents
when tracing torch programs which substrantially decreases memory usage
on large models.

e.g. LLaMa 70b decreased memory allocation from 60+GB to 2 GB for
tensors.
diff --git a/sharktank/sharktank/types/gguf_interop/base.py b/sharktank/sharktank/types/gguf_interop/base.py
@@ -80,9 +80,10 @@ def _externalize_tensor(
     # Important: The annotation tag must be set on the actual leaf tensor
     # which is stored in the root theta. This means that any shaping or
     # data type massaging has to happen *before* annotating.
-    data_tensor = torch.tensor(data)
     if logical_shape is not None:
-        data_tensor = data_tensor.reshape(logical_shape)
+        data_tensor = torch.as_tensor(data.reshape(logical_shape))
+    else:
+        data_tensor = torch.as_tensor(data)
     ExternalTensorTrait(external_name=name, external_scope="").set(data_tensor)
     return data_tensor