Merge pull request #138 from malfet/malfet/add-support-for-tinystories

malfet · web-flow · commit a3e825f21172 · 2024-03-18T13:29:47.000-07:00
By just defining a configs and using model weights storied in model Should enable TinyStories LLMS posted in https://huggingface.co/karpathy/tinyllamas Test Plan: `python generate.py --checkpoint_path checkpoints/stories15M/stories15M.pt --prompt "Once upon a time"`
diff --git a/generate.py b/generate.py
@@ -235,6 +235,8 @@ def _load_model(checkpoint_path, device, precision, use_tp):
         model = simple_quantizer.convert_for_runtime(use_cuda)
 
     checkpoint = torch.load(str(checkpoint_path), mmap=True, weights_only=True)
+    if "model" in checkpoint and "stories" in str(checkpoint_path):
+        checkpoint = checkpoint["model"]
     model.load_state_dict(checkpoint, assign=True)
 
     if use_tp:
diff --git a/model.py b/model.py
@@ -63,6 +63,8 @@ def from_name(cls, name: str):
     "34B": dict(n_layer=48, n_head=64, dim=8192, vocab_size=32000, n_local_heads=8, intermediate_size=22016, rope_base=1000000), # CodeLlama-34B-Python-hf
     "70B": dict(n_layer=80, n_head=64, dim=8192, n_local_heads=8, intermediate_size=28672),
     "Mistral-7B": dict(n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=32000),
+    "stories15M": dict(n_layer=6, n_head=6, dim=288),
+    "stories110M": dict(n_layer=12, n_head=12, dim=768),
 }
 
 class KVCache(nn.Module):

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,8 @@ def from_name(cls, name: str):`
`63`	`63`	`"34B": dict(n_layer=48, n_head=64, dim=8192, vocab_size=32000, n_local_heads=8, intermediate_size=22016, rope_base=1000000), # CodeLlama-34B-Python-hf`
`64`	`64`	`"70B": dict(n_layer=80, n_head=64, dim=8192, n_local_heads=8, intermediate_size=28672),`
`65`	`65`	`"Mistral-7B": dict(n_layer=32, n_head=32, n_local_heads=8, dim=4096, intermediate_size=14336, vocab_size=32000),`
	`66`	`+ "stories15M": dict(n_layer=6, n_head=6, dim=288),`
	`67`	`+ "stories110M": dict(n_layer=12, n_head=12, dim=768),`
`66`	`68`	`}`
`67`	`69`
`68`	`70`	`class KVCache(nn.Module):`