vortext
diff --git a/‎.github/workflows/build.yml
Lines changed: 25 additions & 2 deletions b/‎.github/workflows/build.yml
Lines changed: 25 additions & 2 deletions
diff --git a/‎Package.swift
Lines changed: 6 additions & 3 deletions b/‎Package.swift
Lines changed: 6 additions & 3 deletions
diff --git a/‎common/common.cpp
Lines changed: 12 additions & 6 deletions b/‎common/common.cpp
Lines changed: 12 additions & 6 deletions
diff --git a/‎common/common.h
Lines changed: 1 addition & 0 deletions b/‎common/common.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎convert-falcon-hf-to-gguf.py
Lines changed: 79 additions & 64 deletions b/‎convert-falcon-hf-to-gguf.py
Lines changed: 79 additions & 64 deletions
diff --git a/‎examples/jeopardy/README.md
Lines changed: 1 addition & 1 deletion b/‎examples/jeopardy/README.md
Lines changed: 1 addition & 1 deletion
@@ -10,10 +10,10 @@ on:
   push:
     branches:
       - master
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
   pull_request:
     types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift']
 
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@@ -253,6 +253,29 @@ jobs:
             -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
           cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
 
+  macOS-latest-swift:
+    runs-on: macos-latest
+
+    strategy:
+      matrix:
+        destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS']
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        id: depends
+        continue-on-error: true
+        run: |
+          brew update
+
+      - name: xcodebuild for swift package
+        id: xcodebuild
+        run: |
+          xcodebuild -scheme llama -destination "${{ matrix.destination }}"
+
   windows-latest-cmake:
     runs-on: windows-latest
 
 
@@ -44,9 +44,12 @@ let package = Package(
             cSettings: [
                 .unsafeFlags(["-Wno-shorten-64-to-32"]),
                 .define("GGML_USE_K_QUANTS"),
-                .define("GGML_USE_ACCELERATE"),
-                .define("ACCELERATE_NEW_LAPACK"),
-                .define("ACCELERATE_LAPACK_ILP64")
+                .define("GGML_USE_ACCELERATE")
+                // NOTE: NEW_LAPACK will required iOS version 16.4+
+                // We should consider add this in the future when we drop support for iOS 14
+                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
+                // .define("ACCELERATE_NEW_LAPACK"),
+                // .define("ACCELERATE_LAPACK_ILP64")
             ] + additionalSettings,
             linkerSettings: [
                 .linkedFramework("Accelerate")
 
@@ -167,6 +167,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
+            // store the external file name in params
+            params.prompt_file = argv[i];
             std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
             if (params.prompt.back() == '\n') {
                 params.prompt.pop_back();
@@ -361,7 +363,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back({argv[i], 1.0f});
+            params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
             params.use_mmap = false;
         } else if (arg == "--lora-scaled") {
             if (++i >= argc) {
@@ -373,7 +375,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-            params.lora_adapter.push_back({lora_adapter, std::stof(argv[i])});
+            params.lora_adapter.push_back(std::make_tuple(lora_adapter, std::stof(argv[i])));
             params.use_mmap = false;
         } else if (arg == "--lora-base") {
             if (++i >= argc) {
@@ -616,6 +618,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
         process_escapes(params.prompt);
         process_escapes(params.input_prefix);
         process_escapes(params.input_suffix);
+        for (auto & antiprompt : params.antiprompt) {
+            process_escapes(antiprompt);
+        }
     }
 
     return true;
@@ -1020,10 +1025,11 @@ llama_token llama_sample_token(
             id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu);
         } else {
             // Temperature sampling
-            llama_sample_top_k      (ctx, &cur_p, top_k, 1);
-            llama_sample_tail_free  (ctx, &cur_p, tfs_z, 1);
-            llama_sample_typical    (ctx, &cur_p, typical_p, 1);
-            llama_sample_top_p      (ctx, &cur_p, top_p, 1);
+            size_t min_keep = std::max(1, params.n_probs);
+            llama_sample_top_k      (ctx, &cur_p, top_k, min_keep);
+            llama_sample_tail_free  (ctx, &cur_p, tfs_z, min_keep);
+            llama_sample_typical    (ctx, &cur_p, typical_p, min_keep);
+            llama_sample_top_p      (ctx, &cur_p, top_p, min_keep);
             llama_sample_temp(ctx, &cur_p, temp);
 
             {
 
@@ -79,6 +79,7 @@ struct gpt_params {
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
+    std::string prompt_file       = "";  // store the external prompt file name
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
     std::string input_prefix      = "";  // string to prefix user inputs with
     std::string input_suffix      = "";  // string to suffix user inputs with
 
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import argparse
+import contextlib
 import json
 import os
 import struct
@@ -20,10 +21,10 @@
 import gguf
 
 
-def count_model_parts(dir_model: Path) -> int:
+def count_model_parts(dir_model: Path, prefix: str) -> int:
     num_parts = 0
     for filename in os.listdir(dir_model):
-        if filename.startswith("pytorch_model-"):
+        if filename.startswith(prefix):
             num_parts += 1
 
     if num_parts > 0:
@@ -77,30 +78,36 @@ def parse_args() -> argparse.Namespace:
 with open(dir_model / "config.json", "r", encoding="utf-8") as f:
     hparams = json.load(f)
 
-if hparams["architectures"][0] != "RWForCausalLM":
+if hparams["architectures"][0] != "FalconForCausalLM":
     print("Model architecture not supported: " + hparams["architectures"][0])
 
     sys.exit(1)
 
 # get number of model parts
-num_parts = count_model_parts(dir_model)
+num_parts = count_model_parts(dir_model, "model-00")
+if num_parts:
+    is_safetensors = True
+    from safetensors import safe_open
+else:
+    is_safetensors = False
+    num_parts = count_model_parts(dir_model, "pytorch_model-")
 
 ARCH=gguf.MODEL_ARCH.FALCON
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 
 print("gguf: get model metadata")
 
-block_count = hparams["n_layer"]
+block_count = hparams["num_hidden_layers"]
 
 gguf_writer.add_name("Falcon")
 gguf_writer.add_context_length(2048) # not in config.json
 gguf_writer.add_tensor_data_layout("jploski") # qkv tensor transform
 gguf_writer.add_embedding_length(hparams["hidden_size"])
 gguf_writer.add_feed_forward_length(4 * hparams["hidden_size"])
 gguf_writer.add_block_count(block_count)
-gguf_writer.add_head_count(hparams["n_head"])
-if "n_head_kv" in hparams:
-    gguf_writer.add_head_count_kv(hparams["n_head_kv"])
+gguf_writer.add_head_count(hparams["num_attention_heads"])
+if "num_kv_heads" in hparams:
+    gguf_writer.add_head_count_kv(hparams["num_kv_heads"])
 else:
     gguf_writer.add_head_count_kv(1)
 gguf_writer.add_layer_norm_eps(hparams["layer_norm_epsilon"])
@@ -146,8 +153,8 @@ def parse_args() -> argparse.Namespace:
 tensor_map = gguf.get_tensor_name_map(ARCH,block_count)
 
 # params for qkv transform
-n_head    = hparams["n_head"]
-n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
+n_head    = hparams["num_attention_heads"]
+n_head_kv = hparams["num_kv_heads"] if "num_kv_heads" in hparams else 1
 
 head_dim = hparams["hidden_size"] // n_head
 
@@ -156,6 +163,10 @@ def parse_args() -> argparse.Namespace:
 
 if num_parts == 0:
     part_names = iter(("pytorch_model.bin",))
+elif is_safetensors:
+    part_names = (
+        f"model-{n:05}-of-{num_parts:05}.safetensors" for n in range(1, num_parts + 1)
+    )
 else:
     part_names = (
         f"pytorch_model-{n:05}-of-{num_parts:05}.bin" for n in range(1, num_parts + 1)
@@ -165,60 +176,64 @@ def parse_args() -> argparse.Namespace:
     if args.vocab_only:
         break
     print("gguf: loading model part '" + part_name + "'")
-    model_part = torch.load(dir_model / part_name, map_location="cpu")
-
-    for name in model_part.keys():
-        data = model_part[name]
-
-        old_dtype = data.dtype
-
-        # convert any unsupported data types to float32
-        if data.dtype != torch.float16 and data.dtype != torch.float32:
-            data = data.to(torch.float32)
-
-        # QKV tensor transform
-        # The original query_key_value tensor contains n_head_kv "kv groups",
-        # each consisting of n_head/n_head_kv query weights followed by one key
-        # and one value weight (shared by all query heads in the kv group).
-        # This layout makes it a big pain to work with in GGML.
-        # So we rearrange them here,, so that we have n_head query weights
-        # followed by n_head_kv key weights followed by n_head_kv value weights,
-        # in contiguous fashion.
-        # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
-
-        if "query_key_value" in name:
-            qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
-            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
-            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
-            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
-            data = torch.cat((q,k,v)).reshape_as(data)
-
-        data = data.squeeze().numpy()
-
-        # map tensor names
-        new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
-        if new_name is None:
-            print("Can not map tensor '" + name + "'")
-            sys.exit()
-
-        n_dims = len(data.shape)
-        data_dtype = data.dtype
-
-        # if f32 desired, convert any float16 to float32
-        if ftype == 0 and data_dtype == np.float16:
-            data = data.astype(np.float32)
-
-        # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
-        if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
-            data = data.astype(np.float32)
-
-        # if f16 desired, convert any float32 2-dim weight tensors to float16
-        if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
-            data = data.astype(np.float16)
-
-        print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
-
-        gguf_writer.add_tensor(new_name, data)
+    if is_safetensors:
+        ctx = safe_open(dir_model / part_name, framework="pt", device="cpu")
+    else:
+        ctx = contextlib.nullcontext(torch.load(dir_model / part_name, map_location="cpu"))
+
+    with ctx as model_part:
+        for name in model_part.keys():
+            data = model_part.get_tensor(name) if is_safetensors else model_part[name]
+
+            old_dtype = data.dtype
+
+            # convert any unsupported data types to float32
+            if data.dtype != torch.float16 and data.dtype != torch.float32:
+                data = data.to(torch.float32)
+
+            # QKV tensor transform
+            # The original query_key_value tensor contains n_head_kv "kv groups",
+            # each consisting of n_head/n_head_kv query weights followed by one key
+            # and one value weight (shared by all query heads in the kv group).
+            # This layout makes it a big pain to work with in GGML.
+            # So we rearrange them here,, so that we have n_head query weights
+            # followed by n_head_kv key weights followed by n_head_kv value weights,
+            # in contiguous fashion.
+            # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py
+
+            if "query_key_value" in name:
+                qkv = data.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
+                q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
+                k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
+                data = torch.cat((q,k,v)).reshape_as(data)
+
+            data = data.squeeze().numpy()
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
+            if new_name is None:
+                print("Can not map tensor '" + name + "'")
+                sys.exit()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert any float32 2-dim weight tensors to float16
+            if ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
+
+            gguf_writer.add_tensor(new_name, data)
 
 
 print("gguf: write header")
 
@@ -2,7 +2,7 @@
 
 This is pretty much just a straight port of aigoopy/llm-jeopardy/ with an added graph viewer.
 
-The jeopardy test can be used to compare the fact knowledge of different models and compare them to eachother. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
+The jeopardy test can be used to compare the fact knowledge of different models and compare them to each other. This is in contrast to some other tests, which test logical deduction, creativity, writing skills, etc.
 
 
 Step 1: Open jeopardy.sh and modify the following: