token tests

FlyingFathead · FlyingFathead · commit d2cfa9bf7b4b · 2024-07-23T23:26:34.000+03:00
diff --git a/fix_and_test_tokenizer.py b/fix_and_test_tokenizer.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+# fix_and_test_tokenizer.py
+
+from transformers import GPT2Tokenizer
+import torch
+
+def load_tokenizer(tokenizer_dir):
+    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_dir)
+    # Ensure there is a padding token
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        tokenizer.save_pretrained(tokenizer_dir)
+    print("Tokenizer loaded successfully.")
+    return tokenizer
+
+def identify_and_fix_problematic_tokens(tokenizer, prompt, tokenizer_dir):
+    inputs = tokenizer(prompt, return_tensors=None, padding=True, truncation=True, max_length=512)
+    print(f"Tokenized inputs without return_tensors: {inputs}")
+    
+    prompt_words = prompt.split()
+    problematic_tokens = []
+    for i, token_id in enumerate(inputs['input_ids']):
+        if token_id is None and i < len(prompt_words):
+            problematic_tokens.append(prompt_words[i])
+            print(f"Problematic token at position {i}: '{prompt_words[i]}'")
+    
+    if problematic_tokens:
+        added_tokens_count = tokenizer.add_tokens(problematic_tokens)
+        tokenizer.save_pretrained(tokenizer_dir)
+        print(f"Added {added_tokens_count} tokens to the vocabulary.")
+        return True
+    return False
+
+def test_tokenizer(tokenizer, prompt):
+    try:
+        # Manually ensure that None values are replaced
+        inputs = tokenizer(prompt, return_tensors=None, padding=True, truncation=True)
+        inputs['input_ids'] = [id if id is not None else tokenizer.pad_token_id for id in inputs['input_ids']]
+        
+        # Convert list to tensor manually to ensure correct formatting
+        input_ids_tensor = torch.tensor([inputs['input_ids']], dtype=torch.long)
+        
+        print(f"Tokenized inputs manually converted to tensor: {input_ids_tensor}")
+        
+        decoded_text = tokenizer.decode(input_ids_tensor[0], skip_special_tokens=True)
+        print(f"Decoded text: {decoded_text}")
+    except Exception as e:
+        print(f"Error in tokenizer test: {e}")
+
+if __name__ == "__main__":
+    tokenizer_dir = "./converted_model"
+    prompt = "Hei, miten voit?"
+
+    tokenizer = load_tokenizer(tokenizer_dir)
+    
+    if identify_and_fix_problematic_tokens(tokenizer, prompt, tokenizer_dir):
+        tokenizer = load_tokenizer(tokenizer_dir)  # Reload tokenizer after updates
+    test_tokenizer(tokenizer, prompt)
+
+
+# == (old method) ==
+# #!/usr/bin/env python3
+# # fix_and_test_tokenizer.py
+
+# from transformers import GPT2Tokenizer
+# import json
+
+# def load_tokenizer(tokenizer_dir):
+#     tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_dir)
+    
+#     # Add a padding token if it doesn't exist
+#     if tokenizer.pad_token is None:
+#         tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+
+#     print("Tokenizer loaded successfully.")
+#     return tokenizer
+
+# def identify_problematic_tokens(tokenizer, prompt):
+#     inputs = tokenizer(prompt, padding=True, truncation=True, max_length=512, return_tensors=None)
+#     print(f"Tokenized inputs without return_tensors: {inputs}")
+    
+#     problematic_tokens = []
+#     prompt_tokens = prompt.split()
+#     for i, token_id in enumerate(inputs['input_ids']):
+#         if token_id is None:
+#             token_position = min(i, len(prompt_tokens) - 1)
+#             problematic_token = prompt_tokens[token_position]
+#             problematic_tokens.append(problematic_token)
+#             print(f"Problematic token at position {i}: '{problematic_token}'")
+    
+#     return problematic_tokens
+
+# def add_missing_tokens(vocab_path, tokens):
+#     try:
+#         with open(vocab_path, 'r', encoding='utf-8') as vocab_file:
+#             vocab = json.load(vocab_file)
+        
+#         current_index = max(vocab.values()) + 1
+
+#         for token in tokens:
+#             if token not in vocab:
+#                 vocab[token] = current_index
+#                 print(f"Adding token '{token}' with index {current_index}")
+#                 current_index += 1
+
+#         with open(vocab_path, 'w', encoding='utf-8') as vocab_file:
+#             json.dump(vocab, vocab_file, ensure_ascii=False, indent=2)
+        
+#         print(f"Added {len(tokens)} tokens to the vocabulary.")
+#     except Exception as e:
+#         print(f"Error adding tokens to vocabulary: {e}")
+
+# def test_tokenizer(tokenizer, prompt):
+#     try:
+#         inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
+#         print(f"Tokenized inputs: {inputs}")
+        
+#         decoded_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
+#         print(f"Decoded text: {decoded_text}")
+
+#     except Exception as e:
+#         print(f"Error in tokenizer test: {e}")
+
+# if __name__ == "__main__":
+#     tokenizer_dir = "./converted_model"  # Path to your tokenizer files directory
+#     vocab_path = "./converted_model/vocab.json"
+#     prompt = "Hei, miten voit?"
+
+#     tokenizer = load_tokenizer(tokenizer_dir)
+#     problematic_tokens = identify_problematic_tokens(tokenizer, prompt)
+
+#     if problematic_tokens:
+#         add_missing_tokens(vocab_path, problematic_tokens)
+#         tokenizer = load_tokenizer(tokenizer_dir)  # Reload tokenizer after updating vocab
+#         test_tokenizer(tokenizer, prompt)
+#     else:
+#         print("No problematic tokens found.")
+#         test_tokenizer(tokenizer, prompt)
diff --git a/fix_encoding_and_sync_files.py b/fix_encoding_and_sync_files.py
@@ -1,44 +1,99 @@
 # fix_encoding_and_sync_files.py
 
 import json
+import os
 import sys
 
-def fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path):
-    # Correct encoding issue by reading the file correctly
+def fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path, hparams_path):
+    # Read and process the BPE file
     with open(bpe_path, 'r', encoding='utf-8') as bpe_file:
         bpe_lines = bpe_file.readlines()
-    
+
     # Remove BOM if present
     if bpe_lines[0].startswith('\ufeff'):
         bpe_lines[0] = bpe_lines[0][1:]
+
+    # Create merges.txt from corrected BPE file
+    with open(merges_txt_path, 'w', encoding='utf-8') as merges_file:
+        merges_file.write("#version: 0.2\n")
+        for line in bpe_lines:
+            if not line.startswith("#"):
+                merges_file.write(line)
     
-    # Create vocab.json from encoder.json
+    # Read the encoder.json file
     with open(encoder_path, 'r', encoding='utf-8') as encoder_file:
         encoder = json.load(encoder_file)
     
+    # Check the number of entries in the encoder
+    if len(encoder) != 50257:
+        print(f"Warning: Encoder size is {len(encoder)}. Expected 50257.")
+    
+    # Create vocab.json from encoder.json
     with open(vocab_json_path, 'w', encoding='utf-8') as vocab_file:
         json.dump(encoder, vocab_file, ensure_ascii=False, indent=2)
     
-    # Create merges.txt from corrected bpe file
-    with open(merges_txt_path, 'w', encoding='utf-8') as merges_file:
-        merges_file.write("#version: 0.2\n")
-        for line in bpe_lines:
-            if not line.startswith("#"):
-                merges_file.write(line)
-    
     print(f"Created {vocab_json_path} and {merges_txt_path} with corrected encoding.")
 
 def main():
-    if len(sys.argv) != 3:
-        print("Usage: python fix_encoding_and_sync_files.py <bpe_path> <encoder_path>")
+    if len(sys.argv) != 2:
+        print("Usage: python fix_encoding_and_sync_files.py <source_directory>")
         sys.exit(1)
+    
+    source_directory = sys.argv[1]
+    bpe_path = os.path.join(source_directory, 'vocab.bpe')
+    encoder_path = os.path.join(source_directory, 'encoder.json')
+    hparams_path = os.path.join(source_directory, 'hparams.json')
 
-    bpe_path = sys.argv[1]
-    encoder_path = sys.argv[2]
+    # Destination paths
     vocab_json_path = "./converted_model/vocab.json"
     merges_txt_path = "./converted_model/merges.txt"
     
-    fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path)
+    fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path, hparams_path)
 
 if __name__ == "__main__":
     main()
+
+
+
+# import json
+# import sys
+
+# def fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path):
+#     # Correct encoding issue by reading the file correctly
+#     with open(bpe_path, 'r', encoding='utf-8') as bpe_file:
+#         bpe_lines = bpe_file.readlines()
+    
+#     # Remove BOM if present
+#     if bpe_lines[0].startswith('\ufeff'):
+#         bpe_lines[0] = bpe_lines[0][1:]
+    
+#     # Create vocab.json from encoder.json
+#     with open(encoder_path, 'r', encoding='utf-8') as encoder_file:
+#         encoder = json.load(encoder_file)
+    
+#     with open(vocab_json_path, 'w', encoding='utf-8') as vocab_file:
+#         json.dump(encoder, vocab_file, ensure_ascii=False, indent=2)
+    
+#     # Create merges.txt from corrected bpe file
+#     with open(merges_txt_path, 'w', encoding='utf-8') as merges_file:
+#         merges_file.write("#version: 0.2\n")
+#         for line in bpe_lines:
+#             if not line.startswith("#"):
+#                 merges_file.write(line)
+    
+#     print(f"Created {vocab_json_path} and {merges_txt_path} with corrected encoding.")
+
+# def main():
+#     if len(sys.argv) != 3:
+#         print("Usage: python fix_encoding_and_sync_files.py <bpe_path> <encoder_path>")
+#         sys.exit(1)
+
+#     bpe_path = sys.argv[1]
+#     encoder_path = sys.argv[2]
+#     vocab_json_path = "./converted_model/vocab.json"
+#     merges_txt_path = "./converted_model/merges.txt"
+    
+#     fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path)
+
+# if __name__ == "__main__":
+#     main()
diff --git a/merge_vocabularies.py b/merge_vocabularies.py
@@ -0,0 +1,66 @@
+# merge vocabularies
+# (this util merges two different vocabulary files from encoder.json)
+import json
+import requests
+import os
+import argparse
+
+def download_file(url, dest_path):
+    response = requests.get(url)
+    response.raise_for_status()
+    with open(dest_path, 'wb') as f:
+        f.write(response.content)
+
+def merge_encoders(original_encoder_path, new_encoder_url, output_encoder_path):
+    with open(original_encoder_path, 'r', encoding='utf-8') as original_file:
+        original_encoder = json.load(original_file)
+
+    new_encoder_path = 'new_encoder.json'
+    download_file(new_encoder_url, new_encoder_path)
+
+    with open(new_encoder_path, 'r', encoding='utf-8') as new_file:
+        new_encoder = json.load(new_file)
+
+    # Merge the two encoders
+    merged_encoder = {**new_encoder, **original_encoder}
+
+    with open(output_encoder_path, 'w', encoding='utf-8') as output_file:
+        json.dump(merged_encoder, output_file, ensure_ascii=False, indent=2)
+    
+    os.remove(new_encoder_path)
+
+def merge_bpe_files(original_bpe_path, new_bpe_url, output_bpe_path):
+    with open(original_bpe_path, 'r', encoding='utf-8') as original_file:
+        original_bpe = original_file.readlines()
+
+    new_bpe_path = 'new_bpe.txt'
+    download_file(new_bpe_url, new_bpe_path)
+
+    with open(new_bpe_path, 'r', encoding='utf-8') as new_file:
+        new_bpe = new_file.readlines()
+
+    # Merge the two BPE lists, removing duplicates
+    merged_bpe = list(dict.fromkeys(original_bpe + new_bpe))
+
+    with open(output_bpe_path, 'w', encoding='utf-8') as output_file:
+        output_file.writelines(merged_bpe)
+    
+    os.remove(new_bpe_path)
+
+def main():
+    parser = argparse.ArgumentParser(description="Merge vocabulary files.")
+    parser.add_argument("original_encoder_path", type=str, help="Path to the original encoder.json")
+    parser.add_argument("original_bpe_path", type=str, help="Path to the original vocab.bpe")
+    parser.add_argument("output_encoder_path", type=str, help="Path to save the merged encoder.json")
+    parser.add_argument("output_bpe_path", type=str, help="Path to save the merged vocab.bpe")
+
+    args = parser.parse_args()
+
+    new_encoder_url = 'https://huggingface.co/Finnish-NLP/gpt2-finnish/raw/main/vocab.json'
+    new_bpe_url = 'https://huggingface.co/Finnish-NLP/gpt2-finnish/raw/main/merges.txt'
+
+    merge_encoders(args.original_encoder_path, new_encoder_url, args.output_encoder_path)
+    merge_bpe_files(args.original_bpe_path, new_bpe_url, args.output_bpe_path)
+
+if __name__ == "__main__":
+    main()
diff --git a/test_pytorch_model.py b/test_pytorch_model.py
@@ -7,8 +7,10 @@
 # (Ghostcode via ChaosWhisperer)
 
 
-import torch
+#!/usr/bin/env python3
+
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
+import torch
 
 def load_model(model_dir):
     try:
@@ -44,16 +46,22 @@ def generate_text(model, tokenizer, prompt, max_length=50):
             print("Error: Model generation resulted in None or empty outputs.")
             return None
 
-        # Decode the generated output
-        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Decode the generated output and handle NoneType tokens
+        generated_text = []
+        for token_id in outputs[0]:
+            token_id = token_id.item()
+            try:
+                token_str = tokenizer.decode([token_id], skip_special_tokens=True)
+            except Exception as e:
+                print(f"Error decoding token ID {token_id}: {e}")
+                token_str = None
+            generated_text.append(token_str)
+        
+        # Filter out NoneType tokens and join the text
+        generated_text = [token for token in generated_text if token is not None]
+        generated_text = ''.join(generated_text)
         print(f"Decoded text: {generated_text}")
 
-        # Print each token's ID and corresponding token
-        for i, token in enumerate(outputs[0]):
-            token_id = token.item()
-            token_str = tokenizer.decode([token_id])
-            print(f"Token {i}: {token_id} - {token_str}")
-
         return generated_text
     except Exception as e:
         print(f"Error generating text: {e}")
diff --git a/test_tokenizer.py b/test_tokenizer.py
diff --git a/validate_vocab_and_tokens.py b/validate_vocab_and_tokens.py