Skip to content

Commit d2cfa9b

Browse files
committed
token tests
1 parent 4c83b8a commit d2cfa9b

6 files changed

+352
-30
lines changed

fix_and_test_tokenizer.py

+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
#!/usr/bin/env python3
2+
# fix_and_test_tokenizer.py
3+
4+
from transformers import GPT2Tokenizer
5+
import torch
6+
7+
def load_tokenizer(tokenizer_dir):
8+
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_dir)
9+
# Ensure there is a padding token
10+
if tokenizer.pad_token is None:
11+
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
12+
tokenizer.save_pretrained(tokenizer_dir)
13+
print("Tokenizer loaded successfully.")
14+
return tokenizer
15+
16+
def identify_and_fix_problematic_tokens(tokenizer, prompt, tokenizer_dir):
17+
inputs = tokenizer(prompt, return_tensors=None, padding=True, truncation=True, max_length=512)
18+
print(f"Tokenized inputs without return_tensors: {inputs}")
19+
20+
prompt_words = prompt.split()
21+
problematic_tokens = []
22+
for i, token_id in enumerate(inputs['input_ids']):
23+
if token_id is None and i < len(prompt_words):
24+
problematic_tokens.append(prompt_words[i])
25+
print(f"Problematic token at position {i}: '{prompt_words[i]}'")
26+
27+
if problematic_tokens:
28+
added_tokens_count = tokenizer.add_tokens(problematic_tokens)
29+
tokenizer.save_pretrained(tokenizer_dir)
30+
print(f"Added {added_tokens_count} tokens to the vocabulary.")
31+
return True
32+
return False
33+
34+
def test_tokenizer(tokenizer, prompt):
35+
try:
36+
# Manually ensure that None values are replaced
37+
inputs = tokenizer(prompt, return_tensors=None, padding=True, truncation=True)
38+
inputs['input_ids'] = [id if id is not None else tokenizer.pad_token_id for id in inputs['input_ids']]
39+
40+
# Convert list to tensor manually to ensure correct formatting
41+
input_ids_tensor = torch.tensor([inputs['input_ids']], dtype=torch.long)
42+
43+
print(f"Tokenized inputs manually converted to tensor: {input_ids_tensor}")
44+
45+
decoded_text = tokenizer.decode(input_ids_tensor[0], skip_special_tokens=True)
46+
print(f"Decoded text: {decoded_text}")
47+
except Exception as e:
48+
print(f"Error in tokenizer test: {e}")
49+
50+
if __name__ == "__main__":
51+
tokenizer_dir = "./converted_model"
52+
prompt = "Hei, miten voit?"
53+
54+
tokenizer = load_tokenizer(tokenizer_dir)
55+
56+
if identify_and_fix_problematic_tokens(tokenizer, prompt, tokenizer_dir):
57+
tokenizer = load_tokenizer(tokenizer_dir) # Reload tokenizer after updates
58+
test_tokenizer(tokenizer, prompt)
59+
60+
61+
# == (old method) ==
62+
# #!/usr/bin/env python3
63+
# # fix_and_test_tokenizer.py
64+
65+
# from transformers import GPT2Tokenizer
66+
# import json
67+
68+
# def load_tokenizer(tokenizer_dir):
69+
# tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_dir)
70+
71+
# # Add a padding token if it doesn't exist
72+
# if tokenizer.pad_token is None:
73+
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
74+
75+
# print("Tokenizer loaded successfully.")
76+
# return tokenizer
77+
78+
# def identify_problematic_tokens(tokenizer, prompt):
79+
# inputs = tokenizer(prompt, padding=True, truncation=True, max_length=512, return_tensors=None)
80+
# print(f"Tokenized inputs without return_tensors: {inputs}")
81+
82+
# problematic_tokens = []
83+
# prompt_tokens = prompt.split()
84+
# for i, token_id in enumerate(inputs['input_ids']):
85+
# if token_id is None:
86+
# token_position = min(i, len(prompt_tokens) - 1)
87+
# problematic_token = prompt_tokens[token_position]
88+
# problematic_tokens.append(problematic_token)
89+
# print(f"Problematic token at position {i}: '{problematic_token}'")
90+
91+
# return problematic_tokens
92+
93+
# def add_missing_tokens(vocab_path, tokens):
94+
# try:
95+
# with open(vocab_path, 'r', encoding='utf-8') as vocab_file:
96+
# vocab = json.load(vocab_file)
97+
98+
# current_index = max(vocab.values()) + 1
99+
100+
# for token in tokens:
101+
# if token not in vocab:
102+
# vocab[token] = current_index
103+
# print(f"Adding token '{token}' with index {current_index}")
104+
# current_index += 1
105+
106+
# with open(vocab_path, 'w', encoding='utf-8') as vocab_file:
107+
# json.dump(vocab, vocab_file, ensure_ascii=False, indent=2)
108+
109+
# print(f"Added {len(tokens)} tokens to the vocabulary.")
110+
# except Exception as e:
111+
# print(f"Error adding tokens to vocabulary: {e}")
112+
113+
# def test_tokenizer(tokenizer, prompt):
114+
# try:
115+
# inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
116+
# print(f"Tokenized inputs: {inputs}")
117+
118+
# decoded_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
119+
# print(f"Decoded text: {decoded_text}")
120+
121+
# except Exception as e:
122+
# print(f"Error in tokenizer test: {e}")
123+
124+
# if __name__ == "__main__":
125+
# tokenizer_dir = "./converted_model" # Path to your tokenizer files directory
126+
# vocab_path = "./converted_model/vocab.json"
127+
# prompt = "Hei, miten voit?"
128+
129+
# tokenizer = load_tokenizer(tokenizer_dir)
130+
# problematic_tokens = identify_problematic_tokens(tokenizer, prompt)
131+
132+
# if problematic_tokens:
133+
# add_missing_tokens(vocab_path, problematic_tokens)
134+
# tokenizer = load_tokenizer(tokenizer_dir) # Reload tokenizer after updating vocab
135+
# test_tokenizer(tokenizer, prompt)
136+
# else:
137+
# print("No problematic tokens found.")
138+
# test_tokenizer(tokenizer, prompt)

fix_encoding_and_sync_files.py

+71-16
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,99 @@
11
# fix_encoding_and_sync_files.py
22

33
import json
4+
import os
45
import sys
56

6-
def fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path):
7-
# Correct encoding issue by reading the file correctly
7+
def fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path, hparams_path):
8+
# Read and process the BPE file
89
with open(bpe_path, 'r', encoding='utf-8') as bpe_file:
910
bpe_lines = bpe_file.readlines()
10-
11+
1112
# Remove BOM if present
1213
if bpe_lines[0].startswith('\ufeff'):
1314
bpe_lines[0] = bpe_lines[0][1:]
15+
16+
# Create merges.txt from corrected BPE file
17+
with open(merges_txt_path, 'w', encoding='utf-8') as merges_file:
18+
merges_file.write("#version: 0.2\n")
19+
for line in bpe_lines:
20+
if not line.startswith("#"):
21+
merges_file.write(line)
1422

15-
# Create vocab.json from encoder.json
23+
# Read the encoder.json file
1624
with open(encoder_path, 'r', encoding='utf-8') as encoder_file:
1725
encoder = json.load(encoder_file)
1826

27+
# Check the number of entries in the encoder
28+
if len(encoder) != 50257:
29+
print(f"Warning: Encoder size is {len(encoder)}. Expected 50257.")
30+
31+
# Create vocab.json from encoder.json
1932
with open(vocab_json_path, 'w', encoding='utf-8') as vocab_file:
2033
json.dump(encoder, vocab_file, ensure_ascii=False, indent=2)
2134

22-
# Create merges.txt from corrected bpe file
23-
with open(merges_txt_path, 'w', encoding='utf-8') as merges_file:
24-
merges_file.write("#version: 0.2\n")
25-
for line in bpe_lines:
26-
if not line.startswith("#"):
27-
merges_file.write(line)
28-
2935
print(f"Created {vocab_json_path} and {merges_txt_path} with corrected encoding.")
3036

3137
def main():
32-
if len(sys.argv) != 3:
33-
print("Usage: python fix_encoding_and_sync_files.py <bpe_path> <encoder_path>")
38+
if len(sys.argv) != 2:
39+
print("Usage: python fix_encoding_and_sync_files.py <source_directory>")
3440
sys.exit(1)
41+
42+
source_directory = sys.argv[1]
43+
bpe_path = os.path.join(source_directory, 'vocab.bpe')
44+
encoder_path = os.path.join(source_directory, 'encoder.json')
45+
hparams_path = os.path.join(source_directory, 'hparams.json')
3546

36-
bpe_path = sys.argv[1]
37-
encoder_path = sys.argv[2]
47+
# Destination paths
3848
vocab_json_path = "./converted_model/vocab.json"
3949
merges_txt_path = "./converted_model/merges.txt"
4050

41-
fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path)
51+
fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path, hparams_path)
4252

4353
if __name__ == "__main__":
4454
main()
55+
56+
57+
58+
# import json
59+
# import sys
60+
61+
# def fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path):
62+
# # Correct encoding issue by reading the file correctly
63+
# with open(bpe_path, 'r', encoding='utf-8') as bpe_file:
64+
# bpe_lines = bpe_file.readlines()
65+
66+
# # Remove BOM if present
67+
# if bpe_lines[0].startswith('\ufeff'):
68+
# bpe_lines[0] = bpe_lines[0][1:]
69+
70+
# # Create vocab.json from encoder.json
71+
# with open(encoder_path, 'r', encoding='utf-8') as encoder_file:
72+
# encoder = json.load(encoder_file)
73+
74+
# with open(vocab_json_path, 'w', encoding='utf-8') as vocab_file:
75+
# json.dump(encoder, vocab_file, ensure_ascii=False, indent=2)
76+
77+
# # Create merges.txt from corrected bpe file
78+
# with open(merges_txt_path, 'w', encoding='utf-8') as merges_file:
79+
# merges_file.write("#version: 0.2\n")
80+
# for line in bpe_lines:
81+
# if not line.startswith("#"):
82+
# merges_file.write(line)
83+
84+
# print(f"Created {vocab_json_path} and {merges_txt_path} with corrected encoding.")
85+
86+
# def main():
87+
# if len(sys.argv) != 3:
88+
# print("Usage: python fix_encoding_and_sync_files.py <bpe_path> <encoder_path>")
89+
# sys.exit(1)
90+
91+
# bpe_path = sys.argv[1]
92+
# encoder_path = sys.argv[2]
93+
# vocab_json_path = "./converted_model/vocab.json"
94+
# merges_txt_path = "./converted_model/merges.txt"
95+
96+
# fix_encoding_and_create_files(bpe_path, encoder_path, vocab_json_path, merges_txt_path)
97+
98+
# if __name__ == "__main__":
99+
# main()

merge_vocabularies.py

+66
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# merge vocabularies
2+
# (this util merges two different vocabulary files from encoder.json)
3+
import json
4+
import requests
5+
import os
6+
import argparse
7+
8+
def download_file(url, dest_path):
9+
response = requests.get(url)
10+
response.raise_for_status()
11+
with open(dest_path, 'wb') as f:
12+
f.write(response.content)
13+
14+
def merge_encoders(original_encoder_path, new_encoder_url, output_encoder_path):
15+
with open(original_encoder_path, 'r', encoding='utf-8') as original_file:
16+
original_encoder = json.load(original_file)
17+
18+
new_encoder_path = 'new_encoder.json'
19+
download_file(new_encoder_url, new_encoder_path)
20+
21+
with open(new_encoder_path, 'r', encoding='utf-8') as new_file:
22+
new_encoder = json.load(new_file)
23+
24+
# Merge the two encoders
25+
merged_encoder = {**new_encoder, **original_encoder}
26+
27+
with open(output_encoder_path, 'w', encoding='utf-8') as output_file:
28+
json.dump(merged_encoder, output_file, ensure_ascii=False, indent=2)
29+
30+
os.remove(new_encoder_path)
31+
32+
def merge_bpe_files(original_bpe_path, new_bpe_url, output_bpe_path):
33+
with open(original_bpe_path, 'r', encoding='utf-8') as original_file:
34+
original_bpe = original_file.readlines()
35+
36+
new_bpe_path = 'new_bpe.txt'
37+
download_file(new_bpe_url, new_bpe_path)
38+
39+
with open(new_bpe_path, 'r', encoding='utf-8') as new_file:
40+
new_bpe = new_file.readlines()
41+
42+
# Merge the two BPE lists, removing duplicates
43+
merged_bpe = list(dict.fromkeys(original_bpe + new_bpe))
44+
45+
with open(output_bpe_path, 'w', encoding='utf-8') as output_file:
46+
output_file.writelines(merged_bpe)
47+
48+
os.remove(new_bpe_path)
49+
50+
def main():
51+
parser = argparse.ArgumentParser(description="Merge vocabulary files.")
52+
parser.add_argument("original_encoder_path", type=str, help="Path to the original encoder.json")
53+
parser.add_argument("original_bpe_path", type=str, help="Path to the original vocab.bpe")
54+
parser.add_argument("output_encoder_path", type=str, help="Path to save the merged encoder.json")
55+
parser.add_argument("output_bpe_path", type=str, help="Path to save the merged vocab.bpe")
56+
57+
args = parser.parse_args()
58+
59+
new_encoder_url = 'https://huggingface.co/Finnish-NLP/gpt2-finnish/raw/main/vocab.json'
60+
new_bpe_url = 'https://huggingface.co/Finnish-NLP/gpt2-finnish/raw/main/merges.txt'
61+
62+
merge_encoders(args.original_encoder_path, new_encoder_url, args.output_encoder_path)
63+
merge_bpe_files(args.original_bpe_path, new_bpe_url, args.output_bpe_path)
64+
65+
if __name__ == "__main__":
66+
main()

test_pytorch_model.py

+17-9
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,10 @@
77
# (Ghostcode via ChaosWhisperer)
88

99

10-
import torch
10+
#!/usr/bin/env python3
11+
1112
from transformers import GPT2LMHeadModel, GPT2Tokenizer
13+
import torch
1214

1315
def load_model(model_dir):
1416
try:
@@ -44,16 +46,22 @@ def generate_text(model, tokenizer, prompt, max_length=50):
4446
print("Error: Model generation resulted in None or empty outputs.")
4547
return None
4648

47-
# Decode the generated output
48-
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
49+
# Decode the generated output and handle NoneType tokens
50+
generated_text = []
51+
for token_id in outputs[0]:
52+
token_id = token_id.item()
53+
try:
54+
token_str = tokenizer.decode([token_id], skip_special_tokens=True)
55+
except Exception as e:
56+
print(f"Error decoding token ID {token_id}: {e}")
57+
token_str = None
58+
generated_text.append(token_str)
59+
60+
# Filter out NoneType tokens and join the text
61+
generated_text = [token for token in generated_text if token is not None]
62+
generated_text = ''.join(generated_text)
4963
print(f"Decoded text: {generated_text}")
5064

51-
# Print each token's ID and corresponding token
52-
for i, token in enumerate(outputs[0]):
53-
token_id = token.item()
54-
token_str = tokenizer.decode([token_id])
55-
print(f"Token {i}: {token_id} - {token_str}")
56-
5765
return generated_text
5866
except Exception as e:
5967
print(f"Error generating text: {e}")

0 commit comments

Comments
 (0)