-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimplified_tokenizer_test.py
123 lines (89 loc) · 4.1 KB
/
simplified_tokenizer_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/env python3
# simplified_tokenizer_test.py
from transformers import GPT2Tokenizer
def test_tokenizer(tokenizer_dir):
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_dir)
# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
print("Tokenizer loaded successfully.")
prompt = "Hei, miten voit?"
print(f"Prompt: {prompt}")
# Tokenize the prompt with padding and truncation
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
print(f"Tokenized inputs: {inputs}")
# Decode the tokenized inputs
decoded_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
print(f"Decoded text: {decoded_text}")
if __name__ == "__main__":
tokenizer_dir = "./converted_model" # Path to your tokenizer files directory
test_tokenizer(tokenizer_dir)
# from transformers import AutoTokenizer
# def test_tokenizer(tokenizer_dir):
# try:
# # Use AutoTokenizer to load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
# print("Tokenizer loaded successfully.")
# prompt = "Hei, miten voit?"
# print(f"Prompt: {prompt}")
# # Tokenize the prompt with padding and truncation
# inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
# print(f"Tokenized inputs: {inputs}")
# # Decode the tokenized inputs
# decoded_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
# print(f"Decoded text: {decoded_text}")
# except Exception as e:
# print(f"Error in tokenizer test: {e}")
# if __name__ == "__main__":
# tokenizer_dir = "./converted_model" # Path to your tokenizer files directory
# test_tokenizer(tokenizer_dir)
# from transformers import AutoTokenizer
# def test_tokenizer(tokenizer_dir):
# try:
# # Use AutoTokenizer to load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir)
# print("Tokenizer loaded successfully.")
# prompt = "Hei, miten voit?"
# print(f"Prompt: {prompt}")
# # Tokenize the prompt
# inputs = tokenizer(prompt, return_tensors="pt")
# print(f"Tokenized inputs: {inputs}")
# # Decode the tokenized inputs
# decoded_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
# print(f"Decoded text: {decoded_text}")
# except Exception as e:
# print(f"Error in tokenizer test: {e}")
# if __name__ == "__main__":
# tokenizer_dir = "./converted_model" # Path to your tokenizer files directory
# test_tokenizer(tokenizer_dir)
# # ( old, no autotokenizer => )
# from transformers import GPT2Tokenizer
# def test_tokenizer(tokenizer_dir):
# try:
# # Manually load tokenizer configuration
# tokenizer_config = {
# "model_max_length": 1024,
# "padding_side": "right",
# "special_tokens_map_file": None,
# "tokenizer_class": "GPT2Tokenizer",
# "use_fast": False
# }
# tokenizer = GPT2Tokenizer(
# vocab_file=f"{tokenizer_dir}/vocab.json",
# merges_file=f"{tokenizer_dir}/merges.txt",
# tokenizer_config=tokenizer_config
# )
# print("Tokenizer loaded successfully.")
# prompt = "Hei, miten voit?"
# print(f"Prompt: {prompt}")
# # Tokenize the prompt
# inputs = tokenizer(prompt, return_tensors="pt")
# print(f"Tokenized inputs: {inputs}")
# # Decode the tokenized inputs
# decoded_text = tokenizer.decode(inputs['input_ids'][0], skip_special_tokens=True)
# print(f"Decoded text: {decoded_text}")
# except Exception as e:
# print(f"Error in tokenizer test: {e}")
# if __name__ == "__main__":
# tokenizer_dir = "./converted_model" # Path to your tokenizer files directory
# test_tokenizer(tokenizer_dir)