-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge_vocabularies.py
66 lines (48 loc) · 2.49 KB
/
merge_vocabularies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# merge vocabularies
# (this util merges two different vocabulary files from encoder.json)
import json
import requests
import os
import argparse
def download_file(url, dest_path):
response = requests.get(url)
response.raise_for_status()
with open(dest_path, 'wb') as f:
f.write(response.content)
def merge_encoders(original_encoder_path, new_encoder_url, output_encoder_path):
with open(original_encoder_path, 'r', encoding='utf-8') as original_file:
original_encoder = json.load(original_file)
new_encoder_path = 'new_encoder.json'
download_file(new_encoder_url, new_encoder_path)
with open(new_encoder_path, 'r', encoding='utf-8') as new_file:
new_encoder = json.load(new_file)
# Merge the two encoders
merged_encoder = {**new_encoder, **original_encoder}
with open(output_encoder_path, 'w', encoding='utf-8') as output_file:
json.dump(merged_encoder, output_file, ensure_ascii=False, indent=2)
os.remove(new_encoder_path)
def merge_bpe_files(original_bpe_path, new_bpe_url, output_bpe_path):
with open(original_bpe_path, 'r', encoding='utf-8') as original_file:
original_bpe = original_file.readlines()
new_bpe_path = 'new_bpe.txt'
download_file(new_bpe_url, new_bpe_path)
with open(new_bpe_path, 'r', encoding='utf-8') as new_file:
new_bpe = new_file.readlines()
# Merge the two BPE lists, removing duplicates
merged_bpe = list(dict.fromkeys(original_bpe + new_bpe))
with open(output_bpe_path, 'w', encoding='utf-8') as output_file:
output_file.writelines(merged_bpe)
os.remove(new_bpe_path)
def main():
parser = argparse.ArgumentParser(description="Merge vocabulary files.")
parser.add_argument("original_encoder_path", type=str, help="Path to the original encoder.json")
parser.add_argument("original_bpe_path", type=str, help="Path to the original vocab.bpe")
parser.add_argument("output_encoder_path", type=str, help="Path to save the merged encoder.json")
parser.add_argument("output_bpe_path", type=str, help="Path to save the merged vocab.bpe")
args = parser.parse_args()
new_encoder_url = 'https://huggingface.co/Finnish-NLP/gpt2-finnish/raw/main/vocab.json'
new_bpe_url = 'https://huggingface.co/Finnish-NLP/gpt2-finnish/raw/main/merges.txt'
merge_encoders(args.original_encoder_path, new_encoder_url, args.output_encoder_path)
merge_bpe_files(args.original_bpe_path, new_bpe_url, args.output_bpe_path)
if __name__ == "__main__":
main()