-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfineweb.py
88 lines (82 loc) · 4.29 KB
/
fineweb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
FineWeb-Edu dataset (for srs pretraining)
https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu
Downloads and tokenizes the data and saves data shards to disk.
Run simply as:
$ python fineweb.py
Will save shards to the local directory "edu_fineweb10B".
"""
import itertools
import os
import multiprocessing as mp
import numpy as np
import tiktoken
import h5py
from datasets import load_dataset # pip install datasets
from tqdm import tqdm # pip install tqdm
def tokenize(doc):
# init the tokenizer
enc = tiktoken.get_encoding("gpt2")
eot = enc._special_tokens['<|endoftext|>'] # end of text token
# tokenizes a single document and returns a numpy array of uint16 tokens
tokens = [eot] # the special <|endoftext|> token delimits all documents
tokens.extend(enc.encode_ordinary(doc["text"]))
tokens_np = np.array(tokens)
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
tokens_np_uint16 = tokens_np.astype(np.uint16)
return tokens_np_uint16
def download_fineweb(hdf5_path='edu_fineweb10B.hdf5', remote_name='sample-10BT', shard_size :int = 1024):
'''
Download the fineweb dataset and store it locally into a HDF5 dataset after sharding
args:
hdf5_path (str): path to local HDF5 file where the data is saved
remote_name (str): name of the dataset type to download. Check options at https://huggingface.co/datasets/HuggingFaceFW/fineweb
shard_size (int): Number of tokens per shard
'''
# download the dataset
fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
h5f = h5py.File(hdf5_path, 'w')
write_n_tokens = int(1e8)
# HDF5 is nice enough that it will fill your memory to the limit, and no more...i'm sure there is a way to set a manual limit, but can't be bothered.
dset_train = h5f.create_dataset('edu_fineweb_train', shape=(write_n_tokens, ), maxshape=(None,), dtype=np.uint16, chunks=(shard_size,))
dset_val = h5f.create_dataset('edu_fineweb_val', shape=(write_n_tokens, ), dtype=np.uint16, chunks=(shard_size,))
# tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
nprocs = max(1, os.cpu_count()-2)
with mp.Pool(nprocs) as pool:
progress_bar = tqdm(total=len(fw), unit="samples", desc='Downloading FineWeb', dynamic_ncols=True)
token_count = 0
# preallocate buffer to hold current shard
all_tokens_np = np.empty((write_n_tokens,), dtype=np.uint16)
save_iter, fw_iter = 0, 0
for tokens in pool.imap(tokenize, fw, chunksize=32):
fw_iter += 1
if token_count + len(tokens) < write_n_tokens: # if still space in buffer
# simply append tokens to current shard
all_tokens_np[token_count:token_count+len(tokens)] = tokens
token_count += len(tokens)
else:
# split the document into whatever fits in the buffer; the remainder will be written next time
remainder = write_n_tokens - token_count
all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
if save_iter == 0: #save val dataset
dset_val[:] = all_tokens_np
elif save_iter == 1: # no resizing on the first save
dset_train[write_n_tokens*(save_iter-1):] = all_tokens_np
else:
dset_train.resize(dset_train.shape[0]+write_n_tokens, axis=0) # don't resize the first time for training
dset_train[write_n_tokens*(save_iter-1):] = all_tokens_np
save_iter += 1
# populate the next shard with the leftovers of the current doc
all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
token_count = len(tokens)-remainder
progress_bar.update(fw_iter)
fw_iter = 0
# write any remaining tokens as the last shard
if token_count != 0:
dset_train.resize(dset_train.shape[0]+token_count, axis=0)
dset_train[write_n_tokens*(save_iter-1):] = all_tokens_np[:token_count]
progress_bar.update(token_count)
progress_bar.close()
h5f.close()
if __name__ == '__main__':
download_fineweb()