Skip to content

Commit 7765b37

Browse files
authored
Fixed papers100M, partition bugs (#159)
1 parent 3454393 commit 7765b37

4 files changed

Lines changed: 40 additions & 35 deletions

File tree

benchmarks/ogbn-papers100M/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# OGBn-Papers100M
22

3-
This dataset is large, so we need to preprocess the dataset. We assume that you have downloaded the raw dataset from [OGB](https://snap.stanford.edu/ogb/data/nodeproppred/) and decompress it to `\data\papers` (also the files in `split/time/`). Then `preprocess.py` can help you transform the data into the appropriate format.
3+
This dataset is large, so we need to preprocess the dataset. We assume that you have downloaded the raw dataset from [OGB](https://snap.stanford.edu/ogb/data/nodeproppred/) and decompress it to `/data` (also the files in `split/time/`). Then `preprocess.py` can help you transform the data into the appropriate format.
44

55
Also, Quiver uses large shared memory to hold the dataset. If your program is killed silently or has bus error, make sure your physical memory can hold the dataset. You should make sure your shared memory limit is set properly, and we recommend that it is greater than 128G:
66

benchmarks/ogbn-papers100M/dist_sampling_ogb_paper100M_quiver.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ def run(rank, world_size, quiver_sampler, quiver_feature, y, train_idx,
180180

181181

182182
if __name__ == '__main__':
183-
root = "/data/papers/"
183+
root = "/data"
184184
world_size = torch.cuda.device_count()
185185
dataset = Paper100MDataset(root, 0.15 * min(world_size, 4))
186186

benchmarks/ogbn-papers100M/preprocess.py

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -14,17 +14,19 @@
1414
import quiver
1515
from quiver.partition import partition_without_replication, select_nodes
1616

17-
# data_root = "/data/papers/ogbn_papers100M/raw/"
18-
# label = np.load(osp.join(data_root, "node-label.npz"))
19-
# data = np.load(osp.join(data_root, "data.npz"))
20-
# path = Path('/data/papers/ogbn_papers100M/feat')
21-
# path.mkdir(parents=True)
22-
# path = Path('/data/papers/ogbn_papers100M/csr')
23-
# path.mkdir(parents=True)
24-
# path = Path('/data/papers/ogbn_papers100M/label')
25-
# path.mkdir(parents=True)
26-
# path = Path('/data/papers/ogbn_papers100M/index')
27-
# path.mkdir(parents=True)
17+
root = '/data'
18+
19+
data_root = f"{root}/ogbn_papers100M/raw/"
20+
label = np.load(osp.join(data_root, "node-label.npz"))
21+
data = np.load(osp.join(data_root, "data.npz"))
22+
path = Path(f'{root}/ogbn_papers100M/feat')
23+
path.mkdir(parents=True)
24+
path = Path(f'{root}/ogbn_papers100M/csr')
25+
path.mkdir(parents=True)
26+
path = Path(f'{root}/ogbn_papers100M/label')
27+
path.mkdir(parents=True)
28+
path = Path(f'{root}/ogbn_papers100M/index')
29+
path.mkdir(parents=True)
2830

2931
SCALE = 1
3032
GPU_CACHE_GB = 4
@@ -58,8 +60,8 @@ def process_topo():
5860

5961
print("LOG>>> Begin Save")
6062

61-
torch.save(indptr, "/data/papers/ogbn_papers100M/csr/indptr.pt")
62-
torch.save(indices, "/data/papers/ogbn_papers100M/csr/indices.pt")
63+
torch.save(indptr, f"{root}/ogbn_papers100M/csr/indptr.pt")
64+
torch.save(indices, f"{root}/ogbn_papers100M/csr/indices.pt")
6365

6466
csr_mat = get_csr_from_coo(edge_index, True)
6567
indptr_reverse = csr_mat.indptr
@@ -68,9 +70,9 @@ def process_topo():
6870
indices_reverse = torch.from_numpy(indices_reverse).type(torch.long)
6971

7072
torch.save(indptr_reverse,
71-
"/data/papers/ogbn_papers100M/csr/indptr_reverse.pt")
73+
f"{root}/ogbn_papers100M/csr/indptr_reverse.pt")
7274
torch.save(indices_reverse,
73-
"/data/papers/ogbn_papers100M/csr/indices_reverse.pt")
75+
f"{root}/ogbn_papers100M/csr/indices_reverse.pt")
7476

7577

7678
def process_feature():
@@ -80,41 +82,40 @@ def process_feature():
8082
nid_feat = data["node_feat"]
8183
tensor = torch.from_numpy(nid_feat).type(torch.float)
8284
print("LOG>>> Begin Process")
83-
torch.save(tensor, "/data/papers/ogbn_papers100M/feat/feature.pt")
85+
torch.save(tensor, f"{root}/ogbn_papers100M/feat/feature.pt")
8486

8587

8688
def process_label():
8789
print("LOG>>> Load Finished")
8890
node_label = label["node_label"]
8991
tensor = torch.from_numpy(node_label).type(torch.long)
90-
torch.save(tensor, "/data/papers/ogbn_papers100M/label/label.pt")
92+
torch.save(tensor, f"{root}/ogbn_papers100M/label/label.pt")
9193

9294

9395
def sort_feature():
9496
NUM_ELEMENT = 111059956
95-
indptr = torch.load("/data/papers/ogbn_papers100M/csr/indptr_reverse.pt")
96-
feature = torch.load("/data/papers/ogbn_papers100M/feat/feature.pt")
97+
indptr = torch.load(f"{root}/ogbn_papers100M/csr/indptr_reverse.pt")
98+
feature = torch.load(f"{root}/ogbn_papers100M/feat/feature.pt")
9799
prev = torch.LongTensor(indptr[:-1])
98100
sub = torch.LongTensor(indptr[1:])
99101
deg = sub - prev
100102
sorted_deg, prev_order = torch.sort(deg, descending=True)
101103
total_num = NUM_ELEMENT
102104
total_range = torch.arange(total_num, dtype=torch.long)
103105
feature = feature[prev_order]
104-
torch.save(feature, "/data/papers/ogbn_papers100M/feat/sort_feature.pt")
105-
torch.save(prev_order, "/data/papers/ogbn_papers100M/feat/prev_order.pt")
106+
torch.save(feature, f"{root}/ogbn_papers100M/feat/sort_feature.pt")
107+
torch.save(prev_order, f"{root}/ogbn_papers100M/feat/prev_order.pt")
106108

107109

108110
def process_index():
109-
data = genfromtxt('/data/papers/ogbn_papers100M/split/time/train.csv',
111+
data = genfromtxt(f"{root}/ogbn_papers100M/split/time/train.csv",
110112
delimiter='\n')
111-
data = data.astype(np.long)
113+
data = data.astype(np.int_)
112114
data = torch.from_numpy(data)
113-
torch.save(data, "/data/papers/ogbn_papers100M/index/train_idx.pt")
115+
torch.save(data, f"{root}/ogbn_papers100M/index/train_idx.pt")
114116

115117

116118
def preprocess(host, host_size, p2p_group, p2p_size):
117-
root = '/data/papers'
118119
data_dir = osp.join(root, 'ogbn_papers100M')
119120
indptr_root = osp.join(data_dir, 'csr', 'indptr.pt')
120121
indices_root = osp.join(data_dir, 'csr', 'indices.pt')
@@ -132,6 +133,9 @@ def preprocess(host, host_size, p2p_group, p2p_size):
132133
end = min(idx_len, beg + (idx_len // global_gpus))
133134
train_idxs.append(train_idx[beg:end])
134135
beg = end
136+
137+
path = Path(f'{root}/ogbn_papers100M/{host_size}h')
138+
path.mkdir(parents=True)
135139

136140
csr_topo = quiver.CSRTopo(indptr=indptr, indices=indices)
137141
quiver_sampler = quiver.pyg.GraphSageSampler(csr_topo, [25, 10],
@@ -164,7 +168,7 @@ def preprocess(host, host_size, p2p_group, p2p_size):
164168
print(f'prob {t1 - t0}')
165169
for h in range(host_size):
166170
global2host[res[h]] = h
167-
torch.save(global2host.cpu(), f'/data/papers/{host_size}h/global2host.pt')
171+
torch.save(global2host.cpu(), f"{root}/ogbn_papers100M/{host_size}h/global2host.pt")
168172
t2 = time.time()
169173
print(f'g2h {t2 - t1}')
170174

@@ -183,7 +187,7 @@ def preprocess(host, host_size, p2p_group, p2p_size):
183187
nz.size(0), cpu_size + gpu_size * p2p_size) - choice.size(0)
184188
replicate = local_order[:local_replicate_size]
185189
torch.save(replicate.cpu(),
186-
f'/data/papers/{host_size}h/replicate{host}.pt')
190+
f'{root}/ogbn_papers100M/{host_size}h/replicate{host}.pt')
187191
t3 = time.time()
188192
print(f'replicate {t3 - t2}')
189193
local_all = torch.cat([choice, replicate])
@@ -199,15 +203,15 @@ def preprocess(host, host_size, p2p_group, p2p_size):
199203
local_gpu_ids = [local_all[r] for r in local_res]
200204
local_orders = torch.cat((local_gpu_orders, local_cpu_order))
201205
torch.save(local_orders.cpu(),
202-
f'/data/papers/{host_size}h/local_order{host}.pt')
206+
f'{root}/ogbn_papers100M/{host_size}h/local_order{host}.pt')
203207
t4 = time.time()
204208
print(f'order {t4 - t3}')
205209

206210

207-
# process_topo()
208-
# process_feature()
209-
# process_label()
210-
# sort_feature()
211-
# process_index()
211+
process_topo()
212+
process_feature()
213+
process_label()
214+
sort_feature()
215+
process_index()
212216

213217
preprocess(0, 3, 1, 2)

srcs/python/quiver/partition.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
]
1111

1212
QUIVER_MAGIC_NUMBER = 256
13+
CHUNK_NUM = 32
1314

1415

1516
def partition_without_replication(device, probs, ids):

0 commit comments

Comments
 (0)