1414import quiver
1515from quiver .partition import partition_without_replication , select_nodes
1616
17- # data_root = "/data/papers/ogbn_papers100M/raw/"
18- # label = np.load(osp.join(data_root, "node-label.npz"))
19- # data = np.load(osp.join(data_root, "data.npz"))
20- # path = Path('/data/papers/ogbn_papers100M/feat')
21- # path.mkdir(parents=True)
22- # path = Path('/data/papers/ogbn_papers100M/csr')
23- # path.mkdir(parents=True)
24- # path = Path('/data/papers/ogbn_papers100M/label')
25- # path.mkdir(parents=True)
26- # path = Path('/data/papers/ogbn_papers100M/index')
27- # path.mkdir(parents=True)
17+ root = '/data'
18+
19+ data_root = f"{ root } /ogbn_papers100M/raw/"
20+ label = np .load (osp .join (data_root , "node-label.npz" ))
21+ data = np .load (osp .join (data_root , "data.npz" ))
22+ path = Path (f'{ root } /ogbn_papers100M/feat' )
23+ path .mkdir (parents = True )
24+ path = Path (f'{ root } /ogbn_papers100M/csr' )
25+ path .mkdir (parents = True )
26+ path = Path (f'{ root } /ogbn_papers100M/label' )
27+ path .mkdir (parents = True )
28+ path = Path (f'{ root } /ogbn_papers100M/index' )
29+ path .mkdir (parents = True )
2830
2931SCALE = 1
3032GPU_CACHE_GB = 4
@@ -58,8 +60,8 @@ def process_topo():
5860
5961 print ("LOG>>> Begin Save" )
6062
61- torch .save (indptr , "/data/papers /ogbn_papers100M/csr/indptr.pt" )
62- torch .save (indices , "/data/papers /ogbn_papers100M/csr/indices.pt" )
63+ torch .save (indptr , f" { root } /ogbn_papers100M/csr/indptr.pt" )
64+ torch .save (indices , f" { root } /ogbn_papers100M/csr/indices.pt" )
6365
6466 csr_mat = get_csr_from_coo (edge_index , True )
6567 indptr_reverse = csr_mat .indptr
@@ -68,9 +70,9 @@ def process_topo():
6870 indices_reverse = torch .from_numpy (indices_reverse ).type (torch .long )
6971
7072 torch .save (indptr_reverse ,
71- "/data/papers /ogbn_papers100M/csr/indptr_reverse.pt" )
73+ f" { root } /ogbn_papers100M/csr/indptr_reverse.pt" )
7274 torch .save (indices_reverse ,
73- "/data/papers /ogbn_papers100M/csr/indices_reverse.pt" )
75+ f" { root } /ogbn_papers100M/csr/indices_reverse.pt" )
7476
7577
7678def process_feature ():
@@ -80,41 +82,40 @@ def process_feature():
8082 nid_feat = data ["node_feat" ]
8183 tensor = torch .from_numpy (nid_feat ).type (torch .float )
8284 print ("LOG>>> Begin Process" )
83- torch .save (tensor , "/data/papers /ogbn_papers100M/feat/feature.pt" )
85+ torch .save (tensor , f" { root } /ogbn_papers100M/feat/feature.pt" )
8486
8587
8688def process_label ():
8789 print ("LOG>>> Load Finished" )
8890 node_label = label ["node_label" ]
8991 tensor = torch .from_numpy (node_label ).type (torch .long )
90- torch .save (tensor , "/data/papers /ogbn_papers100M/label/label.pt" )
92+ torch .save (tensor , f" { root } /ogbn_papers100M/label/label.pt" )
9193
9294
9395def sort_feature ():
9496 NUM_ELEMENT = 111059956
95- indptr = torch .load ("/data/papers /ogbn_papers100M/csr/indptr_reverse.pt" )
96- feature = torch .load ("/data/papers /ogbn_papers100M/feat/feature.pt" )
97+ indptr = torch .load (f" { root } /ogbn_papers100M/csr/indptr_reverse.pt" )
98+ feature = torch .load (f" { root } /ogbn_papers100M/feat/feature.pt" )
9799 prev = torch .LongTensor (indptr [:- 1 ])
98100 sub = torch .LongTensor (indptr [1 :])
99101 deg = sub - prev
100102 sorted_deg , prev_order = torch .sort (deg , descending = True )
101103 total_num = NUM_ELEMENT
102104 total_range = torch .arange (total_num , dtype = torch .long )
103105 feature = feature [prev_order ]
104- torch .save (feature , "/data/papers /ogbn_papers100M/feat/sort_feature.pt" )
105- torch .save (prev_order , "/data/papers /ogbn_papers100M/feat/prev_order.pt" )
106+ torch .save (feature , f" { root } /ogbn_papers100M/feat/sort_feature.pt" )
107+ torch .save (prev_order , f" { root } /ogbn_papers100M/feat/prev_order.pt" )
106108
107109
108110def process_index ():
109- data = genfromtxt ('/data/papers/ ogbn_papers100M/split/time/train.csv' ,
111+ data = genfromtxt (f" { root } / ogbn_papers100M/split/time/train.csv" ,
110112 delimiter = '\n ' )
111- data = data .astype (np .long )
113+ data = data .astype (np .int_ )
112114 data = torch .from_numpy (data )
113- torch .save (data , "/data/papers /ogbn_papers100M/index/train_idx.pt" )
115+ torch .save (data , f" { root } /ogbn_papers100M/index/train_idx.pt" )
114116
115117
116118def preprocess (host , host_size , p2p_group , p2p_size ):
117- root = '/data/papers'
118119 data_dir = osp .join (root , 'ogbn_papers100M' )
119120 indptr_root = osp .join (data_dir , 'csr' , 'indptr.pt' )
120121 indices_root = osp .join (data_dir , 'csr' , 'indices.pt' )
@@ -132,6 +133,9 @@ def preprocess(host, host_size, p2p_group, p2p_size):
132133 end = min (idx_len , beg + (idx_len // global_gpus ))
133134 train_idxs .append (train_idx [beg :end ])
134135 beg = end
136+
137+ path = Path (f'{ root } /ogbn_papers100M/{ host_size } h' )
138+ path .mkdir (parents = True )
135139
136140 csr_topo = quiver .CSRTopo (indptr = indptr , indices = indices )
137141 quiver_sampler = quiver .pyg .GraphSageSampler (csr_topo , [25 , 10 ],
@@ -164,7 +168,7 @@ def preprocess(host, host_size, p2p_group, p2p_size):
164168 print (f'prob { t1 - t0 } ' )
165169 for h in range (host_size ):
166170 global2host [res [h ]] = h
167- torch .save (global2host .cpu (), f'/data/papers/ { host_size } h/global2host.pt' )
171+ torch .save (global2host .cpu (), f" { root } /ogbn_papers100M/ { host_size } h/global2host.pt" )
168172 t2 = time .time ()
169173 print (f'g2h { t2 - t1 } ' )
170174
@@ -183,7 +187,7 @@ def preprocess(host, host_size, p2p_group, p2p_size):
183187 nz .size (0 ), cpu_size + gpu_size * p2p_size ) - choice .size (0 )
184188 replicate = local_order [:local_replicate_size ]
185189 torch .save (replicate .cpu (),
186- f'/data/papers /{ host_size } h/replicate{ host } .pt' )
190+ f'{ root } /ogbn_papers100M /{ host_size } h/replicate{ host } .pt' )
187191 t3 = time .time ()
188192 print (f'replicate { t3 - t2 } ' )
189193 local_all = torch .cat ([choice , replicate ])
@@ -199,15 +203,15 @@ def preprocess(host, host_size, p2p_group, p2p_size):
199203 local_gpu_ids = [local_all [r ] for r in local_res ]
200204 local_orders = torch .cat ((local_gpu_orders , local_cpu_order ))
201205 torch .save (local_orders .cpu (),
202- f'/data/papers /{ host_size } h/local_order{ host } .pt' )
206+ f'{ root } /ogbn_papers100M /{ host_size } h/local_order{ host } .pt' )
203207 t4 = time .time ()
204208 print (f'order { t4 - t3 } ' )
205209
206210
207- # process_topo()
208- # process_feature()
209- # process_label()
210- # sort_feature()
211- # process_index()
211+ process_topo ()
212+ process_feature ()
213+ process_label ()
214+ sort_feature ()
215+ process_index ()
212216
213217preprocess (0 , 3 , 1 , 2 )
0 commit comments