fix metapath2vec on custom datasets (dmlc#1499)

BarclayII · web-flow · commit de34e15abbc0 · 2020-05-08T12:51:10.000+08:00
diff --git a/examples/pytorch/metapath2vec/README.md b/examples/pytorch/metapath2vec/README.md
@@ -10,14 +10,26 @@ Dependencies
 
 How to run the code
 -----
-Run with the following procedures:
-
-1, Run sampler.py on your graph dataset. Note that: the input text file should be list of mappings so you probably need to preprocess your graph dataset. Files with sample format are available in "net_dbis" file. Of course you could also use your own metapath sampler implementation.
-
-2, Run the following command:
-```bash
-python metapath2vec.py --download "where/you/want/to/download" --output_file "your_output_file_path"
-```
+Run with either of the following procedures:
+
+* Running with default AMiner dataset:
+  1. Directly run the following command:
+
+     ```bash
+     python metapath2vec.py --aminer --path "where/you/want/to/download" --output_file "your_model_output_path"
+     ```
+* Running with another AMiner-like dataset
+  1. Prepare the data in the same format as the ones of AMiner and DBIS in Section B of [Author's code repo](https://ericdongyx.github.io/metapath2vec/m2v.html).
+  2. Run `sampler.py` on your graph dataset with, for instance,
+
+     ```bash
+     python sampler.py net_dbis
+     ```
+  3. Run the following command:
+
+     ```bash
+     python metapath2vec.py --path net_dbis/output_path.txt --output_file "your_model_output_path"
+     ```
 
 Tips: Change num_workers based on your GPU instances; Running 3 or 4 epochs is actually enough. 
 
diff --git a/examples/pytorch/metapath2vec/download.py b/examples/pytorch/metapath2vec/download.py
@@ -44,3 +44,11 @@ def _download_and_extract(self, path, filename):
         with zipfile.ZipFile(fn) as zf:
             zf.extractall(path)
         print('Unzip finished.')
+
+
+class CustomDataset(object):
+    """
+    Custom dataset generated by sampler.py (e.g. NetDBIS)
+    """
+    def __init__(self, path):
+        self.fn = path
diff --git a/examples/pytorch/metapath2vec/metapath2vec.py b/examples/pytorch/metapath2vec/metapath2vec.py
@@ -7,11 +7,16 @@
 
 from reading_data import DataReader, Metapath2vecDataset
 from model import SkipGramModel
+from download import AminerDataset, CustomDataset
 
 
 class Metapath2VecTrainer:
     def __init__(self, args):
-        self.data = DataReader(args.download, args.min_count, args.care_type)
+        if args.aminer:
+            dataset = AminerDataset(args.path)
+        else:
+            dataset = CustomDataset(args.path)
+        self.data = DataReader(dataset, args.min_count, args.care_type)
         dataset = Metapath2vecDataset(self.data, args.window_size)
         self.dataloader = DataLoader(dataset, batch_size=args.batch_size,
                                      shuffle=True, num_workers=args.num_workers, collate_fn=dataset.collate)
@@ -60,7 +65,8 @@ def train(self):
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Metapath2vec")
     #parser.add_argument('--input_file', type=str, help="input_file")
-    parser.add_argument('--download', type=str, help="download_path")
+    parser.add_argument('--aminer', action='store_true', help='Use AMiner dataset')
+    parser.add_argument('--path', type=str, help="input_path")
     parser.add_argument('--output_file', type=str, help='output_file')
     parser.add_argument('--dim', default=128, type=int, help="embedding dimensions")
     parser.add_argument('--window_size', default=7, type=int, help="context window size")
diff --git a/examples/pytorch/metapath2vec/reading_data.py b/examples/pytorch/metapath2vec/reading_data.py
@@ -7,7 +7,7 @@
 class DataReader:
     NEGATIVE_TABLE_SIZE = 1e8
 
-    def __init__(self, download, min_count, care_type):
+    def __init__(self, dataset, min_count, care_type):
 
         self.negatives = []
         self.discards = []
@@ -18,9 +18,7 @@ def __init__(self, download, min_count, care_type):
         self.sentences_count = 0
         self.token_count = 0
         self.word_frequency = dict()
-        self.download = download
-        FB = AminerDataset(self.download)
-        self.inputFileName = FB.fn
+        self.inputFileName = dataset.fn
         self.read_words(min_count)
         self.initTableNegatives()
         self.initTableDiscards()