Merge pull request #160 from tigergraph/GML-1812-specialized_testing

RobRossmiller-TG · web-flow · commit ca797fc6c0d7 · 2024-07-16T11:35:58.000-04:00
Gml 1812 specialized testing
diff --git a/tests/data/baseline/ml/fastRP.json.gz b/tests/data/baseline/ml/fastRP.json.gz
diff --git a/tests/run.sh b/tests/run.sh
@@ -1,5 +1,4 @@
 clear
-python3 test/create_baseline.py &&
-  python3 test/setup.py &&
-  pytest test/test_centrality.py::TestCentrality
-# pytest
+python3 test/setup.py &&
+  python3 test/baseline/create_baselines.py &&
+  pytest test/test_centrality.py test/test_ml.py
diff --git a/tests/test/baseline/__init__.py b/tests/test/baseline/__init__.py
diff --git a/tests/test/baseline/algos/__init__.py b/tests/test/baseline/algos/__init__.py
@@ -0,0 +1,2 @@
+from .degree_cent import *
+from .fastrp import fastrp_wrapper as fastrp
diff --git a/tests/test/baseline/algos/degree_cent.py b/tests/test/baseline/algos/degree_cent.py
@@ -0,0 +1,46 @@
+from collections import Counter
+
+import networkx as nx
+
+
+def run_degree_baseline_complete(g: nx.Graph, _):
+    s = 1.0 / (len(g) - 1.0)
+
+    # d-1 because nx will double count the self-edge
+    res = {n: (d - 1) * s for n, d in g.degree()}
+
+    out = []
+    for k, v in res.items():
+        out.append({"Vertex_ID": k, "score": v})
+
+    out = [{"top_scores": out}]
+    return out
+
+
+def run_degree_baseline(g: nx.Graph, metric):
+    res = metric(g)
+
+    out = []
+    for k, v in res.items():
+        out.append({"Vertex_ID": k, "score": v})
+
+    out = [{"top_scores": out}]
+    return out
+
+
+def weighted_deg_cent(
+    g: nx.Graph,
+    dir: str = "",
+):
+    res = Counter()
+    for e in g.edges:
+        a = g.get_edge_data(e[0], e[1])["weight"]
+        match dir:
+            case "in":
+                res[e[1]] += a
+            case "out":
+                res[e[0]] += a
+            case _:
+                res[e[0]] += a
+                res[e[1]] += a
+    return res
diff --git a/tests/test/baseline/algos/fastrp.py b/tests/test/baseline/algos/fastrp.py
@@ -0,0 +1,107 @@
+# source: https://github.com/GTmac/FastRP/blob/master/fastrp.py
+
+import numpy as np
+from scipy.sparse import csc_matrix, csr_matrix, spdiags
+from sklearn import random_projection
+from sklearn.preprocessing import normalize, scale
+
+
+# projection method: choose from Gaussian and Sparse
+# input matrix: choose from adjacency and transition matrix
+# alpha adjusts the weighting of nodes according to their degree
+def fastrp_projection(
+    A, q=3, dim=128, projection_method="gaussian", input_matrix="adj", alpha=None
+):
+    assert input_matrix == "adj" or input_matrix == "trans"
+    assert projection_method == "gaussian" or projection_method == "sparse"
+
+    if input_matrix == "adj":
+        M = A
+    else:
+        N = A.shape[0]
+        normalizer = spdiags(np.squeeze(1.0 / csc_matrix.sum(A, axis=1)), 0, N, N)
+        M = normalizer @ A
+    # Gaussian projection matrix
+    if projection_method == "gaussian":
+        transformer = random_projection.GaussianRandomProjection(
+            n_components=dim, random_state=42
+        )
+    # Sparse projection matrix
+    else:
+        transformer = random_projection.SparseRandomProjection(
+            n_components=dim, random_state=42
+        )
+    Y = transformer.fit(M)
+    # Random projection for A
+    if alpha is not None:
+        Y.components_ = Y.components_ @ spdiags(
+            np.squeeze(np.power(csc_matrix.sum(A, axis=1), alpha)), 0, N, N
+        )
+    cur_U = transformer.transform(M)
+    U_list = [cur_U]
+
+    for _ in range(2, q + 1):
+        cur_U = M @ cur_U
+        U_list.append(cur_U)
+    return U_list
+
+
+# When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
+def fastrp_merge(U_list, weights, normalization=False):
+    dense_U_list = (
+        [_U.todense() for _U in U_list] if type(U_list[0]) == csc_matrix else U_list
+    )
+    _U_list = (
+        [normalize(_U, norm="l2", axis=1) for _U in dense_U_list]
+        if normalization
+        else dense_U_list
+    )
+
+    if weights is None:
+        return np.concatenate(_U_list, axis=1)
+    U = np.zeros_like(_U_list[0])
+    for cur_U, weight in zip(_U_list, weights):
+        U += cur_U * weight
+    # U = scale(U.todense())
+    # U = normalize(U.todense(), norm='l2', axis=1)
+    return scale(U.toarray()) if type(U) == csr_matrix else scale(U)
+
+
+# A is always the adjacency matrix
+# the choice between adj matrix and trans matrix is decided in the conf
+def fastrp_wrapper(A, conf):
+    U_list = fastrp_projection(
+        A,
+        q=len(conf["weights"]),
+        dim=conf["dim"],
+        projection_method=conf["projection_method"],
+        input_matrix=conf["input_matrix"],
+        alpha=conf["alpha"],
+    )
+    U = fastrp_merge(U_list, conf["weights"], conf["normalization"])
+    return U
+
+
+def get_emb_filename(prefix, conf):
+    return (
+        prefix
+        + "-dim="
+        + str(conf["dim"])
+        + ",projection_method="
+        + conf["projection_method"]
+        + ",input_matrix="
+        + conf["input_matrix"]
+        + ",normalization="
+        + str(conf["normalization"])
+        + ",weights="
+        + (
+            ",".join(map(str, conf["weights"]))
+            if conf["weights"] is not None
+            else "None"
+        )
+        + ",alpha="
+        + (str(conf["alpha"]) if "alpha" in conf else "")
+        + ",C="
+        + (str(conf["C"]) if "alpha" in conf else "1.0")
+        + ".mat"
+    )
diff --git a/tests/test/baseline/create_baselines.py b/tests/test/baseline/create_baselines.py
@@ -0,0 +1,6 @@
+import degree_cent_baseline
+import fast_rp_baseline
+
+if __name__ == "__main__":
+    degree_cent_baseline.run()
+    fast_rp_baseline.run()
diff --git a/tests/test/baseline/degree_cent_baseline.py b/tests/test/baseline/degree_cent_baseline.py
@@ -1,59 +1,16 @@
 import csv
 import json
-from collections import Counter
 from functools import partial
 
 import networkx as nx
 import numpy as np
+from algos import run_degree_baseline, run_degree_baseline_complete, weighted_deg_cent
 from tqdm import tqdm
 
 data_path_root = "data/"
 baseline_path_root = f"{data_path_root}/baseline/"
 
 
-def weighted_deg_cent(
-    g: nx.Graph,
-    dir: str = "",
-):
-    res = Counter()
-    for e in g.edges:
-        a = g.get_edge_data(e[0], e[1])["weight"]
-        match dir:
-            case "in":
-                res[e[1]] += a
-            case "out":
-                res[e[0]] += a
-            case _:
-                res[e[0]] += a
-                res[e[1]] += a
-    return res
-
-
-def run_degree_baseline_complete(g: nx.Graph, _):
-    s = 1.0 / (len(g) - 1.0)
-
-    # d-1 because nx will double count the self-edge
-    res = {n: (d - 1) * s for n, d in g.degree()}
-
-    out = []
-    for k, v in res.items():
-        out.append({"Vertex_ID": k, "score": v})
-
-    out = [{"top_scores": out}]
-    return out
-
-
-def run_degree_baseline(g: nx.Graph, metric):
-    res = metric(g)
-
-    out = []
-    for k, v in res.items():
-        out.append({"Vertex_ID": k, "score": v})
-
-    out = [{"top_scores": out}]
-    return out
-
-
 def create_graph(edges, weights=False, directed=False):
     if directed:
         g = nx.DiGraph()
@@ -90,7 +47,7 @@ def create_degree_baseline(paths):
             json.dump(res, f)  # , indent=2)
 
 
-if __name__ == "__main__":
+def run():
     # (data, output_path, fun, metric)
     paths = [
         # unweighted
diff --git a/tests/test/baseline/fast_rp_baseline.py b/tests/test/baseline/fast_rp_baseline.py
@@ -0,0 +1,44 @@
+import gzip
+import json
+
+import networkx as nx
+import numpy as np
+import pandas as pd
+from algos import fastrp
+from dotenv import load_dotenv
+from pyTigerGraph.datasets import Datasets
+
+load_dotenv()
+data_path_root = "data"
+baseline_path_root = f"{data_path_root}/baseline"
+
+
+def run(ds_name="Cora"):
+    dataset = Datasets(ds_name)
+    edges = pd.read_csv(dataset.tmp_dir + f"/{ds_name}/edges.csv", header=None)
+    edges.columns = ["src", "tgt"]
+
+    g = nx.Graph()
+    g.add_edges_from(edges.to_numpy())
+    node_ids = sorted(list(g.nodes))
+    A = nx.adjacency_matrix(g, nodelist=node_ids)
+    conf = {
+        "weights": [1, 2, 4],
+        "dim": 8,
+        # "projection_method": "sparse",
+        "projection_method": "gaussian",
+        "input_matrix": "trans",
+        "alpha": -0.628,
+        "normalization": False,
+    }
+
+    vecs = fastrp(A, conf)
+
+    assert len(vecs) == len(node_ids)
+
+    res = {str(k): list(v) for k, v in zip(node_ids, vecs)}
+    with gzip.open(f"{baseline_path_root}/ml/fastRP.json.gz", "wb") as f:
+        f.write(json.dumps(res).encode())
+
+    with gzip.open(f"{baseline_path_root}/ml/fastRP.json.gz", "rb") as f:
+        d = json.load(f)
diff --git a/tests/test/setup.py b/tests/test/setup.py
@@ -1,17 +1,31 @@
 import json
 import os
 import re
+import time
 
 import pyTigerGraph as tg
-import util
 from dotenv import load_dotenv
 from pyTigerGraph.datasets import Datasets
-from tqdm import tqdm
+from tqdm import tqdm, trange
+
+import util
 
 load_dotenv()
 graph_name = "graph_algorithms_testing"
 pattern = re.compile(r'"name":\s*"tg_.*"')
 
+
+def add_reverse_edge(ds: Datasets):
+    with open(f"{dataset.tmp_dir}/{ds.name}/create_schema.gsql") as f:
+        schema: str = f.read()
+    with open(f"{dataset.tmp_dir}/{ds.name}/create_schema.gsql", "w") as f:
+        schema = schema.replace(
+            "ADD DIRECTED EDGE Cite (from Paper, to Paper, time Int, is_train Bool, is_val Bool);",
+            'ADD DIRECTED EDGE Cite (from Paper, to Paper, time Int, is_train Bool, is_val Bool) WITH REVERSE_EDGE="reverse_Cite";',
+        )
+        f.write(schema)
+
+
 if __name__ == "__main__":
     host_name = os.getenv("HOST_NAME")
     user_name = os.getenv("USER_NAME")
@@ -28,9 +42,14 @@
     if res["error"]:
         exit(1)
     # load the data
+    dataset = Datasets("Cora")
+    add_reverse_edge(dataset)
+    conn.ingestDataset(dataset, getToken=True)
+
     dataset = Datasets("graph_algorithms_testing")
     conn.ingestDataset(dataset, getToken=True)
 
+    conn.graphname = graph_name
     # install the queries
     feat = conn.gds.featurizer()
     installed_queries = util.get_installed_queries(conn)
@@ -43,3 +62,5 @@
             print(q)
             feat.installAlgorithm(q)
 
+    for _ in trange(30, desc="Sleeping while data loads"):
+        time.sleep(1)
diff --git a/tests/test/test_ml.py b/tests/test/test_ml.py
diff --git a/tests/test/util.py b/tests/test/util.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .degree_cent import *`
	`2`	`+from .fastrp import fastrp_wrapper as fastrp`