Skip to content
This repository was archived by the owner on Apr 30, 2020. It is now read-only.

Commit 02317bf

Browse files
author
Dandelion Mané
authored
Initial code for importing cred graphs (#10)
This commit contains inaugural infra code, which imports a SourceCred graph and adds type annotations. The code itself is quite simple, with a bit of logic for pulling out the most specific declared node/edge type and adding it as a property in the networkx graph. The logic is lightly tested using the sample graphs. We verify that each graph loads, and for the sourcecred/sourcecred graph, we verify that we have reasonable counts by node and edge type. Test plan: Run the included unit tests via `python infra/import_graph_test.py`
1 parent 83c3b2b commit 02317bf

File tree

3 files changed

+264
-0
lines changed

3 files changed

+264
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]

infra/import_graph.py

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
from __future__ import absolute_import
2+
from __future__ import division
3+
from __future__ import print_function
4+
5+
import networkx as nx
6+
import collections
7+
8+
AddressType = collections.namedtuple("AddressType", ("prefix", "type"))
9+
10+
11+
def _type_prefix_match(address_types, address):
12+
"""For a given address, find the type matching the address.
13+
14+
Takes an object containing an array of {prefix, type} pairs, and
15+
an address. Returns the first type whose corresponding prefix
16+
was a prefix of the given address.
17+
"""
18+
for address_type in address_types:
19+
prefix = address_type.prefix
20+
if address[: len(prefix)] == prefix:
21+
return address_type.type
22+
raise ValueError("No matching prefix for {}".format(address))
23+
24+
25+
def node_type(address):
26+
"""Return a string that identifies the "type" of a SourceCred node.
27+
28+
For any anticipated SourceCred node (i.e., it was supplied by one of the two
29+
standard SourceCred plugins, i.e., sourcecred/git and sourcecred/github),
30+
this method returns a string which identifies the most specific declared
31+
node_type that matches the given node.
32+
33+
The SourceCred type system is still pretty ad-hoc,
34+
(see: https://github.com/sourcecred/sourcecred/issues/710), so this system is
35+
likely to change in the future.
36+
"""
37+
NODE_PREFIX_TO_TYPE = [
38+
AddressType(prefix=["sourcecred", "github", "REPO"], type="github/repo"),
39+
AddressType(
40+
prefix=["sourcecred", "github", "USERLIKE", "USER"], type="github/user"
41+
),
42+
AddressType(
43+
prefix=["sourcecred", "github", "USERLIKE", "BOT"], type="github/bot"
44+
),
45+
AddressType(prefix=["sourcecred", "github", "PULL"], type="github/pull"),
46+
AddressType(prefix=["sourcecred", "github", "ISSUE"], type="github/issue"),
47+
AddressType(prefix=["sourcecred", "github", "REVIEW"], type="github/review"),
48+
AddressType(prefix=["sourcecred", "github", "COMMENT"], type="github/comment"),
49+
AddressType(prefix=["sourcecred", "git", "COMMIT"], type="git/commit"),
50+
]
51+
return _type_prefix_match(NODE_PREFIX_TO_TYPE, address)
52+
53+
54+
def edge_type(address):
55+
"""Return a string that identifies the "type" of a SourceCred edge.
56+
57+
For any anticipated SourceCred edge (i.e., it was supplied by one of the two
58+
standard SourceCred plugins, i.e., sourcecred/git and sourcecred/github),
59+
this method returns a string which identifies the most specific declared
60+
edge_type that matches the given node.
61+
62+
The SourceCred type system is still pretty ad-hoc,
63+
(see: https://github.com/sourcecred/sourcecred/issues/710), so this system is
64+
likely to change in the future.
65+
"""
66+
EDGE_PREFIX_TO_TYPE = [
67+
AddressType(
68+
prefix=["sourcecred", "github", "HAS_PARENT"], type="github/hasParent"
69+
),
70+
AddressType(
71+
prefix=["sourcecred", "github", "REFERENCES"], type="github/references"
72+
),
73+
AddressType(
74+
prefix=["sourcecred", "github", "MENTIONS_AUTHOR"],
75+
type="github/mentionsAuthor",
76+
),
77+
AddressType(prefix=["sourcecred", "github", "AUTHORS"], type="github/authors"),
78+
AddressType(prefix=["sourcecred", "github", "PULL"], type="github/pull"),
79+
AddressType(prefix=["sourcecred", "github", "ISSUE"], type="github/issue"),
80+
AddressType(prefix=["sourcecred", "github", "REVIEW"], type="github/review"),
81+
AddressType(prefix=["sourcecred", "github", "COMMENT"], type="github/comment"),
82+
AddressType(
83+
prefix=["sourcecred", "github", "MERGED_AS"], type="github/mergedAs"
84+
),
85+
AddressType(
86+
prefix=["sourcecred", "github", "REACTS", "HOORAY"],
87+
type="github/reactsHooray",
88+
),
89+
AddressType(
90+
prefix=["sourcecred", "github", "REACTS", "THUMBS_UP"],
91+
type="github/reactsThumbsUp",
92+
),
93+
AddressType(
94+
prefix=["sourcecred", "github", "REACTS", "HEART"],
95+
type="github/reactsHeart",
96+
),
97+
AddressType(
98+
prefix=["sourcecred", "github", "REACTS", "ROCKET"],
99+
type="github/reactsRocket",
100+
),
101+
AddressType(prefix=["sourcecred", "git", "HAS_PARENT"], type="git/hasParent"),
102+
]
103+
return _type_prefix_match(EDGE_PREFIX_TO_TYPE, address)
104+
105+
106+
def json_to_graph(json):
107+
"""Convert a serialized SourceCred graph to a MultiDiGraph.
108+
109+
Takes in a Python dict representing a SourceCred graph json.
110+
Returns a networkx MultiDiGraph, with node and edge type identifiers
111+
added as an additional property.
112+
"""
113+
[compat, data] = json
114+
assert compat["type"] == "sourcecred/graph", compat
115+
assert compat["version"] == "0.4.0", compat
116+
117+
def nodePropertyDict(address):
118+
return {"address": tuple(address), "type": node_type(address)}
119+
120+
def edgePropertyDict(address):
121+
return {"address": tuple(address), "type": edge_type(address)}
122+
123+
nodes = data["nodes"]
124+
edges = data["edges"]
125+
g = nx.MultiDiGraph()
126+
for (i, n) in enumerate(nodes):
127+
g.add_node(i, **nodePropertyDict(n))
128+
for e in edges:
129+
g.add_edge(e["srcIndex"], e["dstIndex"], **edgePropertyDict(e["address"]))
130+
return g

infra/import_graph_test.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
import unittest
2+
import json
3+
import os
4+
from collections import Counter
5+
6+
import import_graph
7+
8+
9+
class TestTypeMatching(unittest.TestCase):
10+
def test_node_type_matching(self):
11+
# Testing the basic functionality. Cases don't need to be exhaustive;
12+
# missing types will be caught during the whole-graph import tests.
13+
example_issue = [
14+
"sourcecred",
15+
"github",
16+
"ISSUE",
17+
"sourcecred",
18+
"sourcecred",
19+
"34",
20+
]
21+
example_repo = ["sourcecred", "github", "REPO", "sourcecred", "sourcecred"]
22+
self.assertEqual(import_graph.node_type(example_issue), "github/issue")
23+
self.assertEqual(import_graph.node_type(example_repo), "github/repo")
24+
with (self.assertRaises(ValueError)):
25+
import_graph.node_type(["non", "existent", "node"])
26+
27+
def test_edge_type_matching(self):
28+
# Testing the basic functionality. Cases don't need to be exhaustive;
29+
# missing types will be caught during the whole-graph import tests.
30+
example_has_parent = [
31+
"sourcecred",
32+
"github",
33+
"HAS_PARENT",
34+
"6",
35+
"sourcecred",
36+
"github",
37+
"ISSUE",
38+
"sourcecred",
39+
"pm",
40+
"1",
41+
]
42+
43+
example_authors = [
44+
"sourcecred",
45+
"github",
46+
"AUTHORS",
47+
"5",
48+
"sourcecred",
49+
"github",
50+
"USERLIKE",
51+
"USER",
52+
"BrianLitwin",
53+
"4",
54+
"sourcecred",
55+
"git",
56+
"COMMIT",
57+
"0cae9fa77c1d7d8b8fe3fe2d316a6782757862e4",
58+
]
59+
self.assertEqual(import_graph.edge_type(example_has_parent), "github/hasParent")
60+
self.assertEqual(import_graph.edge_type(example_authors), "github/authors")
61+
with (self.assertRaises(ValueError)):
62+
import_graph.edge_type(["non", "existent", "edge"])
63+
64+
65+
def sample_graphs_directory():
66+
wd = os.path.abspath(os.path.dirname(__file__))
67+
return os.path.join(wd, os.pardir, "sample-graphs")
68+
69+
70+
class TestImportGraph(unittest.TestCase):
71+
def test_for_sourcecred_sourcecred(self):
72+
sourcecred_sourcecred = os.path.join(
73+
sample_graphs_directory(), "sourcecred_sourcecred.json"
74+
)
75+
76+
with open(sourcecred_sourcecred, "r") as f:
77+
sourcecred_graph_data = json.load(f)
78+
graph = import_graph.json_to_graph(sourcecred_graph_data)
79+
80+
# Some sanity checks on the loaded graph, based on node type counts
81+
# We don't expect exact counts because the expectation is that these graphs are
82+
# regularly re-generated.
83+
# Under normal circumstances the counts of these entities only increases,
84+
# so this test should be reasonably robust.
85+
node_count = Counter()
86+
for n in graph.nodes(data=True):
87+
node_count[n[1]["type"]] += 1
88+
89+
self.assertEqual(node_count["github/repo"], 1)
90+
self.assertGreater(node_count["github/user"], 10)
91+
self.assertGreater(node_count["github/issue"], 100)
92+
self.assertGreater(node_count["github/pull"], 800)
93+
self.assertGreater(node_count["github/review"], 200)
94+
self.assertGreater(node_count["github/comment"], 500)
95+
self.assertGreater(node_count["git/commit"], 800)
96+
97+
# Some sanity checks on the loaded graph, based on edge type counts.
98+
# Same reasoning as the tests above.
99+
edge_count = Counter()
100+
for e in graph.edges(data=True):
101+
edge_count[e[2]["type"]] += 1
102+
103+
self.assertGreater(edge_count["github/authors"], 1000)
104+
self.assertGreater(edge_count["github/references"], 100)
105+
self.assertGreater(edge_count["github/hasParent"], 1000)
106+
self.assertGreater(edge_count["github/mergedAs"], 800)
107+
self.assertGreater(edge_count["github/reactsThumbsUp"], 20)
108+
self.assertGreater(edge_count["github/reactsHooray"], 5)
109+
self.assertGreater(edge_count["github/reactsHeart"], 10)
110+
self.assertGreater(edge_count["github/reactsRocket"], 1)
111+
self.assertGreater(edge_count["git/hasParent"], 800)
112+
113+
def test_all_graphs_load(self):
114+
files = os.listdir(sample_graphs_directory())
115+
graph_files = [
116+
os.path.join(sample_graphs_directory(), f)
117+
for f in files
118+
if f.endswith(".json")
119+
]
120+
self.assertGreater(len(graph_files), 3)
121+
for g in graph_files:
122+
if g.endswith("sourcecred_sourcecred.json"):
123+
continue # This one is loaded in a separate test case.
124+
with open(g, "r") as f:
125+
data = json.load(f)
126+
# Just verify it doesn't throw an error.
127+
import_graph.json_to_graph(data)
128+
129+
130+
if __name__ == "__main__":
131+
unittest.main()

0 commit comments

Comments
 (0)