Skip to content

Commit 36ce285

Browse files
make graph more meaningful (#1977)
* make graph more meaningful if a search with multiple usernames is launched, it creates an additional site node where they both are found. advantages: - better recognition, that users have a connection with each other - better detection of false positives when launching a search with two fake usernames (site node = definite false positive) * fix Graph linking report.py
1 parent c2e3e96 commit 36ce285

File tree

1 file changed

+85
-74
lines changed

1 file changed

+85
-74
lines changed

maigret/report.py

+85-74
Original file line numberDiff line numberDiff line change
@@ -98,118 +98,129 @@ class MaigretGraph:
9898
def __init__(self, graph):
9999
self.G = graph
100100

101-
def add_node(self, key, value):
101+
def add_node(self, key, value, color=None):
102102
node_name = f'{key}: {value}'
103103

104-
params = self.other_params
104+
params = dict(self.other_params)
105105
if key in SUPPORTED_IDS:
106-
params = self.username_params
106+
params = dict(self.username_params)
107107
elif value.startswith('http'):
108-
params = self.site_params
109-
110-
self.G.add_node(node_name, title=node_name, **params)
111-
112-
if value != value.lower():
113-
normalized_node_name = self.add_node(key, value.lower())
114-
self.link(node_name, normalized_node_name)
108+
params = dict(self.site_params)
109+
110+
params['title'] = node_name
111+
if color:
112+
params['color'] = color
115113

114+
self.G.add_node(node_name, **params)
116115
return node_name
117116

118117
def link(self, node1_name, node2_name):
119118
self.G.add_edge(node1_name, node2_name, weight=2)
120119

121120

122121
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
123-
# moved here to speed up the launch of Maigret
124122
import networkx as nx
125123

126124
G = nx.Graph()
127125
graph = MaigretGraph(G)
128126

129-
for username, id_type, results in username_results:
130-
username_node_name = graph.add_node(id_type, username)
127+
base_site_nodes = {}
128+
site_account_nodes = {}
129+
processed_values = {} # Track processed values to avoid duplicates
131130

132-
for website_name in results:
133-
dictionary = results[website_name]
134-
# TODO: fix no site data issue
135-
if not dictionary:
136-
continue
131+
for username, id_type, results in username_results:
132+
# Add username node, using normalized version directly if different
133+
norm_username = username.lower()
134+
username_node_name = graph.add_node(id_type, norm_username)
137135

138-
if dictionary.get("is_similar"):
136+
for website_name, dictionary in results.items():
137+
if not dictionary or dictionary.get("is_similar"):
139138
continue
140139

141140
status = dictionary.get("status")
142-
if not status: # FIXME: currently in case of timeout
141+
if not status or status.status != MaigretCheckStatus.CLAIMED:
143142
continue
144143

145-
if dictionary["status"].status != MaigretCheckStatus.CLAIMED:
146-
continue
144+
# base site node
145+
site_base_url = website_name
146+
if site_base_url not in base_site_nodes:
147+
base_site_nodes[site_base_url] = graph.add_node('site', site_base_url, color='#28a745') # Green color
147148

148-
site_fallback_name = dictionary.get(
149-
'url_user', f'{website_name}: {username.lower()}'
150-
)
151-
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
152-
site_node_name = graph.add_node('site', site_fallback_name)
153-
graph.link(username_node_name, site_node_name)
149+
site_base_node_name = base_site_nodes[site_base_url]
150+
151+
# account node
152+
account_url = dictionary.get('url_user', f'{site_base_url}/{norm_username}')
153+
account_node_id = f"{site_base_url}: {account_url}"
154+
if account_node_id not in site_account_nodes:
155+
site_account_nodes[account_node_id] = graph.add_node('account', account_url)
156+
157+
account_node_name = site_account_nodes[account_node_id]
158+
159+
# link username → account → site
160+
graph.link(username_node_name, account_node_name)
161+
graph.link(account_node_name, site_base_node_name)
154162

155163
def process_ids(parent_node, ids):
156164
for k, v in ids.items():
157-
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
158-
continue
159-
if k in 'image':
165+
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at') or k in 'image':
160166
continue
161167

162-
v_data = v
163-
if v.startswith('['):
164-
try:
165-
v_data = ast.literal_eval(v)
166-
except Exception as e:
167-
logging.error(e)
168-
169-
# value is a list
170-
if isinstance(v_data, list):
171-
list_node_name = graph.add_node(k, site_fallback_name)
172-
for vv in v_data:
173-
data_node_name = graph.add_node(vv, site_fallback_name)
174-
graph.link(list_node_name, data_node_name)
175-
176-
add_ids = {
177-
a: b for b, a in db.extract_ids_from_url(vv).items()
178-
}
179-
if add_ids:
180-
process_ids(data_node_name, add_ids)
168+
# Normalize value if string
169+
norm_v = v.lower() if isinstance(v, str) else v
170+
value_key = f"{k}:{norm_v}"
171+
172+
if value_key in processed_values:
173+
ids_data_name = processed_values[value_key]
181174
else:
182-
# value is just a string
183-
# ids_data_name = f'{k}: {v}'
184-
# if ids_data_name == parent_node:
185-
# continue
186-
187-
ids_data_name = graph.add_node(k, v)
188-
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
189-
graph.link(parent_node, ids_data_name)
190-
191-
# check for username
192-
if 'username' in k or k in SUPPORTED_IDS:
193-
new_username_node_name = graph.add_node('username', v)
194-
graph.link(ids_data_name, new_username_node_name)
175+
v_data = v
176+
if isinstance(v, str) and v.startswith('['):
177+
try:
178+
v_data = ast.literal_eval(v)
179+
except Exception as e:
180+
logging.error(e)
181+
continue
182+
183+
if isinstance(v_data, list):
184+
list_node_name = graph.add_node(k, site_base_url)
185+
processed_values[value_key] = list_node_name
186+
for vv in v_data:
187+
data_node_name = graph.add_node(vv, site_base_url)
188+
graph.link(list_node_name, data_node_name)
189+
190+
add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
191+
if add_ids:
192+
process_ids(data_node_name, add_ids)
193+
ids_data_name = list_node_name
194+
else:
195+
ids_data_name = graph.add_node(k, norm_v)
196+
processed_values[value_key] = ids_data_name
197+
198+
if 'username' in k or k in SUPPORTED_IDS:
199+
new_username_key = f"username:{norm_v}"
200+
if new_username_key not in processed_values:
201+
new_username_node_name = graph.add_node('username', norm_v)
202+
processed_values[new_username_key] = new_username_node_name
203+
graph.link(ids_data_name, new_username_node_name)
204+
205+
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
206+
if add_ids:
207+
process_ids(ids_data_name, add_ids)
195208

196-
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
197-
if add_ids:
198-
process_ids(ids_data_name, add_ids)
209+
graph.link(parent_node, ids_data_name)
199210

200211
if status.ids_data:
201-
process_ids(site_node_name, status.ids_data)
212+
process_ids(account_node_name, status.ids_data)
202213

203-
nodes_to_remove = []
204-
for node in G.nodes:
205-
if len(str(node)) > 100:
206-
nodes_to_remove.append(node)
214+
# Remove overly long nodes
215+
nodes_to_remove = [node for node in G.nodes if len(str(node)) > 100]
216+
G.remove_nodes_from(nodes_to_remove)
207217

208-
[G.remove_node(node) for node in nodes_to_remove]
218+
# Remove site nodes with only one connection
219+
single_degree_sites = [n for n, deg in G.degree() if n.startswith("site:") and deg <= 1]
220+
G.remove_nodes_from(single_degree_sites)
209221

210-
# moved here to speed up the launch of Maigret
222+
# Generate interactive visualization
211223
from pyvis.network import Network
212-
213224
nt = Network(notebook=True, height="750px", width="100%")
214225
nt.from_nx(G)
215226
nt.show(filename)

0 commit comments

Comments
 (0)