@@ -26,7 +26,7 @@ def get_stmts_from_source(source_protein, target_proteins=None):
26
26
27
27
Parameters
28
28
----------
29
- target_protein : string
29
+ source_protein : string
30
30
The protein of interest in relation to protien list user enters
31
31
32
32
protein_list: list
@@ -49,12 +49,27 @@ def get_stmts_from_source(source_protein, target_proteins=None):
49
49
# TODO: get the same values from this result as what you got from the old
50
50
# query
51
51
52
- # cypher to get dataframe with all proteins that have INDRA relationship with target protein
53
- # query = f"""MATCH p=(n:BioEntity)-[r:indra_rel]->(m:BioEntity)
54
- # WHERE n.name = '{source_protein}'
55
- # RETURN m.name, r.stmt_json, m.type, m.id, r.stmt_type"""
52
+ #cypher to get dataframe with all proteins that have INDRA relationship with target protein
53
+ #query = f"""MATCH p=(n:BioEntity)-[r:indra_rel]->(m:BioEntity) WHERE n.name = '{source_protein}' RETURN m.name, r.stmt_json, m.type, m.id, r.stmt_type"""
56
54
#res = client.query_tx(query)
57
-
55
+
56
+
57
+ jsons = []
58
+ types = []
59
+ ids = []
60
+ stmt_types = []
61
+ for i in range (len (res )):
62
+ target_name = res [i ].data
63
+ jsons .append (res [i ].data ["stmt_json" ])
64
+ types .append (res [i ].target_ns )
65
+ ids .append (res [i ].target_id )
66
+ stmt_types .append (res [i ].data ["stmt_type" ])
67
+ protein_dict = {"stmt_json" : jsons , "target_type" : types , "target_id" :ids , "stmt_type" : stmt_types }
68
+ stmts_by_protein_df = pd .DataFrame (protein_dict )
69
+
70
+ print (stmts_by_protein_df )
71
+ print (res [0 ].__dict__ )
72
+
58
73
stmts_by_protein_df = pd .DataFrame (res , columns = ["name" , "stmt_json" , "type" , "id" , "indra_type" ])
59
74
if target_proteins :
60
75
# TODO: since the target proteins are now HGNC ids, you need to change this filter
@@ -66,6 +81,7 @@ def get_stmts_from_source(source_protein, target_proteins=None):
66
81
return stmts_by_protein_df , stmts_by_protein_filtered_df
67
82
68
83
84
+
69
85
def graph_barchart (filtered_df ):
70
86
"""Visualize frequnecy of interaction types among protiens that have direct
71
87
INDRA relationship to target
@@ -119,7 +135,7 @@ def download_indra_htmls(filtered_df):
119
135
ha .save_model ('%s_statements.html' % (name + str (index )))
120
136
121
137
122
- def get_gene_id (protein_name ):
138
+ def get_gene_id (source_protein ):
123
139
"""Return HGNC id for protein of interest
124
140
125
141
Parameters
@@ -133,15 +149,15 @@ def get_gene_id(protein_name):
133
149
The HGNC id for the protein of interest
134
150
135
151
"""
136
- hgnc_id = hgnc_client .get_hgnc_id (protein_name )
137
- if not hgnc_id :
138
- hgnc_id = hgnc_client .get_current_hgnc_id (protein_name )
139
- if not hgnc_id :
140
- print ("%s is not a valid gene name" % protein_name )
152
+ source_hgnc_id = hgnc_client .get_hgnc_id (source_protein )
153
+ if not source_hgnc_id :
154
+ source_hgnc_id = hgnc_client .get_current_hgnc_id (source_protein )
155
+ if not source_hgnc_id :
156
+ print ("%s is not a valid gene name" % source_protein )
141
157
return None
142
- return hgnc_id
158
+ return source_hgnc_id
143
159
144
- def get_gene_ids (protein_list ):
160
+ def get_gene_ids (target_proteins ):
145
161
"""Return HGNC ids for all proteins in the list
146
162
147
163
Parameters
@@ -152,15 +168,15 @@ def get_gene_ids(protein_list):
152
168
Returns
153
169
-------
154
170
"""
155
- hgnc_ids = []
156
- for protein in protein_list :
171
+ target_hgnc_ids = []
172
+ for protein in target_proteins :
157
173
hgnc_id = get_gene_id (protein )
158
174
if hgnc_id :
159
- hgnc_ids .append (hgnc_id )
160
- return hgnc_ids
175
+ target_hgnc_ids .append (hgnc_id )
176
+ return target_hgnc_ids
161
177
162
178
163
- def shared_pathway (id_df , target_id , target_protein ):
179
+ def shared_pathway (id_df , target_id , source_protein ):
164
180
"""Find shared pathways between list of genes and target protien
165
181
166
182
Parameters
@@ -169,7 +185,7 @@ def shared_pathway(id_df, target_id, target_protein):
169
185
Contains HGNC ids for protein_list protein list
170
186
target_id: string
171
187
The target proteins HGNC id
172
- target_protein : string
188
+ source_protein : string
173
189
The protein of interest in relation to protien list
174
190
175
191
Returns
@@ -183,21 +199,21 @@ def shared_pathway(id_df, target_id, target_protein):
183
199
gene_id = ids [5 :]
184
200
result = get_shared_pathways_for_genes ((("HGNC" , gene_id ),("HGNC" , target_id )))
185
201
if not result :
186
- print ("\n There are no shared pathways for" , names , "and" , target_protein )
202
+ print ("\n There are no shared pathways for" , names , "and" , source_protein )
187
203
else :
188
- print ("\n Here are the shared pathways for" , names , "and" , target_protein )
204
+ print ("\n Here are the shared pathways for" , names , "and" , source_protein )
189
205
print (result )
190
206
191
207
192
- def child_of_target (id_df , target_id , target_protein ):
208
+ def child_of_target (id_df , target_id , source_protein ):
193
209
""" Determine if any gene in gene list isa/partof the target protein
194
210
Parameters
195
211
----------
196
212
id_df : dataframe
197
213
Contains HGNC ids for protein_list
198
214
target_id : string
199
215
The target proteins HGNC id
200
- target_protein : string
216
+ source_protein : string
201
217
The protein of interest in relation to protien list user enters
202
218
203
219
Returns
@@ -214,10 +230,10 @@ def child_of_target(id_df, target_id, target_protein):
214
230
result = isa_or_partof (("HGNC" , id ),("HGNC" , target_id ))
215
231
216
232
if result == True :
217
- print ("\n " , names , "and" , target_protein , "are a part of the same family" )
233
+ print ("\n " , names , "and" , source_protein , "are a part of the same family" )
218
234
print (result )
219
235
else :
220
- print ("\n " ,names , "and" , target_protein , "are not a part of the same family" )
236
+ print ("\n " ,names , "and" , source_protein , "are not a part of the same family" )
221
237
222
238
223
239
def get_go_terms_for_target (target_id ):
@@ -400,10 +416,10 @@ def graph_boxplots(shared_complexes_df,shared_entities):
400
416
401
417
def run_analysis (source_hgnc_id , target_hgnc_ids ):
402
418
# to get dataframe with protiens that target has INDRA rel with filtered by users gene list
403
- filtered_df , protein_df = get_stmts_from_source (source_hgnc_id , target_hgnc_ids )
419
+ stmts_by_protein_df , stmts_by_protein_filtered_df = get_stmts_from_source (source_hgnc_id , target_hgnc_ids )
404
420
print ("\n This is a dataframe of protiens that have INDRA relationships with " ,
405
- target_protein , " that have been filtered for the protein list" )
406
- print (filtered_df )
421
+ source_hgnc_id , " that have been filtered for the protein list" )
422
+ print (stmts_by_protein_filtered_df )
407
423
408
424
# visualize frequnecy of interaction types among protiens that have direct
409
425
# INDRA relationship to target
@@ -413,27 +429,27 @@ def run_analysis(source_hgnc_id, target_hgnc_ids):
413
429
download_indra_htmls (filtered_df )
414
430
415
431
# to get gene ids for users gene list and target protein
416
- id_df , target_id = get_gene_ids (protein_list , target_protein )
432
+ id_df , target_id = get_gene_ids (protein_list , source_protein )
417
433
418
434
# to find shared pathways between users gene list and target protein
419
- shared_pathway (id_df , target_id , target_protein )
435
+ shared_pathway (id_df , target_id , source_protein )
420
436
421
437
# which proteins of interest are part of the same protien family complex
422
438
# as the target
423
- child_of_target (id_df , target_id , target_protein )
439
+ child_of_target (id_df , target_id , source_protein )
424
440
425
441
# to get go term ids for target gene
426
442
target_go , go_nodes = get_go_terms_for_target (target_id )
427
443
428
444
# finds shared upstream bioentities between the users gene list and target protein
429
445
shared_proteins , shared_entities = shared_bioentities (protein_df )
430
446
print ("These are the shared upstream bioentities between the gene list and" ,
431
- target_protein )
447
+ source_protein )
432
448
print (shared_entities )
433
449
434
450
# finds shared bioentities between users gene list and target protein using GO terms
435
451
shared_complexes_df = finding_protein_complexes (target_go )
436
- print ("These are shared complexes between the gene list and" , target_protein )
452
+ print ("These are shared complexes between the gene list and" , source_protein )
437
453
print (shared_complexes_df )
438
454
439
455
# gets a list of reactome and wikipathways for shared genes
@@ -449,17 +465,19 @@ def main():
449
465
'AMOT' , 'PLA2G4A' , 'RCN2' , 'TTC9' , 'FABP4' , 'GPCPD1' , 'VSNL1' ,
450
466
'CRYBB1' , 'PDZD8' , 'FNDC3A' ]
451
467
452
- target_hgnc_ids = get_gene_ids (target_protein_names )
453
468
454
469
# the protein of interest in relation to protien list user enters
455
470
source_protein_name = "CTNNB1"
456
471
457
472
source_hgnc_id = get_gene_id (source_protein_name )
458
-
473
+ target_hgnc_ids = get_gene_ids (target_protein_names )
474
+
475
+ print (source_hgnc_id ,target_hgnc_ids )
459
476
if not source_hgnc_id or not target_hgnc_ids :
460
477
print ("Cannot perform analysis due to invalid gene names" )
461
478
return
462
-
479
+
480
+
463
481
run_analysis (source_hgnc_id , target_hgnc_ids )
464
482
465
483
0 commit comments