Skip to content

Commit d04d94b

Browse files
committed
This is the updated changes
1 parent 00dd2e6 commit d04d94b

File tree

1 file changed

+55
-37
lines changed

1 file changed

+55
-37
lines changed

src/indra_cogex/analysis/protein_analysis.py

+55-37
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def get_stmts_from_source(source_protein, target_proteins=None):
2626
2727
Parameters
2828
----------
29-
target_protein: string
29+
source_protein: string
3030
The protein of interest in relation to protien list user enters
3131
3232
protein_list: list
@@ -49,12 +49,27 @@ def get_stmts_from_source(source_protein, target_proteins=None):
4949
# TODO: get the same values from this result as what you got from the old
5050
# query
5151

52-
# cypher to get dataframe with all proteins that have INDRA relationship with target protein
53-
# query = f"""MATCH p=(n:BioEntity)-[r:indra_rel]->(m:BioEntity)
54-
# WHERE n.name = '{source_protein}'
55-
# RETURN m.name, r.stmt_json, m.type, m.id, r.stmt_type"""
52+
#cypher to get dataframe with all proteins that have INDRA relationship with target protein
53+
#query = f"""MATCH p=(n:BioEntity)-[r:indra_rel]->(m:BioEntity) WHERE n.name = '{source_protein}' RETURN m.name, r.stmt_json, m.type, m.id, r.stmt_type"""
5654
#res = client.query_tx(query)
57-
55+
56+
57+
jsons = []
58+
types = []
59+
ids = []
60+
stmt_types = []
61+
for i in range(len(res)):
62+
target_name = res[i].data
63+
jsons.append(res[i].data["stmt_json"])
64+
types.append(res[i].target_ns)
65+
ids.append(res[i].target_id)
66+
stmt_types.append(res[i].data["stmt_type"])
67+
protein_dict = {"stmt_json": jsons, "target_type": types, "target_id":ids, "stmt_type": stmt_types}
68+
stmts_by_protein_df = pd.DataFrame(protein_dict)
69+
70+
print(stmts_by_protein_df)
71+
print(res[0].__dict__)
72+
5873
stmts_by_protein_df = pd.DataFrame(res, columns=["name", "stmt_json", "type", "id", "indra_type"])
5974
if target_proteins:
6075
# TODO: since the target proteins are now HGNC ids, you need to change this filter
@@ -66,6 +81,7 @@ def get_stmts_from_source(source_protein, target_proteins=None):
6681
return stmts_by_protein_df, stmts_by_protein_filtered_df
6782

6883

84+
6985
def graph_barchart(filtered_df):
7086
"""Visualize frequnecy of interaction types among protiens that have direct
7187
INDRA relationship to target
@@ -119,7 +135,7 @@ def download_indra_htmls(filtered_df):
119135
ha.save_model('%s_statements.html' % (name+str(index)))
120136

121137

122-
def get_gene_id(protein_name):
138+
def get_gene_id(source_protein):
123139
"""Return HGNC id for protein of interest
124140
125141
Parameters
@@ -133,15 +149,15 @@ def get_gene_id(protein_name):
133149
The HGNC id for the protein of interest
134150
135151
"""
136-
hgnc_id = hgnc_client.get_hgnc_id(protein_name)
137-
if not hgnc_id:
138-
hgnc_id = hgnc_client.get_current_hgnc_id(protein_name)
139-
if not hgnc_id:
140-
print("%s is not a valid gene name" % protein_name)
152+
source_hgnc_id = hgnc_client.get_hgnc_id(source_protein)
153+
if not source_hgnc_id:
154+
source_hgnc_id = hgnc_client.get_current_hgnc_id(source_protein)
155+
if not source_hgnc_id:
156+
print("%s is not a valid gene name" % source_protein)
141157
return None
142-
return hgnc_id
158+
return source_hgnc_id
143159

144-
def get_gene_ids(protein_list):
160+
def get_gene_ids(target_proteins):
145161
"""Return HGNC ids for all proteins in the list
146162
147163
Parameters
@@ -152,15 +168,15 @@ def get_gene_ids(protein_list):
152168
Returns
153169
-------
154170
"""
155-
hgnc_ids = []
156-
for protein in protein_list:
171+
target_hgnc_ids = []
172+
for protein in target_proteins:
157173
hgnc_id = get_gene_id(protein)
158174
if hgnc_id:
159-
hgnc_ids.append(hgnc_id)
160-
return hgnc_ids
175+
target_hgnc_ids.append(hgnc_id)
176+
return target_hgnc_ids
161177

162178

163-
def shared_pathway(id_df, target_id, target_protein):
179+
def shared_pathway(id_df, target_id, source_protein):
164180
"""Find shared pathways between list of genes and target protien
165181
166182
Parameters
@@ -169,7 +185,7 @@ def shared_pathway(id_df, target_id, target_protein):
169185
Contains HGNC ids for protein_list protein list
170186
target_id: string
171187
The target proteins HGNC id
172-
target_protein: string
188+
source_protein: string
173189
The protein of interest in relation to protien list
174190
175191
Returns
@@ -183,21 +199,21 @@ def shared_pathway(id_df, target_id, target_protein):
183199
gene_id = ids[5:]
184200
result = get_shared_pathways_for_genes((("HGNC", gene_id),("HGNC", target_id)))
185201
if not result:
186-
print("\nThere are no shared pathways for", names, "and", target_protein)
202+
print("\nThere are no shared pathways for", names, "and", source_protein)
187203
else:
188-
print("\nHere are the shared pathways for", names, "and", target_protein)
204+
print("\nHere are the shared pathways for", names, "and", source_protein)
189205
print(result)
190206

191207

192-
def child_of_target(id_df, target_id, target_protein):
208+
def child_of_target(id_df, target_id, source_protein):
193209
""" Determine if any gene in gene list isa/partof the target protein
194210
Parameters
195211
----------
196212
id_df : dataframe
197213
Contains HGNC ids for protein_list
198214
target_id : string
199215
The target proteins HGNC id
200-
target_protein : string
216+
source_protein : string
201217
The protein of interest in relation to protien list user enters
202218
203219
Returns
@@ -214,10 +230,10 @@ def child_of_target(id_df, target_id, target_protein):
214230
result = isa_or_partof(("HGNC", id),("HGNC", target_id))
215231

216232
if result == True:
217-
print("\n", names, "and", target_protein, "are a part of the same family")
233+
print("\n", names, "and", source_protein, "are a part of the same family")
218234
print(result)
219235
else:
220-
print("\n",names, "and", target_protein, "are not a part of the same family")
236+
print("\n",names, "and", source_protein, "are not a part of the same family")
221237

222238

223239
def get_go_terms_for_target(target_id):
@@ -400,10 +416,10 @@ def graph_boxplots(shared_complexes_df,shared_entities):
400416

401417
def run_analysis(source_hgnc_id, target_hgnc_ids):
402418
# to get dataframe with protiens that target has INDRA rel with filtered by users gene list
403-
filtered_df, protein_df = get_stmts_from_source(source_hgnc_id, target_hgnc_ids)
419+
stmts_by_protein_df, stmts_by_protein_filtered_df = get_stmts_from_source(source_hgnc_id, target_hgnc_ids)
404420
print("\nThis is a dataframe of protiens that have INDRA relationships with ",
405-
target_protein, " that have been filtered for the protein list")
406-
print(filtered_df)
421+
source_hgnc_id, " that have been filtered for the protein list")
422+
print(stmts_by_protein_filtered_df)
407423

408424
# visualize frequnecy of interaction types among protiens that have direct
409425
# INDRA relationship to target
@@ -413,27 +429,27 @@ def run_analysis(source_hgnc_id, target_hgnc_ids):
413429
download_indra_htmls(filtered_df)
414430

415431
# to get gene ids for users gene list and target protein
416-
id_df, target_id = get_gene_ids(protein_list, target_protein)
432+
id_df, target_id = get_gene_ids(protein_list, source_protein)
417433

418434
# to find shared pathways between users gene list and target protein
419-
shared_pathway(id_df, target_id, target_protein)
435+
shared_pathway(id_df, target_id, source_protein)
420436

421437
# which proteins of interest are part of the same protien family complex
422438
# as the target
423-
child_of_target(id_df, target_id, target_protein)
439+
child_of_target(id_df, target_id, source_protein)
424440

425441
# to get go term ids for target gene
426442
target_go, go_nodes = get_go_terms_for_target(target_id)
427443

428444
# finds shared upstream bioentities between the users gene list and target protein
429445
shared_proteins, shared_entities = shared_bioentities(protein_df)
430446
print("These are the shared upstream bioentities between the gene list and",
431-
target_protein)
447+
source_protein)
432448
print(shared_entities)
433449

434450
# finds shared bioentities between users gene list and target protein using GO terms
435451
shared_complexes_df = finding_protein_complexes(target_go)
436-
print("These are shared complexes between the gene list and", target_protein)
452+
print("These are shared complexes between the gene list and", source_protein)
437453
print(shared_complexes_df)
438454

439455
# gets a list of reactome and wikipathways for shared genes
@@ -449,17 +465,19 @@ def main():
449465
'AMOT', 'PLA2G4A', 'RCN2', 'TTC9', 'FABP4', 'GPCPD1', 'VSNL1',
450466
'CRYBB1', 'PDZD8', 'FNDC3A']
451467

452-
target_hgnc_ids = get_gene_ids(target_protein_names)
453468

454469
# the protein of interest in relation to protien list user enters
455470
source_protein_name = "CTNNB1"
456471

457472
source_hgnc_id = get_gene_id(source_protein_name)
458-
473+
target_hgnc_ids = get_gene_ids(target_protein_names)
474+
475+
print(source_hgnc_id,target_hgnc_ids)
459476
if not source_hgnc_id or not target_hgnc_ids:
460477
print("Cannot perform analysis due to invalid gene names")
461478
return
462-
479+
480+
463481
run_analysis(source_hgnc_id, target_hgnc_ids)
464482

465483

0 commit comments

Comments
 (0)