Skip to content

Commit 8a48bca

Browse files
add scripts
1 parent e2cf95a commit 8a48bca

20 files changed

+2377
-0
lines changed

2D_scatterplots.py

+157
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[1]:
5+
6+
7+
# I'm gonna make a 2D scatterplot with density contour here.
8+
import sys
9+
import seaborn as sb
10+
import numpy as np
11+
import pandas as pd
12+
import matplotlib
13+
import matplotlib.colors as mcolors
14+
import matplotlib.pyplot as plt
15+
import matplotlib.font_manager as font_manager
16+
pd.options.mode.chained_assignment = None # stop annoying warning when clipping columns. default='warn'
17+
18+
19+
# In[2]:
20+
21+
22+
#userInputFile = '/Users/jra/Desktop/CTCF_tmp/loops/CTCF_D4_pad1000_FDR0.05_quickAssoc_no11.csv'
23+
#userInputFile = '/Users/jra/Desktop/CTCF_tmp/loops/H3K27ac/ESC_vs_EpiLC_FDR0.05_quickAssoc_no11.csv'
24+
userInputFile = '/Users/jra/Library/CloudStorage/Dropbox/3dname/28_feb_2023/CnR_D0-4_WT-TKO_peaks_RPKM_DNAme_filtered_sort_limma_thresholded.txt'
25+
inputFile = pd.read_csv(userInputFile, sep='\t')
26+
print(inputFile.columns.tolist())
27+
28+
29+
# In[3]:
30+
31+
32+
#x1 = 'CTCF_D4_DnmtWT_rep1.filt.intra'
33+
#x2 = 'CTCF_D4_DnmtWT_rep2.filt.intra'
34+
#y1 = 'CTCF_D4_DnmtTKO_rep1.filt.intra'
35+
#y2 = 'CTCF_D4_DnmtTKO_rep2.filt.intra'
36+
x1 = 'D0_DnmtWT_rep1.filt.intra'
37+
x2 = 'D0_DnmtWT_rep2.filt.intra'
38+
y1 = 'D4_DnmtWT_rep1.filt.intra'
39+
y2 = 'D4_DnmtWT_rep2.filt.intra'
40+
#x1 = 'CTCF_D0_DnmtWT_rep1.filt.intra'
41+
#x2 = 'CTCF_D0_DnmtWT_rep2.filt.intra'
42+
#y1 = 'CTCF_D4_DnmtWT_rep1.filt.intra'
43+
#y2 = 'CTCF_D4_DnmtWT_rep2.filt.intra'
44+
45+
x = 'E14_D4_WT_CTCF_CUTnRUN_AMS042021_rep1-2.bam_RPKM'
46+
y = 'E14_D4_TKO_CTCF_CUTnRUN_AMS042021_rep1-2.bam_RPKM'
47+
#x = 'E14_D0_WT_CTCF_CUTnRUN_AMS042021_rep1-2.bam_RPKM'
48+
#y = 'E14_D0_TKO_CTCF_CUTnRUN_AMS042021_rep1-2.bam_RPKM'
49+
50+
logFC = 'logFC'
51+
#FDR = 'FDR' # HiChIP
52+
FDR = 'adj.P.Val' # CnR
53+
# region = 'region'
54+
55+
#df = inputFile[[x1,x2,y1,y2,logFC,FDR,region]] # HiChIP
56+
df = inputFile[[x,y,logFC,FDR]] # CnR
57+
df
58+
59+
60+
# In[4]:
61+
62+
63+
# sum the replicates for HiChIP data
64+
# df['x'] = df[x1] + df[x2]
65+
# df['y'] = df[y1] + df[y2]
66+
67+
# reverse for non-diffloops comparisons
68+
#df[logFC] = -df[logFC] # diffloops flipped on me
69+
df[logFC] = df[logFC] # diffloops flipped on me
70+
71+
# set the filtering thresholds
72+
FDR_cutoff = 0.05
73+
logFC_cutoff = 1
74+
75+
# set the filtering conditions
76+
conditions = [
77+
(df[FDR] < FDR_cutoff) & (df[logFC] > logFC_cutoff),
78+
(df[FDR] < FDR_cutoff) & (df[logFC] < -logFC_cutoff)
79+
]
80+
81+
# name the conditions
82+
names = ['UP', 'DN']
83+
#colours = {'NN':'0.333, 0.333, 0.333, 0.1', 'UP':'0.333, 0.333, 0.333, 1', 'DN':'0.878, 0.267, 0.278, 1'}
84+
# add a new column with the names that correspond to the conditions met
85+
df['thresh'] = np.select(conditions, names, default='NN')
86+
87+
88+
colors_dict = {'UP':(0.878, 0.267, 0.278, 1.000),
89+
'DN':(0.333, 0.333, 0.333, 0.050),
90+
'NN':(0.333, 0.333, 0.333, 0.050),}
91+
#colors_dict = {'UP':(0.004, 0.627, 0.451, 1.000),
92+
# 'DN':(0.62, 0.208, 0.62, 1.000),
93+
# 'NN':(0.333, 0.333, 0.333, 0.050),}
94+
95+
96+
97+
colors = [mcolors.to_rgba(colors_dict[c]) for c in df['thresh']]
98+
# count the number of instances each condition is met
99+
counts = df['thresh'].value_counts()
100+
print(counts)
101+
102+
103+
# In[7]:
104+
105+
106+
# matplotlib.rcParams['svg.fonttype'] = 'none' # saves fonts as a text object instead of a vector path
107+
small_size = 12
108+
medium_size = 16
109+
plt.rcParams['font.size'] = medium_size
110+
plt.rcParams["figure.figsize"] = (6,6)
111+
font_path = '/opt/X11/share/system_fonts/HelveticaNeue.ttc'
112+
font_name = 'Helvetica Neue'
113+
prop = font_manager.FontProperties(fname=font_path)
114+
115+
# fig = df.plot(kind="scatter", x = 'x', y = 'y', c = colors, s=12) # HiChIP
116+
fig = df.plot(kind="scatter", x = x, y = y, c = colors, s=12, marker='o', edgecolors='none')
117+
118+
fig.set_xscale('log')
119+
fig.set_yscale('log')
120+
121+
# this is really stupid, there must be a better way
122+
for idx, count in enumerate(counts):
123+
fig.text(1.1, 0.95 - idx*0.1, f"{counts.index[idx]}: {count}", ha='left', va='top', transform=fig.transAxes)
124+
125+
fig.set_xlabel("ESC contacts", fontname=font_name, fontproperties=prop)
126+
fig.set_ylabel("EpiLC contacts", fontname=font_name, fontproperties=prop)
127+
fig.set_title("HiChIP loops", fontname=font_name, fontproperties=prop)
128+
129+
for tick in fig.get_xticklabels():
130+
tick.set_fontname(font_name)
131+
tick.set_fontsize(small_size)
132+
for tick in fig.get_yticklabels():
133+
tick.set_fontname(font_name)
134+
tick.set_fontsize(small_size)
135+
136+
fig.spines['bottom'].set_linewidth(0.5)
137+
fig.spines['left'].set_linewidth(0.5)
138+
fig.spines['top'].set_linewidth(0)
139+
fig.spines['right'].set_linewidth(0)
140+
fig.tick_params(width=0.5)
141+
142+
143+
# In[8]:
144+
145+
146+
datapoint_count = len(df)
147+
#outFigure = "%s_2D_scatter_%s_datapoints.svg" % (userInputFile, datapoint_count)
148+
#fig.figure.savefig(outFigure)
149+
outFigure = "%s_2D_scatter_%s_datapoints.png" % (userInputFile, datapoint_count)
150+
fig.figure.savefig(outFigure, dpi=1200, bbox_inches='tight', transparent=True)
151+
152+
153+
# In[ ]:
154+
155+
156+
157+

CUTnRUN_PCA.py

+150
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# In[101]:
5+
6+
7+
import sys
8+
import numpy as np
9+
import pandas as pd
10+
import seaborn as sb
11+
import matplotlib
12+
import matplotlib.pyplot as plt
13+
pd.options.mode.chained_assignment = None # stop annoying warning when clipping columns. default='warn'
14+
from combat.pycombat import pycombat
15+
16+
17+
# In[97]:
18+
19+
20+
# prepare data
21+
# the indexes correspond to the gene names
22+
# the column names correspond to the sample names
23+
24+
# CUT n RUN
25+
input_df = pd.read_csv("/Users/jra/Desktop/CTCF_tmp/data_filtered_sort_limma.txt",
26+
sep='\t', index_col=0)
27+
28+
29+
# HiChIP 1D
30+
#input_df = pd.read_csv("/Users/jra/Dropbox/3dname/28_feb_2023/E14_D0-4_WT-TKO_CTCF_HiChIP_rep1-2_R1-2_trimV1_mm10_macs2_broad_peaks_DNAme_filtered_1CpG_data_named.txt",
31+
# sep='\t', index_col=0)
32+
33+
34+
35+
36+
37+
for column_name in input_df.columns:
38+
print(column_name)
39+
print(input_df.shape)
40+
#print(df_expression.columns)
41+
#print(df_expression.head(2))
42+
43+
44+
# In[98]:
45+
46+
47+
# CUT n RUN
48+
df = input_df[["E14_D0_WT_CTCF_CUTnRUN_AMS042021_rep1_crop36_trimV5_mm10.bam_RPKM",
49+
"E14_D0_WT_CTCF_CUTnRUN_AMS042021_rep2_crop36_trimV5_mm10.bam_RPKM",
50+
"E14_D0_TKO_CTCF_CUTnRUN_AMS042021_rep1_crop36_trimV5_mm10.bam_RPKM",
51+
"E14_D0_TKO_CTCF_CUTnRUN_AMS042021_rep2_crop36_trimV5_mm10.bam_RPKM",
52+
"E14_D4_WT_CTCF_CUTnRUN_AMS042021_rep1_crop36_trimV5_mm10.bam_RPKM",
53+
"E14_D4_WT_CTCF_CUTnRUN_AMS042021_rep2_crop36_trimV5_mm10.bam_RPKM",
54+
"E14_D4_TKO_CTCF_CUTnRUN_AMS042021_rep1_crop36_trimV5_mm10.bam_RPKM",
55+
"E14_D4_TKO_CTCF_CUTnRUN_AMS042021_rep2_crop36_trimV5_mm10.bam_RPKM"]]
56+
57+
58+
59+
# HiChIP 1D
60+
#df = input_df[["E14_D4_WT_CTCF_HiChIP_deep_shallow_AMS112022_rep1_R1-2_trimV1_mm10.bam_RPKM",
61+
# "E14_D0_WT_CTCF_HiChIP_deep_shallow_AMS112022_rep1_R1-2_trimV1_mm10.bam_RPKM",
62+
# "E14_D4_TKO_CTCF_HiChIP_deep_shallow_AMS112022_rep1_R1-2_trimV1_mm10.bam_RPKM",
63+
# "E14_D0_TKO_CTCF_HiChIP_deep_shallow_AMS112022_rep1_R1-2_trimV1_mm10.bam_RPKM",
64+
# "E14_D4_TKO_CTCF_HiChIP_AMS012023_rep2_R1-2_trimV1_mm10.bam_RPKM",
65+
# "E14_D0_WT_CTCF_HiChIP_AMS012023_rep2_R1-2_trimV1_mm10.bam_RPKM",
66+
# "E14_D4_WT_CTCF_HiChIP_AMS012023_rep2_R1-2_trimV1_mm10.bam_RPKM",
67+
# "E14_D0_TKO_CTCF_HiChIP_AMS012023_rep2_R1-2_trimV1_mm10.bam_RPKM"]]
68+
69+
df
70+
#df_expression
71+
# SOMETHING IS WRONG WITH THE TABLE LETS FIND OUT WHAT
72+
# df_expression.isnull().values.any()
73+
# df_expression.isnull().sum()
74+
# null_data = df_expression[df_expression.isnull().any(axis=1)]
75+
# null_data
76+
# should return 0 rows if the table is nice
77+
78+
df = df.loc[(df!=0).any(1)] # drop rows that have all 0 values
79+
print("Dropping rows with only 0 values")
80+
print(df.shape)
81+
df
82+
83+
84+
# In[99]:
85+
86+
87+
matplotlib.rcParams['svg.fonttype'] = 'none' # saves fonts as a text object instead of a vector path
88+
plt.rcParams['font.size'] = 8
89+
plt.rcParams["figure.figsize"] = (20,4)
90+
91+
# plot the distribution of log2(RPKM+1) gene expression values (uncorrected for batch effects)
92+
boxPlot = plt.boxplot(np.log2(df.clip(upper=None, lower=0)+1), \
93+
labels=df.columns, \
94+
patch_artist=True, notch=True, \
95+
medianprops={'color':'Black'}, \
96+
flierprops={'markersize':2, 'marker':'o', 'markerfacecolor':'white', 'markeredgecolor':'black', 'markeredgewidth':0.5})
97+
plt.xticks(rotation=90)
98+
plt.ylim(-2.1, None)
99+
plt.show() # showing the figure "moves" the figure to Shell, leaving nothing behind. Show after saving file.
100+
101+
102+
# In[100]:
103+
104+
105+
from sklearn.decomposition import PCA
106+
107+
samples = df.transpose()
108+
#samples = df
109+
pca = PCA(2)
110+
projected = pca.fit_transform(samples)
111+
print(df.shape)
112+
print(samples.shape)
113+
print(projected.shape)
114+
115+
pca1variation = (pca.explained_variance_ratio_ * 100)[0].round(2)
116+
pca2variation = (pca.explained_variance_ratio_ * 100)[1].round(2)
117+
118+
# use the explained variance to size the output graph
119+
plt.rcParams["figure.figsize"] = ((pca1variation/4).astype(int), (pca2variation/4).astype(int))
120+
plt.scatter(projected[:, 0], projected[:, 1],
121+
edgecolor='none', alpha=1, s=200)
122+
123+
x_title = 'PC1: ' + pca1variation.astype(str) + '% of variation explained'
124+
y_title = 'PC2: ' + pca2variation.astype(str) + '% of variation explained'
125+
plt.xlabel(x_title)
126+
plt.ylabel(y_title)
127+
plt.title('made by Julien using python in a jupyter notebook')
128+
129+
for i, label in enumerate(df.columns):
130+
plt.annotate(label, (projected[:,0][i], projected[:,1][i]))
131+
outFigure = "/Users/jra/Desktop/PCA_plot_PCscaled.svg"
132+
plt.savefig(outFigure)
133+
plt.close()
134+
135+
plt.rcParams["figure.figsize"] = (4,4)
136+
plt.scatter(projected[:, 0], projected[:, 1],
137+
edgecolor='none', alpha=1, s=200)
138+
139+
x_title = 'PC1: ' + pca1variation.astype(str) + '% of variation explained'
140+
y_title = 'PC2: ' + pca2variation.astype(str) + '% of variation explained'
141+
plt.xlabel(x_title)
142+
plt.ylabel(y_title)
143+
plt.title('made by Julien using python in a jupyter notebook')
144+
145+
for i, label in enumerate(df.columns):
146+
plt.annotate(label, (projected[:,0][i], projected[:,1][i]))
147+
outFigure = "/Users/jra/Desktop/PCA_plot.svg"
148+
plt.savefig(outFigure)
149+
#plt.close()
150+

0 commit comments

Comments
 (0)