yaseminsucu_project1.py

# -*- coding: utf-8 -*-
"""yaseminsucu.Project1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1m1_QdI9OwV5_RgiEhQ7PD1doq2cmrngi
"""

#mount your drive
from google.colab import drive
drive.mount('/content/drive')

import os

from os import listdir

#creating a path variable with the general pathway to the DataFiles folder in Drive
path = '/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/'

#we will use CCR5 as an example
filename = path + 'CCR5.fasta'
print(filename)
print(type(filename))

#creating a list with the files from the 'datafiles' directory

data_files = []
data_files = listdir('/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/') #listing the files in datafiles folder

#os.rmdir('/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/.ipynb_checkpoints') I did this earlier and that file went away(IsADirectoryError: [Errno 21] Is a directory: '/content/drive/MyDrive/Colab Notebooks/CSC219/PROJECT 1 /datafiles/.ipynb_checkpoints')
print(data_files)


#making the for loop

data_files_filename = []

for file in data_files:     #going over each file in the list to make a concatenated pathway for each files
  filename = path + file
  data_files_filename.append(filename) #trying to store the values from the for loop into a new list called data_files_filename
  print('filename is:', filename)
  print(data_files_filename)

organisms = [] #making another list to have every organisms names with that the file type extension, i.e: cheetah instead of cheetah.fasta

for filename in data_files:
  organisms.append(filename[:-6])
  print(filename[:-6])

print(organisms)

# print the content of the file after reading it
dnaResult = []
for file in data_files_filename:
    f= open(file)
    DNA = (f.readlines()[1:]) #specify which index you actually want to begin reading at
    # print(DNA)
    dnaResult.append(DNA)
#print(dnaResult[0])
#print(dnaResult[1])
print(dnaResult)

#DNA is a list of lists , each list containing string with '\n'
# so we want to go through the list and take out '\n' in each string
tempList = []
tempString = ''
finalList = []

for dnaList in dnaResult: #grab each dna list inside dnaResult
  for itemString in dnaList:  #grab each string inside each list is 'CTT...ATGTATCTGGCATAGTGTGAGTCCTCATA\n'
    sequence = itemString[:-1] #negative index counts from the end of the string \n
    # print(sequence)
    tempList.append(sequence) #grab all string without '\n' and add it to a list
    # print(tempList)
    #once done going through 1 list
  #join them , note my code is aligned with the inside for loop
  tempString = ''.join(tempList)
  # print(tempString)
  finalList.append(tempString)

print(finalList) #print the whole list
print(finalList[0]) #print the first item in the list
print(finalList[1]) #prrint the second item in the list

total_nucleotide_read=[] #getting only the amount of total read

for i in finalList:
  x=len(i)
  print(len(i))
  total_nucleotide_read.append(x)

print(total_nucleotide_read)

#Counting G


genesequence = finalList[:]

genedict = {}

for i in range(len(data_files_filename)):
  genedict[data_files_filename[i]]= genesequence[i]

print(genedict)

#iterate over the dictionary for G

resultDict_G = {}

for k,v in genedict.items(): #keys,val, a,b
  #note the sequence is in v as a string 'ATTGGCCGGCC' and gene name is in k
  # so we need to iterate over each character in the string
  count = 0
  for char in v:
    if char == 'G':
      count += 1
  resultDict_G [k] = count, len(v)


print(resultDict_G)

#iterate over the dictionary for C

resultDict_C = {}

for k,v in genedict.items(): #keys,val, a,b
  #note the sequence is in v as a string 'ATTGGCCGGCC' and gene name is in k
  # so we need to iterate over each character in the string
  count = 0
  for char in v:
    if char == 'C':
      count += 1
  resultDict_C [k] = count, len(v)


print(resultDict_C)

#iterate over the dictionary for A

resultDict_A = {}

for k,v in genedict.items(): #keys,val, a,b
  #note the sequence is in v as a string 'ATTGGCCGGCC' and gene name is in k
  # so we need to iterate over each character in the string
  count = 0
  for char in v:
    if char == 'A':
      count += 1
  resultDict_A [k] = count, len(v)


print(resultDict_A)

#iterate over the dictionary for T

resultDict_T = {}

for k,v in genedict.items(): #keys,val, a,b
  #note the sequence is in v as a string 'ATTGGCCGGCC' and gene name is in k
  # so we need to iterate over each character in the string
  count = 0
  for char in v:
    if char == 'T':
      count += 1
  resultDict_T [k] = count, len(v)


print(resultDict_T)

import matplotlib.pyplot as plt
import numpy as np

for i,v in resultDict_G.items():

#total base pair compared to G content in each organisms of 5:
  x = np.array(resultDict_G[i][1]) #total base pair
  y = np.array(resultDict_G[i][0]) #G content
  plt.scatter(x, y)


plt.title("Whole Nucleotide Sequence Over G count")
plt.xlabel('Total DNA lenght')
plt.ylabel('G count')
plt.ylim(0, 8000)
plt.legend(['Cat', 'Cheetah', 'Chlamydia', 'Polar Bear', 'Malaria', 'HPV', 'HIV2', 'H5N8', 'Covid', 'CCR5'], bbox_to_anchor =(1.05, 1.03))

plt.show()

for i,v in resultDict_C.items():

#total base pair compared to C content in each organisms of 5:
  x = np.array(resultDict_C[i][1]) #total base pair
  y = np.array(resultDict_C[i][0]) #C content
  plt.scatter(x, y)


plt.title("Whole Nucleotide Sequence Over C count")
plt.xlabel('Total DNA lenght')
plt.ylabel('C count')
plt.ylim(0, 7000)
plt.legend(['Cat', 'Cheetah', 'Chlamydia', 'Polar Bear', 'Malaria', 'HPV', 'HIV2', 'H5N8', 'Covid', 'CCR5'], bbox_to_anchor =(1.05, 1.03))

plt.show()

for i,v in resultDict_A.items():

#total base pair compared to A content in each organisms of 5:
  x = np.array(resultDict_A[i][1]) #total base pair
  y = np.array(resultDict_A[i][0]) #A content
  plt.scatter(x, y)


plt.title("Whole Nucleotide Sequence Over A count")
plt.xlabel('Total DNA lenght')
plt.ylabel('A count')
plt.ylim(0, 8000)
plt.legend(['Cat', 'Cheetah', 'Chlamydia', 'Polar Bear', 'Malaria', 'HPV', 'HIV2', 'H5N8', 'Covid', 'CCR5'], bbox_to_anchor =(1.05, 1.03))

plt.show()

for i,v in resultDict_T.items():

#total base pair compared to T content in each organisms of 5:
  x = np.array(resultDict_T[i][1]) #total base pair
  y = np.array(resultDict_T[i][0]) #T content
  plt.scatter(x, y)


plt.title("Whole Nucleotide Sequence Over T count")
plt.xlabel('Total DNA lenght')
plt.ylabel('T count')
plt.ylim(0, 8000)
plt.legend(['Cat', 'Cheetah', 'Chlamydia', 'Polar Bear', 'Malaria', 'HPV', 'HIV2', 'H5N8', 'Covid', 'CCR5'], bbox_to_anchor =(1.05, 1.03))

plt.show()

#Creating a random sequence with the same ACTG content as well as matching total sequence length
#first I want to make a new list to gather the results in for each ACTG count over a total length
#then I can make a random sequence with the exact same amount


import random

#counting the CpG island
#we will be makeing a dictionary with organisms name and total sequence count
#from that dictionary we will count the CpG islands per 1000 base pair

print(organisms)

print(total_nucleotide_read)

organisms_total_count = dict(zip(organisms, total_nucleotide_read)) #making a dictionary with the elements of these two list in the above
print(organisms_total_count)

#counting the CpG content for each organism
#first I need to make a counter function that counts the CG blocks in total nucleotide reads for each organisms


resultDict_CG_content = {}

for k,v in genedict.items(): #keys,val, a,b
  #note the sequence is in v as a string 'ATTGGCCGGCC' and gene name is in k
  # so we need to iterate over each character in the string
  count = 0
  for char in v:
    if char == 'G':
      count += 1
    elif char == "C":
       count += 1
  resultDict_CG_content [k] = count/len(v) # Finding out the CG content: Total CG count/total count


print(resultDict_CG_content)


import random #making a randomized sequence

for s in finalList :

  result_random = ["".join(random.sample(items, k=len(items))) for items in finalList]

print("orig_list", finalList)
print("randomized_list", result_random)


#combining two list into dictionary

sequence_dictionary= dict(zip(organisms, finalList))
random_sequence_dictionary = dict(zip(organisms,result_random))

def calCpGsite(sequence_dictionary): # {'hiv2':'ATCC', 'cheetah':'GCCCTTC',...}
  #iterate over dictionary
  resDict = {}
  size = 1000

  # 'AT GG CC CC T'
  #  01 23 45 67 8 9
  # Size = 2

  for k,v in sequence_dictionary.items(): #k is name of gene, v is value of sequence
    #create a variable range start and end, that begins at 0,1000, 20000, and ends at 999, 1999, 2999 until the end of sequence length
    numLen = len(v) # 'ATCCCGGCCAA'
    isRemainder = numLen % 1000 # start 11547, end = 547, remainder 547 'ACCTGCC
    # count = 0
    #count how many ranges 0,1000,2000, etc., 11000

    # a, b, c, d length = 4
    # 0 ,1, 2, 3

    temp=[]
    for i in range(0,numLen,size): #start = 0, stop is at 6, range = 0,1,2,3,4,5, size= step,
      if isRemainder != 0 and i == numLen - isRemainder: # if i numLen 11547 - 547 = 11000
          start = i #11000
          end = start + isRemainder + 1 #if 1500 is last index, 1501 should be end range(start,stop,step)
      else: # for 0-999, step eaech 2
          start = i
          end = start + size #999 is the last index, 1000 should be end range

      countb = 0
      for j in range(start, end):

          # print(v[j:j + 2])
          if v[j:j + 2] == 'CG':  # 0:2 check 0,1 index, 'AT'
              countb += 1
      temp.append(countb)
    resDict[k]=temp

  return resDict

cgCount_1 = calCpGsite(sequence_dictionary)
for k,v in cgCount_1.items():
  print("keys result: ", k)
  print("values: ", v)
print(cgCount_1)

def calCpGsite(random_sequence_dictionary): #counting the Cpg sites for the randomized seq
  #iterate over dictionary
  resDict_1 = {}
  size = 1000

  # 'AT GG CC CC T'
  #  01 23 45 67 8 9
  # Size = 2

  for k,v in random_sequence_dictionary.items(): #k is name of gene, v is value of sequence
    #create a variable range start and end, that begins at 0,1000, 20000, and ends at 999, 1999, 2999 until the end of sequence length
    numLen = len(v) # 'ATCCCGGCCAA'
    isRemainder = numLen % 1000 # start 11547, end = 547, remainder 547 'ACCTGCC
    # count = 0
    #count how many ranges 0,1000,2000, etc., 11000

    # a, b, c, d length = 4
    # 0 ,1, 2, 3

    temp_1=[]
    for i in range(0,numLen,size): #start = 0, stop is at 6, range = 0,1,2,3,4,5, size= step,
      if isRemainder != 0 and i == numLen - isRemainder: # if i numLen 11547 - 547 = 11000
          start = i #11000
          end = start + isRemainder + 1 #if 1500 is last index, 1501 should be end range(start,stop,step)
      else: # for 0-999, step eaech 2
          start = i
          end = start + size #999 is the last index, 1000 should be end range

      countb = 0
      for j in range(start, end):

          # print(v[j:j + 2])
          if v[j:j + 2] == 'CG':  # 0:2 check 0,1 index, 'AT'
              countb += 1
      temp_1.append(countb)
    resDict_1[k]=temp_1

  return resDict_1

cgCount_2 = calCpGsite(random_sequence_dictionary)
for k,v in cgCount_2.items():
  print("keys result: ", k)
  print("values: ", v)
print(cgCount_2)

print(cgCount_1)
print(cgCount_2)

import pandas as pd
import seaborn as sns
import numpy as np

list_of_dicts_cgCount = [cgCount_1, cgCount_2]

df = (pd.concat([pd.concat([pd.DataFrame(v, columns=[k]) for k, v in data.items()], axis=1).assign(Sequence=i) for i, data in enumerate(list_of_dicts_cgCount)],
                ignore_index=False)
      .reset_index()
      .rename({'index': '1000bp'}, axis=1))

# Update the CG Amount column to correspond to the actual numbers
df['1000bp'] = df['1000bp'].add(1).mul(1000)

# seaborn works with DataFrames in a long form, so melt
df = df.melt(id_vars=['Sequence', '1000bp'], var_name='Organism', value_name='Repeats')

df

g = sns.relplot(data=df, x='1000bp', y='Repeats', hue='Sequence', col='Organism')

g = sns.catplot(data=df, kind='bar', x='1000bp', y='Repeats', hue='Sequence', col='Organism')

g= sns.violinplot(data=df, x='1000bp', y='Repeats', hue='Sequence', col='Organism')