forked from Gian77/Bash-For-Bioinformatics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbash_commands.sh
160 lines (114 loc) · 5.45 KB
/
bash_commands.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
##################################################
## ##
## Useful bash scripts for bioinformatics ##
## ##
## Gian MN Benucci ##
## gian[dot]benucci[at]gmail[dot]com ##
## ##
## East Lansing, Michigan ##
## Settember 24th, 2016 ##
## ##
##################################################
## BASIC COMMANDS
#*******************************************#
# count lines (emulates "wc -l")
sed -n '$='
# count sequencs in fasta file
grep -c "^>" sample1.fasta
# count sequences in a fastq file
grep -c "^@" example.fastq
cat example.fastq | echo $(("wc -l"/4))
Number of reads in multiple fastq.gz files
zcat *.fastq | echo $((`wc -l`/4))
## MORE ADVANCED COMMANDS
#*******************************************#
## Convert a .fastq file to .fasta (NOTE: this assumes that each FASTQ entry spans only four lines as is customary)
sed -n '1~4s/^@/>/p;2~4p' imputfilename.fastq > outputfilename.fasta
## extract sequences that are longer from 21 to 25 bp from a dataset
cat your.fastq | paste - - - - | awk 'length($2) >= 21 && length($2) <= 25' | sed 's/\t/\n/g' > filtered.fastq
awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 21 && length(seq) <= 25) {print header, seq, qheader, qseq}}' < your.fastq > filtered.fastq
## get the first 400 lines of a file
sed -n -e "1,400p" ITS_R1_otus_numbered.fasta > otus_NCBI.fasta
## Count the total number of bases in a FASTA file
grep -v ">" imputfile.fasta | wc | awk '{print $3 - $1}'
## Count sequences length inside a .fastq file
cat imputfile.fastq | awk '{if(NR%4==2) print length($1)}' > seqsfastq_length.txt
cat imputfile.fasta | awk '{if(NR%4==0) print length($1)}' > seqsfasta_length.txt
cat imputfile.fasta | awk '{if(NR%%4==2) print length(\$1)}' | sort -c
## Count the total number of sequences present in each different read files
for i in $(ls *.fasta); do grep -c "^>" $i; done
for i in `ls *.fastq.gz`; do echo $(zcat ${i} | wc -l)/4|bc; done
## give the mean of the lenght of the seqs contained in a .fasta file
awk '{/>/&&++a||b+=length()}END{print b/a}' seqs.fna
## Convert .fasta and .qual to .fastq
for i in $(ls *.fasta);
do convert_fastaqual_fastq.py -f R -q seqs.qual -o fastq_files/
done
## Creating a md5 report for a series of file
md5sum *.fastq > md5.txt
## analyse content of entire folders
find -s somedir -type f -exec md5sum {} \; | md5sum
tar -cf - somedir | md5sum
## Get a histogram of sequence lengths from FASTA/Q files
cat imputfile.fastq | awk '{if(NR%4==2) print length($1)}' | sort -n | uniq -c
cat imputfile.fasta | awk '{if(NR%4==0) print length($1)}' | sort -n | uniq -c
## Do arithmetic operations on the bash command line
echo $((1 + 1))
echo $((1 - 1))
echo $((1 * 1))
echo $((1 / 1))
echo $(((1+3) / (1+1)))
echo "scale=1; 1/2" | bc ## this is for floating numbers
## Replace spaces in file names with underscore
rename ' ' '_' *
## Rename sequence header using awk
awk '/^>/{print ">whateveryouwant" ++i; next}{print}' < file.fasta
## add something to end of all header lines
sed 's/>.*/>whateveryouwant/' file.fasta > outfile.fasta
find SRA_R2_ITS/ -name '*.fastq' | wc -l
## add something after > of all header lines
sed "s/>/>whateveryouwant/" input.fasta > output.fasta
sed 's/^>/>SampleA/g' INFILE.fasta>OUTFILE.fasta
## clean up a fasta file so only first column of the header is outputted
awk '{print $1}' inputfile.fasta > outputfile.fasta
### check identical sequences with bash script and collapse them
grep -v '>\|^$' inputfile.fasta | sort -u > Sequence
while read SEQUENCE; do
grep -w -m 1 -B1 $SEQUENCE inputfile.fasta >> outputfile.fasta
done < Sequence
## To remove duplicated sequences
sed -e '/^>/s/$/@/' -e 's/^>/#/' file.fasta | tr -d '\n' | tr "#" "\n" | tr "@" "\t" | sort -u -t $'\t' -f -k 2,2 | sed -e 's/^/>/' -e 's/\t/\n/'
## To extract ids from a sequence file
grep -o -E "^>\w+" file.fasta | tr -d ">"
find SRA_R2_ITS/ -name '*.fastq' | wc -l
## Linearize your sequences i.e. remove the sequence wrapping
sed -e 's/\(^>.*$\)/#\1#/' file.fasta | tr -d "\r" | tr -d "\n" | sed -e 's/$/#/' | tr "#" "\n" | sed -e '/^$/d'
## convert tab-delimited to csv
tr "\\t" "," < otu_table.txt > otu_table.csv
## counting reads among .fastq and fastq.gz files
for fastq in *assembled.fastq; do echo "$fastq : `grep -c "^+$" $fastq`"; done > assembled_reads.counts
for gz in *R1_001.fastq.gz; do echo "$gz : `gunzip -c $gz | grep -c "^+$"`"; done > raw_reads_R1.counts
# remove column 3 of a .txt file
awk '!($3="")' file1 > file2.txt
cut -f1,1,3- file1.txt > file2.txt
# rearragne columns and invert postions, using tab as a comun delimiter
awk 'BEGIN {OFS="\t"}; '{ print $1,$2,$3,$4,$15 " " $14}' file1.txt > file2.txt
# primtimg linearizied fzcat *R1_ITS.fastq | echo $((`wc -l`/4))
astq
zcat raw_data.fastq.gz | paste - - - - | head
# rename part of a file name across several files
rename 's/test-this/REPLACESTRING/g' *
# If you don't have rename use a for loop as shown below:
for i in test-this*
do
mv "$i" "${i/test-this/foo}"
done
# count the number of fastq files in a repsitory
find YOUR_PATH/ -name '*.fastq' | wc -l
# Rename files inside a directory
# adding R1 at the end of the file name before the extension .fastq
for f in *fastq ; do NEW=${f%.fastq}R1.fastq; mv ${f} "${NEW}"; done
# Rename file names bu addinf prefix or suffix words
for i in *.fastq; do mv "$i" "ITS_R1_$i"; done
for i in *.fastq;do mv $i ${i//ITS_R1_/}; done
rename 's/\.fastq$/_16S_r2.fastq/' *fastq