-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgff2gaf
70 lines (56 loc) · 1.35 KB
/
gff2gaf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/perl
#Daniel Ence
my $usage="
This script takes a set of gene annotations (in gff3 format)
and a gene ontology (in obo format) and ouptuts a gene
assocation file (GAF 2.0 file). This GAF file is the input
to many GO utilities like map2slim.
";
my ($gff3_file, $obo_file) = @ARGV;
#First, get the namespaces associated to the GO terms
my %GO_hash;
my $curr_term="";
open(OBO, $obo_file);
while(<OBO>){
if(/id\:\s+(GO\:\d+)\s/){
$curr_term=$1;
}
elsif(/^namespace\:\s+(\S+)/){
$GO_hash{$curr_term}=$1;
}
}
close OBO;
#print the header for the gaf file
open(GAF, ">tmp.gaf");
print GAF "!gaf-version: 2.0\n";
#Next get the Gene IDs from the GFF3 file
open(GFF3, $gff3_file);
while(<GFF3>){
chomp;
if(/\tmRNA\t/){
#/ID\=([^\;]+).*Ontology_term\=([^\;]+)/;
$_ =~ /ID\=([^\;]+)/;
my $gene_ID=$1;
my $tmp="";
if($_ =~ /Ontology_term\=([^\;]+)/){
$tmp=$1;
}
my @ont_terms = split(/\,/,$tmp);
foreach my $term (@ont_terms){
my $namespace=$GO_hash{$term};
my $abbr="";
if($namespace == "biologial_process"){
$abbr="P";
}
elsif($namespace == "cellular_component"){
$abbr="C";
}
elsif($namespace == "molecular_function"){
$abbr="F";
}
print GAF "NULL\t$gene_ID\tNULL\tNULL\t$term\tNULL\tNULL\tNULL\t$abbr\tNULL\tNULL\tNULL\ttaxon:1\tNULL\tNULL\tNULL\tNULL\n";
}
}
}
close GFF3;
close GAF;