123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- #! /usr/bin/perl -w
- #This file is designed to extract annotation information from ensembl\plantgbd genebank file.
- #The file has 9 colume define as:
- #Gene ID; Start Site; End Site; Direction; Locus tag; GO terms; EntrezGene ID; NR Description
- #perl <genebankfile>
- use strict;
- use warnings;
- my $usage="USAGE: perl $0 <DAT.gz> <gbk>\n\n";
- die $usage if(@ARGV==0);
- #open GENEBANK, "<$ARGV[0]" or die $!;
- if($ARGV[0]=~/gz$/){
- open GENEBANK, "gunzip -dc $ARGV[0]|" or die $!;
- }else{
- open GENEBANK, "$ARGV[0]" or die $!;
- }
- my ($Chr, $Start, $End, @Gene, $Ingene, $note_sign, $id, $i, $j);
- $Ingene = 0;
- $note_sign = 0;
- $id = -1;
- my $n;
- while(<GENEBANK>){
- #print $n++,"_", $Ingene,"_", $note_sign,"_", $id, $_;
- chomp;
- if(/^LOCUS/){($Chr) = /^LOCUS (\S+) .*/}
- if(/^..\s{3,5}\S+\s+/){$Ingene=0};
- if(/^..\s{3,5}gene\s+/){
- $id ++;
- if(($Start, $End) = /complement\(([1-9][0-9]*)\.\.([1-9][0-9]+)\)/){
- $Gene[$id][4] = "-";
- $Gene[$id][2] = $1;
- $Gene[$id][3] = $2;
- $Ingene = 1;
- next;
- }
- elsif(($Start, $End) = /([1-9][0-9]*)\.\.([1-9][0-9]+)/){
- $Gene[$id][4] = "+";
- $Gene[$id][2] = $1;
- $Gene[$id][3] = $2;
- $Ingene = 1;
- next;
- }
- else{
- die "Error encounter when read gene lenghth."
- }
- }
- if($Ingene){
- if($_ =~ /\/gene=([^"]+)$/){$Gene[$id][0] = $1;$Gene[$id][1] = $Chr;next;}
- elsif($_ =~ /\/locus_tag="(.+)"$/){$Gene[$id][5] = $1;next;}
- elsif($_ =~ /\/note="(.+)(")?$/){$Gene[$id][8] = $1; if($2){$note_sign = 0;}else{$note_sign = 1;}next;}
- else{if($note_sign && $_ =~ /^..\s{7,}([^"]+)(")?$/){$Gene[$id][8] .= $1;next;}else{if($note_sign == 1){$Ingene = 0;}$note_sign = 0;}}
- }
- else{
- if(/db_xref=".*(GO:[\d]{7,7})"/){$Gene[$id][6] .= $1.";"}
- if(/db_xref="(EntrezGene:[\d]{1,})"/){$Gene[$id][7] .= $1.";"}}
-
- }
- open R,">$ARGV[1]";
- for $i (0 .. $#Gene ){
- for $j (0 .. 8){
- defined($Gene[$i][$j]) or $Gene[$i][$j] ="-";
- print R "$Gene[$i][$j]\t";
- }
- print R "\n";
- }
- close GENEBANK;
- close R;
|