#! /usr/bin/perl -w #This file is designed to extract annotation information from ensembl\plantgbd genebank file. #The file has 9 colume define as: #Gene ID; Start Site; End Site; Direction; Locus tag; GO terms; EntrezGene ID; NR Description #perl use strict; use warnings; my $usage="USAGE: perl $0 \n\n"; die $usage if(@ARGV==0); #open GENEBANK, "<$ARGV[0]" or die $!; if($ARGV[0]=~/gz$/){ open GENEBANK, "gunzip -dc $ARGV[0]|" or die $!; }else{ open GENEBANK, "$ARGV[0]" or die $!; } my ($Chr, $Start, $End, @Gene, $Ingene, $note_sign, $id, $i, $j); $Ingene = 0; $note_sign = 0; $id = -1; my $n; while(){ #print $n++,"_", $Ingene,"_", $note_sign,"_", $id, $_; chomp; if(/^LOCUS/){($Chr) = /^LOCUS (\S+) .*/} if(/^..\s{3,5}\S+\s+/){$Ingene=0}; if(/^..\s{3,5}gene\s+/){ $id ++; if(($Start, $End) = /complement\(([1-9][0-9]*)\.\.([1-9][0-9]+)\)/){ $Gene[$id][4] = "-"; $Gene[$id][2] = $1; $Gene[$id][3] = $2; $Ingene = 1; next; } elsif(($Start, $End) = /([1-9][0-9]*)\.\.([1-9][0-9]+)/){ $Gene[$id][4] = "+"; $Gene[$id][2] = $1; $Gene[$id][3] = $2; $Ingene = 1; next; } else{ die "Error encounter when read gene lenghth." } } if($Ingene){ if($_ =~ /\/gene=([^"]+)$/){$Gene[$id][0] = $1;$Gene[$id][1] = $Chr;next;} elsif($_ =~ /\/locus_tag="(.+)"$/){$Gene[$id][5] = $1;next;} elsif($_ =~ /\/note="(.+)(")?$/){$Gene[$id][8] = $1; if($2){$note_sign = 0;}else{$note_sign = 1;}next;} else{if($note_sign && $_ =~ /^..\s{7,}([^"]+)(")?$/){$Gene[$id][8] .= $1;next;}else{if($note_sign == 1){$Ingene = 0;}$note_sign = 0;}} } else{ if(/db_xref=".*(GO:[\d]{7,7})"/){$Gene[$id][6] .= $1.";"} if(/db_xref="(EntrezGene:[\d]{1,})"/){$Gene[$id][7] .= $1.";"}} } open R,">$ARGV[1]"; for $i (0 .. $#Gene ){ for $j (0 .. 8){ defined($Gene[$i][$j]) or $Gene[$i][$j] ="-"; print R "$Gene[$i][$j]\t"; } print R "\n"; } close GENEBANK; close R;