ensembl_gbk2tsv.pl 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. #! /usr/bin/perl -w
  2. #This file is designed to extract annotation information from ensembl\plantgbd genebank file.
  3. #The file has 9 colume define as:
  4. #Gene ID; Start Site; End Site; Direction; Locus tag; GO terms; EntrezGene ID; NR Description
  5. #perl <genebankfile>
  6. use strict;
  7. use warnings;
  8. my $usage="USAGE: perl $0 <DAT.gz> <gbk>\n\n";
  9. die $usage if(@ARGV==0);
  10. #open GENEBANK, "<$ARGV[0]" or die $!;
  11. if($ARGV[0]=~/gz$/){
  12. open GENEBANK, "gunzip -dc $ARGV[0]|" or die $!;
  13. }else{
  14. open GENEBANK, "$ARGV[0]" or die $!;
  15. }
  16. my ($Chr, $Start, $End, @Gene, $Ingene, $note_sign, $id, $i, $j);
  17. $Ingene = 0;
  18. $note_sign = 0;
  19. $id = -1;
  20. my $n;
  21. while(<GENEBANK>){
  22. #print $n++,"_", $Ingene,"_", $note_sign,"_", $id, $_;
  23. chomp;
  24. if(/^LOCUS/){($Chr) = /^LOCUS (\S+) .*/}
  25. if(/^..\s{3,5}\S+\s+/){$Ingene=0};
  26. if(/^..\s{3,5}gene\s+/){
  27. $id ++;
  28. if(($Start, $End) = /complement\(([1-9][0-9]*)\.\.([1-9][0-9]+)\)/){
  29. $Gene[$id][4] = "-";
  30. $Gene[$id][2] = $1;
  31. $Gene[$id][3] = $2;
  32. $Ingene = 1;
  33. next;
  34. }
  35. elsif(($Start, $End) = /([1-9][0-9]*)\.\.([1-9][0-9]+)/){
  36. $Gene[$id][4] = "+";
  37. $Gene[$id][2] = $1;
  38. $Gene[$id][3] = $2;
  39. $Ingene = 1;
  40. next;
  41. }
  42. else{
  43. die "Error encounter when read gene lenghth."
  44. }
  45. }
  46. if($Ingene){
  47. if($_ =~ /\/gene=([^"]+)$/){$Gene[$id][0] = $1;$Gene[$id][1] = $Chr;next;}
  48. elsif($_ =~ /\/locus_tag="(.+)"$/){$Gene[$id][5] = $1;next;}
  49. elsif($_ =~ /\/note="(.+)(")?$/){$Gene[$id][8] = $1; if($2){$note_sign = 0;}else{$note_sign = 1;}next;}
  50. else{if($note_sign && $_ =~ /^..\s{7,}([^"]+)(")?$/){$Gene[$id][8] .= $1;next;}else{if($note_sign == 1){$Ingene = 0;}$note_sign = 0;}}
  51. }
  52. else{
  53. if(/db_xref=".*(GO:[\d]{7,7})"/){$Gene[$id][6] .= $1.";"}
  54. if(/db_xref="(EntrezGene:[\d]{1,})"/){$Gene[$id][7] .= $1.";"}}
  55. }
  56. open R,">$ARGV[1]";
  57. for $i (0 .. $#Gene ){
  58. for $j (0 .. 8){
  59. defined($Gene[$i][$j]) or $Gene[$i][$j] ="-";
  60. print R "$Gene[$i][$j]\t";
  61. }
  62. print R "\n";
  63. }
  64. close GENEBANK;
  65. close R;