merge_anno.v1.pl 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. #!/usr/bin/perl -w
  2. use strict;
  3. use warnings;
  4. use ScriptManager;
  5. use Getopt::Long;
  6. use File::Basename qw(basename);
  7. my $BEGIN_TIME=time();
  8. my ($out_file,$gtf,$all_gene);
  9. GetOptions(
  10. "help|?" =>\&USAGE,
  11. "o:s"=>\$out_file,
  12. "g:s"=>\$gtf,
  13. "a"=>\$all_gene
  14. ) or &USAGE;
  15. &USAGE unless ($gtf and @ARGV);
  16. $out_file ||="Annotation.xls";
  17. my @heads=("Chromosome","Start Site","End Site","Direction","Length","Name",@ARGV);
  18. open F,$gtf or die $!;
  19. my %gene;
  20. open O,">tmp_gene_exon.bed" or die $!;
  21. while (<F>) {
  22. chomp;
  23. next if(/^#/);
  24. if(/\tgene\t/){
  25. if(! $all_gene){
  26. next unless /protein_coding/;
  27. }
  28. my @line=split/\t/;
  29. my ($gene_name)=$line[8]=~/gene_name \"([^\"]+)\"/;
  30. my ($gene)=$line[8]=~/gene \"([^\"]+)\"/;
  31. my ($Name)=$line[8]=~/Name \"([^\"]+)\"/;
  32. my ($locus_tag)=$line[8]=~/locus_tag \"([^\"]+)\"/;
  33. my $last_name=$gene_name||$Name||$gene||$locus_tag;
  34. my ($gene_id)=$line[8]=~/gene_id \"([^\"]+)\"/;
  35. $gene{$gene_id}{"Chromosome"}=$line[0];
  36. $gene{$gene_id}{"Start Site"}=$line[3];
  37. $gene{$gene_id}{"End Site"}=$line[4];
  38. $gene{$gene_id}{"Direction"}=$line[6];
  39. $gene{$gene_id}{"Name"}=$last_name;
  40. }
  41. next unless(/\texon\t/);
  42. if(/\texon\t(\d+)\t(\d+).+gene_id \"([^\"]+)\"/){
  43. my $start=$1-1;
  44. my $end=$2;
  45. my $geneID=$3;
  46. print O join("\t",$geneID,$start,$end)."\n";
  47. }
  48. }
  49. close F;
  50. close O;
  51. `bedtools sort -i tmp_gene_exon.bed > tmp_gene_exon.sort.bed && bedtools merge -i tmp_gene_exon.sort.bed >tmp_no_overlap.bed`;
  52. open I, "tmp_no_overlap.bed" or die $!;
  53. while(<I>){
  54. chomp;
  55. next if /^\s*$/;
  56. my @line=split/\t/;
  57. $gene{$line[0]}{"Length"}+=($line[2]-$line[1]);
  58. }
  59. close I;
  60. `rm -f tmp_gene_exon.bed tmp_gene_exon.sort.bed tmp_no_overlap.bed`;
  61. foreach my $anno_file (@ARGV){
  62. open IN,$anno_file or die $!;
  63. while(<IN>){
  64. chomp;
  65. next if(/^\s*$/ || /^#/);
  66. my @line=split/\t/;
  67. if($gene{$line[0]}){
  68. $gene{$line[0]}{$anno_file}=$line[1];
  69. }
  70. }
  71. close IN;
  72. }
  73. open OUT,">$out_file" or die $!;
  74. print OUT join("\t","Gene ID",@heads)."\n";
  75. foreach my $g (sort keys %gene){
  76. $gene{$g}{"Length"}=$gene{$g}{"End Site"} - $gene{$g}{"Start Site"} + 1 if( ! $gene{$g}{"Length"} );
  77. print OUT join("\t","$g",map { $gene{$g}{$_} || "-" } @heads ),"\n" if($gene{$g}{"Start Site"});
  78. }
  79. close OUT;
  80. print STDOUT "\nDone. Total elapsed time : ",time()-$BEGIN_TIME,"s\n";
  81. sub USAGE {
  82. my $usage=<<"USAGE";
  83. Usage:perl $0 -g reference.gtf -o Annotation.xls GO KEGG eggNOG ...
  84. Description:Calculate gene exon length and Megre GO KEGG NR eggNOG NR Swissprot
  85. Options:
  86. -o <outfile> output Annotation file default "Annotation.xls"
  87. -g <gtf> reference genome gtf file
  88. -a all gene ;
  89. -h help
  90. USAGE
  91. print $usage;
  92. exit;
  93. }
  94. ################################################################################################################
  95. sub GetTime {
  96. my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst)=localtime(time());
  97. return sprintf("%4d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $day, $hour, $min, $sec);
  98. }
  99. ################################################################################################################