123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- #!/usr/bin/perl -w
- use strict;
- use warnings;
- use ScriptManager;
- use Getopt::Long;
- use File::Basename qw(basename);
- my $BEGIN_TIME=time();
- my ($out_file,$gtf,$all_gene);
- GetOptions(
- "help|?" =>\&USAGE,
- "o:s"=>\$out_file,
- "g:s"=>\$gtf,
- "a"=>\$all_gene
- ) or &USAGE;
- &USAGE unless ($gtf and @ARGV);
- $out_file ||="Annotation.xls";
- my @heads=("Chromosome","Start Site","End Site","Direction","Length","Name",@ARGV);
- open F,$gtf or die $!;
- my %gene;
- open O,">tmp_gene_exon.bed" or die $!;
- while (<F>) {
- chomp;
- next if(/^#/);
- if(/\tgene\t/){
- if(! $all_gene){
- next unless /protein_coding/;
- }
- my @line=split/\t/;
- my ($gene_name)=$line[8]=~/gene_name \"([^\"]+)\"/;
- my ($gene)=$line[8]=~/gene \"([^\"]+)\"/;
- my ($Name)=$line[8]=~/Name \"([^\"]+)\"/;
- my ($locus_tag)=$line[8]=~/locus_tag \"([^\"]+)\"/;
- my $last_name=$gene_name||$Name||$gene||$locus_tag;
- my ($gene_id)=$line[8]=~/gene_id \"([^\"]+)\"/;
- $gene{$gene_id}{"Chromosome"}=$line[0];
- $gene{$gene_id}{"Start Site"}=$line[3];
- $gene{$gene_id}{"End Site"}=$line[4];
- $gene{$gene_id}{"Direction"}=$line[6];
- $gene{$gene_id}{"Name"}=$last_name;
- }
- next unless(/\texon\t/);
- if(/\texon\t(\d+)\t(\d+).+gene_id \"([^\"]+)\"/){
- my $start=$1-1;
- my $end=$2;
- my $geneID=$3;
- print O join("\t",$geneID,$start,$end)."\n";
- }
- }
- close F;
- close O;
- `bedtools sort -i tmp_gene_exon.bed > tmp_gene_exon.sort.bed && bedtools merge -i tmp_gene_exon.sort.bed >tmp_no_overlap.bed`;
- open I, "tmp_no_overlap.bed" or die $!;
- while(<I>){
- chomp;
- next if /^\s*$/;
- my @line=split/\t/;
- $gene{$line[0]}{"Length"}+=($line[2]-$line[1]);
- }
- close I;
- `rm -f tmp_gene_exon.bed tmp_gene_exon.sort.bed tmp_no_overlap.bed`;
- foreach my $anno_file (@ARGV){
- open IN,$anno_file or die $!;
- while(<IN>){
- chomp;
- next if(/^\s*$/ || /^#/);
- my @line=split/\t/;
- if($gene{$line[0]}){
- $gene{$line[0]}{$anno_file}=$line[1];
- }
- }
- close IN;
- }
- open OUT,">$out_file" or die $!;
- print OUT join("\t","Gene ID",@heads)."\n";
- foreach my $g (sort keys %gene){
- $gene{$g}{"Length"}=$gene{$g}{"End Site"} - $gene{$g}{"Start Site"} + 1 if( ! $gene{$g}{"Length"} );
- print OUT join("\t","$g",map { $gene{$g}{$_} || "-" } @heads ),"\n" if($gene{$g}{"Start Site"});
- }
- close OUT;
- print STDOUT "\nDone. Total elapsed time : ",time()-$BEGIN_TIME,"s\n";
- sub USAGE {
- my $usage=<<"USAGE";
- Usage:perl $0 -g reference.gtf -o Annotation.xls GO KEGG eggNOG ...
- Description:Calculate gene exon length and Megre GO KEGG NR eggNOG NR Swissprot
- Options:
- -o <outfile> output Annotation file default "Annotation.xls"
- -g <gtf> reference genome gtf file
- -a all gene ;
- -h help
- USAGE
- print $usage;
- exit;
- }
- ################################################################################################################
- sub GetTime {
- my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst)=localtime(time());
- return sprintf("%4d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $day, $hour, $min, $sec);
- }
- ################################################################################################################
|