#!/usr/bin/perl -w use strict; use warnings; use ScriptManager; use Getopt::Long; use File::Basename qw(basename); my $BEGIN_TIME=time(); my ($out_file,$gtf,$all_gene); GetOptions( "help|?" =>\&USAGE, "o:s"=>\$out_file, "g:s"=>\$gtf, "a"=>\$all_gene ) or &USAGE; &USAGE unless ($gtf and @ARGV); $out_file ||="Annotation.xls"; my @heads=("Chromosome","Start Site","End Site","Direction","Length","Name",@ARGV); open F,$gtf or die $!; my %gene; open O,">tmp_gene_exon.bed" or die $!; while () { chomp; next if(/^#/); if(/\tgene\t/){ if(! $all_gene){ next unless /protein_coding/; } my @line=split/\t/; my ($gene_name)=$line[8]=~/gene_name \"([^\"]+)\"/; my ($gene)=$line[8]=~/gene \"([^\"]+)\"/; my ($Name)=$line[8]=~/Name \"([^\"]+)\"/; my ($locus_tag)=$line[8]=~/locus_tag \"([^\"]+)\"/; my $last_name=$gene_name||$Name||$gene||$locus_tag; my ($gene_id)=$line[8]=~/gene_id \"([^\"]+)\"/; $gene{$gene_id}{"Chromosome"}=$line[0]; $gene{$gene_id}{"Start Site"}=$line[3]; $gene{$gene_id}{"End Site"}=$line[4]; $gene{$gene_id}{"Direction"}=$line[6]; $gene{$gene_id}{"Name"}=$last_name; } next unless(/\texon\t/); if(/\texon\t(\d+)\t(\d+).+gene_id \"([^\"]+)\"/){ my $start=$1-1; my $end=$2; my $geneID=$3; print O join("\t",$geneID,$start,$end)."\n"; } } close F; close O; `bedtools sort -i tmp_gene_exon.bed > tmp_gene_exon.sort.bed && bedtools merge -i tmp_gene_exon.sort.bed >tmp_no_overlap.bed`; open I, "tmp_no_overlap.bed" or die $!; while(){ chomp; next if /^\s*$/; my @line=split/\t/; $gene{$line[0]}{"Length"}+=($line[2]-$line[1]); } close I; `rm -f tmp_gene_exon.bed tmp_gene_exon.sort.bed tmp_no_overlap.bed`; foreach my $anno_file (@ARGV){ open IN,$anno_file or die $!; while(){ chomp; next if(/^\s*$/ || /^#/); my @line=split/\t/; if($gene{$line[0]}){ $gene{$line[0]}{$anno_file}=$line[1]; } } close IN; } open OUT,">$out_file" or die $!; print OUT join("\t","Gene ID",@heads)."\n"; foreach my $g (sort keys %gene){ $gene{$g}{"Length"}=$gene{$g}{"End Site"} - $gene{$g}{"Start Site"} + 1 if( ! $gene{$g}{"Length"} ); print OUT join("\t","$g",map { $gene{$g}{$_} || "-" } @heads ),"\n" if($gene{$g}{"Start Site"}); } close OUT; print STDOUT "\nDone. Total elapsed time : ",time()-$BEGIN_TIME,"s\n"; sub USAGE { my $usage=<<"USAGE"; Usage:perl $0 -g reference.gtf -o Annotation.xls GO KEGG eggNOG ... Description:Calculate gene exon length and Megre GO KEGG NR eggNOG NR Swissprot Options: -o output Annotation file default "Annotation.xls" -g reference genome gtf file -a all gene ; -h help USAGE print $usage; exit; } ################################################################################################################ sub GetTime { my ($sec, $min, $hour, $day, $mon, $year, $wday, $yday, $isdst)=localtime(time()); return sprintf("%4d-%02d-%02d %02d:%02d:%02d", $year+1900, $mon+1, $day, $hour, $min, $sec); } ################################################################################################################