1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950 |
- #!perl -w
- if($ARGV[0]=~/gz$/){
- open GTF, "gzip -dc $ARGV[0]|" or die $!;
- }else{
- open GTF, "$ARGV[0]" or die $!;
- }
- my %allgene;
- while (<GTF>) {
- chomp;
- my ($gene_id,$gene_type,$locus_tag,$old_locus_tag,$entry_id,$protein_id)=("-","-","-","-","-","-");
- if(/\tgene\t/){
- ($gene_id)=$_=~/gene_id \"([^"]+)\"/;
- ($gene_type)=$_=~/gene_biotype \"([^"]+)\"/ if ( $_=~/gene_biotype \"/);
- ($locus_tag)=$_=~/ locus_tag \"([^"]+)\"/ if ( $_=~/ locus_tag \"/);
- ($old_locus_tag)=$_=~/ old_locus_tag \"([^"]+)\"/ if ( $_=~/ old_locus_tag \"/);
- ($entry_id)=$_=~/ \"GeneID:(\d+)/ if ( $_=~/ \"GeneID:/);
- my ($gene_name)=$_=~/gene_name \"([^"]+)\"/;
- my ($gene)=$_=~/gene \"([^\"]+)/;
- my ($Name)=$_=~/Name \"([^\"]+)/;
- my $last_name=$gene_name||$Name||$gene||$locus_tag||$old_locus_tag||$gene_id;
- $allgene{$gene_id}{"type"}=$gene_type;
- $allgene{$gene_id}{"name"}=$last_name;
- $allgene{$gene_id}{"locus"}=$locus_tag;
- $allgene{$gene_id}{"old_locus"}=$old_locus_tag;
- $allgene{$gene_id}{"entry"}=$entry_id;
- }
- if(/gene_id \"/){
- ($gene_id)=$_=~/gene_id \"([^"]+)\"/;
- if(/protein_id \"/){
- ($protein_id)=$_=~/protein_id \"([^"]+)\"/;
- }
- $allgene{$gene_id}{"protein"}{$protein_id}=1 if(/gene_id \"/);
- }
- }
- close GTF;
- print "gene_id\tlocus_tag\told_locus_tag\tentry_id\tprotein_id\tgene_name\tgene_biotype\n";
- foreach my $g (sort keys %allgene){
- my @proteins = sort keys %{$allgene{$g}{"protein"}};
- if(@proteins > 1){
- foreach my $p (@proteins){
- print join("\t",$g,$allgene{$g}{"locus"},$allgene{$g}{"old_locus"},$allgene{$g}{"entry"},$p,$allgene{$g}{"name"},$allgene{$g}{"type"})."\n" if($p ne "-");
- }
- }else{
- print join("\t",$g,$allgene{$g}{"locus"},$allgene{$g}{"old_locus"},$allgene{$g}{"entry"},$proteins[0],$allgene{$g}{"name"},$allgene{$g}{"type"})."\n";
- }
- }
|