gtf2id_table.pl 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. #!perl -w
  2. if($ARGV[0]=~/gz$/){
  3. open GTF, "gzip -dc $ARGV[0]|" or die $!;
  4. }else{
  5. open GTF, "$ARGV[0]" or die $!;
  6. }
  7. my %allgene;
  8. while (<GTF>) {
  9. chomp;
  10. my ($gene_id,$gene_type,$locus_tag,$old_locus_tag,$entry_id,$protein_id)=("-","-","-","-","-","-");
  11. if(/\tgene\t/){
  12. ($gene_id)=$_=~/gene_id \"([^"]+)\"/;
  13. ($gene_type)=$_=~/gene_biotype \"([^"]+)\"/ if ( $_=~/gene_biotype \"/);
  14. ($locus_tag)=$_=~/ locus_tag \"([^"]+)\"/ if ( $_=~/ locus_tag \"/);
  15. ($old_locus_tag)=$_=~/ old_locus_tag \"([^"]+)\"/ if ( $_=~/ old_locus_tag \"/);
  16. ($entry_id)=$_=~/ \"GeneID:(\d+)/ if ( $_=~/ \"GeneID:/);
  17. my ($gene_name)=$_=~/gene_name \"([^"]+)\"/;
  18. my ($gene)=$_=~/gene \"([^\"]+)/;
  19. my ($Name)=$_=~/Name \"([^\"]+)/;
  20. my $last_name=$gene_name||$Name||$gene||$locus_tag||$old_locus_tag||$gene_id;
  21. $allgene{$gene_id}{"type"}=$gene_type;
  22. $allgene{$gene_id}{"name"}=$last_name;
  23. $allgene{$gene_id}{"locus"}=$locus_tag;
  24. $allgene{$gene_id}{"old_locus"}=$old_locus_tag;
  25. $allgene{$gene_id}{"entry"}=$entry_id;
  26. }
  27. if(/gene_id \"/){
  28. ($gene_id)=$_=~/gene_id \"([^"]+)\"/;
  29. if(/protein_id \"/){
  30. ($protein_id)=$_=~/protein_id \"([^"]+)\"/;
  31. }
  32. $allgene{$gene_id}{"protein"}{$protein_id}=1 if(/gene_id \"/);
  33. }
  34. }
  35. close GTF;
  36. print "gene_id\tlocus_tag\told_locus_tag\tentry_id\tprotein_id\tgene_name\tgene_biotype\n";
  37. foreach my $g (sort keys %allgene){
  38. my @proteins = sort keys %{$allgene{$g}{"protein"}};
  39. if(@proteins > 1){
  40. foreach my $p (@proteins){
  41. print join("\t",$g,$allgene{$g}{"locus"},$allgene{$g}{"old_locus"},$allgene{$g}{"entry"},$p,$allgene{$g}{"name"},$allgene{$g}{"type"})."\n" if($p ne "-");
  42. }
  43. }else{
  44. print join("\t",$g,$allgene{$g}{"locus"},$allgene{$g}{"old_locus"},$allgene{$g}{"entry"},$proteins[0],$allgene{$g}{"name"},$allgene{$g}{"type"})."\n";
  45. }
  46. }