geneGoSummary.pl 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294
  1. #!/usr/bin/perl -w
  2. # ===========================================================================
  3. #
  4. # PUBLIC DOMAIN NOTICE
  5. # National Center for Biotechnology Information
  6. #
  7. # This software/database is a "United States Government Work" under the
  8. # terms of the United States Copyright Act. It was written as part of
  9. # the author's official duties as a United States Government employee and
  10. # thus cannot be copyrighted. This software/database is freely available
  11. # to the public for use. The National Library of Medicine and the U.S.
  12. # Government have not placed any restriction on its use or reproduction.
  13. #
  14. # Although all reasonable efforts have been taken to ensure the accuracy
  15. # and reliability of the software and data, the NLM and the U.S.
  16. # Government do not and cannot warrant the performance or results that
  17. # may be obtained by using this software or data. The NLM and the U.S.
  18. # Government disclaim all warranties, express or implied, including
  19. # warranties of performance, merchantability or fitness for any particular
  20. # purpose.
  21. #
  22. # Please cite the author in any work or product based on this material.
  23. #
  24. # ===========================================================================
  25. #
  26. # Author: Craig Wallin
  27. #
  28. # File Description:
  29. #
  30. # Accepts chromosome and genomic location range as input data.
  31. # Sample input:
  32. # chr12:1000000-2000000
  33. # Searches for genes in the specified range from Entrez Gene via e-utils.
  34. # Gets GO data from gene2go FTP file (must be downloaded and decompressed
  35. # first).
  36. # Outputs a summary of GO data for genes in the specified range, in
  37. # tab-delimited format.
  38. use strict;
  39. # ---------------------------------------------------------------------------
  40. #
  41. # use libraries
  42. use LWP::Simple; # Define library for the 'get' function.
  43. # ---------------------------------------------------------------------------
  44. #
  45. # Global variables
  46. my $arg_tax_id = 9606; # human by default
  47. my $arg_verbose = 0;
  48. my $arg_input_data_file = "-"; # STDIN by default
  49. my $arg_go_file = "";
  50. my $beginTime = time();
  51. my %gene_go_terms = ();
  52. # ---------------------------------------------------------------------------
  53. #
  54. # Main program
  55. #
  56. &processCommandLineParameters;
  57. &processInputData;
  58. &end;
  59. ################################################################################
  60. #
  61. sub processCommandLineParameters {
  62. while ( my $term = shift @ARGV ) {
  63. if ($term eq '-h') {
  64. &showUsageAndExit;
  65. } elsif ($term eq '-v') {
  66. $arg_verbose = 1;
  67. } elsif ($term eq '-i') {
  68. $arg_input_data_file = shift @ARGV;
  69. } elsif ($term eq '-g') {
  70. $arg_go_file = shift @ARGV;
  71. } elsif ($term eq '-t') {
  72. $arg_tax_id = shift @ARGV;
  73. } else {
  74. # Ignore extra input
  75. }
  76. }
  77. if ($arg_input_data_file eq "") {&showUsageAndExit}
  78. if ($arg_go_file eq "") {&showUsageAndExit}
  79. die "gene2go file does not exist: $arg_go_file\n" unless -e $arg_go_file;
  80. }
  81. ################################################################################
  82. sub showUsageAndExit {
  83. my $usage = qq/
  84. Usage: $0 [options]
  85. Options: -h Display this usage help information
  86. -v Verbose
  87. -i Input file (or - for stdin) with ranges
  88. -g gene2go file
  89. -t tax id
  90. /;
  91. print STDERR "$usage\n";
  92. &end
  93. }
  94. ################################################################################
  95. sub printHeader {
  96. print STDOUT join "\t", ( "Genomic coordinates", "Enzyme", "Receptor", "Hormone", "Structural" );
  97. print STDOUT "\n";
  98. }
  99. ################################################################################
  100. # This populates %gene_go_terms
  101. sub readGoFile {
  102. print STDERR "Reading gene2go file...\n";
  103. open IN, "<$arg_go_file" or die "Failed to open: $arg_go_file\n";
  104. while ( <IN> ) {
  105. chomp();
  106. next if m/^#/; # skip header
  107. my ( $tax_id, $GeneID, $GO_ID, $Evidence, $Qualifier, $GO_term, $PubMed, $Category ) = split /\t/;
  108. next unless $tax_id == $arg_tax_id;
  109. next if $Qualifier =~ m/^NOT/;
  110. # Accumulate terms for this gene.
  111. my $gene_terms = $gene_go_terms{ $GeneID };
  112. $gene_terms .= "enzyme " if $GO_term =~ m/enzyme/i;
  113. $gene_terms .= "receptor " if $GO_term =~ m/receptor/i;
  114. $gene_terms .= "hormone " if $GO_term =~ m/hormone/i;
  115. $gene_terms .= "structural " if $GO_term =~ m/structural/i;
  116. $gene_go_terms{ $GeneID } = $gene_terms;
  117. }
  118. print STDERR "Done reading gene2go file, read data for " . scalar(keys(%gene_go_terms)) . " genes.\n";
  119. close IN;
  120. }
  121. ################################################################################
  122. sub processInputData {
  123. readGoFile();
  124. my $maxResults = 15000;
  125. my $esearch_result;
  126. &printHeader;
  127. my @input_records = ();
  128. open IN, "<$arg_input_data_file" or die "Failed to open input: $arg_input_data_file\n";
  129. while ( my $line = <IN> ) {
  130. chomp($line);
  131. push @input_records, $line;
  132. }
  133. my $total_records = scalar @input_records;
  134. my $record_count = 0;
  135. while ( my $line = shift @input_records ) {
  136. $record_count++;
  137. if ( $record_count % 100 == 0 ) {
  138. print STDERR "Read $record_count of $total_records records\n";
  139. }
  140. $line =~ m/^chr(\w+):(\d+)-(\d+)$/ or die "Could not parse input line: $line\n";
  141. my ( $chr, $start, $stop ) = ( $1, $2, $3 );
  142. my $location_str = "chr$chr:$start\-$stop";
  143. my $terms="txid9606 AND $chr \[chr\] AND $start : $stop \[chrpos\]";
  144. my $request = "db=gene&retmax=$maxResults&term=$terms";
  145. #print "$request\n";
  146. $esearch_result = &Eutil ("esearch", $request);
  147. $esearch_result =~ m/<Count>(\d+)<\/Count>/
  148. or die "$esearch_result did not contain expected <Count>,\n for request $request\n";
  149. # Summarize counts as the number of genes in the range with the
  150. # desired keywords.
  151. my $num_enzyme = 0;
  152. my $num_receptor = 0;
  153. my $num_hormone = 0;
  154. my $num_structural = 0;
  155. while ($esearch_result =~ m/<Id>(\d+)<\/Id>/g) {
  156. my $GeneID = $1; # extract a geneId
  157. my $gene_terms = $gene_go_terms{ $GeneID };
  158. next unless $gene_terms;
  159. $num_enzyme++ if $gene_terms =~ m/enzyme/;
  160. $num_receptor++ if $gene_terms =~ m/receptor/;
  161. $num_hormone++ if $gene_terms =~ m/hormone/;
  162. $num_structural++ if $gene_terms =~ m/structural/;
  163. }
  164. print STDOUT join "\t", ( $location_str, $num_enzyme, $num_receptor, $num_hormone, $num_structural );
  165. print STDOUT "\n";
  166. }
  167. close IN;
  168. }
  169. ################################################################################
  170. # Subroutine to handle all eutil calls.
  171. # Create a BEGIN block to provide effect of static data for sub
  172. BEGIN {
  173. use Time::HiRes qw( usleep gettimeofday tv_interval );
  174. # static data
  175. my $lastEutilTime = [gettimeofday]; # init once
  176. # storing local constants here too.
  177. my $eutilBaseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
  178. # most frequent usage allowed for eutils is 3 requests per second
  179. my $minUtilPeriod_microsec = 333333; # microseconds
  180. sub Eutil {
  181. my ($eutil, $querystringData) = @_;
  182. my $elapsed_sec = tv_interval( $lastEutilTime );
  183. my $delay_microsec = $minUtilPeriod_microsec - ( $elapsed_sec * 1000000 );
  184. if ($delay_microsec < 0) {$delay_microsec = 0}
  185. usleep ($delay_microsec);
  186. $lastEutilTime = [gettimeofday]; # save for next time
  187. my $eutilUrl = "$eutilBaseUrl/$eutil.fcgi?$querystringData";
  188. if ($arg_verbose) {print STDERR "\neutilUrl: $eutilUrl\n";}
  189. my $result = get($eutilUrl); # get result of the eutil for return
  190. if ((not defined $result) or ($result eq ""))
  191. {
  192. $result = ""; # Simplify error testing on return
  193. print STDERR "$eutil failed for request: $querystringData\n\n";
  194. }
  195. if ($arg_verbose) {
  196. print STDERR "\neutil result: $result\n";
  197. my $elapsedTime = tv_interval( $lastEutilTime );
  198. print STDERR "$eutil took $elapsedTime seconds\n";
  199. }
  200. $result; # for return
  201. }
  202. }
  203. ################################################################################
  204. sub end {
  205. if ($arg_verbose) {
  206. my $elapsedTime = time() - $beginTime;
  207. print STDERR "\nElapsed time: $elapsedTime seconds\n";
  208. }
  209. exit;
  210. }