taxidToGeneNames.pl 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. #!/usr/local/bin/perl -w
  2. # ===========================================================================
  3. #
  4. # PUBLIC DOMAIN NOTICE
  5. # National Center for Biotechnology Information
  6. #
  7. # This software/database is a "United States Government Work" under the
  8. # terms of the United States Copyright Act. It was written as part of
  9. # the author's official duties as a United States Government employee and
  10. # thus cannot be copyrighted. This software/database is freely available
  11. # to the public for use. The National Library of Medicine and the U.S.
  12. # Government have not placed any restriction on its use or reproduction.
  13. #
  14. # Although all reasonable efforts have been taken to ensure the accuracy
  15. # and reliability of the software and data, the NLM and the U.S.
  16. # Government do not and cannot warrant the performance or results that
  17. # may be obtained by using this software or data. The NLM and the U.S.
  18. # Government disclaim all warranties, express or implied, including
  19. # warranties of performance, merchantability or fitness for any particular
  20. # purpose.
  21. #
  22. # Please cite the author in any work or product based on this material.
  23. #
  24. # ===========================================================================
  25. #
  26. # Author: Zev Hochberg
  27. #
  28. # File Description:
  29. # Accepts tax_id as an argument
  30. # Gets name information from gene via e-utils
  31. #
  32. use strict;
  33. # ---------------------------------------------------------------------------
  34. #
  35. use LWP::Simple; # Define library for the 'get' function.
  36. # ---------------------------------------------------------------------------
  37. #
  38. # Global variables
  39. my $verbose = 0;
  40. my $taxId;
  41. my $dopt;
  42. my $beginTime = time();
  43. my $startTag = "<eSummaryResult>";
  44. my $stopTag = "</eSummaryResult>";
  45. # ---------------------------------------------------------------------------
  46. #
  47. # Main program
  48. #
  49. &processCommandLineParameters;
  50. &verifyInput;
  51. &printHeader if $dopt eq 'tab';
  52. &process;
  53. &end;
  54. ################################################################################
  55. #
  56. sub processCommandLineParameters {
  57. $taxId = "";
  58. $dopt = "";
  59. while(defined ($_= shift @ARGV)) {
  60. if ($_ eq '-h') {
  61. &showUsageAndExit;
  62. } elsif ($_ eq '-v') {
  63. $verbose = 1;
  64. } elsif ($_ eq '-t') {
  65. $taxId = shift @ARGV;
  66. } elsif ($_ eq '-o') {
  67. $dopt = shift @ARGV;
  68. } else {
  69. # Ignore extra input
  70. }
  71. }
  72. if ($taxId eq "") {&showUsageAndExit}
  73. if ($dopt eq "") {&showUsageAndExit}
  74. }
  75. ################################################################################
  76. #
  77. sub verifyInput {
  78. unless (($taxId =~ m/^\d*$/) and ($taxId != 0)) {
  79. print STDERR "taxonomyId (-t) must be numeric and > 0\n";
  80. &showUsageAndExit;
  81. }
  82. unless (($dopt eq 'xml') or ($dopt eq 'tab')) {
  83. print STDERR "output (-o) is required and must equal 'xml' or 'tab'\n";
  84. &showUsageAndExit;
  85. }
  86. }
  87. ################################################################################
  88. #
  89. sub showUsageAndExit {
  90. my $usage = qq/
  91. Usage: $0 [option] -t taxonomyId -o xml|tab
  92. Options: -h Display this usage help information
  93. -v Verbose
  94. -o output options
  95. xml - XML
  96. tab - tab-delimited
  97. /;
  98. print STDERR "$usage\n\n";
  99. &end
  100. }
  101. ################################################################################
  102. #
  103. sub printHeader {
  104. print STDOUT "geneId\tname\tdescription\n";
  105. }
  106. ################################################################################
  107. #
  108. sub process {
  109. my $geneStart = 0;
  110. my $maxGenes = 250;
  111. my $totalGenes;
  112. my $haveTotal = 0;
  113. my $geneId = "";
  114. my $geneCount = 0;
  115. my $qs;
  116. my $esearch_result;
  117. my $esummary_result;
  118. my $first = 1;
  119. # Main loop: get up to maxGenes Gene ID's for requested taxId from Gene
  120. $totalGenes = $geneStart+1; # to get started
  121. GeneLoop:
  122. for (; $geneStart<$totalGenes; $geneStart+=$maxGenes) {
  123. if ($verbose) {print STDOUT "Processing genes $geneStart to ", $geneStart+$maxGenes-1, "\n";}
  124. #defining the query
  125. #this option looks for GeneIDs with the $taxId value of interest that are
  126. #encoded on the mitochondrion. Note use of the field restriction [properties]
  127. #and the boolean AND
  128. #$qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+AND+source_mitochondrion[properties]";
  129. #this option looks for GeneIDs with the $taxId value of interest that are
  130. #NOT encoded on the mitochondrion and do not have RefSeqs of the type model.
  131. #Note use of the boolean NOT and field restriction
  132. $qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+NOT+source_mitochondrion[properties]+NOT+srcdb_refseq_model[properties]";
  133. $esearch_result = &Eutil ("esearch", $qs);
  134. if ($esearch_result =~ m/<Count>(\d+)<\/Count>/) {
  135. if ($haveTotal) {
  136. if ($totalGenes != $1) {
  137. die "esearch reported new total genes: was $totalGenes; now $1\nFor request $qs\n";
  138. }
  139. } else {
  140. $totalGenes = $1; # extract total
  141. $haveTotal = 1;
  142. }
  143. } else {
  144. die "$esearch_result did not contain expected <Count>,\n for request $qs\n";
  145. }
  146. # Build querystring for GENE search
  147. $qs = "db=gene&id=";
  148. while ($esearch_result =~ m/<Id>(\d+)<\/Id>/g) {
  149. $geneCount++;
  150. $geneId = $1; # extract a geneId
  151. $qs .= "$geneId,";
  152. }
  153. chop($qs);
  154. # Get Gene summary
  155. $esummary_result = &Eutil ("esummary", $qs);
  156. # Extract and output information for all genes
  157. if ($dopt eq 'tab'){
  158. &extractAndOutput ($esummary_result);
  159. } else {
  160. &extractAndOutputXml ($esummary_result, $first);
  161. }
  162. $first = 0;
  163. if ($verbose) {
  164. print STDOUT "\ngeneCount: $geneCount\n";
  165. }
  166. }
  167. # Close xml output
  168. if ($dopt eq 'xml') {print STDOUT $stopTag}
  169. }
  170. ################################################################################
  171. #
  172. sub extractAndOutput {
  173. my $xml = $_[0];
  174. my $match;
  175. my $geneId;
  176. my $name;
  177. my $description;
  178. while ($xml =~ m/<DocSum>(.*?)<\/DocSum>/gs) {
  179. $match = $1;
  180. $geneId = "";
  181. $name = "";
  182. $description = "";
  183. if ($match =~ /<Id>(\d+)<\/Id>/) {$geneId = $1}
  184. if ($match =~ m/<Item Name=.Name.*>(.+)<\/Item>/) {$name = $1}
  185. if ($match =~ m/<Item Name=.Description.*>(.+)<\/Item>/) {$description = $1}
  186. print STDOUT "$geneId\t$name\t$description\n";
  187. }
  188. }
  189. ################################################################################
  190. #
  191. sub extractAndOutputXml {
  192. my $xml = shift;
  193. my $first = shift;
  194. # Strip leading <!DOCTYPE, <?xml, and bracketing <eSummaryResult> tags,
  195. # from all but first set
  196. # Strip closing <eSummaryResult> tag from all
  197. if ($first) {
  198. my $start = 0;
  199. my $stop = index ($xml, $stopTag);
  200. print STDOUT substr ($xml, $start, $stop-$start);
  201. } else {
  202. my $start = index ($xml, $startTag) + length($startTag);
  203. my $stop = index ($xml, $stopTag);
  204. print STDOUT substr ($xml, $start, $stop-$start);
  205. }
  206. }
  207. ################################################################################
  208. #
  209. # Subroutine to handle all eutil calls
  210. #
  211. # Create a BEGIN block to provide effect of static data for sub
  212. BEGIN {
  213. # static data
  214. my $lastEutilTime = 0; # init to avoid delay on first Eutil
  215. # storing local constants here too.
  216. my $eutilBaseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
  217. my $minUtilPeriod = 3; # seconds
  218. sub Eutil {
  219. my ($eutil, $querystringData) = @_;
  220. my $elapsed = time() - $lastEutilTime;
  221. my $delay = $minUtilPeriod - $elapsed;
  222. if ($delay < 0) {$delay = 0}
  223. sleep ($delay);
  224. $lastEutilTime = time(); # save for next time
  225. my $eutilUrl = "$eutilBaseUrl/$eutil.fcgi?$querystringData";
  226. if ($verbose) {print STDOUT "\neutilUrl: $eutilUrl\n";}
  227. my $result = get($eutilUrl); # get result of the eutil for return
  228. if ((not defined $result) or ($result eq ""))
  229. {
  230. $result = ""; # Simplify error testing on return
  231. print STDERR "$eutil failed for request: $querystringData\n\n";
  232. }
  233. if ($verbose) {
  234. print STDOUT "\neutil result: $result\n";
  235. my $elapsedTime = time() - $lastEutilTime;
  236. print STDOUT "$eutil took $elapsedTime seconds\n";
  237. }
  238. $result; # for return
  239. }
  240. }
  241. ################################################################################
  242. #
  243. sub end {
  244. if ($verbose) {
  245. my $elapsedTime = time() - $beginTime;
  246. print STDOUT "\nElapsed time: $elapsedTime seconds\n";
  247. }
  248. exit;
  249. }