123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- #!/usr/local/bin/perl -w
- # ===========================================================================
- #
- # PUBLIC DOMAIN NOTICE
- # National Center for Biotechnology Information
- #
- # This software/database is a "United States Government Work" under the
- # terms of the United States Copyright Act. It was written as part of
- # the author's official duties as a United States Government employee and
- # thus cannot be copyrighted. This software/database is freely available
- # to the public for use. The National Library of Medicine and the U.S.
- # Government have not placed any restriction on its use or reproduction.
- #
- # Although all reasonable efforts have been taken to ensure the accuracy
- # and reliability of the software and data, the NLM and the U.S.
- # Government do not and cannot warrant the performance or results that
- # may be obtained by using this software or data. The NLM and the U.S.
- # Government disclaim all warranties, express or implied, including
- # warranties of performance, merchantability or fitness for any particular
- # purpose.
- #
- # Please cite the author in any work or product based on this material.
- #
- # ===========================================================================
- #
- # Author: Zev Hochberg
- #
- # File Description:
- # Accepts tax_id as an argument
- # Gets name information from gene via e-utils
- #
- use strict;
- # ---------------------------------------------------------------------------
- #
- use LWP::Simple; # Define library for the 'get' function.
- # ---------------------------------------------------------------------------
- #
- # Global variables
- my $verbose = 0;
- my $taxId;
- my $dopt;
- my $beginTime = time();
- my $startTag = "<eSummaryResult>";
- my $stopTag = "</eSummaryResult>";
- # ---------------------------------------------------------------------------
- #
- # Main program
- #
- &processCommandLineParameters;
- &verifyInput;
- &printHeader if $dopt eq 'tab';
- &process;
- &end;
- ################################################################################
- #
- sub processCommandLineParameters {
- $taxId = "";
- $dopt = "";
- while(defined ($_= shift @ARGV)) {
- if ($_ eq '-h') {
- &showUsageAndExit;
- } elsif ($_ eq '-v') {
- $verbose = 1;
- } elsif ($_ eq '-t') {
- $taxId = shift @ARGV;
- } elsif ($_ eq '-o') {
- $dopt = shift @ARGV;
- } else {
- # Ignore extra input
- }
- }
- if ($taxId eq "") {&showUsageAndExit}
- if ($dopt eq "") {&showUsageAndExit}
- }
- ################################################################################
- #
- sub verifyInput {
- unless (($taxId =~ m/^\d*$/) and ($taxId != 0)) {
- print STDERR "taxonomyId (-t) must be numeric and > 0\n";
- &showUsageAndExit;
- }
- unless (($dopt eq 'xml') or ($dopt eq 'tab')) {
- print STDERR "output (-o) is required and must equal 'xml' or 'tab'\n";
- &showUsageAndExit;
- }
- }
- ################################################################################
- #
- sub showUsageAndExit {
- my $usage = qq/
- Usage: $0 [option] -t taxonomyId -o xml|tab
- Options: -h Display this usage help information
- -v Verbose
- -o output options
- xml - XML
- tab - tab-delimited
- /;
- print STDERR "$usage\n\n";
- &end
- }
- ################################################################################
- #
- sub printHeader {
- print STDOUT "geneId\tname\tdescription\n";
- }
- ################################################################################
- #
- sub process {
- my $geneStart = 0;
- my $maxGenes = 250;
- my $totalGenes;
- my $haveTotal = 0;
- my $geneId = "";
- my $geneCount = 0;
- my $qs;
- my $esearch_result;
- my $esummary_result;
- my $first = 1;
- # Main loop: get up to maxGenes Gene ID's for requested taxId from Gene
- $totalGenes = $geneStart+1; # to get started
- GeneLoop:
- for (; $geneStart<$totalGenes; $geneStart+=$maxGenes) {
- if ($verbose) {print STDOUT "Processing genes $geneStart to ", $geneStart+$maxGenes-1, "\n";}
- #defining the query
- #this option looks for GeneIDs with the $taxId value of interest that are
- #encoded on the mitochondrion. Note use of the field restriction [properties]
- #and the boolean AND
- #$qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+AND+source_mitochondrion[properties]";
- #this option looks for GeneIDs with the $taxId value of interest that are
- #NOT encoded on the mitochondrion and do not have RefSeqs of the type model.
- #Note use of the boolean NOT and field restriction
- $qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+NOT+source_mitochondrion[properties]+NOT+srcdb_refseq_model[properties]";
-
- $esearch_result = &Eutil ("esearch", $qs);
- if ($esearch_result =~ m/<Count>(\d+)<\/Count>/) {
- if ($haveTotal) {
- if ($totalGenes != $1) {
- die "esearch reported new total genes: was $totalGenes; now $1\nFor request $qs\n";
- }
- } else {
- $totalGenes = $1; # extract total
- $haveTotal = 1;
- }
- } else {
- die "$esearch_result did not contain expected <Count>,\n for request $qs\n";
- }
- # Build querystring for GENE search
- $qs = "db=gene&id=";
- while ($esearch_result =~ m/<Id>(\d+)<\/Id>/g) {
- $geneCount++;
- $geneId = $1; # extract a geneId
- $qs .= "$geneId,";
- }
- chop($qs);
- # Get Gene summary
- $esummary_result = &Eutil ("esummary", $qs);
-
- # Extract and output information for all genes
- if ($dopt eq 'tab'){
- &extractAndOutput ($esummary_result);
- } else {
- &extractAndOutputXml ($esummary_result, $first);
- }
- $first = 0;
-
- if ($verbose) {
- print STDOUT "\ngeneCount: $geneCount\n";
- }
- }
-
- # Close xml output
- if ($dopt eq 'xml') {print STDOUT $stopTag}
- }
- ################################################################################
- #
- sub extractAndOutput {
- my $xml = $_[0];
- my $match;
-
- my $geneId;
- my $name;
- my $description;
-
- while ($xml =~ m/<DocSum>(.*?)<\/DocSum>/gs) {
- $match = $1;
-
- $geneId = "";
- $name = "";
- $description = "";
-
- if ($match =~ /<Id>(\d+)<\/Id>/) {$geneId = $1}
- if ($match =~ m/<Item Name=.Name.*>(.+)<\/Item>/) {$name = $1}
- if ($match =~ m/<Item Name=.Description.*>(.+)<\/Item>/) {$description = $1}
-
- print STDOUT "$geneId\t$name\t$description\n";
- }
- }
- ################################################################################
- #
- sub extractAndOutputXml {
- my $xml = shift;
- my $first = shift;
-
- # Strip leading <!DOCTYPE, <?xml, and bracketing <eSummaryResult> tags,
- # from all but first set
- # Strip closing <eSummaryResult> tag from all
-
- if ($first) {
- my $start = 0;
- my $stop = index ($xml, $stopTag);
- print STDOUT substr ($xml, $start, $stop-$start);
- } else {
- my $start = index ($xml, $startTag) + length($startTag);
- my $stop = index ($xml, $stopTag);
- print STDOUT substr ($xml, $start, $stop-$start);
- }
- }
- ################################################################################
- #
- # Subroutine to handle all eutil calls
- #
- # Create a BEGIN block to provide effect of static data for sub
- BEGIN {
- # static data
- my $lastEutilTime = 0; # init to avoid delay on first Eutil
-
- # storing local constants here too.
- my $eutilBaseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
- my $minUtilPeriod = 3; # seconds
-
- sub Eutil {
- my ($eutil, $querystringData) = @_;
-
- my $elapsed = time() - $lastEutilTime;
- my $delay = $minUtilPeriod - $elapsed;
- if ($delay < 0) {$delay = 0}
- sleep ($delay);
- $lastEutilTime = time(); # save for next time
- my $eutilUrl = "$eutilBaseUrl/$eutil.fcgi?$querystringData";
- if ($verbose) {print STDOUT "\neutilUrl: $eutilUrl\n";}
-
- my $result = get($eutilUrl); # get result of the eutil for return
- if ((not defined $result) or ($result eq ""))
- {
- $result = ""; # Simplify error testing on return
- print STDERR "$eutil failed for request: $querystringData\n\n";
- }
-
- if ($verbose) {
- print STDOUT "\neutil result: $result\n";
- my $elapsedTime = time() - $lastEutilTime;
- print STDOUT "$eutil took $elapsedTime seconds\n";
- }
-
- $result; # for return
- }
- }
- ################################################################################
- #
- sub end {
- if ($verbose) {
- my $elapsedTime = time() - $beginTime;
- print STDOUT "\nElapsed time: $elapsedTime seconds\n";
- }
- exit;
- }
|