#!/usr/local/bin/perl -w # =========================================================================== # # PUBLIC DOMAIN NOTICE # National Center for Biotechnology Information # # This software/database is a "United States Government Work" under the # terms of the United States Copyright Act. It was written as part of # the author's official duties as a United States Government employee and # thus cannot be copyrighted. This software/database is freely available # to the public for use. The National Library of Medicine and the U.S. # Government have not placed any restriction on its use or reproduction. # # Although all reasonable efforts have been taken to ensure the accuracy # and reliability of the software and data, the NLM and the U.S. # Government do not and cannot warrant the performance or results that # may be obtained by using this software or data. The NLM and the U.S. # Government disclaim all warranties, express or implied, including # warranties of performance, merchantability or fitness for any particular # purpose. # # Please cite the author in any work or product based on this material. # # =========================================================================== # # Author: Zev Hochberg # # File Description: # Accepts tax_id as an argument # Gets name information from gene via e-utils # use strict; # --------------------------------------------------------------------------- # use LWP::Simple; # Define library for the 'get' function. # --------------------------------------------------------------------------- # # Global variables my $verbose = 0; my $taxId; my $dopt; my $beginTime = time(); my $startTag = ""; my $stopTag = ""; # --------------------------------------------------------------------------- # # Main program # &processCommandLineParameters; &verifyInput; &printHeader if $dopt eq 'tab'; &process; &end; ################################################################################ # sub processCommandLineParameters { $taxId = ""; $dopt = ""; while(defined ($_= shift @ARGV)) { if ($_ eq '-h') { &showUsageAndExit; } elsif ($_ eq '-v') { $verbose = 1; } elsif ($_ eq '-t') { $taxId = shift @ARGV; } elsif ($_ eq '-o') { $dopt = shift @ARGV; } else { # Ignore extra input } } if ($taxId eq "") {&showUsageAndExit} if ($dopt eq "") {&showUsageAndExit} } ################################################################################ # sub verifyInput { unless (($taxId =~ m/^\d*$/) and ($taxId != 0)) { print STDERR "taxonomyId (-t) must be numeric and > 0\n"; &showUsageAndExit; } unless (($dopt eq 'xml') or ($dopt eq 'tab')) { print STDERR "output (-o) is required and must equal 'xml' or 'tab'\n"; &showUsageAndExit; } } ################################################################################ # sub showUsageAndExit { my $usage = qq/ Usage: $0 [option] -t taxonomyId -o xml|tab Options: -h Display this usage help information -v Verbose -o output options xml - XML tab - tab-delimited /; print STDERR "$usage\n\n"; &end } ################################################################################ # sub printHeader { print STDOUT "geneId\tname\tdescription\n"; } ################################################################################ # sub process { my $geneStart = 0; my $maxGenes = 250; my $totalGenes; my $haveTotal = 0; my $geneId = ""; my $geneCount = 0; my $qs; my $esearch_result; my $esummary_result; my $first = 1; # Main loop: get up to maxGenes Gene ID's for requested taxId from Gene $totalGenes = $geneStart+1; # to get started GeneLoop: for (; $geneStart<$totalGenes; $geneStart+=$maxGenes) { if ($verbose) {print STDOUT "Processing genes $geneStart to ", $geneStart+$maxGenes-1, "\n";} #defining the query #this option looks for GeneIDs with the $taxId value of interest that are #encoded on the mitochondrion. Note use of the field restriction [properties] #and the boolean AND #$qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+AND+source_mitochondrion[properties]"; #this option looks for GeneIDs with the $taxId value of interest that are #NOT encoded on the mitochondrion and do not have RefSeqs of the type model. #Note use of the boolean NOT and field restriction $qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+NOT+source_mitochondrion[properties]+NOT+srcdb_refseq_model[properties]"; $esearch_result = &Eutil ("esearch", $qs); if ($esearch_result =~ m/(\d+)<\/Count>/) { if ($haveTotal) { if ($totalGenes != $1) { die "esearch reported new total genes: was $totalGenes; now $1\nFor request $qs\n"; } } else { $totalGenes = $1; # extract total $haveTotal = 1; } } else { die "$esearch_result did not contain expected ,\n for request $qs\n"; } # Build querystring for GENE search $qs = "db=gene&id="; while ($esearch_result =~ m/(\d+)<\/Id>/g) { $geneCount++; $geneId = $1; # extract a geneId $qs .= "$geneId,"; } chop($qs); # Get Gene summary $esummary_result = &Eutil ("esummary", $qs); # Extract and output information for all genes if ($dopt eq 'tab'){ &extractAndOutput ($esummary_result); } else { &extractAndOutputXml ($esummary_result, $first); } $first = 0; if ($verbose) { print STDOUT "\ngeneCount: $geneCount\n"; } } # Close xml output if ($dopt eq 'xml') {print STDOUT $stopTag} } ################################################################################ # sub extractAndOutput { my $xml = $_[0]; my $match; my $geneId; my $name; my $description; while ($xml =~ m/(.*?)<\/DocSum>/gs) { $match = $1; $geneId = ""; $name = ""; $description = ""; if ($match =~ /(\d+)<\/Id>/) {$geneId = $1} if ($match =~ m/(.+)<\/Item>/) {$name = $1} if ($match =~ m/(.+)<\/Item>/) {$description = $1} print STDOUT "$geneId\t$name\t$description\n"; } } ################################################################################ # sub extractAndOutputXml { my $xml = shift; my $first = shift; # Strip leading tags, # from all but first set # Strip closing tag from all if ($first) { my $start = 0; my $stop = index ($xml, $stopTag); print STDOUT substr ($xml, $start, $stop-$start); } else { my $start = index ($xml, $startTag) + length($startTag); my $stop = index ($xml, $stopTag); print STDOUT substr ($xml, $start, $stop-$start); } } ################################################################################ # # Subroutine to handle all eutil calls # # Create a BEGIN block to provide effect of static data for sub BEGIN { # static data my $lastEutilTime = 0; # init to avoid delay on first Eutil # storing local constants here too. my $eutilBaseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"; my $minUtilPeriod = 3; # seconds sub Eutil { my ($eutil, $querystringData) = @_; my $elapsed = time() - $lastEutilTime; my $delay = $minUtilPeriod - $elapsed; if ($delay < 0) {$delay = 0} sleep ($delay); $lastEutilTime = time(); # save for next time my $eutilUrl = "$eutilBaseUrl/$eutil.fcgi?$querystringData"; if ($verbose) {print STDOUT "\neutilUrl: $eutilUrl\n";} my $result = get($eutilUrl); # get result of the eutil for return if ((not defined $result) or ($result eq "")) { $result = ""; # Simplify error testing on return print STDERR "$eutil failed for request: $querystringData\n\n"; } if ($verbose) { print STDOUT "\neutil result: $result\n"; my $elapsedTime = time() - $lastEutilTime; print STDOUT "$eutil took $elapsedTime seconds\n"; } $result; # for return } } ################################################################################ # sub end { if ($verbose) { my $elapsedTime = time() - $beginTime; print STDOUT "\nElapsed time: $elapsedTime seconds\n"; } exit; }