#!/usr/local/bin/perl -w
# ===========================================================================
#
# PUBLIC DOMAIN NOTICE
# National Center for Biotechnology Information
#
# This software/database is a "United States Government Work" under the
# terms of the United States Copyright Act. It was written as part of
# the author's official duties as a United States Government employee and
# thus cannot be copyrighted. This software/database is freely available
# to the public for use. The National Library of Medicine and the U.S.
# Government have not placed any restriction on its use or reproduction.
#
# Although all reasonable efforts have been taken to ensure the accuracy
# and reliability of the software and data, the NLM and the U.S.
# Government do not and cannot warrant the performance or results that
# may be obtained by using this software or data. The NLM and the U.S.
# Government disclaim all warranties, express or implied, including
# warranties of performance, merchantability or fitness for any particular
# purpose.
#
# Please cite the author in any work or product based on this material.
#
# ===========================================================================
#
# Author: Zev Hochberg
#
# File Description:
# Accepts tax_id as an argument
# Gets name information from gene via e-utils
#
use strict;
# ---------------------------------------------------------------------------
#
use LWP::Simple; # Define library for the 'get' function.
# ---------------------------------------------------------------------------
#
# Global variables
my $verbose = 0;
my $taxId;
my $dopt;
my $beginTime = time();
my $startTag = "";
my $stopTag = "";
# ---------------------------------------------------------------------------
#
# Main program
#
&processCommandLineParameters;
&verifyInput;
&printHeader if $dopt eq 'tab';
&process;
&end;
################################################################################
#
sub processCommandLineParameters {
$taxId = "";
$dopt = "";
while(defined ($_= shift @ARGV)) {
if ($_ eq '-h') {
&showUsageAndExit;
} elsif ($_ eq '-v') {
$verbose = 1;
} elsif ($_ eq '-t') {
$taxId = shift @ARGV;
} elsif ($_ eq '-o') {
$dopt = shift @ARGV;
} else {
# Ignore extra input
}
}
if ($taxId eq "") {&showUsageAndExit}
if ($dopt eq "") {&showUsageAndExit}
}
################################################################################
#
sub verifyInput {
unless (($taxId =~ m/^\d*$/) and ($taxId != 0)) {
print STDERR "taxonomyId (-t) must be numeric and > 0\n";
&showUsageAndExit;
}
unless (($dopt eq 'xml') or ($dopt eq 'tab')) {
print STDERR "output (-o) is required and must equal 'xml' or 'tab'\n";
&showUsageAndExit;
}
}
################################################################################
#
sub showUsageAndExit {
my $usage = qq/
Usage: $0 [option] -t taxonomyId -o xml|tab
Options: -h Display this usage help information
-v Verbose
-o output options
xml - XML
tab - tab-delimited
/;
print STDERR "$usage\n\n";
&end
}
################################################################################
#
sub printHeader {
print STDOUT "geneId\tname\tdescription\n";
}
################################################################################
#
sub process {
my $geneStart = 0;
my $maxGenes = 250;
my $totalGenes;
my $haveTotal = 0;
my $geneId = "";
my $geneCount = 0;
my $qs;
my $esearch_result;
my $esummary_result;
my $first = 1;
# Main loop: get up to maxGenes Gene ID's for requested taxId from Gene
$totalGenes = $geneStart+1; # to get started
GeneLoop:
for (; $geneStart<$totalGenes; $geneStart+=$maxGenes) {
if ($verbose) {print STDOUT "Processing genes $geneStart to ", $geneStart+$maxGenes-1, "\n";}
#defining the query
#this option looks for GeneIDs with the $taxId value of interest that are
#encoded on the mitochondrion. Note use of the field restriction [properties]
#and the boolean AND
#$qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+AND+source_mitochondrion[properties]";
#this option looks for GeneIDs with the $taxId value of interest that are
#NOT encoded on the mitochondrion and do not have RefSeqs of the type model.
#Note use of the boolean NOT and field restriction
$qs = "db=gene&retstart=$geneStart&retmax=$maxGenes&term=" . $taxId . "[taxid]+NOT+source_mitochondrion[properties]+NOT+srcdb_refseq_model[properties]";
$esearch_result = &Eutil ("esearch", $qs);
if ($esearch_result =~ m/(\d+)<\/Count>/) {
if ($haveTotal) {
if ($totalGenes != $1) {
die "esearch reported new total genes: was $totalGenes; now $1\nFor request $qs\n";
}
} else {
$totalGenes = $1; # extract total
$haveTotal = 1;
}
} else {
die "$esearch_result did not contain expected ,\n for request $qs\n";
}
# Build querystring for GENE search
$qs = "db=gene&id=";
while ($esearch_result =~ m/(\d+)<\/Id>/g) {
$geneCount++;
$geneId = $1; # extract a geneId
$qs .= "$geneId,";
}
chop($qs);
# Get Gene summary
$esummary_result = &Eutil ("esummary", $qs);
# Extract and output information for all genes
if ($dopt eq 'tab'){
&extractAndOutput ($esummary_result);
} else {
&extractAndOutputXml ($esummary_result, $first);
}
$first = 0;
if ($verbose) {
print STDOUT "\ngeneCount: $geneCount\n";
}
}
# Close xml output
if ($dopt eq 'xml') {print STDOUT $stopTag}
}
################################################################################
#
sub extractAndOutput {
my $xml = $_[0];
my $match;
my $geneId;
my $name;
my $description;
while ($xml =~ m/(.*?)<\/DocSum>/gs) {
$match = $1;
$geneId = "";
$name = "";
$description = "";
if ($match =~ /(\d+)<\/Id>/) {$geneId = $1}
if ($match =~ m/- (.+)<\/Item>/) {$name = $1}
if ($match =~ m/
- (.+)<\/Item>/) {$description = $1}
print STDOUT "$geneId\t$name\t$description\n";
}
}
################################################################################
#
sub extractAndOutputXml {
my $xml = shift;
my $first = shift;
# Strip leading tags,
# from all but first set
# Strip closing tag from all
if ($first) {
my $start = 0;
my $stop = index ($xml, $stopTag);
print STDOUT substr ($xml, $start, $stop-$start);
} else {
my $start = index ($xml, $startTag) + length($startTag);
my $stop = index ($xml, $stopTag);
print STDOUT substr ($xml, $start, $stop-$start);
}
}
################################################################################
#
# Subroutine to handle all eutil calls
#
# Create a BEGIN block to provide effect of static data for sub
BEGIN {
# static data
my $lastEutilTime = 0; # init to avoid delay on first Eutil
# storing local constants here too.
my $eutilBaseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
my $minUtilPeriod = 3; # seconds
sub Eutil {
my ($eutil, $querystringData) = @_;
my $elapsed = time() - $lastEutilTime;
my $delay = $minUtilPeriod - $elapsed;
if ($delay < 0) {$delay = 0}
sleep ($delay);
$lastEutilTime = time(); # save for next time
my $eutilUrl = "$eutilBaseUrl/$eutil.fcgi?$querystringData";
if ($verbose) {print STDOUT "\neutilUrl: $eutilUrl\n";}
my $result = get($eutilUrl); # get result of the eutil for return
if ((not defined $result) or ($result eq ""))
{
$result = ""; # Simplify error testing on return
print STDERR "$eutil failed for request: $querystringData\n\n";
}
if ($verbose) {
print STDOUT "\neutil result: $result\n";
my $elapsedTime = time() - $lastEutilTime;
print STDOUT "$eutil took $elapsedTime seconds\n";
}
$result; # for return
}
}
################################################################################
#
sub end {
if ($verbose) {
my $elapsedTime = time() - $beginTime;
print STDOUT "\nElapsed time: $elapsedTime seconds\n";
}
exit;
}