123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294 |
- #!/usr/bin/perl -w
- # ===========================================================================
- #
- # PUBLIC DOMAIN NOTICE
- # National Center for Biotechnology Information
- #
- # This software/database is a "United States Government Work" under the
- # terms of the United States Copyright Act. It was written as part of
- # the author's official duties as a United States Government employee and
- # thus cannot be copyrighted. This software/database is freely available
- # to the public for use. The National Library of Medicine and the U.S.
- # Government have not placed any restriction on its use or reproduction.
- #
- # Although all reasonable efforts have been taken to ensure the accuracy
- # and reliability of the software and data, the NLM and the U.S.
- # Government do not and cannot warrant the performance or results that
- # may be obtained by using this software or data. The NLM and the U.S.
- # Government disclaim all warranties, express or implied, including
- # warranties of performance, merchantability or fitness for any particular
- # purpose.
- #
- # Please cite the author in any work or product based on this material.
- #
- # ===========================================================================
- #
- # Author: Craig Wallin
- #
- # File Description:
- #
- # Accepts chromosome and genomic location range as input data.
- # Sample input:
- # chr12:1000000-2000000
- # Searches for genes in the specified range from Entrez Gene via e-utils.
- # Gets GO data from gene2go FTP file (must be downloaded and decompressed
- # first).
- # Outputs a summary of GO data for genes in the specified range, in
- # tab-delimited format.
- use strict;
- # ---------------------------------------------------------------------------
- #
- # use libraries
- use LWP::Simple; # Define library for the 'get' function.
- # ---------------------------------------------------------------------------
- #
- # Global variables
- my $arg_tax_id = 9606; # human by default
- my $arg_verbose = 0;
- my $arg_input_data_file = "-"; # STDIN by default
- my $arg_go_file = "";
- my $beginTime = time();
- my %gene_go_terms = ();
- # ---------------------------------------------------------------------------
- #
- # Main program
- #
- &processCommandLineParameters;
- &processInputData;
- &end;
- ################################################################################
- #
- sub processCommandLineParameters {
- while ( my $term = shift @ARGV ) {
- if ($term eq '-h') {
- &showUsageAndExit;
- } elsif ($term eq '-v') {
- $arg_verbose = 1;
- } elsif ($term eq '-i') {
- $arg_input_data_file = shift @ARGV;
- } elsif ($term eq '-g') {
- $arg_go_file = shift @ARGV;
- } elsif ($term eq '-t') {
- $arg_tax_id = shift @ARGV;
- } else {
- # Ignore extra input
- }
- }
- if ($arg_input_data_file eq "") {&showUsageAndExit}
- if ($arg_go_file eq "") {&showUsageAndExit}
- die "gene2go file does not exist: $arg_go_file\n" unless -e $arg_go_file;
- }
- ################################################################################
- sub showUsageAndExit {
- my $usage = qq/
- Usage: $0 [options]
- Options: -h Display this usage help information
- -v Verbose
- -i Input file (or - for stdin) with ranges
- -g gene2go file
- -t tax id
- /;
- print STDERR "$usage\n";
- &end
- }
- ################################################################################
- sub printHeader {
- print STDOUT join "\t", ( "Genomic coordinates", "Enzyme", "Receptor", "Hormone", "Structural" );
- print STDOUT "\n";
- }
- ################################################################################
- # This populates %gene_go_terms
- sub readGoFile {
- print STDERR "Reading gene2go file...\n";
- open IN, "<$arg_go_file" or die "Failed to open: $arg_go_file\n";
- while ( <IN> ) {
- chomp();
- next if m/^#/; # skip header
- my ( $tax_id, $GeneID, $GO_ID, $Evidence, $Qualifier, $GO_term, $PubMed, $Category ) = split /\t/;
- next unless $tax_id == $arg_tax_id;
- next if $Qualifier =~ m/^NOT/;
- # Accumulate terms for this gene.
- my $gene_terms = $gene_go_terms{ $GeneID };
- $gene_terms .= "enzyme " if $GO_term =~ m/enzyme/i;
- $gene_terms .= "receptor " if $GO_term =~ m/receptor/i;
- $gene_terms .= "hormone " if $GO_term =~ m/hormone/i;
- $gene_terms .= "structural " if $GO_term =~ m/structural/i;
- $gene_go_terms{ $GeneID } = $gene_terms;
- }
- print STDERR "Done reading gene2go file, read data for " . scalar(keys(%gene_go_terms)) . " genes.\n";
- close IN;
- }
- ################################################################################
- sub processInputData {
- readGoFile();
- my $maxResults = 15000;
- my $esearch_result;
- &printHeader;
- my @input_records = ();
- open IN, "<$arg_input_data_file" or die "Failed to open input: $arg_input_data_file\n";
- while ( my $line = <IN> ) {
- chomp($line);
- push @input_records, $line;
- }
- my $total_records = scalar @input_records;
- my $record_count = 0;
- while ( my $line = shift @input_records ) {
- $record_count++;
- if ( $record_count % 100 == 0 ) {
- print STDERR "Read $record_count of $total_records records\n";
- }
- $line =~ m/^chr(\w+):(\d+)-(\d+)$/ or die "Could not parse input line: $line\n";
- my ( $chr, $start, $stop ) = ( $1, $2, $3 );
- my $location_str = "chr$chr:$start\-$stop";
- my $terms="txid9606 AND $chr \[chr\] AND $start : $stop \[chrpos\]";
- my $request = "db=gene&retmax=$maxResults&term=$terms";
- #print "$request\n";
- $esearch_result = &Eutil ("esearch", $request);
- $esearch_result =~ m/<Count>(\d+)<\/Count>/
- or die "$esearch_result did not contain expected <Count>,\n for request $request\n";
- # Summarize counts as the number of genes in the range with the
- # desired keywords.
- my $num_enzyme = 0;
- my $num_receptor = 0;
- my $num_hormone = 0;
- my $num_structural = 0;
- while ($esearch_result =~ m/<Id>(\d+)<\/Id>/g) {
- my $GeneID = $1; # extract a geneId
- my $gene_terms = $gene_go_terms{ $GeneID };
- next unless $gene_terms;
- $num_enzyme++ if $gene_terms =~ m/enzyme/;
- $num_receptor++ if $gene_terms =~ m/receptor/;
- $num_hormone++ if $gene_terms =~ m/hormone/;
- $num_structural++ if $gene_terms =~ m/structural/;
- }
- print STDOUT join "\t", ( $location_str, $num_enzyme, $num_receptor, $num_hormone, $num_structural );
- print STDOUT "\n";
- }
- close IN;
- }
- ################################################################################
- # Subroutine to handle all eutil calls.
- # Create a BEGIN block to provide effect of static data for sub
- BEGIN {
- use Time::HiRes qw( usleep gettimeofday tv_interval );
- # static data
- my $lastEutilTime = [gettimeofday]; # init once
- # storing local constants here too.
- my $eutilBaseUrl = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils";
- # most frequent usage allowed for eutils is 3 requests per second
- my $minUtilPeriod_microsec = 333333; # microseconds
- sub Eutil {
- my ($eutil, $querystringData) = @_;
- my $elapsed_sec = tv_interval( $lastEutilTime );
- my $delay_microsec = $minUtilPeriod_microsec - ( $elapsed_sec * 1000000 );
- if ($delay_microsec < 0) {$delay_microsec = 0}
- usleep ($delay_microsec);
- $lastEutilTime = [gettimeofday]; # save for next time
- my $eutilUrl = "$eutilBaseUrl/$eutil.fcgi?$querystringData";
- if ($arg_verbose) {print STDERR "\neutilUrl: $eutilUrl\n";}
- my $result = get($eutilUrl); # get result of the eutil for return
- if ((not defined $result) or ($result eq ""))
- {
- $result = ""; # Simplify error testing on return
- print STDERR "$eutil failed for request: $querystringData\n\n";
- }
- if ($arg_verbose) {
- print STDERR "\neutil result: $result\n";
- my $elapsedTime = tv_interval( $lastEutilTime );
- print STDERR "$eutil took $elapsedTime seconds\n";
- }
- $result; # for return
- }
- }
- ################################################################################
- sub end {
- if ($arg_verbose) {
- my $elapsedTime = time() - $beginTime;
- print STDERR "\nElapsed time: $elapsedTime seconds\n";
- }
- exit;
- }
|