#!/bin/bash #set -e type="proteins" script_path=$(dirname `realpath $0`) outdir="." declare -A software software["CDS"]="diamond blastx" software["proteins"]="diamond blastp" usage(){ echo -e "Usage:" echo -e "\t$0 -i cds.fa -o outdir -t CDS -d uniprot.fa -g uniprot.go.tsv" echo -e "OPTIONS:" echo -e "\t-h: help information" echo -e "\t-i: fasta file" echo -e "\t-o: output dir" echo -e "\t-t: proteins or CDS default:proteins" echo -e "\t-d: uniprot species fasta" echo -e "\t-g: uniprot species go annotation" exit 1 } while getopts 'd:g:t:i:o:h' OPT;do case $OPT in h) usage;; d) uniprot_fasta="$OPTARG";; t) type="$OPTARG";; i) fasta="$OPTARG";; o) outdir="$OPTARG";; g) uniprot_go="$OPTARG";; ?) usage;; esac done if [[ ! $uniprot_fasta ]] || [[ ! -s $fasta ]] || [[ ! $uniprot_go ]] then usage fi if [[ ! ${software[${type}]} ]] then echo "ERROR:unknown -t $type" usage fi mkdir -p $outdir diamond makedb --in $uniprot_fasta -d $uniprot_fasta ${software[${type}]} -q $fasta -d ${uniprot_fasta}.dmnd -o $outdir/blast.up.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5 cut -f 1,2 -d "|" $outdir/blast.up.txt|sed 's/\t.*|/\t/'|awk -F"\t" '!a[$1]++{print}' > $outdir/gid_upid awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$3}FNR!=NR{if(a[$2]){GO=a[$2]}else{GO="-"};print $1,GO}' $uniprot_go $outdir/gid_upid > $outdir/uniprot.go