123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051 |
- #!/bin/bash
- #set -e
- type="proteins"
- script_path=$(dirname `realpath $0`)
- outdir="."
- declare -A software
- software["CDS"]="diamond blastx"
- software["proteins"]="diamond blastp"
- usage(){
- echo -e "Usage:"
- echo -e "\t$0 -i cds.fa -o outdir -t CDS -d uniprot.fa -g uniprot.go.tsv"
- echo -e "OPTIONS:"
- echo -e "\t-h: help information"
- echo -e "\t-i: fasta file"
- echo -e "\t-o: output dir"
- echo -e "\t-t: proteins or CDS default:proteins"
- echo -e "\t-d: uniprot species fasta"
- echo -e "\t-g: uniprot species go annotation"
- exit 1
- }
- while getopts 'd:g:t:i:o:h' OPT;do
- case $OPT in
- h) usage;;
- d) uniprot_fasta="$OPTARG";;
- t) type="$OPTARG";;
- i) fasta="$OPTARG";;
- o) outdir="$OPTARG";;
- g) uniprot_go="$OPTARG";;
- ?) usage;;
- esac
- done
- if [[ ! $uniprot_fasta ]] || [[ ! -s $fasta ]] || [[ ! $uniprot_go ]]
- then
- usage
- fi
- if [[ ! ${software[${type}]} ]]
- then
- echo "ERROR:unknown -t $type"
- usage
- fi
- mkdir -p $outdir
- diamond makedb --in $uniprot_fasta -d $uniprot_fasta
- ${software[${type}]} -q $fasta -d ${uniprot_fasta}.dmnd -o $outdir/blast.up.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
- cut -f 1,2 -d "|" $outdir/blast.up.txt|sed 's/\t.*|/\t/'|awk -F"\t" '!a[$1]++{print}' > $outdir/gid_upid
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$3}FNR!=NR{if(a[$2]){GO=a[$2]}else{GO="-"};print $1,GO}' $uniprot_go $outdir/gid_upid > $outdir/uniprot.go
|