blast_uniprot.sh 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051
  1. #!/bin/bash
  2. #set -e
  3. type="proteins"
  4. script_path=$(dirname `realpath $0`)
  5. outdir="."
  6. declare -A software
  7. software["CDS"]="diamond blastx"
  8. software["proteins"]="diamond blastp"
  9. usage(){
  10. echo -e "Usage:"
  11. echo -e "\t$0 -i cds.fa -o outdir -t CDS -d uniprot.fa -g uniprot.go.tsv"
  12. echo -e "OPTIONS:"
  13. echo -e "\t-h: help information"
  14. echo -e "\t-i: fasta file"
  15. echo -e "\t-o: output dir"
  16. echo -e "\t-t: proteins or CDS default:proteins"
  17. echo -e "\t-d: uniprot species fasta"
  18. echo -e "\t-g: uniprot species go annotation"
  19. exit 1
  20. }
  21. while getopts 'd:g:t:i:o:h' OPT;do
  22. case $OPT in
  23. h) usage;;
  24. d) uniprot_fasta="$OPTARG";;
  25. t) type="$OPTARG";;
  26. i) fasta="$OPTARG";;
  27. o) outdir="$OPTARG";;
  28. g) uniprot_go="$OPTARG";;
  29. ?) usage;;
  30. esac
  31. done
  32. if [[ ! $uniprot_fasta ]] || [[ ! -s $fasta ]] || [[ ! $uniprot_go ]]
  33. then
  34. usage
  35. fi
  36. if [[ ! ${software[${type}]} ]]
  37. then
  38. echo "ERROR:unknown -t $type"
  39. usage
  40. fi
  41. mkdir -p $outdir
  42. diamond makedb --in $uniprot_fasta -d $uniprot_fasta
  43. ${software[${type}]} -q $fasta -d ${uniprot_fasta}.dmnd -o $outdir/blast.up.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
  44. cut -f 1,2 -d "|" $outdir/blast.up.txt|sed 's/\t.*|/\t/'|awk -F"\t" '!a[$1]++{print}' > $outdir/gid_upid
  45. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$3}FNR!=NR{if(a[$2]){GO=a[$2]}else{GO="-"};print $1,GO}' $uniprot_go $outdir/gid_upid > $outdir/uniprot.go