NR.sh 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. #!/bin/bash
  2. #set -e
  3. type="proteins"
  4. script_path=$(dirname `realpath $0`)
  5. database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/NR"
  6. declare -A software database_dir
  7. software["CDS"]="diamond blastx"
  8. software["proteins"]="diamond blastp"
  9. database_dir["animal"]=$database_dir/diamond_index/Metazoa.fa.dmnd
  10. database_dir["plant"]=$database_dir/diamond_index/Viridiplantae.fa.dmnd
  11. database_dir["fungi"]=$database_dir/diamond_index/Fungi.fa.dmnd
  12. database_dir["prokaryotes"]=$database_dir/diamond_index/Bacteria.fa.dmnd
  13. database_dir["virus"]=$database_dir/diamond_index/Virus.fa.dmnd
  14. usage(){
  15. echo -e "Usage:"
  16. echo -e "\t$0 -i cds.fa -o outdir -t CDS -s animal"
  17. echo -e "OPTIONS:"
  18. echo -e "\t-h: help information"
  19. echo -e "\t-i: fasta file"
  20. echo -e "\t-o: output dir"
  21. echo -e "\t-t: proteins or CDS default:proteins"
  22. echo -e "\t-s: species [ animal plant fungi prokaryotes virus taxid(9606 10096 ..) ]"
  23. exit 1
  24. }
  25. while getopts 'hs:t:i:o:' OPT;do
  26. case $OPT in
  27. h) usage;;
  28. s) species="$OPTARG";;
  29. t) type="$OPTARG";;
  30. i) fasta="$OPTARG";;
  31. o) outdir="$OPTARG";;
  32. ?) usage;;
  33. esac
  34. done
  35. #echo $species $type $fasta $outdir
  36. if [[ ! $species ]] || [[ ! $fasta ]] || [[ ! $outdir ]] || [ $# == 0 ]
  37. then
  38. usage
  39. fi
  40. if [[ ! ${software[${type}]} ]]
  41. then
  42. echo "ERROR:unknown -t $type "
  43. exit 1
  44. fi
  45. mkdir -p $outdir
  46. case $species in
  47. animal|plant|fungi|prokaryotes|virus)
  48. ${software[${type}]} -q ${fasta} -d ${database_dir[${species}]} -o $outdir/blast.$species.txt -e 0.00001 \
  49. --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle \
  50. -p 12 --more-sensitive --max-target-seqs 5
  51. ;;
  52. [0-9]*)
  53. /Business/psn_company/t04/hudabang/bin/yaogu/bin/taxonkit list --data-dir /Business/psn_company/t04/.taxonkit \
  54. --indent "" -i $species -n -o $outdir/$species.taxid.txt
  55. sed -i 's/ /\t/' $outdir/$species.taxid.txt
  56. head -1 $outdir/$species.taxid.txt | sed "s/^/taxid and name are : /"
  57. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$3])print $2}' $outdir/$species.taxid.txt \
  58. $database_dir/prot.accession2taxid > $outdir/$species.accession.txt
  59. perl $script_path/extract_fa_by_list.pl $outdir/$species.accession.txt $database_dir/nr $outdir/$species.fa
  60. diamond makedb --in $outdir/$species.fa -d $outdir/$species.fa
  61. ${software[${type}]} -q ${fasta} -d $outdir/$species.fa.dmnd -o $outdir/blast.$species.txt -e 0.00001 \
  62. --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle \
  63. -p 12 --more-sensitive --max-target-seqs 5
  64. ;;
  65. *)
  66. echo "ERROR:unknown -s $species"
  67. usage
  68. ;;
  69. esac
  70. awk 'BEGIN{FS=OFS="\t"}{if(!a[$1]){print $1,$13;a[$1]=1}}' $outdir/blast.$species.txt > $outdir/NR.txt