KEGG.sh 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. #!/bin/bash
  2. #set -e
  3. type="proteins"
  4. script_path=$(dirname `realpath $0`)
  5. database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/KEGG"
  6. default_rate=0.5
  7. idlist=1
  8. declare -A software db_path
  9. software["CDS"]="diamond blastx"
  10. software["proteins"]="diamond blastp"
  11. db_path["animal"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Animals.pep.fasta.dmnd
  12. db_path["plant"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Plants.pep.fasta.dmnd
  13. db_path["fungi"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Fungi.pep.fasta.dmnd
  14. db_path["prokaryotes"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Bacteria.pep.fasta.dmnd
  15. usage(){
  16. echo -e "Usage:"
  17. echo -e "\t$0 -i cds.fa -o outdir -t CDS -s animal -l idlist"
  18. echo -e "OPTIONS:"
  19. echo -e "\t-h: help information"
  20. echo -e "\t-i: fasta file"
  21. echo -e "\t-o: output dir"
  22. echo -e "\t-l: idlist;col1-6 is gene_id locus_tag old_locus_tag entry_id protein_id gene_name"
  23. echo -e "\t-t: proteins or CDS default:proteins"
  24. echo -e "\t-s: kegg org [ animal plant fungi prokaryotes hsa mmu ... ]"
  25. echo -e "\t-r: rate of Matched to known KO; default:0.5"
  26. exit 1
  27. }
  28. while getopts 's:t:i:o:l:h' OPT;do
  29. case $OPT in
  30. h) usage;;
  31. s) species="$OPTARG";;
  32. t) type="$OPTARG";;
  33. i) fasta="$OPTARG";;
  34. o) outdir="$OPTARG";;
  35. l) idlist="$OPTARG";;
  36. ?) usage;;
  37. esac
  38. done
  39. if [[ ! $species ]] || [[ ! -s $fasta ]] || [[ ! $outdir ]] || [ $# == 0 ]
  40. then
  41. usage
  42. fi
  43. if [[ ! ${software[${type}]} ]]
  44. then
  45. echo "ERROR:unknown -t $type"
  46. usage
  47. fi
  48. case $species in
  49. animal|plant|fungi|prokaryotes)
  50. ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.kegg.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
  51. source /Business/psn_company/t01/local/anaconda3/bin/activate kobas
  52. kobas-annotate -k /Business/psn_company/t01/public/software/kobas-3.0/ -i $outdir/blast.kegg.txt -t blastout:tab -s ko -o $outdir/kobas.txt
  53. cut -f 1 -d '|' $outdir/kobas.txt|grep None -v |grep -P "K\d+"> $outdir/KEGG.txt
  54. ;;
  55. *)
  56. if [ -s "$database_dir/KO/$species" ] && [ -s $idlist ]
  57. then
  58. known_ko_number=`cat $database_dir/KO/$species |wc -l`
  59. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{for(i=1;i<=6;i++){if(a[$i])a[$1]=a[$i]};if(a[$1])print $1,a[$1]}' \
  60. $database_dir/KO/$species $idlist > $outdir/KEGG.txt
  61. ko_number=`cat $outdir/KEGG.txt|wc -l`
  62. rate=`awk 'BEGIN{print "'$ko_number'"/"'$known_ko_number'"}'`
  63. if [ $(echo "$rate >= $default_rate" | $script_path/bc) -eq 1 ]
  64. then
  65. echo "KEGG annotated from $database_dir/KO/$species"
  66. else
  67. echo "rate of Matched to known KO is $rate < $default_rate;KO annotated by kobas"
  68. kingdom=`grep -P "^$species\t" $database_dir/org_kindom.txt|cut -f2`
  69. [ $kingdom == "protists" ] && "echo $species is protists; not in animal plant fungi prokaryotes" && exit 1
  70. echo "blast db is : ${db_path[${kingdom}]}"
  71. ${software[${type}]} -q ${fasta} -d ${db_path[${kingdom}]} -o $outdir/blast.kegg.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
  72. source /Business/psn_company/t01/local/anaconda3/bin/activate kobas
  73. kobas-annotate -k /Business/psn_company/t01/public/software/kobas-3.0/ -i $outdir/blast.kegg.txt -t blastout:tab -s ko -o $outdir/kobas.txt
  74. cut -f 1 -d '|' $outdir/kobas.txt|grep None -v |grep -P "K\d+"> $outdir/KEGG.txt
  75. fi
  76. elif [ -s "$database_dir/KO/$species" ]
  77. then
  78. echo "ERROR:unknown -l;$idlist not exist"
  79. usage
  80. else
  81. echo "ERROR:$database_dir/KO/$species not exist"
  82. usage
  83. fi
  84. ;;
  85. esac