12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- #!/bin/bash
- #set -e
- type="proteins"
- script_path=$(dirname `realpath $0`)
- database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/KEGG"
- default_rate=0.5
- idlist=1
- declare -A software db_path
- software["CDS"]="diamond blastx"
- software["proteins"]="diamond blastp"
- db_path["animal"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Animals.pep.fasta.dmnd
- db_path["plant"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Plants.pep.fasta.dmnd
- db_path["fungi"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Fungi.pep.fasta.dmnd
- db_path["prokaryotes"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Bacteria.pep.fasta.dmnd
- usage(){
- echo -e "Usage:"
- echo -e "\t$0 -i cds.fa -o outdir -t CDS -s animal -l idlist"
- echo -e "OPTIONS:"
- echo -e "\t-h: help information"
- echo -e "\t-i: fasta file"
- echo -e "\t-o: output dir"
- echo -e "\t-l: idlist;col1-6 is gene_id locus_tag old_locus_tag entry_id protein_id gene_name"
- echo -e "\t-t: proteins or CDS default:proteins"
- echo -e "\t-s: kegg org [ animal plant fungi prokaryotes hsa mmu ... ]"
- echo -e "\t-r: rate of Matched to known KO; default:0.5"
- exit 1
- }
- while getopts 's:t:i:o:l:h' OPT;do
- case $OPT in
- h) usage;;
- s) species="$OPTARG";;
- t) type="$OPTARG";;
- i) fasta="$OPTARG";;
- o) outdir="$OPTARG";;
- l) idlist="$OPTARG";;
- ?) usage;;
- esac
- done
- if [[ ! $species ]] || [[ ! -s $fasta ]] || [[ ! $outdir ]] || [ $# == 0 ]
- then
- usage
- fi
- if [[ ! ${software[${type}]} ]]
- then
- echo "ERROR:unknown -t $type"
- usage
- fi
- case $species in
- animal|plant|fungi|prokaryotes)
- ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.kegg.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
- source /Business/psn_company/t01/local/anaconda3/bin/activate kobas
- kobas-annotate -k /Business/psn_company/t01/public/software/kobas-3.0/ -i $outdir/blast.kegg.txt -t blastout:tab -s ko -o $outdir/kobas.txt
- cut -f 1 -d '|' $outdir/kobas.txt|grep None -v |grep -P "K\d+"> $outdir/KEGG.txt
- ;;
- *)
- if [ -s "$database_dir/KO/$species" ] && [ -s $idlist ]
- then
- known_ko_number=`cat $database_dir/KO/$species |wc -l`
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{for(i=1;i<=6;i++){if(a[$i])a[$1]=a[$i]};if(a[$1])print $1,a[$1]}' \
- $database_dir/KO/$species $idlist > $outdir/KEGG.txt
- ko_number=`cat $outdir/KEGG.txt|wc -l`
- rate=`awk 'BEGIN{print "'$ko_number'"/"'$known_ko_number'"}'`
- if [ $(echo "$rate >= $default_rate" | $script_path/bc) -eq 1 ]
- then
- echo "KEGG annotated from $database_dir/KO/$species"
- else
- echo "rate of Matched to known KO is $rate < $default_rate;KO annotated by kobas"
- kingdom=`grep -P "^$species\t" $database_dir/org_kindom.txt|cut -f2`
- [ $kingdom == "protists" ] && "echo $species is protists; not in animal plant fungi prokaryotes" && exit 1
- echo "blast db is : ${db_path[${kingdom}]}"
- ${software[${type}]} -q ${fasta} -d ${db_path[${kingdom}]} -o $outdir/blast.kegg.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
- source /Business/psn_company/t01/local/anaconda3/bin/activate kobas
- kobas-annotate -k /Business/psn_company/t01/public/software/kobas-3.0/ -i $outdir/blast.kegg.txt -t blastout:tab -s ko -o $outdir/kobas.txt
- cut -f 1 -d '|' $outdir/kobas.txt|grep None -v |grep -P "K\d+"> $outdir/KEGG.txt
- fi
- elif [ -s "$database_dir/KO/$species" ]
- then
- echo "ERROR:unknown -l;$idlist not exist"
- usage
- else
- echo "ERROR:$database_dir/KO/$species not exist"
- usage
- fi
- ;;
- esac
|