#!/bin/bash #set -e type="proteins" script_path=$(dirname `realpath $0`) database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/KEGG" default_rate=0.5 idlist=1 declare -A software db_path software["CDS"]="diamond blastx" software["proteins"]="diamond blastp" db_path["animal"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Animals.pep.fasta.dmnd db_path["plant"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Plants.pep.fasta.dmnd db_path["fungi"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Fungi.pep.fasta.dmnd db_path["prokaryotes"]=/Business/psn_company/t10/DATA/KEGG/KOBAS/Bacteria.pep.fasta.dmnd usage(){ echo -e "Usage:" echo -e "\t$0 -i cds.fa -o outdir -t CDS -s animal -l idlist" echo -e "OPTIONS:" echo -e "\t-h: help information" echo -e "\t-i: fasta file" echo -e "\t-o: output dir" echo -e "\t-l: idlist;col1-6 is gene_id locus_tag old_locus_tag entry_id protein_id gene_name" echo -e "\t-t: proteins or CDS default:proteins" echo -e "\t-s: kegg org [ animal plant fungi prokaryotes hsa mmu ... ]" echo -e "\t-r: rate of Matched to known KO; default:0.5" exit 1 } while getopts 's:t:i:o:l:h' OPT;do case $OPT in h) usage;; s) species="$OPTARG";; t) type="$OPTARG";; i) fasta="$OPTARG";; o) outdir="$OPTARG";; l) idlist="$OPTARG";; ?) usage;; esac done if [[ ! $species ]] || [[ ! -s $fasta ]] || [[ ! $outdir ]] || [ $# == 0 ] then usage fi if [[ ! ${software[${type}]} ]] then echo "ERROR:unknown -t $type" usage fi case $species in animal|plant|fungi|prokaryotes) ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.kegg.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5 source /Business/psn_company/t01/local/anaconda3/bin/activate kobas kobas-annotate -k /Business/psn_company/t01/public/software/kobas-3.0/ -i $outdir/blast.kegg.txt -t blastout:tab -s ko -o $outdir/kobas.txt cut -f 1 -d '|' $outdir/kobas.txt|grep None -v |grep -P "K\d+"> $outdir/KEGG.txt ;; *) if [ -s "$database_dir/KO/$species" ] && [ -s $idlist ] then known_ko_number=`cat $database_dir/KO/$species |wc -l` awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{for(i=1;i<=6;i++){if(a[$i])a[$1]=a[$i]};if(a[$1])print $1,a[$1]}' \ $database_dir/KO/$species $idlist > $outdir/KEGG.txt ko_number=`cat $outdir/KEGG.txt|wc -l` rate=`awk 'BEGIN{print "'$ko_number'"/"'$known_ko_number'"}'` if [ $(echo "$rate >= $default_rate" | $script_path/bc) -eq 1 ] then echo "KEGG annotated from $database_dir/KO/$species" else echo "rate of Matched to known KO is $rate < $default_rate;KO annotated by kobas" kingdom=`grep -P "^$species\t" $database_dir/org_kindom.txt|cut -f2` [ $kingdom == "protists" ] && "echo $species is protists; not in animal plant fungi prokaryotes" && exit 1 echo "blast db is : ${db_path[${kingdom}]}" ${software[${type}]} -q ${fasta} -d ${db_path[${kingdom}]} -o $outdir/blast.kegg.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5 source /Business/psn_company/t01/local/anaconda3/bin/activate kobas kobas-annotate -k /Business/psn_company/t01/public/software/kobas-3.0/ -i $outdir/blast.kegg.txt -t blastout:tab -s ko -o $outdir/kobas.txt cut -f 1 -d '|' $outdir/kobas.txt|grep None -v |grep -P "K\d+"> $outdir/KEGG.txt fi elif [ -s "$database_dir/KO/$species" ] then echo "ERROR:unknown -l;$idlist not exist" usage else echo "ERROR:$database_dir/KO/$species not exist" usage fi ;; esac