1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- #!/bin/bash
- #set -e
- type="proteins"
- script_path=$(dirname `realpath $0`)
- database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/NR"
- declare -A software database_dir
- software["CDS"]="diamond blastx"
- software["proteins"]="diamond blastp"
- database_dir["animal"]=$database_dir/diamond_index/Metazoa.fa.dmnd
- database_dir["plant"]=$database_dir/diamond_index/Viridiplantae.fa.dmnd
- database_dir["fungi"]=$database_dir/diamond_index/Fungi.fa.dmnd
- database_dir["prokaryotes"]=$database_dir/diamond_index/Bacteria.fa.dmnd
- database_dir["virus"]=$database_dir/diamond_index/Virus.fa.dmnd
- usage(){
- echo -e "Usage:"
- echo -e "\t$0 -i cds.fa -o outdir -t CDS -s animal"
- echo -e "OPTIONS:"
- echo -e "\t-h: help information"
- echo -e "\t-i: fasta file"
- echo -e "\t-o: output dir"
- echo -e "\t-t: proteins or CDS default:proteins"
- echo -e "\t-s: species [ animal plant fungi prokaryotes virus taxid(9606 10096 ..) ]"
- exit 1
- }
- while getopts 'hs:t:i:o:' OPT;do
- case $OPT in
- h) usage;;
- s) species="$OPTARG";;
- t) type="$OPTARG";;
- i) fasta="$OPTARG";;
- o) outdir="$OPTARG";;
- ?) usage;;
- esac
- done
- #echo $species $type $fasta $outdir
- if [[ ! $species ]] || [[ ! $fasta ]] || [[ ! $outdir ]] || [ $# == 0 ]
- then
- usage
- fi
- if [[ ! ${software[${type}]} ]]
- then
- echo "ERROR:unknown -t $type "
- exit 1
- fi
- mkdir -p $outdir
- case $species in
- animal|plant|fungi|prokaryotes|virus)
- ${software[${type}]} -q ${fasta} -d ${database_dir[${species}]} -o $outdir/blast.$species.txt -e 0.00001 \
- --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle \
- -p 12 --more-sensitive --max-target-seqs 5
- ;;
- [0-9]*)
- /Business/psn_company/t04/hudabang/bin/yaogu/bin/taxonkit list --data-dir /Business/psn_company/t04/.taxonkit \
- --indent "" -i $species -n -o $outdir/$species.taxid.txt
- sed -i 's/ /\t/' $outdir/$species.taxid.txt
- head -1 $outdir/$species.taxid.txt | sed "s/^/taxid and name are : /"
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$3])print $2}' $outdir/$species.taxid.txt \
- $database_dir/prot.accession2taxid > $outdir/$species.accession.txt
- perl $script_path/extract_fa_by_list.pl $outdir/$species.accession.txt $database_dir/nr $outdir/$species.fa
- diamond makedb --in $outdir/$species.fa -d $outdir/$species.fa
- ${software[${type}]} -q ${fasta} -d $outdir/$species.fa.dmnd -o $outdir/blast.$species.txt -e 0.00001 \
- --outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore stitle \
- -p 12 --more-sensitive --max-target-seqs 5
- ;;
- *)
- echo "ERROR:unknown -s $species"
- usage
- ;;
- esac
- awk 'BEGIN{FS=OFS="\t"}{if(!a[$1]){print $1,$13;a[$1]=1}}' $outdir/blast.$species.txt > $outdir/NR.txt
|