1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586 |
- #!/bin/bash
- #set -e
- type="proteins"
- script_path=$(dirname `realpath $0`)
- database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/TF"
- declare -A software db_path
- software["CDS"]="diamond blastx"
- software["proteins"]="diamond blastp"
- outdir="09_TFs"
- while read line
- do
- db_path[${line}]=${database_dir}/fasta/${line}.fa.dmnd
- done < $database_dir/species.txt
- usage(){
- echo -e "Usage:"
- echo -e "\t$0 -i pep.fa -o outdir -t proteins -s [animal plant Homo_sapiens Oryza_sativa_subsp._indica Zea_mays ...] -a Annotation.xls"
- echo -e "OPTIONS:"
- echo -e "\t-h: help information"
- echo -e "\t-i: fasta file; >geneid"
- echo -e "\t-o: output dir;default:./09_TFs"
- echo -e "\t-t: proteins or CDS default:proteins"
- echo -e "\t-s: species [ animal plant Homo_sapiens Oryza_sativa_subsp._indica Zea_mays ...];more species see $database_dir/species.txt"
- echo -e "\t-a: annotation file,col1:geneid col7:genename col13 NR or Description"
- exit 1
- }
- while getopts 's:t:i:o:a:h' OPT;do
- case $OPT in
- h) usage;;
- s) species="$OPTARG";;
- t) type="$OPTARG";;
- i) fasta="$OPTARG";;
- o) outdir="$OPTARG";;
- a) anno="$OPTARG";;
- ?) usage;;
- esac
- done
- if [[ ! $species ]] || [[ ! $fasta ]] || [[ ! $anno ]] || [ $# == 0 ]
- then
- usage
- fi
- if [[ ! ${db_path[${species}]} ]] || [[ ! ${software[${type}]} ]]
- then
- echo "ERROR:unknown -t $type or -s $species"
- exit 1
- fi
- mkdir -p $outdir
- des_col=`head -1 $anno | tr "\t" "\n" | grep -nPi "^nr$|^description$" |cut -f1 -d ":"|head -1`
- if [ -z "$des_col" ]
- then
- echo "ERROR:no nr or description in annotation file"
- exit 1
- fi
- if [ $species != "plant" ] && [ $species != "animal" ]
- then
- known_tf_number=`awk 'BEGIN{FS=OFS="\t"}$1=="'$species'"{print}' ${database_dir}/all_TF_list.txt|cut -f3,4|sort -u |wc -l`
- tf_number=`awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$3])print $0}' $anno ${database_dir}/all_TF_list.txt|wc -l`
- rate=`awk 'BEGIN{print "'$tf_number'"/"'$known_tf_number'"}'`
- if [ $(echo "$rate >= 0.8" | $script_path/bc) -eq 1 ]
- then
- awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$3]=$4}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \
- ${database_dir}/all_TF_list.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls
- else
- echo "known rate is $rate < 0.8 ;$tf_number $known_tf_number"
- ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.tf.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
- awk 'BEGIN{FS=OFS="\t"}$3>50{if(!a[$1]){print $1,$2;a[$1]=1}}' $outdir/blast.tf.txt |\
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$4}FNR!=NR{if(a[$2])print $1,a[$2]}' ${database_dir}/all_TF_list.txt - > $outdir/gene_id_TF.txt
- awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \
- $outdir/gene_id_TF.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls
- fi
- else
- ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.tf.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
- awk 'BEGIN{FS=OFS="\t"}$3>50{if(!a[$1]){print $1,$2;a[$1]=1}}' $outdir/blast.tf.txt |\
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$4}FNR!=NR{if(a[$2])print $1,a[$2]}' ${database_dir}/all_TF_list.txt - > $outdir/gene_id_TF.txt
- awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \
- $outdir/gene_id_TF.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls
- fi
- sed 1d $outdir/TF.detail.xls | awk -F"\t" '{a[$3]++}END{for(i in a){print i"\t"a[i]}}'|sort -k2 -nr | sed '1iFamily\tCount' > $outdir/TF.family.count.xls
- Rscript $script_path/TF_Family.R $outdir/TF.family.count.xls $outdir/TF.count
|