#!/bin/bash #set -e type="proteins" script_path=$(dirname `realpath $0`) database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/TF" declare -A software db_path software["CDS"]="diamond blastx" software["proteins"]="diamond blastp" outdir="09_TFs" while read line do db_path[${line}]=${database_dir}/fasta/${line}.fa.dmnd done < $database_dir/species.txt usage(){ echo -e "Usage:" echo -e "\t$0 -i pep.fa -o outdir -t proteins -s [animal plant Homo_sapiens Oryza_sativa_subsp._indica Zea_mays ...] -a Annotation.xls" echo -e "OPTIONS:" echo -e "\t-h: help information" echo -e "\t-i: fasta file; >geneid" echo -e "\t-o: output dir;default:./09_TFs" echo -e "\t-t: proteins or CDS default:proteins" echo -e "\t-s: species [ animal plant Homo_sapiens Oryza_sativa_subsp._indica Zea_mays ...];more species see $database_dir/species.txt" echo -e "\t-a: annotation file,col1:geneid col7:genename col13 NR or Description" exit 1 } while getopts 's:t:i:o:a:h' OPT;do case $OPT in h) usage;; s) species="$OPTARG";; t) type="$OPTARG";; i) fasta="$OPTARG";; o) outdir="$OPTARG";; a) anno="$OPTARG";; ?) usage;; esac done if [[ ! $species ]] || [[ ! $fasta ]] || [[ ! $anno ]] || [ $# == 0 ] then usage fi if [[ ! ${db_path[${species}]} ]] || [[ ! ${software[${type}]} ]] then echo "ERROR:unknown -t $type or -s $species" exit 1 fi mkdir -p $outdir des_col=`head -1 $anno | tr "\t" "\n" | grep -nPi "^nr$|^description$" |cut -f1 -d ":"|head -1` if [ -z "$des_col" ] then echo "ERROR:no nr or description in annotation file" exit 1 fi if [ $species != "plant" ] && [ $species != "animal" ] then known_tf_number=`awk 'BEGIN{FS=OFS="\t"}$1=="'$species'"{print}' ${database_dir}/all_TF_list.txt|cut -f3,4|sort -u |wc -l` tf_number=`awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$3])print $0}' $anno ${database_dir}/all_TF_list.txt|wc -l` rate=`awk 'BEGIN{print "'$tf_number'"/"'$known_tf_number'"}'` if [ $(echo "$rate >= 0.8" | $script_path/bc) -eq 1 ] then awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$3]=$4}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \ ${database_dir}/all_TF_list.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls else echo "known rate is $rate < 0.8 ;$tf_number $known_tf_number" ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.tf.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5 awk 'BEGIN{FS=OFS="\t"}$3>50{if(!a[$1]){print $1,$2;a[$1]=1}}' $outdir/blast.tf.txt |\ awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$4}FNR!=NR{if(a[$2])print $1,a[$2]}' ${database_dir}/all_TF_list.txt - > $outdir/gene_id_TF.txt awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \ $outdir/gene_id_TF.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls fi else ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.tf.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5 awk 'BEGIN{FS=OFS="\t"}$3>50{if(!a[$1]){print $1,$2;a[$1]=1}}' $outdir/blast.tf.txt |\ awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$4}FNR!=NR{if(a[$2])print $1,a[$2]}' ${database_dir}/all_TF_list.txt - > $outdir/gene_id_TF.txt awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \ $outdir/gene_id_TF.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls fi sed 1d $outdir/TF.detail.xls | awk -F"\t" '{a[$3]++}END{for(i in a){print i"\t"a[i]}}'|sort -k2 -nr | sed '1iFamily\tCount' > $outdir/TF.family.count.xls Rscript $script_path/TF_Family.R $outdir/TF.family.count.xls $outdir/TF.count