TF.sh 3.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. #!/bin/bash
  2. #set -e
  3. type="proteins"
  4. script_path=$(dirname `realpath $0`)
  5. database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/TF"
  6. declare -A software db_path
  7. software["CDS"]="diamond blastx"
  8. software["proteins"]="diamond blastp"
  9. outdir="09_TFs"
  10. while read line
  11. do
  12. db_path[${line}]=${database_dir}/fasta/${line}.fa.dmnd
  13. done < $database_dir/species.txt
  14. usage(){
  15. echo -e "Usage:"
  16. echo -e "\t$0 -i pep.fa -o outdir -t proteins -s [animal plant Homo_sapiens Oryza_sativa_subsp._indica Zea_mays ...] -a Annotation.xls"
  17. echo -e "OPTIONS:"
  18. echo -e "\t-h: help information"
  19. echo -e "\t-i: fasta file; >geneid"
  20. echo -e "\t-o: output dir;default:./09_TFs"
  21. echo -e "\t-t: proteins or CDS default:proteins"
  22. echo -e "\t-s: species [ animal plant Homo_sapiens Oryza_sativa_subsp._indica Zea_mays ...];more species see $database_dir/species.txt"
  23. echo -e "\t-a: annotation file,col1:geneid col7:genename col13 NR or Description"
  24. exit 1
  25. }
  26. while getopts 's:t:i:o:a:h' OPT;do
  27. case $OPT in
  28. h) usage;;
  29. s) species="$OPTARG";;
  30. t) type="$OPTARG";;
  31. i) fasta="$OPTARG";;
  32. o) outdir="$OPTARG";;
  33. a) anno="$OPTARG";;
  34. ?) usage;;
  35. esac
  36. done
  37. if [[ ! $species ]] || [[ ! $fasta ]] || [[ ! $anno ]] || [ $# == 0 ]
  38. then
  39. usage
  40. fi
  41. if [[ ! ${db_path[${species}]} ]] || [[ ! ${software[${type}]} ]]
  42. then
  43. echo "ERROR:unknown -t $type or -s $species"
  44. exit 1
  45. fi
  46. mkdir -p $outdir
  47. des_col=`head -1 $anno | tr "\t" "\n" | grep -nPi "^nr$|^description$" |cut -f1 -d ":"|head -1`
  48. if [ -z "$des_col" ]
  49. then
  50. echo "ERROR:no nr or description in annotation file"
  51. exit 1
  52. fi
  53. if [ $species != "plant" ] && [ $species != "animal" ]
  54. then
  55. known_tf_number=`awk 'BEGIN{FS=OFS="\t"}$1=="'$species'"{print}' ${database_dir}/all_TF_list.txt|cut -f3,4|sort -u |wc -l`
  56. tf_number=`awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$3])print $0}' $anno ${database_dir}/all_TF_list.txt|wc -l`
  57. rate=`awk 'BEGIN{print "'$tf_number'"/"'$known_tf_number'"}'`
  58. if [ $(echo "$rate >= 0.8" | $script_path/bc) -eq 1 ]
  59. then
  60. awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$3]=$4}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \
  61. ${database_dir}/all_TF_list.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls
  62. else
  63. echo "known rate is $rate < 0.8 ;$tf_number $known_tf_number"
  64. ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.tf.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
  65. awk 'BEGIN{FS=OFS="\t"}$3>50{if(!a[$1]){print $1,$2;a[$1]=1}}' $outdir/blast.tf.txt |\
  66. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$4}FNR!=NR{if(a[$2])print $1,a[$2]}' ${database_dir}/all_TF_list.txt - > $outdir/gene_id_TF.txt
  67. awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \
  68. $outdir/gene_id_TF.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls
  69. fi
  70. else
  71. ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/blast.tf.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 5
  72. awk 'BEGIN{FS=OFS="\t"}$3>50{if(!a[$1]){print $1,$2;a[$1]=1}}' $outdir/blast.tf.txt |\
  73. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$4}FNR!=NR{if(a[$2])print $1,a[$2]}' ${database_dir}/all_TF_list.txt - > $outdir/gene_id_TF.txt
  74. awk -v n="$des_col" 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1])print $1,$7,a[$1],$n}' \
  75. $outdir/gene_id_TF.txt $anno|sed '1iGeneID\tSymbol\tFamily\tDescription' > $outdir/TF.detail.xls
  76. fi
  77. sed 1d $outdir/TF.detail.xls | awk -F"\t" '{a[$3]++}END{for(i in a){print i"\t"a[i]}}'|sort -k2 -nr | sed '1iFamily\tCount' > $outdir/TF.family.count.xls
  78. Rscript $script_path/TF_Family.R $outdir/TF.family.count.xls $outdir/TF.count