PPI.sh 3.8 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. #!/bin/bash
  2. #set -e
  3. type="proteins"
  4. default_rate=0.8
  5. script_path=$(dirname `realpath $0`)
  6. database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/PPI"
  7. outdir="PPI"
  8. declare -A software db_path
  9. software["CDS"]="diamond blastx"
  10. software["proteins"]="diamond blastp"
  11. while read line
  12. do
  13. db_path[${line}]=${database_dir}/sequences/${line}.protein.sequences.v12.0.fa.dmnd
  14. done < $database_dir/type_taxid.txt
  15. usage(){
  16. echo -e "Usage:"
  17. echo -e "\t$0 -i pep.fa -o outdir -t proteins -s [animal plant fungi prokaryotes taxonid(9606) ]"
  18. echo -e "OPTIONS:"
  19. echo -e "\t-h: help information"
  20. echo -e "\t-i: fasta file; >geneid"
  21. echo -e "\t-o: output dir;default:./PPI"
  22. echo -e "\t-t: proteins or CDS default:proteins"
  23. echo -e "\t-s: species [animal plant fungi prokaryotes taxonid(9606)];"
  24. echo -e "\t more options see $database_dir/type_taxid.txt"
  25. echo -e "\t-m: geneid protein_id mapping file"
  26. echo -e "\t-r: rate of Matched to known proteins; default:0.8"
  27. exit 1
  28. }
  29. while getopts 's:t:i:o:r:m:h' OPT;do
  30. case $OPT in
  31. h) usage;;
  32. s) species="$OPTARG";;
  33. t) type="$OPTARG";;
  34. i) fasta="$OPTARG";;
  35. o) outdir="$OPTARG";;
  36. m) map="$OPTARG";;
  37. r) default_rate="$OPTARG";;
  38. ?) usage;;
  39. esac
  40. done
  41. if [[ ! $species ]] || [[ ! $fasta ]] || [ ! $map ] ||[ $# == 0 ]
  42. then
  43. usage
  44. fi
  45. if [[ ! ${db_path[${species}]} ]] || [[ ! ${software[${type}]} ]]
  46. then
  47. echo "ERROR:unknown -t $type or -s $species"
  48. exit 1
  49. fi
  50. mkdir -p $outdir
  51. case $species in
  52. animal|plant|fungi|prokaryotes)
  53. echo "从$species数据库中搜索$species的蛋白"
  54. awk 'BEGIN{RS=">"}{a++;gsub("\n$","",$0);s[a]=">"$0}END{srand();for(i=1;i<=1000;i++){r=int(1+rand()*a);if(s[r])print s[r]}}' $fasta > $outdir/rand.fa
  55. ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/rand.blast -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1
  56. taxid=`cut -f2 $outdir/rand.blast |sed 's/\..*//'|awk '{a[$1]++}END{for(i in a){print i"\t"a[i]}}' |sort -k2 -nr | head -1 |cut -f1`
  57. ;;
  58. [0-9]*)
  59. taxid=$species
  60. ;;
  61. esac
  62. known_ppi_number=`grep -c ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa`
  63. ppi_number=`grep ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa |\
  64. sed "s/>$taxid\.//"|awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$2])print $0}' - $map |sort -u|wc -l `
  65. rate=`awk 'BEGIN{print "'$ppi_number'"/"'$known_ppi_number'"}'`
  66. less ${database_dir}/links/${taxid}.protein.links.detailed.v12.0.txt.gz|sed "1d;s/$taxid\.//g"|\
  67. awk 'BEGIN{OFS="\t"}{print $1,$2,$10/1000,$3,$4,$5,$6,$7,$8,$9}' > $outdir/$taxid.links.txt
  68. if [ $(echo "$rate >= $default_rate" | $script_path/bc) -eq 1 ]
  69. then
  70. echo "该物种的PPI数据库中已存在该物种的PPI,不进行比对"
  71. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$1}FNR!=NR{if(a[$1] && a[$2] && a[$1]!=a[$2])print a[$1],a[$2],$3,$4,$5,$6,$7,$8,$9,$10}' \
  72. $map $outdir/$taxid.links.txt |awk -f $script_path/uniq_ppi_pair.awk | \
  73. sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt
  74. else
  75. echo "该物种的PPI数据库中不存在该物种的PPI,选择物种taxid为:$taxid的PPI进行比对"
  76. ${software[${type}]} -q ${fasta} -d ${db_path[${taxid}]} -o $outdir/blast.$taxid.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1
  77. sed "s/\t$taxid\./\t/" $outdir/blast.$taxid.txt |cut -f1,2 > $outdir/blast.geneid.proteinid.txt
  78. awk -f $script_path/pid2gid.awk $outdir/blast.geneid.proteinid.txt $outdir/$taxid.links.txt |\
  79. awk -f $script_path/uniq_ppi_pair.awk |\
  80. sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt
  81. fi
  82. rm $outdir/$taxid.links.txt