#!/bin/bash #set -e type="proteins" default_rate=0.8 script_path=$(dirname `realpath $0`) database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/PPI" outdir="PPI" declare -A software db_path software["CDS"]="diamond blastx" software["proteins"]="diamond blastp" while read line do db_path[${line}]=${database_dir}/sequences/${line}.protein.sequences.v12.0.fa.dmnd done < $database_dir/type_taxid.txt usage(){ echo -e "Usage:" echo -e "\t$0 -i pep.fa -o outdir -t proteins -s [animal plant fungi prokaryotes taxonid(9606) ]" echo -e "OPTIONS:" echo -e "\t-h: help information" echo -e "\t-i: fasta file; >geneid" echo -e "\t-o: output dir;default:./PPI" echo -e "\t-t: proteins or CDS default:proteins" echo -e "\t-s: species [animal plant fungi prokaryotes taxonid(9606)];" echo -e "\t more options see $database_dir/type_taxid.txt" echo -e "\t-m: geneid protein_id mapping file" echo -e "\t-r: rate of Matched to known proteins; default:0.8" exit 1 } while getopts 's:t:i:o:r:m:h' OPT;do case $OPT in h) usage;; s) species="$OPTARG";; t) type="$OPTARG";; i) fasta="$OPTARG";; o) outdir="$OPTARG";; m) map="$OPTARG";; r) default_rate="$OPTARG";; ?) usage;; esac done if [[ ! $species ]] || [[ ! $fasta ]] || [ ! $map ] ||[ $# == 0 ] then usage fi if [[ ! ${db_path[${species}]} ]] || [[ ! ${software[${type}]} ]] then echo "ERROR:unknown -t $type or -s $species" exit 1 fi mkdir -p $outdir case $species in animal|plant|fungi|prokaryotes) echo "从$species数据库中搜索$species的蛋白" awk 'BEGIN{RS=">"}{a++;gsub("\n$","",$0);s[a]=">"$0}END{srand();for(i=1;i<=1000;i++){r=int(1+rand()*a);if(s[r])print s[r]}}' $fasta > $outdir/rand.fa ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/rand.blast -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1 taxid=`cut -f2 $outdir/rand.blast |sed 's/\..*//'|awk '{a[$1]++}END{for(i in a){print i"\t"a[i]}}' |sort -k2 -nr | head -1 |cut -f1` ;; [0-9]*) taxid=$species ;; esac known_ppi_number=`grep -c ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa` ppi_number=`grep ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa |\ sed "s/>$taxid\.//"|awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$2])print $0}' - $map |sort -u|wc -l ` rate=`awk 'BEGIN{print "'$ppi_number'"/"'$known_ppi_number'"}'` less ${database_dir}/links/${taxid}.protein.links.detailed.v12.0.txt.gz|sed "1d;s/$taxid\.//g"|\ awk 'BEGIN{OFS="\t"}{print $1,$2,$10/1000,$3,$4,$5,$6,$7,$8,$9}' > $outdir/$taxid.links.txt if [ $(echo "$rate >= $default_rate" | $script_path/bc) -eq 1 ] then echo "该物种的PPI数据库中已存在该物种的PPI,不进行比对" awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$1}FNR!=NR{if(a[$1] && a[$2] && a[$1]!=a[$2])print a[$1],a[$2],$3,$4,$5,$6,$7,$8,$9,$10}' \ $map $outdir/$taxid.links.txt |awk -f $script_path/uniq_ppi_pair.awk | \ sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt else echo "该物种的PPI数据库中不存在该物种的PPI,选择物种taxid为:$taxid的PPI进行比对" ${software[${type}]} -q ${fasta} -d ${db_path[${taxid}]} -o $outdir/blast.$taxid.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1 sed "s/\t$taxid\./\t/" $outdir/blast.$taxid.txt |cut -f1,2 > $outdir/blast.geneid.proteinid.txt awk -f $script_path/pid2gid.awk $outdir/blast.geneid.proteinid.txt $outdir/$taxid.links.txt |\ awk -f $script_path/uniq_ppi_pair.awk |\ sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt fi rm $outdir/$taxid.links.txt