#!/bin/bash
#set -e
type="proteins"
default_rate=0.8
script_path=$(dirname `realpath $0`)
database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/PPI"
outdir="PPI"
declare -A software db_path
software["CDS"]="diamond blastx"
software["proteins"]="diamond blastp"
while read line
do
    db_path[${line}]=${database_dir}/sequences/${line}.protein.sequences.v12.0.fa.dmnd
done < $database_dir/type_taxid.txt

usage(){
    echo -e "Usage:"
    echo -e "\t$0 -i pep.fa -o outdir -t proteins -s [animal plant fungi prokaryotes taxonid(9606) ]"
    echo -e "OPTIONS:"
	echo -e "\t-h: help information"
    echo -e "\t-i: fasta file; >geneid"
    echo -e "\t-o: output dir;default:./PPI"
    echo -e "\t-t: proteins or CDS default:proteins"
    echo -e "\t-s: species [animal plant fungi prokaryotes taxonid(9606)];"
    echo -e "\t    more options see $database_dir/type_taxid.txt"
    echo -e "\t-m: geneid protein_id mapping file"
    echo -e "\t-r: rate of Matched to known proteins; default:0.8"
    exit 1
}

while getopts 's:t:i:o:r:m:h' OPT;do
    case $OPT in
        h) usage;;
        s) species="$OPTARG";;
		t) type="$OPTARG";;
        i) fasta="$OPTARG";;
        o) outdir="$OPTARG";;
        m) map="$OPTARG";;
        r) default_rate="$OPTARG";;
        ?) usage;;
    esac
done

if [[ ! $species ]] || [[ ! $fasta ]] || [ ! $map ] ||[ $# == 0 ]
then
	usage
fi

if [[ ! ${db_path[${species}]} ]] || [[ ! ${software[${type}]} ]]
then
    echo "ERROR:unknown -t $type  or -s $species" 
    exit 1
fi

mkdir -p $outdir
case $species in
    animal|plant|fungi|prokaryotes)
        echo "从$species数据库中搜索$species的蛋白"
        awk 'BEGIN{RS=">"}{a++;gsub("\n$","",$0);s[a]=">"$0}END{srand();for(i=1;i<=1000;i++){r=int(1+rand()*a);if(s[r])print s[r]}}' $fasta > $outdir/rand.fa
        ${software[${type}]} -q ${fasta}  -d ${db_path[${species}]} -o $outdir/rand.blast  -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1
        taxid=`cut -f2 $outdir/rand.blast |sed 's/\..*//'|awk '{a[$1]++}END{for(i in a){print i"\t"a[i]}}' |sort -k2 -nr | head -1 |cut -f1` 
    ;;
    [0-9]*)
        taxid=$species
    ;;
esac 

known_ppi_number=`grep -c ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa`
ppi_number=`grep ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa |\
sed "s/>$taxid\.//"|awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$2])print $0}' - $map |sort -u|wc -l `
rate=`awk 'BEGIN{print "'$ppi_number'"/"'$known_ppi_number'"}'`
less ${database_dir}/links/${taxid}.protein.links.detailed.v12.0.txt.gz|sed "1d;s/$taxid\.//g"|\
awk 'BEGIN{OFS="\t"}{print $1,$2,$10/1000,$3,$4,$5,$6,$7,$8,$9}' > $outdir/$taxid.links.txt
if [ $(echo "$rate >= $default_rate" | $script_path/bc) -eq 1 ]
then
    echo "该物种的PPI数据库中已存在该物种的PPI，不进行比对"
    awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$1}FNR!=NR{if(a[$1] && a[$2] && a[$1]!=a[$2])print a[$1],a[$2],$3,$4,$5,$6,$7,$8,$9,$10}' \
    $map $outdir/$taxid.links.txt |awk -f $script_path/uniq_ppi_pair.awk | \
    sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt
else
    echo "该物种的PPI数据库中不存在该物种的PPI，选择物种taxid为：$taxid的PPI进行比对"
    ${software[${type}]} -q ${fasta}  -d ${db_path[${taxid}]} -o $outdir/blast.$taxid.txt  -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1
    sed "s/\t$taxid\./\t/" $outdir/blast.$taxid.txt |cut -f1,2 > $outdir/blast.geneid.proteinid.txt
    awk -f $script_path/pid2gid.awk $outdir/blast.geneid.proteinid.txt $outdir/$taxid.links.txt |\
    awk -f $script_path/uniq_ppi_pair.awk |\
    sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt  
fi
rm $outdir/$taxid.links.txt