12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- #!/bin/bash
- #set -e
- type="proteins"
- default_rate=0.8
- script_path=$(dirname `realpath $0`)
- database_dir="/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/PPI"
- outdir="PPI"
- declare -A software db_path
- software["CDS"]="diamond blastx"
- software["proteins"]="diamond blastp"
- while read line
- do
- db_path[${line}]=${database_dir}/sequences/${line}.protein.sequences.v12.0.fa.dmnd
- done < $database_dir/type_taxid.txt
- usage(){
- echo -e "Usage:"
- echo -e "\t$0 -i pep.fa -o outdir -t proteins -s [animal plant fungi prokaryotes taxonid(9606) ]"
- echo -e "OPTIONS:"
- echo -e "\t-h: help information"
- echo -e "\t-i: fasta file; >geneid"
- echo -e "\t-o: output dir;default:./PPI"
- echo -e "\t-t: proteins or CDS default:proteins"
- echo -e "\t-s: species [animal plant fungi prokaryotes taxonid(9606)];"
- echo -e "\t more options see $database_dir/type_taxid.txt"
- echo -e "\t-m: geneid protein_id mapping file"
- echo -e "\t-r: rate of Matched to known proteins; default:0.8"
- exit 1
- }
- while getopts 's:t:i:o:r:m:h' OPT;do
- case $OPT in
- h) usage;;
- s) species="$OPTARG";;
- t) type="$OPTARG";;
- i) fasta="$OPTARG";;
- o) outdir="$OPTARG";;
- m) map="$OPTARG";;
- r) default_rate="$OPTARG";;
- ?) usage;;
- esac
- done
- if [[ ! $species ]] || [[ ! $fasta ]] || [ ! $map ] ||[ $# == 0 ]
- then
- usage
- fi
- if [[ ! ${db_path[${species}]} ]] || [[ ! ${software[${type}]} ]]
- then
- echo "ERROR:unknown -t $type or -s $species"
- exit 1
- fi
- mkdir -p $outdir
- case $species in
- animal|plant|fungi|prokaryotes)
- echo "从$species数据库中搜索$species的蛋白"
- awk 'BEGIN{RS=">"}{a++;gsub("\n$","",$0);s[a]=">"$0}END{srand();for(i=1;i<=1000;i++){r=int(1+rand()*a);if(s[r])print s[r]}}' $fasta > $outdir/rand.fa
- ${software[${type}]} -q ${fasta} -d ${db_path[${species}]} -o $outdir/rand.blast -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1
- taxid=`cut -f2 $outdir/rand.blast |sed 's/\..*//'|awk '{a[$1]++}END{for(i in a){print i"\t"a[i]}}' |sort -k2 -nr | head -1 |cut -f1`
- ;;
- [0-9]*)
- taxid=$species
- ;;
- esac
- known_ppi_number=`grep -c ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa`
- ppi_number=`grep ">" ${database_dir}/sequences/${taxid}.protein.sequences.v12.0.fa |\
- sed "s/>$taxid\.//"|awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$2])print $0}' - $map |sort -u|wc -l `
- rate=`awk 'BEGIN{print "'$ppi_number'"/"'$known_ppi_number'"}'`
- less ${database_dir}/links/${taxid}.protein.links.detailed.v12.0.txt.gz|sed "1d;s/$taxid\.//g"|\
- awk 'BEGIN{OFS="\t"}{print $1,$2,$10/1000,$3,$4,$5,$6,$7,$8,$9}' > $outdir/$taxid.links.txt
- if [ $(echo "$rate >= $default_rate" | $script_path/bc) -eq 1 ]
- then
- echo "该物种的PPI数据库中已存在该物种的PPI,不进行比对"
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$2]=$1}FNR!=NR{if(a[$1] && a[$2] && a[$1]!=a[$2])print a[$1],a[$2],$3,$4,$5,$6,$7,$8,$9,$10}' \
- $map $outdir/$taxid.links.txt |awk -f $script_path/uniq_ppi_pair.awk | \
- sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt
- else
- echo "该物种的PPI数据库中不存在该物种的PPI,选择物种taxid为:$taxid的PPI进行比对"
- ${software[${type}]} -q ${fasta} -d ${db_path[${taxid}]} -o $outdir/blast.$taxid.txt -e 0.00001 -f 6 -p 12 --more-sensitive --max-target-seqs 1
- sed "s/\t$taxid\./\t/" $outdir/blast.$taxid.txt |cut -f1,2 > $outdir/blast.geneid.proteinid.txt
- awk -f $script_path/pid2gid.awk $outdir/blast.geneid.proteinid.txt $outdir/$taxid.links.txt |\
- awk -f $script_path/uniq_ppi_pair.awk |\
- sed '1iNode1\tNode2\tScore\tneighborhood\tfusion\tcooccurence\tcoexpression\texperimental\tdatabase\ttextmining' > $outdir/PPI.txt
- fi
- rm $outdir/$taxid.links.txt
|