#!/bin/bash # set -ex # 脚本里需要耗cpu 内存的操作;均使用了bsub ;投递本流程 推荐 -n 2 -M 4G script_path=$(dirname `realpath $0`) trap "kill 0;exit 1" 2 # 默认变量 outdir="." # 默认Annotation 只包含 protein_coding 基因 allgene=0 # 路径 database_dir=/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare # 投递最大cpu和内存参数 cpu=6 memory=50G # 解析命令行参数 # getopt usage() { echo "Usage: $0 [-h|--help] [-i|--indir ] [-o|--outdir ] [-c|--config ] [-n|--cpu ] [-M|--memory ] [-v|--version] [-a|--allgene]" echo "Options:" echo " -h, --help Show this help message and exit." echo " -i, --indir Input directory containing the files that in config file" echo " -o, --outdir Output directory" echo " -c, --config Config file for the pipeline." echo " -n, --cpu Number of CPUs in bsub;default: 6" echo " -M, --memory Memory limit in bsub;default: 50G" echo " -v, --version Show the version of the pipeline and exit." echo " -a, --allgene Include all genes, not just protein-coding genes." exit 1 } ARGS=`getopt -a -o hi:o:c:n:M:va -l help,indir:,outdir:,config:,cpu:,memory:,version,allgene -n $0 -- "$@"` [ $? != 0 ] && usage eval set -- "${ARGS}" while true do case "$1" in -h|--help) usage shift ;; -i|--indir) indir=$2 shift 2 ;; -o|--outdir) outdir=$2 shift 2 ;; -c|--config) config_file=$2 shift 2 ;; -n|--cpu) cpu=$2 shift 2 ;; -M|--memory) memory=$2 shift 2 ;; -v|--version) echo "genome prepare version 1.0" echo "Author: Hu Dabang" echo "Email: hudb@personalbio.cn" echo "run $0 -h for help" exit 1 ;; -a|--allgene) allgene=1 shift ;; --) shift break ;; esac done if [ ! -d "$indir" ] || [ ! -s "$config_file" ] then echo "ERROR: The input directory or config file does not exist." usage fi indir=`realpath $indir` config_file=`realpath $config_file` source $config_file # check file for f in `grep -P "_file=|fasta=|gtf=|table=" $config_file|sed '/=none$/d'|cut -f1 -d "="` do [ ! -s "$indir/${!f}" ] && echo "ERROR: The input file $indir/${!f} does not exist." && exit 1 done mkdir -p $outdir/{DNA,GTF,Annotation,PPI,09_TFs,annoDB,LOG} outdir=`realpath $outdir` # check long chr perl $SCRIPTMANAGER_LIB/PrepRef/create_hrds/create_hrds.pl -i $indir/$dna_fasta -o $outdir/DNA/hrds.txt -org $latin_name long_chr=`cut -f1,2 $outdir/DNA/hrds.txt|sed 's/^>//;s#/len=##'|awk -F"\t" '$2>=536870912{print $1}'|tr "\n" ";"` if [ -n "$long_chr" ] then echo -e "WARNING: The following chromosomes have length larger than 536870912, \nPlease check the length of the following chromosomes:" echo "$long_chr" exit 1 fi mv $indir/$dna_fasta $outdir/DNA/ mv $indir/$gtf $outdir/GTF/ base=`perl $SCRIPTMANAGER_LIB/PrepRef/calculate_fasta_basepair/calculate_fasta_basepair.pl -i $outdir/DNA/$dna_fasta|cut -f2 -d ":"` cat>$outdir/DNA/RefDatabase_info<//;s#/len=##' | sort -k2 -nr |\ awk 'BEGIN{FS=OFS="\t"}{split($2,a,"");print $0,length(a)}' | \ awk 'BEGIN{FS=OFS="\t"}{a[$3]++;if(length(a)<=2)print $1,$2,length(a)}' > $outdir/DNA/chr.tmp contig_n=`grep -Pc -i "contig|scaffold" $outdir/DNA/chr.tmp` no_contig_n=`grep -Pvc -i "contig|scaffold" $outdir/DNA/chr.tmp` if [ $contig_n -eq 0 ] then if [ $kindom == "animal" ] then awk 'BEGIN{FS=OFS="\t"}$3<=2{print $1}' $outdir/DNA/chr.tmp |sort -V > $outdir/DNA/chr_list else awk 'BEGIN{FS=OFS="\t"}$3==1{print $1}' $outdir/DNA/chr.tmp |sort -V > $outdir/DNA/chr_list fi else head -24 $outdir/DNA/chr.tmp |cut -f1 > $outdir/DNA/chr_list fi perl $SCRIPTMANAGER_LIB/PrepRef/create_cytoband/create_cytoband.pl -i $outdir/DNA/hrds.txt -c $outdir/DNA/chr_list -o $outdir/DNA/cytoband.txt # GTF 相关文件 /Business/psn_company/t01/software/source_pkg/YZ_data_software/hisat2-2.1.0/extract_splice_sites.py $outdir/GTF/$gtf > $outdir/GTF/genome.ss perl $SCRIPTMANAGER_LIB/PrepRef/gtf2bed/gtf2bed.pl $outdir/GTF/$gtf > $outdir/GTF/GTF.bed python $SCRIPTMANAGER_LIB/DE/exon_DEXSeq/dexseq_prepare_annotation.py $outdir/GTF/$gtf $outdir/GTF/DEXSeq.gff # lncRNA 这个脚本要优化一下 (暂未处理ncbi里的转录本没有加上gene行) perl $script_path/abstract_lncRNA_gtf.pl -i $outdir/GTF/$gtf -o $outdir/GTF/lncRNA.gtf if [ ! -s "$outdir/GTF/lncRNA.gtf" ] then touch $outdir/GTF/lncRNA.gtf fi gtfToGenePred -genePredExt $outdir/GTF/$gtf $outdir/annoDB/Anno_refGene.txt perl /Business/psn_company/t01/public/software/annovar/retrieve_seq_from_fasta.pl --format refGene --seqfile \ $outdir/DNA/$dna_fasta $outdir/annoDB/Anno_refGene.txt --out $outdir/annoDB/Anno_refGeneMrna.fa # miRNA mature_org=`grep -P "\t$taxid$" $database_dir/miRBase/organisms.taxid.txt|cut -f1` if [ -s "$database_dir/miRBase/mature/${mature_org}_mature.fa" ] then cp $database_dir/miRBase/mature/${mature_org}_mature.fa $outdir/mature.fa else touch $outdir/mature.fa fi # DNA 索引 samtools faidx $outdir/DNA/$dna_fasta bsub -q psnpublic -o $outdir/LOG/hisat2.%J.o -e $outdir/LOG/hisat2.%J.e -n $cpu -M $memory -I hisat2-build $outdir/DNA/$dna_fasta $outdir/DNA/$dna_fasta touch $outdir/Annotation/annotation_summary.txt $outdir/Annotation/Annotation.xls $outdir/PPI/PPI.txt cat >$outdir/refCfg.ini< $outdir/PPI/gid_pid if [ `grep -Pc "^$taxid$" $database_dir/PPI/type_taxid.txt` -eq 1 ] then bsub -q psnpublic -n $cpu -M $memory -o $outdir/LOG/ppi.%J.o -e $outdir/LOG/ppi.%J.e -I $script_path/PPI.sh -i $fasta -t $fasta_type -s $taxid -o $outdir/PPI/ -m $outdir/PPI/gid_pid & else bsub -q psnpublic -n $cpu -M $memory -o $outdir/LOG/ppi.%J.o -e $outdir/LOG/ppi.%J.e -I $script_path/PPI.sh -i $fasta -t $fasta_type -s $kindom -o $outdir/PPI/ -m $outdir/PPI/gid_pid & fi # 注释 mkdir -p $outdir/Annotation/tmp if [ $allgene -eq 0 ] then gene_number=`sed "/^gene_id/d" $indir/$id_table|grep protein_coding|cut -f1 |sort -u|wc -l` else gene_number=`sed "/^gene_id/d" $indir/$id_table|cut -f1 |sort -u|wc -l` fi do_eggnog=0 # GO if [ "$go_file" != "none" ] then sed '/^Gene[ _]ID/d' $indir/$go_file > $outdir/Annotation/tmp/GO else awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$1])print a[$1],$2}'\ $indir/$id_table $database_dir/NCBI/gene_go.tsv | awk -F"\t" '!a[$1]++{print}' > $outdir/Annotation/tmp/ncbi.go go_n=`cat $outdir/Annotation/tmp/ncbi.go |wc -l` rate=`awk 'BEGIN{print "'$go_n'"/"'$gene_number'"}'` if [ $(echo "$rate >= 0.1" | $script_path/bc) -eq 1 ] then cp $outdir/Annotation/tmp/ncbi.go $outdir/Annotation/tmp/GO else do_eggnog=1 fi fi # eggnog if [ "$eggnog_file" != "none" ] then cut -f1,2 $indir/$eggnog_file |sed '/^Gene[ _]ID/d' > $outdir/Annotation/tmp/eggNOG cut -f1,3 $indir/$eggnog_file |sed '/^Gene[ _]ID/d' > $outdir/Annotation/tmp/eggNOG_Category else awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$1])print a[$1],$2,$3}'\ $indir/$id_table $database_dir/eggNOG/Ensembl_EggNOG.v5.0 |awk -F"\t" '!a[$1]++{print}' > $outdir/Annotation/tmp/ensembl.eggnog egg_n=`cat $outdir/Annotation/tmp/ensembl.eggnog|wc -l` rate=`awk 'BEGIN{print "'$egg_n'"/"'$gene_number'"}'` if [ `echo "$rate >= 0.1" | $script_path/bc ` -eq 1 ] then cut -f1,2 $outdir/Annotation/tmp/ensembl.eggnog |sed '/^Gene[ _]ID/d' > $outdir/Annotation/tmp/eggNOG cut -f1,3 $outdir/Annotation/tmp/ensembl.eggnog |sed '/^Gene[ _]ID/d' > $outdir/Annotation/tmp/eggNOG_Category else do_eggnog=1 fi fi if [ $do_eggnog -eq 1 ] then bsub -q psnpublic -n $cpu -M $memory -o $outdir/LOG/eggnog_map.%J.o -e $outdir/LOG/eggnog_map.%J.e -I $script_path/eggNOG_GO.sh -i $fasta -t $fasta_type -s $kindom -o $outdir/Annotation/blast & fi # blastuniprot if [ -s "$indir/$uniprot_fasta" ] && [ -s "$indir/$uniprot_tsv" ] then $script_path/blast_uniprot.sh -i $fasta -t $fasta_type -d $indir/$uniprot_fasta -g $indir/$uniprot_tsv -o $outdir/Annotation/blast/ fi # KEGG if [ "$kegg_file" == "none" ] then bsub -q psnpublic -n $cpu -M $memory -o $outdir/LOG/kegg.%J.o -e $outdir/LOG/kegg.%J.e -I $script_path/KEGG.sh -i $fasta -o $outdir/Annotation/blast -t $fasta_type -s $org -l $indir/$id_table & else sed '/^Gene[ _]ID/d' $indir/$kegg_file > $outdir/Annotation/tmp/KEGG fi # NR if [ "$nr_file" == "none" ] then if [ "$taxid" == "none" ] then bsub -q psnpublic -n $cpu -M $memory -o $outdir/LOG/nr.%J.o -e $outdir/LOG/nr.%J.e -I $script_path/NR.sh -i $fasta -o $outdir/Annotation/blast -t $fasta_type -s $kindom & else bsub -q psnpublic -n $cpu -M $memory -o $outdir/LOG/nr.%J.o -e $outdir/LOG/nr.%J.e -I $script_path/NR.sh -i $fasta -o $outdir/Annotation/blast -t $fasta_type -s $taxid & fi else sed '/^Gene[ _]ID/d' $indir/$nr_file > $outdir/Annotation/tmp/NR fi # Swissprot if [ "$swissprot_file" == "none" ] then bsub -q psnpublic -n $cpu -M $memory -o $outdir/LOG/swissprot.%J.o -e $outdir/LOG/swissprot.%J.e -I $script_path/Swissprot.sh -i $fasta -o $outdir/Annotation/blast -t $fasta_type & else sed '/^Gene[ _]ID/d' $indir/$swissprot_file > $outdir/Annotation/tmp/Swissprot fi wait # 拷贝到 $outdir/Annotation/tmp/ 下合并注释 for anno in GO KEGG eggNOG eggNOG_Category Swissprot NR do if [ ! -s "$outdir/Annotation/tmp/$anno" ] then if [ $anno == "GO" ] then if [ -f "$outdir/Annotation/blast/uniprot.go" ] then up_n=`cat $outdir/Annotation/blast/uniprot.go |wc -l` egg_go_n=`cat $outdir/Annotation/blast/GO.txt |wc -l` if [ $up_n -gt $egg_go_n ] then cp $outdir/Annotation/blast/uniprot.go $outdir/Annotation/tmp/$anno else cp $outdir/Annotation/blast/$anno.txt $outdir/Annotation/tmp/$anno fi else cp $outdir/Annotation/blast/$anno.txt $outdir/Annotation/tmp/$anno fi else cp $outdir/Annotation/blast/$anno.txt $outdir/Annotation/tmp/$anno fi fi done # Pathway if [ -s "$database_dir/KEGG/pathway/$org" ] then $script_path/Pathway.sh $outdir/Annotation/tmp/KEGG $org $outdir/Annotation/tmp/Pathway else $script_path/Pathway.sh $outdir/Annotation/tmp/KEGG $kindom $outdir/Annotation/tmp/Pathway fi # BP CC MF for i in BP CC MF do grep -P "\t$i$" $database_dir/GO/GO_description_total.txt |\ awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR&&/GO:/{des="";split($2,b,";");for(i in b){if(a[b[i]])des=des";"b[i]"//"a[b[i]]};print $1,des}' \ - $outdir/Annotation/tmp/GO | grep "GO:" |sed 's/\t;/\t/' > $outdir/Annotation/tmp/$i done # SwissprotName python3 $script_path/uniprot_info.py -s $outdir/Annotation/tmp/Swissprot -o $outdir/Annotation/tmp/ # term2gene /Business/psn_company/Work/Transcriptome/T01/software/miniconda3/envs/R4.2/bin/Rscript $script_path/term2gene.R $outdir/Annotation/tmp/GO $outdir/Annotation/term2gene.RData # 是否加entryid entry_id="" if [ `cut -f4 $indir/$id_table |grep -Pcv "\t-$" ` -gt 100 ] then cut -f1,4 $indir/$id_table|sed "/^gene_id/d" |sort -u > $outdir/Annotation/tmp/Entrez_geneID entry_id="Entrez_geneID" fi # merge Annotation if [ $allgene -eq 1 ] then allgene="-a" else allgene="" fi cd $outdir/Annotation/tmp/ perl $script_path/merge_anno.v1.pl -g $outdir/GTF/$gtf $allgene -o $outdir/Annotation/Annotation.xls GO KEGG eggNOG eggNOG_Category Swissprot NR SwissprotName $entry_id BP CC MF Pathway cd - # stat annotation perl $SCRIPTMANAGER_LIB//PrepRef/stat_Annotation/stat_Annotation.pl -i $outdir/Annotation/Annotation.xls -o $outdir/Annotation/annotation_summary.txt sed -i 's/Ensembl/ALL/' $outdir/Annotation//annotation_summary.txt sed -i 's/#/__/g' $outdir/Annotation/Annotation.xls # TF if [ "$kindom" == "fungi" ] then touch $outdir/09_TFs/TF.detail.xls else tf_species=`grep -P "\t$taxid$" $database_dir/TF/species.taxid.txt |cut -f1` if [ -n "$tf_species" ] then $script_path/TF.sh -i $fasta -o $outdir/09_TFs -t $fasta_type -s $tf_species -a $outdir/Annotation/Annotation.xls else $script_path/TF.sh -i $fasta -o $outdir/09_TFs -t $fasta_type -s $kindom -a $outdir/Annotation/Annotation.xls fi fi