hdb
/
genome_prepare


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
							#!/bin/bash
script_path=$(dirname `realpath $0`)
outdir="."
database_dir=/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare

usage() {
    cat <<EOF
Usage: $0 --dna genome.fa --gtf genome.gtf --latin_name  Raphanus_sativus  --taxid 3726 --outdir ./ 
    get taxid :echo Homo_sapiens |sed 's/_/ /g' |/Business/psn_company/t04/hudabang/bin/yaogu/bin/taxonkit name2taxid --data-dir /Business/psn_company/t04/.taxonkit/ 
Options:
    -h, --help            Show this help message and exit
    -o, --outdir          Output directory
    -f, --dna             Genome sequence file
    -g, --gtf             ensembl format GTF file
    -n, --name            latin name
    -t, --taxid           taxid
    
EOF
    exit 1
}

ARGS=`getopt -a -o ho:f:g:n:t: -l help,outdir:,dna:,gtf:,name:,taxid: -n $0 -- "$@"`

[ $? != 0 ] && usage
eval set -- "${ARGS}"
while true
do
    case "$1" in
        -h|--help)
            usage
            shift
            ;;
        -o|--outdir)
            outdir=$2
            shift 2
            ;;
        -f|--dna)
            dna=$2
            shift 2
            ;;
        -g|--gtf)
            gtf=$2
            shift 2
            ;;
        -n|--name)
            name=$2
            shift 2
            ;;
        -t|--taxid)
            taxid=$2
            shift 2
            ;; 
        --)
            shift
            break
            ;;
        esac
done

for i in dna gtf taxid name 
do
    if [ -z "${!i}" ]
    then
        echo "Error: unknown -$i "
        usage
    fi
done

gffread -g $dna -x $outdir/cds.tid.fa $gtf
perl -ne 'chomp;if(/\ttranscript\t/){@a=$_=~/gene_id "([^"]+).*transcript_id \"([^"]+)/;print join("\t",@a)."\n"}' $gtf | sort -u > $outdir/gid_tid
perl $script_path/protein2geneid.pl  $outdir/gid_tid $outdir/cds.tid.fa > $outdir/cds.gid.fa
perl $script_path/get_longseq.pl  $outdir/cds.gid.fa > $outdir/cds.fa
rm -f $outdir/cds.gid.fa $outdir/gid_tid $outdir/cds.tid.fa
perl $script_path/gtf2id_table.pl $gtf > $outdir/id_table.txt
org=`awk -F"\t" '$2=="'$name'"{print $1}' $script_path/kegg_org_species_taxid.txt `
kindom=`awk -F"\t" '$1=="'$taxid'"{print $2}' $script_path/taxid_division.txt `

if [ -z "$kindom" ]
then
    echo "物种分类不在NCBI taxid；请手动填写配置文件的kindom 和org"
elif [ -z "$org" ]
then
    org=$kindom
fi

cat > $outdir/genome_conf.ini<<EOF
latin_name=$name
taxid=$taxid
url="-"
version="-"
dna_fasta=`basename $dna`
gtf=`basename $gtf`
fasta=cds.fa
fasta_type=CDS
id_table=id_table.txt
kindom=$kindom
org=$org
go_file=none
kegg_file=none
eggnog_file=none
swissprot_file=none
nr_file=none
uniprot_tsv=none
uniprot_fasta=none
EOF