123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- #!/bin/bash
- script_path=$(dirname `realpath $0`)
- outdir="."
- database_dir=/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare
- usage() {
- cat <<EOF
- Usage: $0 --dna genome.fa --gtf genome.gtf --latin_name Raphanus_sativus --taxid 3726 --outdir ./
- get taxid :echo Homo_sapiens |sed 's/_/ /g' |/Business/psn_company/t04/hudabang/bin/yaogu/bin/taxonkit name2taxid --data-dir /Business/psn_company/t04/.taxonkit/
- Options:
- -h, --help Show this help message and exit
- -o, --outdir Output directory
- -f, --dna Genome sequence file
- -g, --gtf ensembl format GTF file
- -n, --name latin name
- -t, --taxid taxid
-
- EOF
- exit 1
- }
- ARGS=`getopt -a -o ho:f:g:n:t: -l help,outdir:,dna:,gtf:,name:,taxid: -n $0 -- "$@"`
- [ $? != 0 ] && usage
- eval set -- "${ARGS}"
- while true
- do
- case "$1" in
- -h|--help)
- usage
- shift
- ;;
- -o|--outdir)
- outdir=$2
- shift 2
- ;;
- -f|--dna)
- dna=$2
- shift 2
- ;;
- -g|--gtf)
- gtf=$2
- shift 2
- ;;
- -n|--name)
- name=$2
- shift 2
- ;;
- -t|--taxid)
- taxid=$2
- shift 2
- ;;
- --)
- shift
- break
- ;;
- esac
- done
- for i in dna gtf taxid name
- do
- if [ -z "${!i}" ]
- then
- echo "Error: unknown -$i "
- usage
- fi
- done
- gffread -g $dna -x $outdir/cds.tid.fa $gtf
- perl -ne 'chomp;if(/\ttranscript\t/){@a=$_=~/gene_id "([^"]+).*transcript_id \"([^"]+)/;print join("\t",@a)."\n"}' $gtf | sort -u > $outdir/gid_tid
- perl $script_path/protein2geneid.pl $outdir/gid_tid $outdir/cds.tid.fa > $outdir/cds.gid.fa
- perl $script_path/get_longseq.pl $outdir/cds.gid.fa > $outdir/cds.fa
- rm -f $outdir/cds.gid.fa $outdir/gid_tid $outdir/cds.tid.fa
- perl $script_path/gtf2id_table.pl $gtf > $outdir/id_table.txt
- org=`awk -F"\t" '$2=="'$name'"{print $1}' $script_path/kegg_org_species_taxid.txt `
- kindom=`awk -F"\t" '$1=="'$taxid'"{print $2}' $script_path/taxid_division.txt `
- if [ -z "$kindom" ]
- then
- echo "物种分类不在NCBI taxid;请手动填写配置文件的kindom 和org"
- elif [ -z "$org" ]
- then
- org=$kindom
- fi
- cat > $outdir/genome_conf.ini<<EOF
- latin_name=$name
- taxid=$taxid
- url="-"
- version="-"
- dna_fasta=`basename $dna`
- gtf=`basename $gtf`
- fasta=cds.fa
- fasta_type=CDS
- id_table=id_table.txt
- kindom=$kindom
- org=$org
- go_file=none
- kegg_file=none
- eggnog_file=none
- swissprot_file=none
- nr_file=none
- uniprot_tsv=none
- uniprot_fasta=none
- EOF
|