create_conf.sh 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. #!/bin/bash
  2. script_path=$(dirname `realpath $0`)
  3. outdir="."
  4. database_dir=/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare
  5. usage() {
  6. cat <<EOF
  7. Usage: $0 --dna genome.fa --gtf genome.gtf --latin_name Raphanus_sativus --taxid 3726 --outdir ./
  8. get taxid :echo Homo_sapiens |sed 's/_/ /g' |/Business/psn_company/t04/hudabang/bin/yaogu/bin/taxonkit name2taxid --data-dir /Business/psn_company/t04/.taxonkit/
  9. Options:
  10. -h, --help Show this help message and exit
  11. -o, --outdir Output directory
  12. -f, --dna Genome sequence file
  13. -g, --gtf ensembl format GTF file
  14. -n, --name latin name
  15. -t, --taxid taxid
  16. EOF
  17. exit 1
  18. }
  19. ARGS=`getopt -a -o ho:f:g:n:t: -l help,outdir:,dna:,gtf:,name:,taxid: -n $0 -- "$@"`
  20. [ $? != 0 ] && usage
  21. eval set -- "${ARGS}"
  22. while true
  23. do
  24. case "$1" in
  25. -h|--help)
  26. usage
  27. shift
  28. ;;
  29. -o|--outdir)
  30. outdir=$2
  31. shift 2
  32. ;;
  33. -f|--dna)
  34. dna=$2
  35. shift 2
  36. ;;
  37. -g|--gtf)
  38. gtf=$2
  39. shift 2
  40. ;;
  41. -n|--name)
  42. name=$2
  43. shift 2
  44. ;;
  45. -t|--taxid)
  46. taxid=$2
  47. shift 2
  48. ;;
  49. --)
  50. shift
  51. break
  52. ;;
  53. esac
  54. done
  55. for i in dna gtf taxid name
  56. do
  57. if [ -z "${!i}" ]
  58. then
  59. echo "Error: unknown -$i "
  60. usage
  61. fi
  62. done
  63. gffread -g $dna -x $outdir/cds.tid.fa $gtf
  64. perl -ne 'chomp;if(/\ttranscript\t/){@a=$_=~/gene_id "([^"]+).*transcript_id \"([^"]+)/;print join("\t",@a)."\n"}' $gtf | sort -u > $outdir/gid_tid
  65. perl $script_path/protein2geneid.pl $outdir/gid_tid $outdir/cds.tid.fa > $outdir/cds.gid.fa
  66. perl $script_path/get_longseq.pl $outdir/cds.gid.fa > $outdir/cds.fa
  67. rm -f $outdir/cds.gid.fa $outdir/gid_tid $outdir/cds.tid.fa
  68. perl $script_path/gtf2id_table.pl $gtf > $outdir/id_table.txt
  69. org=`awk -F"\t" '$2=="'$name'"{print $1}' $script_path/kegg_org_species_taxid.txt `
  70. kindom=`awk -F"\t" '$1=="'$taxid'"{print $2}' $script_path/taxid_division.txt `
  71. if [ -z "$kindom" ]
  72. then
  73. echo "物种分类不在NCBI taxid;请手动填写配置文件的kindom 和org"
  74. elif [ -z "$org" ]
  75. then
  76. org=$kindom
  77. fi
  78. cat > $outdir/genome_conf.ini<<EOF
  79. latin_name=$name
  80. taxid=$taxid
  81. url="-"
  82. version="-"
  83. dna_fasta=`basename $dna`
  84. gtf=`basename $gtf`
  85. fasta=cds.fa
  86. fasta_type=CDS
  87. id_table=id_table.txt
  88. kindom=$kindom
  89. org=$org
  90. go_file=none
  91. kegg_file=none
  92. eggnog_file=none
  93. swissprot_file=none
  94. nr_file=none
  95. uniprot_tsv=none
  96. uniprot_fasta=none
  97. EOF