ensembl.sh 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. #!/bin/bash
  2. # ensembl 每次更新都会增加;删除或 重命名一些物种 ;注释也会有变化
  3. # 暂时下载的是 111 和 58 :/personalbio1/R_Report/Users/hudb/script/ensembl_download/all_species.txt
  4. # 如果要跟新;请运行:sh /personalbio1/R_Report/Users/hudb/script/download_genome/downloan_ensembl_species.sh
  5. script_path=$(dirname `realpath $0`)
  6. trap "kill 0;exit 1" 2
  7. # 默认变量
  8. outdir="."
  9. species_file=$script_path/ensembl/all_species.txt
  10. version="current"
  11. retry_time=3
  12. usage() {
  13. cat << EOF
  14. Usage: $0 -n homo_sapiens -v 111 -o ./ -r 3
  15. Description:
  16. Download ensembl genome;for all species to see $script_path/ensembl/all_species.txt
  17. Options:
  18. -h, --help Show this help message and exit.
  19. -v, --version ensembl genome version;default: current version
  20. -o, --outdir Output directory;default: ./
  21. -n, --name species name
  22. -r, --retry curl and wget retry times;default: 3
  23. EOF
  24. exit 1
  25. }
  26. ARGS=`getopt -a -o hn:o:v:r: -l help,name:version:,outdir:,retry: -n $0 -- "$@"`
  27. [ $? != 0 ] && usage
  28. eval set -- "${ARGS}"
  29. while true
  30. do
  31. case "$1" in
  32. -h|--help)
  33. usage
  34. shift
  35. ;;
  36. -n|--name)
  37. name=$2
  38. shift 2
  39. ;;
  40. -v|--version)
  41. version=$2
  42. shift 2
  43. ;;
  44. -o|--outdir)
  45. outdir=$2
  46. shift 2
  47. ;;
  48. -r|--retry)
  49. retry_time=$2
  50. shift 2
  51. ;;
  52. --)
  53. shift
  54. break
  55. ;;
  56. esac
  57. done
  58. if [ ! $name ]
  59. then
  60. usage
  61. fi
  62. mkdir -p $outdir
  63. outdir=`realpath $outdir`
  64. if [ `cut -f2 $species_file |sed 1d|grep -Pc "^$name$" ` -eq 0 ]
  65. then
  66. echo "$name is not in $species_file;please run below cmd to download latest all_species.txt:"
  67. echo "sh $script_path/downloan_ensembl_species.sh"
  68. exit 1
  69. fi
  70. #assembly=`awk -F"\t" '$2=="'$name'"{print $5}' $species_file`
  71. url=`awk -F"\t" '$2=="'$name'"{print $8}' $species_file`
  72. taxid=`awk -F"\t" '$2=="'$name'"{print $4}' $species_file`
  73. latin_name=`echo $name|sed 's/^_//'|sed 's/^\(.\)/\U&/' `
  74. kindom=`awk -F"\t" '$2=="'$name'"{print $3}' $species_file`
  75. org=`grep -Pi "\t$latin_name\t" $script_path/kegg_org_species_taxid.txt |cut -f1|head -1`
  76. case $kindom in
  77. EnsemblFungi)
  78. kindom=fungi
  79. ;;
  80. EnsemblMetazoa)
  81. kindom=animal
  82. ;;
  83. EnsemblPlants)
  84. kindom=plant
  85. ;;
  86. EnsemblVertebrates)
  87. kindom=animal
  88. ;;
  89. esac
  90. if [ -z "$org" ]
  91. then
  92. org=$kindom
  93. fi
  94. if [ $version == "current" ]
  95. then
  96. case $url in
  97. *ftp.ensembl.org*)
  98. version=`curl -s --retry $retry_time https://ftp.ensembl.org/pub/current_README |grep "Ensembl Release" |sed 's/Ensembl Release //;s/ .*//'`
  99. ;;
  100. *ensemblgenomes*)
  101. version=`curl -s --retry $retry_time http://ftp.ensemblgenomes.org/pub/current_README | grep "current release" |sed 's/.* //'`
  102. ;;
  103. *)
  104. exit 1
  105. ;;
  106. esac
  107. echo "current version is $version"
  108. fi
  109. outdir="$outdir/${name}_ensembl_${version}"
  110. mkdir -p $outdir
  111. assembly=`curl -s --retry $retry_time "$url/release-$version/fasta/$name/cds/" |grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name" |sed "s/$latin_name.//;s/.cds.all.fa.gz//"`
  112. cat <<eof
  113. species name: $name
  114. species taxid: $taxid
  115. species url: $url
  116. species version: $version
  117. species outdir: $outdir
  118. species latin name: $latin_name
  119. species kindom: $kindom
  120. species org: $org
  121. species assembly: $assembly
  122. eof
  123. dna_url="$url/release-$version/fasta/$name/dna/$latin_name.$assembly.dna.toplevel.fa.gz"
  124. pep_url="$url/release-$version/fasta/$name/pep/$latin_name.$assembly.pep.all.fa.gz"
  125. cds_url="$url/release-$version/fasta/$name/cds/$latin_name.$assembly.cds.all.fa.gz"
  126. gtf_url="$url/release-$version/gtf/$name/$latin_name.$assembly.$version.gtf.gz"
  127. gbk_url="$url/release-$version/genbank/$name/"
  128. tsv_url="$url/release-$version/tsv/$name/"
  129. # 下载gbk文件
  130. mkdir -p $outdir/{gbk,tsv}
  131. curl -s --retry $retry_time "$gbk_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"
  132. for g in `curl -s --retry $retry_time "$gbk_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"`
  133. do
  134. wget -c -t $retry_time -q "$gbk_url/$g" -P $outdir/gbk/
  135. if [ $? != 0 ]
  136. then
  137. echo "$gbk_url/$g failed"
  138. else
  139. echo "$gbk_url/$g success"
  140. fi
  141. sleep 1
  142. done
  143. # 下载tsv文件
  144. curl -s --retry $retry_time "$tsv_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"
  145. sleep 1
  146. for t in `curl -s --retry $retry_time "$tsv_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"`
  147. do
  148. wget -c -t $retry_time -q "$tsv_url/$t" -P $outdir/tsv/
  149. if [ $? != 0 ]
  150. then
  151. echo "wget $tsv_url/$t failed"
  152. else
  153. echo "wget $tsv_url/$t success"
  154. fi
  155. sleep 1
  156. done
  157. # 下载cds文件
  158. wget -c -t $retry_time -q "$cds_url" -P $outdir/
  159. if [ $? != 0 ]
  160. then
  161. echo "wget $cds_url failed"
  162. else
  163. echo "wget $cds_url success"
  164. fi
  165. sleep 1
  166. # 下载pep文件
  167. wget -c -t $retry_time -q "$pep_url" -P $outdir/
  168. if [ $? != 0 ]
  169. then
  170. echo "wget $pep_url failed"
  171. else
  172. echo "wget $pep_url success"
  173. fi
  174. sleep 1
  175. # 下载dna文件
  176. wget -c -t $retry_time -q "$dna_url" -P $outdir/
  177. if [ $? != 0 ]
  178. then
  179. echo "wget $dna_url failed"
  180. else
  181. echo "wget $dna_url success"
  182. fi
  183. sleep 1
  184. # 下载gtf文件
  185. wget -c -t $retry_time -q "$gtf_url" -P $outdir/
  186. if [ $? != 0 ]
  187. then
  188. echo "wget $gtf_url failed"
  189. else
  190. echo "wget $gtf_url success"
  191. fi
  192. swissprot_file="none"
  193. go_file="none"
  194. uniprot_fasta="none"
  195. uniprot_tsv="none"
  196. perl $script_path/gtf2id_table.pl $outdir/$latin_name.$assembly.$version.gtf.gz > $outdir/id_table.tmp
  197. cat $outdir/gbk/*dat.gz > $outdir/all.gbk.gz
  198. # rm -f $outdir/gbk/*dat.gz
  199. perl $script_path/ensembl_gbk2tsv.pl $outdir/all.gbk.gz $outdir/gbk.tsv
  200. cut -f1,9 $outdir/gbk.tsv |sed 's/\"$//;s/\.[0-9]\+\t/\t/' > $outdir/NR.txt
  201. cut -f1,7 $outdir/gbk.tsv |sed 's/;$//;s/\.[0-9]\+\t/\t/' |grep "GO:" | \
  202. perl -alne '@g=split(/;/,$F[1]);my %hash;@u=grep {!$hash{$_}++ } @g;print $F[0]."\t".join(";",@u)' > $outdir/GO.txt
  203. less $outdir/$latin_name.$assembly.pep.all.fa.gz | sed 's/^>.*gene:/>/;s/\.[0-9]\+.*//' > $outdir/pep.tmp
  204. perl $script_path/get_longseq.pl $outdir/pep.tmp > $outdir/pep.fa
  205. # 根据是否有GO;下载uniprot
  206. if [ `grep -c "GO:" $outdir/GO.txt` -gt 0 ]
  207. then
  208. go_file=GO.txt
  209. else
  210. uniprot_id=`awk -F"\t" '$2=="'$taxid'"{print $1}' $script_path/uniprot_id_taxid_url.txt`
  211. uniprot_url=`awk -F"\t" '$2=="'$taxid'"{print $3}' $script_path/uniprot_id_taxid_url.txt`
  212. if [ -n "$uniprot_id" ]
  213. then
  214. wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.dat.gz" -P $outdir/
  215. sleep 1
  216. wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.fasta.gz" -P $outdir/
  217. perl $script_path/uniprot_dat2tsv.pl $outdir/${uniprot_id}_${taxid}.dat.gz $outdir/${uniprot_id}_${taxid}.tsv
  218. gunzip $outdir/${uniprot_id}_${taxid}.fasta.gz
  219. uniprot_fasta=${uniprot_id}_${taxid}.fasta
  220. uniprot_tsv=${uniprot_id}_${taxid}.tsv
  221. fi
  222. fi
  223. if [ -s "$outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz" ]
  224. then
  225. less $outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz |cut -f1,4 |sort -u|sed '1igene_id\tentry_id' |\
  226. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1]){$4=a[$1]}else{$4="-"};print $0}' - $outdir/id_table.tmp > $outdir/id_table.txt
  227. else
  228. mv $outdir/id_table.tmp $outdir/id_table.txt
  229. fi
  230. if [ -s "$outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz" ]
  231. then
  232. less $outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz |cut -f1,4 | sed 1d |sort -u | perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach (sort keys %a) {print $_."\t".join(";",sort keys %{$a{$_}})} }' > $outdir/Swissprot.txt
  233. swissprot_file=Swissprot.txt
  234. fi
  235. # 基因组配置文件
  236. cat >$outdir/genome_conf.ini<<eof
  237. latin_name=$latin_name
  238. taxid=$taxid
  239. url=$dna_url
  240. version=$version
  241. dna_fasta=$latin_name.$assembly.dna.toplevel.fa
  242. gtf=$latin_name.$assembly.$version.gtf
  243. fasta=pep.fa
  244. fasta_type=proteins
  245. id_table=id_table.txt
  246. kindom=$kindom
  247. org=$org
  248. go_file=$go_file
  249. kegg_file=none
  250. eggnog_file=none
  251. swissprot_file=$swissprot_file
  252. nr_file=NR.txt
  253. uniprot_tsv=$uniprot_tsv
  254. uniprot_fasta=$uniprot_fasta
  255. eof