123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279 |
- #!/bin/bash
- # ensembl 每次更新都会增加;删除或 重命名一些物种 ;注释也会有变化
- # 暂时下载的是 111 和 58 :/personalbio1/R_Report/Users/hudb/script/ensembl_download/all_species.txt
- # 如果要跟新;请运行:sh /personalbio1/R_Report/Users/hudb/script/download_genome/downloan_ensembl_species.sh
- script_path=$(dirname `realpath $0`)
- trap "kill 0;exit 1" 2
- # 默认变量
- outdir="."
- species_file=$script_path/ensembl/all_species.txt
- version="current"
- retry_time=3
- usage() {
- cat << EOF
- Usage: $0 -n homo_sapiens -v 111 -o ./ -r 3
- Description:
- Download ensembl genome;for all species to see $script_path/ensembl/all_species.txt
- Options:
- -h, --help Show this help message and exit.
- -v, --version ensembl genome version;default: current version
- -o, --outdir Output directory;default: ./
- -n, --name species name
- -r, --retry curl and wget retry times;default: 3
- EOF
- exit 1
- }
- ARGS=`getopt -a -o hn:o:v:r: -l help,name:version:,outdir:,retry: -n $0 -- "$@"`
- [ $? != 0 ] && usage
- eval set -- "${ARGS}"
- while true
- do
- case "$1" in
- -h|--help)
- usage
- shift
- ;;
- -n|--name)
- name=$2
- shift 2
- ;;
- -v|--version)
- version=$2
- shift 2
- ;;
- -o|--outdir)
- outdir=$2
- shift 2
- ;;
- -r|--retry)
- retry_time=$2
- shift 2
- ;;
- --)
- shift
- break
- ;;
- esac
- done
- if [ ! $name ]
- then
- usage
- fi
- mkdir -p $outdir
- outdir=`realpath $outdir`
- if [ `cut -f2 $species_file |sed 1d|grep -Pc "^$name$" ` -eq 0 ]
- then
- echo "$name is not in $species_file;please run below cmd to download latest all_species.txt:"
- echo "sh $script_path/downloan_ensembl_species.sh"
- exit 1
- fi
- #assembly=`awk -F"\t" '$2=="'$name'"{print $5}' $species_file`
- url=`awk -F"\t" '$2=="'$name'"{print $8}' $species_file`
- taxid=`awk -F"\t" '$2=="'$name'"{print $4}' $species_file`
- latin_name=`echo $name|sed 's/^_//'|sed 's/^\(.\)/\U&/' `
- kindom=`awk -F"\t" '$2=="'$name'"{print $3}' $species_file`
- org=`grep -Pi "\t$latin_name\t" $script_path/kegg_org_species_taxid.txt |cut -f1|head -1`
- case $kindom in
- EnsemblFungi)
- kindom=fungi
- ;;
- EnsemblMetazoa)
- kindom=animal
- ;;
- EnsemblPlants)
- kindom=plant
- ;;
- EnsemblVertebrates)
- kindom=animal
- ;;
- esac
- if [ -z "$org" ]
- then
- org=$kindom
- fi
- if [ $version == "current" ]
- then
- case $url in
- *ftp.ensembl.org*)
- version=`curl -s --retry $retry_time https://ftp.ensembl.org/pub/current_README |grep "Ensembl Release" |sed 's/Ensembl Release //;s/ .*//'`
- ;;
- *ensemblgenomes*)
- version=`curl -s --retry $retry_time http://ftp.ensemblgenomes.org/pub/current_README | grep "current release" |sed 's/.* //'`
- ;;
- *)
- exit 1
- ;;
- esac
- echo "current version is $version"
- fi
- outdir="$outdir/${name}_ensembl_${version}"
- mkdir -p $outdir
- assembly=`curl -s --retry $retry_time "$url/release-$version/fasta/$name/cds/" |grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name" |sed "s/$latin_name.//;s/.cds.all.fa.gz//"`
- cat <<eof
- species name: $name
- species taxid: $taxid
- species url: $url
- species version: $version
- species outdir: $outdir
- species latin name: $latin_name
- species kindom: $kindom
- species org: $org
- species assembly: $assembly
- eof
- dna_url="$url/release-$version/fasta/$name/dna/$latin_name.$assembly.dna.toplevel.fa.gz"
- pep_url="$url/release-$version/fasta/$name/pep/$latin_name.$assembly.pep.all.fa.gz"
- cds_url="$url/release-$version/fasta/$name/cds/$latin_name.$assembly.cds.all.fa.gz"
- gtf_url="$url/release-$version/gtf/$name/$latin_name.$assembly.$version.gtf.gz"
- gbk_url="$url/release-$version/genbank/$name/"
- tsv_url="$url/release-$version/tsv/$name/"
- # 下载gbk文件
- mkdir -p $outdir/{gbk,tsv}
- curl -s --retry $retry_time "$gbk_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"
- for g in `curl -s --retry $retry_time "$gbk_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"`
- do
- wget -c -t $retry_time -q "$gbk_url/$g" -P $outdir/gbk/
- if [ $? != 0 ]
- then
- echo "$gbk_url/$g failed"
- else
- echo "$gbk_url/$g success"
- fi
- sleep 1
- done
- # 下载tsv文件
- curl -s --retry $retry_time "$tsv_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"
- sleep 1
- for t in `curl -s --retry $retry_time "$tsv_url" | grep "td><a href=" |sed 's/.*href="//;s/".*//' | grep "$latin_name"`
- do
- wget -c -t $retry_time -q "$tsv_url/$t" -P $outdir/tsv/
- if [ $? != 0 ]
- then
- echo "wget $tsv_url/$t failed"
- else
- echo "wget $tsv_url/$t success"
- fi
- sleep 1
- done
- # 下载cds文件
- wget -c -t $retry_time -q "$cds_url" -P $outdir/
- if [ $? != 0 ]
- then
- echo "wget $cds_url failed"
- else
- echo "wget $cds_url success"
- fi
- sleep 1
- # 下载pep文件
- wget -c -t $retry_time -q "$pep_url" -P $outdir/
- if [ $? != 0 ]
- then
- echo "wget $pep_url failed"
- else
- echo "wget $pep_url success"
- fi
- sleep 1
- # 下载dna文件
- wget -c -t $retry_time -q "$dna_url" -P $outdir/
- if [ $? != 0 ]
- then
- echo "wget $dna_url failed"
- else
- echo "wget $dna_url success"
- fi
- sleep 1
- # 下载gtf文件
- wget -c -t $retry_time -q "$gtf_url" -P $outdir/
- if [ $? != 0 ]
- then
- echo "wget $gtf_url failed"
- else
- echo "wget $gtf_url success"
- fi
- swissprot_file="none"
- go_file="none"
- uniprot_fasta="none"
- uniprot_tsv="none"
- perl $script_path/gtf2id_table.pl $outdir/$latin_name.$assembly.$version.gtf.gz > $outdir/id_table.tmp
- cat $outdir/gbk/*dat.gz > $outdir/all.gbk.gz
- # rm -f $outdir/gbk/*dat.gz
- perl $script_path/ensembl_gbk2tsv.pl $outdir/all.gbk.gz $outdir/gbk.tsv
- cut -f1,9 $outdir/gbk.tsv |sed 's/\"$//;s/\.[0-9]\+\t/\t/' > $outdir/NR.txt
- cut -f1,7 $outdir/gbk.tsv |sed 's/;$//;s/\.[0-9]\+\t/\t/' |grep "GO:" | \
- perl -alne '@g=split(/;/,$F[1]);my %hash;@u=grep {!$hash{$_}++ } @g;print $F[0]."\t".join(";",@u)' > $outdir/GO.txt
- less $outdir/$latin_name.$assembly.pep.all.fa.gz | sed 's/^>.*gene:/>/;s/\.[0-9]\+.*//' > $outdir/pep.tmp
- perl $script_path/get_longseq.pl $outdir/pep.tmp > $outdir/pep.fa
- # 根据是否有GO;下载uniprot
- if [ `grep -c "GO:" $outdir/GO.txt` -gt 0 ]
- then
- go_file=GO.txt
- else
- uniprot_id=`awk -F"\t" '$2=="'$taxid'"{print $1}' $script_path/uniprot_id_taxid_url.txt`
- uniprot_url=`awk -F"\t" '$2=="'$taxid'"{print $3}' $script_path/uniprot_id_taxid_url.txt`
- if [ -n "$uniprot_id" ]
- then
- wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.dat.gz" -P $outdir/
- sleep 1
- wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.fasta.gz" -P $outdir/
- perl $script_path/uniprot_dat2tsv.pl $outdir/${uniprot_id}_${taxid}.dat.gz $outdir/${uniprot_id}_${taxid}.tsv
- gunzip $outdir/${uniprot_id}_${taxid}.fasta.gz
- uniprot_fasta=${uniprot_id}_${taxid}.fasta
- uniprot_tsv=${uniprot_id}_${taxid}.tsv
- fi
- fi
- if [ -s "$outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz" ]
- then
- less $outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz |cut -f1,4 |sort -u|sed '1igene_id\tentry_id' |\
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1]){$4=a[$1]}else{$4="-"};print $0}' - $outdir/id_table.tmp > $outdir/id_table.txt
- else
- mv $outdir/id_table.tmp $outdir/id_table.txt
- fi
- if [ -s "$outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz" ]
- then
- less $outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz |cut -f1,4 | sed 1d |sort -u | perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach (sort keys %a) {print $_."\t".join(";",sort keys %{$a{$_}})} }' > $outdir/Swissprot.txt
- swissprot_file=Swissprot.txt
- fi
- # 基因组配置文件
- cat >$outdir/genome_conf.ini<<eof
- latin_name=$latin_name
- taxid=$taxid
- url=$dna_url
- version=$version
- dna_fasta=$latin_name.$assembly.dna.toplevel.fa
- gtf=$latin_name.$assembly.$version.gtf
- fasta=pep.fa
- fasta_type=proteins
- id_table=id_table.txt
- kindom=$kindom
- org=$org
- go_file=$go_file
- kegg_file=none
- eggnog_file=none
- swissprot_file=$swissprot_file
- nr_file=NR.txt
- uniprot_tsv=$uniprot_tsv
- uniprot_fasta=$uniprot_fasta
- eof
|