#!/bin/bash # ensembl 每次更新都会增加;删除或 重命名一些物种 ;注释也会有变化 # 暂时下载的是 111 和 58 :/personalbio1/R_Report/Users/hudb/script/ensembl_download/all_species.txt # 如果要跟新;请运行:sh /personalbio1/R_Report/Users/hudb/script/download_genome/downloan_ensembl_species.sh script_path=$(dirname `realpath $0`) trap "kill 0;exit 1" 2 # 默认变量 outdir="." species_file=$script_path/ensembl/all_species.txt version="current" retry_time=3 usage() { cat << EOF Usage: $0 -n homo_sapiens -v 111 -o ./ -r 3 Description: Download ensembl genome;for all species to see $script_path/ensembl/all_species.txt Options: -h, --help Show this help message and exit. -v, --version ensembl genome version;default: current version -o, --outdir Output directory;default: ./ -n, --name species name -r, --retry curl and wget retry times;default: 3 EOF exit 1 } ARGS=`getopt -a -o hn:o:v:r: -l help,name:version:,outdir:,retry: -n $0 -- "$@"` [ $? != 0 ] && usage eval set -- "${ARGS}" while true do case "$1" in -h|--help) usage shift ;; -n|--name) name=$2 shift 2 ;; -v|--version) version=$2 shift 2 ;; -o|--outdir) outdir=$2 shift 2 ;; -r|--retry) retry_time=$2 shift 2 ;; --) shift break ;; esac done if [ ! $name ] then usage fi mkdir -p $outdir outdir=`realpath $outdir` if [ `cut -f2 $species_file |sed 1d|grep -Pc "^$name$" ` -eq 0 ] then echo "$name is not in $species_file;please run below cmd to download latest all_species.txt:" echo "sh $script_path/downloan_ensembl_species.sh" exit 1 fi #assembly=`awk -F"\t" '$2=="'$name'"{print $5}' $species_file` url=`awk -F"\t" '$2=="'$name'"{print $8}' $species_file` taxid=`awk -F"\t" '$2=="'$name'"{print $4}' $species_file` latin_name=`echo $name|sed 's/^_//'|sed 's/^\(.\)/\U&/' ` kindom=`awk -F"\t" '$2=="'$name'"{print $3}' $species_file` org=`grep -Pi "\t$latin_name\t" $script_path/kegg_org_species_taxid.txt |cut -f1|head -1` case $kindom in EnsemblFungi) kindom=fungi ;; EnsemblMetazoa) kindom=animal ;; EnsemblPlants) kindom=plant ;; EnsemblVertebrates) kindom=animal ;; esac if [ -z "$org" ] then org=$kindom fi if [ $version == "current" ] then case $url in *ftp.ensembl.org*) version=`curl -s --retry $retry_time https://ftp.ensembl.org/pub/current_README |grep "Ensembl Release" |sed 's/Ensembl Release //;s/ .*//'` ;; *ensemblgenomes*) version=`curl -s --retry $retry_time http://ftp.ensemblgenomes.org/pub/current_README | grep "current release" |sed 's/.* //'` ;; *) exit 1 ;; esac echo "current version is $version" fi outdir="$outdir/${name}_ensembl_${version}" mkdir -p $outdir assembly=`curl -s --retry $retry_time "$url/release-$version/fasta/$name/cds/" |grep "td> $outdir/id_table.tmp cat $outdir/gbk/*dat.gz > $outdir/all.gbk.gz # rm -f $outdir/gbk/*dat.gz perl $script_path/ensembl_gbk2tsv.pl $outdir/all.gbk.gz $outdir/gbk.tsv cut -f1,9 $outdir/gbk.tsv |sed 's/\"$//;s/\.[0-9]\+\t/\t/' > $outdir/NR.txt cut -f1,7 $outdir/gbk.tsv |sed 's/;$//;s/\.[0-9]\+\t/\t/' |grep "GO:" | \ perl -alne '@g=split(/;/,$F[1]);my %hash;@u=grep {!$hash{$_}++ } @g;print $F[0]."\t".join(";",@u)' > $outdir/GO.txt less $outdir/$latin_name.$assembly.pep.all.fa.gz | sed 's/^>.*gene:/>/;s/\.[0-9]\+.*//' > $outdir/pep.tmp perl $script_path/get_longseq.pl $outdir/pep.tmp > $outdir/pep.fa # 根据是否有GO;下载uniprot if [ `grep -c "GO:" $outdir/GO.txt` -gt 0 ] then go_file=GO.txt else uniprot_id=`awk -F"\t" '$2=="'$taxid'"{print $1}' $script_path/uniprot_id_taxid_url.txt` uniprot_url=`awk -F"\t" '$2=="'$taxid'"{print $3}' $script_path/uniprot_id_taxid_url.txt` if [ -n "$uniprot_id" ] then wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.dat.gz" -P $outdir/ sleep 1 wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.fasta.gz" -P $outdir/ perl $script_path/uniprot_dat2tsv.pl $outdir/${uniprot_id}_${taxid}.dat.gz $outdir/${uniprot_id}_${taxid}.tsv gunzip $outdir/${uniprot_id}_${taxid}.fasta.gz uniprot_fasta=${uniprot_id}_${taxid}.fasta uniprot_tsv=${uniprot_id}_${taxid}.tsv fi fi if [ -s "$outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz" ] then less $outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz |cut -f1,4 |sort -u|sed '1igene_id\tentry_id' |\ awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1]){$4=a[$1]}else{$4="-"};print $0}' - $outdir/id_table.tmp > $outdir/id_table.txt else mv $outdir/id_table.tmp $outdir/id_table.txt fi if [ -s "$outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz" ] then less $outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz |cut -f1,4 | sed 1d |sort -u | perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach (sort keys %a) {print $_."\t".join(";",sort keys %{$a{$_}})} }' > $outdir/Swissprot.txt swissprot_file=Swissprot.txt fi # 基因组配置文件 cat >$outdir/genome_conf.ini<