#!/bin/bash
# ensembl 每次更新都会增加;删除或 重命名一些物种 ;注释也会有变化
# 暂时下载的是 111 和 58 :/personalbio1/R_Report/Users/hudb/script/ensembl_download/all_species.txt
# 如果要跟新;请运行:sh /personalbio1/R_Report/Users/hudb/script/download_genome/downloan_ensembl_species.sh
script_path=$(dirname `realpath $0`)
trap "kill 0;exit 1" 2
# 默认变量
outdir="."
species_file=$script_path/ensembl/all_species.txt
version="current"
retry_time=3
usage() {
cat << EOF
Usage: $0 -n homo_sapiens -v 111 -o ./ -r 3
Description:
Download ensembl genome;for all species to see $script_path/ensembl/all_species.txt
Options:
-h, --help Show this help message and exit.
-v, --version ensembl genome version;default: current version
-o, --outdir Output directory;default: ./
-n, --name species name
-r, --retry curl and wget retry times;default: 3
EOF
exit 1
}
ARGS=`getopt -a -o hn:o:v:r: -l help,name:version:,outdir:,retry: -n $0 -- "$@"`
[ $? != 0 ] && usage
eval set -- "${ARGS}"
while true
do
case "$1" in
-h|--help)
usage
shift
;;
-n|--name)
name=$2
shift 2
;;
-v|--version)
version=$2
shift 2
;;
-o|--outdir)
outdir=$2
shift 2
;;
-r|--retry)
retry_time=$2
shift 2
;;
--)
shift
break
;;
esac
done
if [ ! $name ]
then
usage
fi
mkdir -p $outdir
outdir=`realpath $outdir`
if [ `cut -f2 $species_file |sed 1d|grep -Pc "^$name$" ` -eq 0 ]
then
echo "$name is not in $species_file;please run below cmd to download latest all_species.txt:"
echo "sh $script_path/downloan_ensembl_species.sh"
exit 1
fi
#assembly=`awk -F"\t" '$2=="'$name'"{print $5}' $species_file`
url=`awk -F"\t" '$2=="'$name'"{print $8}' $species_file`
taxid=`awk -F"\t" '$2=="'$name'"{print $4}' $species_file`
latin_name=`echo $name|sed 's/^_//'|sed 's/^\(.\)/\U&/' `
kindom=`awk -F"\t" '$2=="'$name'"{print $3}' $species_file`
org=`grep -Pi "\t$latin_name\t" $script_path/kegg_org_species_taxid.txt |cut -f1|head -1`
case $kindom in
EnsemblFungi)
kindom=fungi
;;
EnsemblMetazoa)
kindom=animal
;;
EnsemblPlants)
kindom=plant
;;
EnsemblVertebrates)
kindom=animal
;;
esac
if [ -z "$org" ]
then
org=$kindom
fi
if [ $version == "current" ]
then
case $url in
*ftp.ensembl.org*)
version=`curl -s --retry $retry_time https://ftp.ensembl.org/pub/current_README |grep "Ensembl Release" |sed 's/Ensembl Release //;s/ .*//'`
;;
*ensemblgenomes*)
version=`curl -s --retry $retry_time http://ftp.ensemblgenomes.org/pub/current_README | grep "current release" |sed 's/.* //'`
;;
*)
exit 1
;;
esac
echo "current version is $version"
fi
outdir="$outdir/${name}_ensembl_${version}"
mkdir -p $outdir
assembly=`curl -s --retry $retry_time "$url/release-$version/fasta/$name/cds/" |grep "td> $outdir/id_table.tmp
cat $outdir/gbk/*dat.gz > $outdir/all.gbk.gz
# rm -f $outdir/gbk/*dat.gz
perl $script_path/ensembl_gbk2tsv.pl $outdir/all.gbk.gz $outdir/gbk.tsv
cut -f1,9 $outdir/gbk.tsv |sed 's/\"$//;s/\.[0-9]\+\t/\t/' > $outdir/NR.txt
cut -f1,7 $outdir/gbk.tsv |sed 's/;$//;s/\.[0-9]\+\t/\t/' |grep "GO:" | \
perl -alne '@g=split(/;/,$F[1]);my %hash;@u=grep {!$hash{$_}++ } @g;print $F[0]."\t".join(";",@u)' > $outdir/GO.txt
less $outdir/$latin_name.$assembly.pep.all.fa.gz | sed 's/^>.*gene:/>/;s/\.[0-9]\+.*//' > $outdir/pep.tmp
perl $script_path/get_longseq.pl $outdir/pep.tmp > $outdir/pep.fa
# 根据是否有GO;下载uniprot
if [ `grep -c "GO:" $outdir/GO.txt` -gt 0 ]
then
go_file=GO.txt
else
uniprot_id=`awk -F"\t" '$2=="'$taxid'"{print $1}' $script_path/uniprot_id_taxid_url.txt`
uniprot_url=`awk -F"\t" '$2=="'$taxid'"{print $3}' $script_path/uniprot_id_taxid_url.txt`
if [ -n "$uniprot_id" ]
then
wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.dat.gz" -P $outdir/
sleep 1
wget -c -t $retry_time -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.fasta.gz" -P $outdir/
perl $script_path/uniprot_dat2tsv.pl $outdir/${uniprot_id}_${taxid}.dat.gz $outdir/${uniprot_id}_${taxid}.tsv
gunzip $outdir/${uniprot_id}_${taxid}.fasta.gz
uniprot_fasta=${uniprot_id}_${taxid}.fasta
uniprot_tsv=${uniprot_id}_${taxid}.tsv
fi
fi
if [ -s "$outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz" ]
then
less $outdir/tsv/$latin_name.$assembly.$version.entrez.tsv.gz |cut -f1,4 |sort -u|sed '1igene_id\tentry_id' |\
awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$1]){$4=a[$1]}else{$4="-"};print $0}' - $outdir/id_table.tmp > $outdir/id_table.txt
else
mv $outdir/id_table.tmp $outdir/id_table.txt
fi
if [ -s "$outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz" ]
then
less $outdir/tsv/$latin_name.$assembly.$version.uniprot.tsv.gz |cut -f1,4 | sed 1d |sort -u | perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach (sort keys %a) {print $_."\t".join(";",sort keys %{$a{$_}})} }' > $outdir/Swissprot.txt
swissprot_file=Swissprot.txt
fi
# 基因组配置文件
cat >$outdir/genome_conf.ini<