123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227 |
- #!/bin/bash
- # assembly_summary
- # https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt
- # https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt
- script_path=$(dirname `realpath $0`)
- refseq=$script_path/ncbi/assembly_summary_refseq.txt
- genbank=$script_path/ncbi/assembly_summary_genbank.txt
- outdir="."
- usage(){
- echo -e "Usage:"
- echo -e "\t$0 -a [GCF.. GCA..] -o outdir "
- echo -e "assembly information: 245: $script_path/ncbi/assembly_summary*txt"
- echo -e "OPTIONS:"
- echo -e "\t-h: help information"
- echo -e "\t-a: assembly_accession"
- echo -e "\t-o: output dir"
- exit 1
- }
- while getopts 'o:a:h' OPT;do
- case $OPT in
- h) usage;;
- a) assembly_accession="$OPTARG";;
- o) outdir="$OPTARG";;
- ?) usage;;
- esac
- done
- if [[ ! $assembly_accession ]]
- then
- usage
- fi
- mkdir -p $outdir
- outdir=`realpath $outdir`
- case $assembly_accession in
- GCF*)
- if [ `grep -Pc "^$assembly_accession\t" $refseq` -eq 1 ]
- then
- taxid=`awk -F"\t" '$1=="'$assembly_accession'"{print $6}' $refseq`
- organism_name=`awk -F"\t" '$1=="'$assembly_accession'"{print $8}' $refseq|sed 's#(#_#g;s#)#_#g;s/ /_/g'`
- ftp_path=`awk -F"\t" '$1=="'$assembly_accession'"{print $20}' $refseq|sed 's#https://ftp.ncbi.nlm.nih.gov##;'`
- version_name=`echo $ftp_path |sed 's/.*\///'`
- else
- echo "unknown assembly_accession $assembly_accession"
- usage
- fi
- ;;
- GCA*)
- if [ `grep -Pc "^$assembly_accession\t" $genbank` -eq 1 ]
- then
- taxid=`awk -F"\t" '$1=="'$assembly_accession'"{print $6}' $genbank`
- organism_name=`awk -F"\t" '$1=="'$assembly_accession'"{print $8}' $genbank|sed 's#(#_#g;s#)#_#g;s/ /_/g'`
- ftp_path=`awk -F"\t" '$1=="'$assembly_accession'"{print $20}' $genbank|sed 's#https://ftp.ncbi.nlm.nih.gov##;'`
- version_name=`echo $ftp_path |sed 's/.*\///'`
- else
- echo "unknown assembly_accession $assembly_accession"
- usage
- fi
- ;;
- *)
- echo "unknown assembly_accession $assembly_accession"
- usage
- ;;
- esac
- outdir=$outdir/${organism_name}_NCBI_$version_name
- mkdir -p $outdir
- echo "organism_name: $organism_name"
- echo "taxid: $taxid"
- echo "version_name: $version_name"
- echo "ftp_path: $ftp_path"
- echo "outdir: $outdir"
- cds_file=${version_name}_cds_from_genomic.fna.gz
- pep_file=${version_name}_protein.faa.gz
- gff_file=${version_name}_genomic.gff.gz
- gtf_file=${version_name}_genomic.gtf.gz
- dna_file=${version_name}_genomic.fna.gz
- feature_table=${version_name}_feature_table.txt.gz
- ontology_file=${version_name}_gene_ontology.gaf.gz
- md5=md5checksums.txt
- # 获取文件列表
- curl -s --retry 3 https://ftp.ncbi.nlm.nih.gov$ftp_path/ |grep "<a href=" | cut -f2 -d "\"" |grep -v "/" > $outdir/online_file_list.txt
- echo $cds_file $pep_file $gff_file $gtf_file $dna_file $feature_table $ontology_file |sed 's/ /\n/g' > $outdir/default_file_list.txt
- # ascp 下载
- cd $outdir
- # 先下载 MD5
- /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/bin/ascp -i /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/etc/asperaweb_id_dsa.openssh -l 100M -k 1 -T anonftp@ftp.ncbi.nlm.nih.gov:$ftp_path/$md5 ./
- # 下载文件
- for file in `awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$1])print $1}' default_file_list.txt online_file_list.txt`
- do
- echo "downloading $file"
- /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/bin/ascp -i /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/etc/asperaweb_id_dsa.openssh -l 100M -k 1 -T anonftp@ftp.ncbi.nlm.nih.gov:$ftp_path/$file ./
- if [ $? -eq 0 ]
- then
- echo "downloading $file success"
- check_md5=`grep $file $md5 |md5sum -c|grep -c OK`
- if [ $check_md5 -eq 1 ]
- then
- echo "md5 check $file success"
- else
- echo "md5 check $file failed"
- fi
- else
- echo "downloading $file failed"
- fi
- done
- # gi
- if [ "$kindom" == "prokaryotes" ]
- then
- for i in `less $dna_file| grep \>| sed 's/>//;s/ .*//'`
- do
- gi=`curl -s --retry 3 "https://www.ncbi.nlm.nih.gov/nuccore/$i?report=gilist&log$=seqview&format=text"| sed -n '/<pre>\([0-9]\+\)/p' |sed 's/<pre>//'`
- echo -e "$gi\t$i"
- done > gi
- fi
- org=`awk -F"\t" '$2=="'$organism_name'"{print $1}' $script_path/kegg_org_species_taxid.txt `
- kindom=`awk -F"\t" '$1=="'$taxid'"{print $2}' $script_path/ncbi/taxid_division.txt `
- # 处理下载后的文件
- swissprot_file="none"
- go_file="none"
- uniprot_fasta="none"
- uniprot_tsv="none"
- less $outdir/${version_name}_genomic.gtf.gz | grep -v "gene_id \"\"" | sed 's#transcript_id "unknown_transcript_1"; ##;s#transcript_id ""; ##' > $outdir/${version_name}_genomic.gtf
- less -S $outdir/$pep_file | cut -f1 -d " " > $outdir/pep.tmp
- less -S $outdir/${version_name}_genomic.gtf |perl -ne 'chomp;if(/\tCDS\t/){@a=$_=~/gene_id "([^"]+).*protein_id \"([^"]+)/;print join("\t",@a)."\n"}' | sort -u > $outdir/gid_pid
- perl $script_path/protein2geneid.pl $outdir/gid_pid $outdir/pep.tmp > $outdir/pep_geneid.fa
- perl $script_path/get_longseq.pl $outdir/pep_geneid.fa > $outdir/pep.fa
- less -S $outdir/$pep_file|grep "^>" | sed 's/^>//' | awk '{print $1"\t"$0}' |\
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{print $1,a[$2]}' - $outdir/gid_pid > $outdir/NR.txt
- rm -f $outdir/gid_pid $outdir/pep.tmp $outdir/pep_geneid.fa
- perl $script_path/gtf2id_table.pl $outdir/${version_name}_genomic.gtf > $outdir/id_table.txt
- # 下载uniprot
- uniprot_id=`awk -F"\t" '$2=="'$taxid'"{print $1}' $script_path/uniprot_id_taxid_url.txt`
- uniprot_url=`awk -F"\t" '$2=="'$taxid'"{print $3}' $script_path/uniprot_id_taxid_url.txt`
- if [ -n "$uniprot_id" ]
- then
- wget -c -t 3 -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.dat.gz" -P $outdir/
- sleep 1
- wget -c -t 3 -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.fasta.gz" -P $outdir/
- perl $script_path/uniprot_dat2tsv.pl $outdir/${uniprot_id}_${taxid}.dat.gz > $outdir/${uniprot_id}_${taxid}.tsv
- gunzip $outdir/${uniprot_id}_${taxid}.fasta.gz
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$2] && $2 != "-")print a[$2],$1}' \
- $id_table $outdir/${uniprot_id}_${taxid}.tsv | perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach(sort keys %a){print $_."\t".join(";",sort keys %{$a{$_}})}}' > $outdir/Swissprot.txt
- if [ `cat $outdir/Swissprot.txt|wc -l` -gt 100 ]
- then
- swissprot_file=Swissprot.txt
- fi
- fi
- if [ -z "$org" ]
- then
- org=$kindom
- fi
- if [ -s "$outdir/$ontology_file" ]
- then
- less $outdir/$ontology_file |sed -n '/GO_ID/,$p' | grep "GO:" | cut -f2,5 | \
- perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach (sort keys %a){print $_."\t".join(";",sort keys %{$a{$_}})}}' |\
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$4])print $1,a[$4]}' - $outdir/id_table.txt |sort -u > $outdir/GO.txt
- go_file=GO.txt
- else
- less $outdir/$gtf_file |grep "GO:"| perl -alne '($id)=$_=~/gene_id "([^"]+)/;@g=$_=~/(GO:\d{7})/g;print "$id\t".join(";",@g)' |sort -u > $outdir/GO.txt
- if [ `cat $outdir/GO.txt|wc -l` -gt 0 ]
- then
- go_file=GO.txt
- else
- awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$1])print a[$1],$2}' \
- $outdir/id_table.txt $script_path/ncbi/gene_go.tsv | awk -F"\t" '!a[$1]++{print}' > $outdir/ncbi.go
- if [ `cat $outdir/ncbi.go|wc -l` -gt 0 ]
- then
- mv $outdir/ncbi.go $outdir/GO.txt
- go_file=GO.txt
- else
- if [ -n "$uniprot_id" ]
- then
- uniprot_fasta=${uniprot_id}_${taxid}.fasta
- uniprot_tsv=${uniprot_id}_${taxid}.tsv
- fi
- fi
- fi
- fi
- # 基因组配置文件
- cat >genome_conf.ini<<eof
- latin_name=$organism_name
- taxid=$taxid
- url=https://ftp.ncbi.nlm.nih.gov$ftp_path
- version=${version_name}
- dna_fasta=${version_name}_genomic.fna
- gtf=${version_name}_genomic.gtf
- fasta=pep.fa
- fasta_type=proteins
- id_table=id_table.txt
- kindom=$kindom
- org=$org
- go_file=$go_file
- kegg_file=none
- eggnog_file=none
- swissprot_file=$swissprot_file
- nr_file=NR.txt
- uniprot_tsv=$uniprot_tsv
- uniprot_fasta=$uniprot_fasta
- eof
- cd -
- if [ -z "$kindom" ]
- then
- echo "物种 $latin_name ;taxid: $taxid 不在 动物 植物 真菌或细菌的分类;注意修改配置文件"
- fi
|