#!/bin/bash # assembly_summary # https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt # https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt script_path=$(dirname `realpath $0`) refseq=$script_path/ncbi/assembly_summary_refseq.txt genbank=$script_path/ncbi/assembly_summary_genbank.txt outdir="." usage(){ echo -e "Usage:" echo -e "\t$0 -a [GCF.. GCA..] -o outdir " echo -e "assembly information: 245: $script_path/ncbi/assembly_summary*txt" echo -e "OPTIONS:" echo -e "\t-h: help information" echo -e "\t-a: assembly_accession" echo -e "\t-o: output dir" exit 1 } while getopts 'o:a:h' OPT;do case $OPT in h) usage;; a) assembly_accession="$OPTARG";; o) outdir="$OPTARG";; ?) usage;; esac done if [[ ! $assembly_accession ]] then usage fi mkdir -p $outdir outdir=`realpath $outdir` case $assembly_accession in GCF*) if [ `grep -Pc "^$assembly_accession\t" $refseq` -eq 1 ] then taxid=`awk -F"\t" '$1=="'$assembly_accession'"{print $6}' $refseq` organism_name=`awk -F"\t" '$1=="'$assembly_accession'"{print $8}' $refseq|sed 's#(#_#g;s#)#_#g;s/ /_/g'` ftp_path=`awk -F"\t" '$1=="'$assembly_accession'"{print $20}' $refseq|sed 's#https://ftp.ncbi.nlm.nih.gov##;'` version_name=`echo $ftp_path |sed 's/.*\///'` else echo "unknown assembly_accession $assembly_accession" usage fi ;; GCA*) if [ `grep -Pc "^$assembly_accession\t" $genbank` -eq 1 ] then taxid=`awk -F"\t" '$1=="'$assembly_accession'"{print $6}' $genbank` organism_name=`awk -F"\t" '$1=="'$assembly_accession'"{print $8}' $genbank|sed 's#(#_#g;s#)#_#g;s/ /_/g'` ftp_path=`awk -F"\t" '$1=="'$assembly_accession'"{print $20}' $genbank|sed 's#https://ftp.ncbi.nlm.nih.gov##;'` version_name=`echo $ftp_path |sed 's/.*\///'` else echo "unknown assembly_accession $assembly_accession" usage fi ;; *) echo "unknown assembly_accession $assembly_accession" usage ;; esac outdir=$outdir/${organism_name}_NCBI_$version_name mkdir -p $outdir echo "organism_name: $organism_name" echo "taxid: $taxid" echo "version_name: $version_name" echo "ftp_path: $ftp_path" echo "outdir: $outdir" cds_file=${version_name}_cds_from_genomic.fna.gz pep_file=${version_name}_protein.faa.gz gff_file=${version_name}_genomic.gff.gz gtf_file=${version_name}_genomic.gtf.gz dna_file=${version_name}_genomic.fna.gz feature_table=${version_name}_feature_table.txt.gz ontology_file=${version_name}_gene_ontology.gaf.gz md5=md5checksums.txt # 获取文件列表 curl -s --retry 3 https://ftp.ncbi.nlm.nih.gov$ftp_path/ |grep " $outdir/online_file_list.txt echo $cds_file $pep_file $gff_file $gtf_file $dna_file $feature_table $ontology_file |sed 's/ /\n/g' > $outdir/default_file_list.txt # ascp 下载 cd $outdir # 先下载 MD5 /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/bin/ascp -i /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/etc/asperaweb_id_dsa.openssh -l 100M -k 1 -T anonftp@ftp.ncbi.nlm.nih.gov:$ftp_path/$md5 ./ # 下载文件 for file in `awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$1])print $1}' default_file_list.txt online_file_list.txt` do echo "downloading $file" /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/bin/ascp -i /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/etc/asperaweb_id_dsa.openssh -l 100M -k 1 -T anonftp@ftp.ncbi.nlm.nih.gov:$ftp_path/$file ./ if [ $? -eq 0 ] then echo "downloading $file success" check_md5=`grep $file $md5 |md5sum -c|grep -c OK` if [ $check_md5 -eq 1 ] then echo "md5 check $file success" else echo "md5 check $file failed" fi else echo "downloading $file failed" fi done # gi if [ "$kindom" == "prokaryotes" ] then for i in `less $dna_file| grep \>| sed 's/>//;s/ .*//'` do gi=`curl -s --retry 3 "https://www.ncbi.nlm.nih.gov/nuccore/$i?report=gilist&log$=seqview&format=text"| sed -n '/
\([0-9]\+\)/p' |sed 's/
//'`
        echo -e "$gi\t$i"
    done > gi
fi

org=`awk -F"\t" '$2=="'$organism_name'"{print $1}' $script_path/kegg_org_species_taxid.txt `
kindom=`awk -F"\t" '$1=="'$taxid'"{print $2}' $script_path/ncbi/taxid_division.txt `

# 处理下载后的文件
swissprot_file="none"
go_file="none"
uniprot_fasta="none"
uniprot_tsv="none"
less  $outdir/${version_name}_genomic.gtf.gz | grep -v "gene_id \"\"" | sed 's#transcript_id "unknown_transcript_1"; ##;s#transcript_id ""; ##' > $outdir/${version_name}_genomic.gtf
less -S $outdir/$pep_file | cut -f1 -d " " > $outdir/pep.tmp
less -S $outdir/${version_name}_genomic.gtf |perl -ne 'chomp;if(/\tCDS\t/){@a=$_=~/gene_id "([^"]+).*protein_id \"([^"]+)/;print join("\t",@a)."\n"}' | sort -u > $outdir/gid_pid
perl $script_path/protein2geneid.pl  $outdir/gid_pid $outdir/pep.tmp > $outdir/pep_geneid.fa
perl $script_path/get_longseq.pl  $outdir/pep_geneid.fa > $outdir/pep.fa
less -S $outdir/$pep_file|grep "^>" | sed 's/^>//' | awk '{print $1"\t"$0}' |\
awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{print $1,a[$2]}' - $outdir/gid_pid  > $outdir/NR.txt
rm -f $outdir/gid_pid $outdir/pep.tmp $outdir/pep_geneid.fa
perl $script_path/gtf2id_table.pl $outdir/${version_name}_genomic.gtf > $outdir/id_table.txt

# 下载uniprot
uniprot_id=`awk -F"\t" '$2=="'$taxid'"{print $1}' $script_path/uniprot_id_taxid_url.txt`
uniprot_url=`awk -F"\t" '$2=="'$taxid'"{print $3}' $script_path/uniprot_id_taxid_url.txt`
if [ -n "$uniprot_id" ]
then
    wget -c -t 3 -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.dat.gz" -P $outdir/
    sleep 1
    wget -c -t 3 -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.fasta.gz" -P $outdir/
    perl $script_path/uniprot_dat2tsv.pl $outdir/${uniprot_id}_${taxid}.dat.gz > $outdir/${uniprot_id}_${taxid}.tsv
    gunzip $outdir/${uniprot_id}_${taxid}.fasta.gz
    awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$2] && $2 != "-")print a[$2],$1}' \
    $id_table $outdir/${uniprot_id}_${taxid}.tsv | perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach(sort keys %a){print $_."\t".join(";",sort keys %{$a{$_}})}}'  > $outdir/Swissprot.txt
    if [ `cat $outdir/Swissprot.txt|wc -l` -gt 100 ]
    then
        swissprot_file=Swissprot.txt
    fi
fi




if [ -z "$org" ]
then
    org=$kindom
fi


if [ -s "$outdir/$ontology_file" ]
then
    less  $outdir/$ontology_file |sed -n '/GO_ID/,$p' | grep "GO:" | cut -f2,5 | \
    perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach (sort keys %a){print $_."\t".join(";",sort keys %{$a{$_}})}}' |\
    awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$4])print $1,a[$4]}' - $outdir/id_table.txt |sort -u > $outdir/GO.txt
    go_file=GO.txt
else
    less  $outdir/$gtf_file |grep "GO:"| perl -alne '($id)=$_=~/gene_id "([^"]+)/;@g=$_=~/(GO:\d{7})/g;print "$id\t".join(";",@g)' |sort -u > $outdir/GO.txt
    if [ `cat $outdir/GO.txt|wc -l` -gt 0 ]
    then
        go_file=GO.txt
    else
        awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$1])print a[$1],$2}' \
        $outdir/id_table.txt $script_path/ncbi/gene_go.tsv | awk -F"\t" '!a[$1]++{print}'  > $outdir/ncbi.go
        if [ `cat $outdir/ncbi.go|wc -l` -gt 0 ]
        then
            mv $outdir/ncbi.go $outdir/GO.txt
            go_file=GO.txt
        else
            if [ -n "$uniprot_id" ]
            then
                uniprot_fasta=${uniprot_id}_${taxid}.fasta
                uniprot_tsv=${uniprot_id}_${taxid}.tsv
            fi
        fi
    fi
fi

# 基因组配置文件
cat >genome_conf.ini<