ncbi.sh 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. #!/bin/bash
  2. # assembly_summary
  3. # https://ftp.ncbi.nlm.nih.gov/genomes/genbank/assembly_summary_genbank.txt
  4. # https://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt
  5. script_path=$(dirname `realpath $0`)
  6. refseq=$script_path/ncbi/assembly_summary_refseq.txt
  7. genbank=$script_path/ncbi/assembly_summary_genbank.txt
  8. outdir="."
  9. usage(){
  10. echo -e "Usage:"
  11. echo -e "\t$0 -a [GCF.. GCA..] -o outdir "
  12. echo -e "assembly information: 245: $script_path/ncbi/assembly_summary*txt"
  13. echo -e "OPTIONS:"
  14. echo -e "\t-h: help information"
  15. echo -e "\t-a: assembly_accession"
  16. echo -e "\t-o: output dir"
  17. exit 1
  18. }
  19. while getopts 'o:a:h' OPT;do
  20. case $OPT in
  21. h) usage;;
  22. a) assembly_accession="$OPTARG";;
  23. o) outdir="$OPTARG";;
  24. ?) usage;;
  25. esac
  26. done
  27. if [[ ! $assembly_accession ]]
  28. then
  29. usage
  30. fi
  31. mkdir -p $outdir
  32. outdir=`realpath $outdir`
  33. case $assembly_accession in
  34. GCF*)
  35. if [ `grep -Pc "^$assembly_accession\t" $refseq` -eq 1 ]
  36. then
  37. taxid=`awk -F"\t" '$1=="'$assembly_accession'"{print $6}' $refseq`
  38. organism_name=`awk -F"\t" '$1=="'$assembly_accession'"{print $8}' $refseq|sed 's#(#_#g;s#)#_#g;s/ /_/g'`
  39. ftp_path=`awk -F"\t" '$1=="'$assembly_accession'"{print $20}' $refseq|sed 's#https://ftp.ncbi.nlm.nih.gov##;'`
  40. version_name=`echo $ftp_path |sed 's/.*\///'`
  41. else
  42. echo "unknown assembly_accession $assembly_accession"
  43. usage
  44. fi
  45. ;;
  46. GCA*)
  47. if [ `grep -Pc "^$assembly_accession\t" $genbank` -eq 1 ]
  48. then
  49. taxid=`awk -F"\t" '$1=="'$assembly_accession'"{print $6}' $genbank`
  50. organism_name=`awk -F"\t" '$1=="'$assembly_accession'"{print $8}' $genbank|sed 's#(#_#g;s#)#_#g;s/ /_/g'`
  51. ftp_path=`awk -F"\t" '$1=="'$assembly_accession'"{print $20}' $genbank|sed 's#https://ftp.ncbi.nlm.nih.gov##;'`
  52. version_name=`echo $ftp_path |sed 's/.*\///'`
  53. else
  54. echo "unknown assembly_accession $assembly_accession"
  55. usage
  56. fi
  57. ;;
  58. *)
  59. echo "unknown assembly_accession $assembly_accession"
  60. usage
  61. ;;
  62. esac
  63. outdir=$outdir/${organism_name}_NCBI_$version_name
  64. mkdir -p $outdir
  65. echo "organism_name: $organism_name"
  66. echo "taxid: $taxid"
  67. echo "version_name: $version_name"
  68. echo "ftp_path: $ftp_path"
  69. echo "outdir: $outdir"
  70. cds_file=${version_name}_cds_from_genomic.fna.gz
  71. pep_file=${version_name}_protein.faa.gz
  72. gff_file=${version_name}_genomic.gff.gz
  73. gtf_file=${version_name}_genomic.gtf.gz
  74. dna_file=${version_name}_genomic.fna.gz
  75. feature_table=${version_name}_feature_table.txt.gz
  76. ontology_file=${version_name}_gene_ontology.gaf.gz
  77. md5=md5checksums.txt
  78. # 获取文件列表
  79. curl -s --retry 3 https://ftp.ncbi.nlm.nih.gov$ftp_path/ |grep "<a href=" | cut -f2 -d "\"" |grep -v "/" > $outdir/online_file_list.txt
  80. echo $cds_file $pep_file $gff_file $gtf_file $dna_file $feature_table $ontology_file |sed 's/ /\n/g' > $outdir/default_file_list.txt
  81. # ascp 下载
  82. cd $outdir
  83. # 先下载 MD5
  84. /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/bin/ascp -i /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/etc/asperaweb_id_dsa.openssh -l 100M -k 1 -T anonftp@ftp.ncbi.nlm.nih.gov:$ftp_path/$md5 ./
  85. # 下载文件
  86. for file in `awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=1}FNR!=NR{if(a[$1])print $1}' default_file_list.txt online_file_list.txt`
  87. do
  88. echo "downloading $file"
  89. /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/bin/ascp -i /personalbio1/R_Report/software/miniconda_jjl/envs/Aspera/etc/asperaweb_id_dsa.openssh -l 100M -k 1 -T anonftp@ftp.ncbi.nlm.nih.gov:$ftp_path/$file ./
  90. if [ $? -eq 0 ]
  91. then
  92. echo "downloading $file success"
  93. check_md5=`grep $file $md5 |md5sum -c|grep -c OK`
  94. if [ $check_md5 -eq 1 ]
  95. then
  96. echo "md5 check $file success"
  97. else
  98. echo "md5 check $file failed"
  99. fi
  100. else
  101. echo "downloading $file failed"
  102. fi
  103. done
  104. # gi
  105. if [ "$kindom" == "prokaryotes" ]
  106. then
  107. for i in `less $dna_file| grep \>| sed 's/>//;s/ .*//'`
  108. do
  109. gi=`curl -s --retry 3 "https://www.ncbi.nlm.nih.gov/nuccore/$i?report=gilist&log$=seqview&format=text"| sed -n '/<pre>\([0-9]\+\)/p' |sed 's/<pre>//'`
  110. echo -e "$gi\t$i"
  111. done > gi
  112. fi
  113. org=`awk -F"\t" '$2=="'$organism_name'"{print $1}' $script_path/kegg_org_species_taxid.txt `
  114. kindom=`awk -F"\t" '$1=="'$taxid'"{print $2}' $script_path/ncbi/taxid_division.txt `
  115. # 处理下载后的文件
  116. swissprot_file="none"
  117. go_file="none"
  118. uniprot_fasta="none"
  119. uniprot_tsv="none"
  120. less $outdir/${version_name}_genomic.gtf.gz | grep -v "gene_id \"\"" | sed 's#transcript_id "unknown_transcript_1"; ##;s#transcript_id ""; ##' > $outdir/${version_name}_genomic.gtf
  121. less -S $outdir/$pep_file | cut -f1 -d " " > $outdir/pep.tmp
  122. less -S $outdir/${version_name}_genomic.gtf |perl -ne 'chomp;if(/\tCDS\t/){@a=$_=~/gene_id "([^"]+).*protein_id \"([^"]+)/;print join("\t",@a)."\n"}' | sort -u > $outdir/gid_pid
  123. perl $script_path/protein2geneid.pl $outdir/gid_pid $outdir/pep.tmp > $outdir/pep_geneid.fa
  124. perl $script_path/get_longseq.pl $outdir/pep_geneid.fa > $outdir/pep.fa
  125. less -S $outdir/$pep_file|grep "^>" | sed 's/^>//' | awk '{print $1"\t"$0}' |\
  126. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{print $1,a[$2]}' - $outdir/gid_pid > $outdir/NR.txt
  127. rm -f $outdir/gid_pid $outdir/pep.tmp $outdir/pep_geneid.fa
  128. perl $script_path/gtf2id_table.pl $outdir/${version_name}_genomic.gtf > $outdir/id_table.txt
  129. # 下载uniprot
  130. uniprot_id=`awk -F"\t" '$2=="'$taxid'"{print $1}' $script_path/uniprot_id_taxid_url.txt`
  131. uniprot_url=`awk -F"\t" '$2=="'$taxid'"{print $3}' $script_path/uniprot_id_taxid_url.txt`
  132. if [ -n "$uniprot_id" ]
  133. then
  134. wget -c -t 3 -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.dat.gz" -P $outdir/
  135. sleep 1
  136. wget -c -t 3 -q "$uniprot_url/$uniprot_id/${uniprot_id}_${taxid}.fasta.gz" -P $outdir/
  137. perl $script_path/uniprot_dat2tsv.pl $outdir/${uniprot_id}_${taxid}.dat.gz > $outdir/${uniprot_id}_${taxid}.tsv
  138. gunzip $outdir/${uniprot_id}_${taxid}.fasta.gz
  139. awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$2] && $2 != "-")print a[$2],$1}' \
  140. $id_table $outdir/${uniprot_id}_${taxid}.tsv | perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach(sort keys %a){print $_."\t".join(";",sort keys %{$a{$_}})}}' > $outdir/Swissprot.txt
  141. if [ `cat $outdir/Swissprot.txt|wc -l` -gt 100 ]
  142. then
  143. swissprot_file=Swissprot.txt
  144. fi
  145. fi
  146. if [ -z "$org" ]
  147. then
  148. org=$kindom
  149. fi
  150. if [ -s "$outdir/$ontology_file" ]
  151. then
  152. less $outdir/$ontology_file |sed -n '/GO_ID/,$p' | grep "GO:" | cut -f2,5 | \
  153. perl -alne '$a{$F[0]}{$F[1]}=1;END{foreach (sort keys %a){print $_."\t".join(";",sort keys %{$a{$_}})}}' |\
  154. awk 'BEGIN{FS=OFS="\t"}FNR==NR{a[$1]=$2}FNR!=NR{if(a[$4])print $1,a[$4]}' - $outdir/id_table.txt |sort -u > $outdir/GO.txt
  155. go_file=GO.txt
  156. else
  157. less $outdir/$gtf_file |grep "GO:"| perl -alne '($id)=$_=~/gene_id "([^"]+)/;@g=$_=~/(GO:\d{7})/g;print "$id\t".join(";",@g)' |sort -u > $outdir/GO.txt
  158. if [ `cat $outdir/GO.txt|wc -l` -gt 0 ]
  159. then
  160. go_file=GO.txt
  161. else
  162. awk 'BEGIN{FS=OFS="\t"}FNR==NR{for(i=1;i<=6;i++){a[$i]=$1}}FNR!=NR{if(a[$1])print a[$1],$2}' \
  163. $outdir/id_table.txt $script_path/ncbi/gene_go.tsv | awk -F"\t" '!a[$1]++{print}' > $outdir/ncbi.go
  164. if [ `cat $outdir/ncbi.go|wc -l` -gt 0 ]
  165. then
  166. mv $outdir/ncbi.go $outdir/GO.txt
  167. go_file=GO.txt
  168. else
  169. if [ -n "$uniprot_id" ]
  170. then
  171. uniprot_fasta=${uniprot_id}_${taxid}.fasta
  172. uniprot_tsv=${uniprot_id}_${taxid}.tsv
  173. fi
  174. fi
  175. fi
  176. fi
  177. # 基因组配置文件
  178. cat >genome_conf.ini<<eof
  179. latin_name=$organism_name
  180. taxid=$taxid
  181. url=https://ftp.ncbi.nlm.nih.gov$ftp_path
  182. version=${version_name}
  183. dna_fasta=${version_name}_genomic.fna
  184. gtf=${version_name}_genomic.gtf
  185. fasta=pep.fa
  186. fasta_type=proteins
  187. id_table=id_table.txt
  188. kindom=$kindom
  189. org=$org
  190. go_file=$go_file
  191. kegg_file=none
  192. eggnog_file=none
  193. swissprot_file=$swissprot_file
  194. nr_file=NR.txt
  195. uniprot_tsv=$uniprot_tsv
  196. uniprot_fasta=$uniprot_fasta
  197. eof
  198. cd -
  199. if [ -z "$kindom" ]
  200. then
  201. echo "物种 $latin_name ;taxid: $taxid 不在 动物 植物 真菌或细菌的分类;注意修改配置文件"
  202. fi