hdb cd53d00105 test | 8 달 전 | |
---|---|---|
script | 8 달 전 | |
Readme.md | 8 달 전 |
gff3ToGenePred $gff genpred
genePredToGtf file genpred example.gtf
# 下面这一步会自动加上gene_biotype "protein_coding";请根据gff文件里是否有基因类型的信息;去处理
perl /Business/psn_company/t04/hudabang/bin/yaogu/tools/add_gene_type_to_gtf.pl -i example.gtf -o example.add_gene.gtf
# 注意这个里面是否有 gene_biotype 信息;没有的话需要对有cds的基因加上 gene_biotype "protein_coding"
sed '/\tgene\t/s/$/ gene_biotype "protein_coding";/' example.add_gene.gtf
source /PERSONALBIO/work/transcriptome/t00/software/biosoftware/miniconda3/bin/activate AGAT_gff
export PERL5LIB=/PERSONALBIO/work/transcriptome/t00/software/biosoftware/miniconda3/envs/pfam_scan/share/pfam_scan-1.6-4
# 修复gff 缺少基因或mRNA;cds没有对应的外显子
agat_convert_sp_gxf2gxf.pl -g example.gff -o example.fixed.gff3
# gff3 转gtf ;参考上面
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/main.sh -h
Usage: /Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/main.sh [-h|--help] [-i|--indir <indir>] [-o|--outdir <outdir>] [-c|--config <config>] [-n|--cpu <cpu>] [-M|--memory <memory>] [-v|--version] [-a|--allgene]
Options:
-h, --help Show this help message and exit.
-i, --indir Input directory containing the files that in config file
-o, --outdir Output directory
-c, --config Config file for the pipeline.
-n, --cpu Number of CPUs in bsub;default: 6
-M, --memory Memory limit in bsub;default: 50G
-v, --version Show the version of the pipeline and exit.
-a, --allgene Include all genes, not just protein-coding genes.
#示例路径
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/example/mus_musculus_ensembl_111
#示例命令
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/main.sh -i /Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/example/mus_musculus_ensembl_111 -o /Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/example/mus_musculus_ensembl_111/genome -c /Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/example/mus_musculus_ensembl_111/genome_conf.ini
#详细解释见 :/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/genome_conf.ini
latin_name=Mus_musculus 物种拉丁名
taxid=10090 物种taxid
url=https://ftp.ensembl.org/pub/release-111/fasta/mus_musculus/dna/ 物种数据库下载链接
version=111 数据库上对应物种版本
dna_fasta=Mus_musculus.GRCm39.dna.toplevel.fa 基因组序列
gtf=Mus_musculus.GRCm39.111.gtf 基因组结构注释gtf
fasta=pep.fa 蛋白序列或cds序列(序列title必须是gtf里的基因id)
fasta_type=proteins 序列类型:参数 proteins 或 CDS
id_table=id_table.txt gtf里的基因id locus_tag old_locus_tag entry_id 蛋白id 基因类型
kindom=animal 物种大类:参数: animal plant fungi
org=mmu 物种kegg三字符; 没有则写物种大类
go_file=GO.txt GO注释文件
kegg_file=none KEGG注释文件
eggnog_file=none eggnog注释文件
swissprot_file=Swissprot.txt swissprot注释文件
nr_file=NR.txt NR注释文件
uniprot_tsv=none uniprot 本物种的 GO注释
uniprot_fasta=none uniprot 本物种的 蛋白序列
# 2.1 RefDatabase_info文件:
Genome Homo_sapiens.GRCh38.dna.primary_assembly.fa
Path http://asia.ensembl.org/Homo_sapiens/Info/Index
Genebuild by Ensembl
Database version 104.38
Base Pairs 3,099,750,718
统计Base Pairs可用以下命令:
perl /Business/psn_company/t01/public/ScriptManager/lib/PrepRef/calculate_fasta_basepair/calculate_fasta_basepair.pl -i reference.fa
# 2.2 对DNA建立 hisat2 索引和fai索引
hisat2-build -p 12 reference.fa reference.fa
samtools faidx reference.fa
# 2.3 chr_list文件
chr_list 主要是用于画基因组圈图的,指定哪些染色体需要画到圈图中,一般为常染色体加性染色体,不包括没拼好的 scalfold,每行写一个染色体;若全是contig或scaffold, 则全部保留;从参考基因组序列文件中获取。
grep "^>" reference.fa | grep chromosome | cut -d " " -f1 | sed 's/^>//' > chr_list
# 2.4 生成 hrds 文件(SNP分析中用到)
该文件记录了物种和每个染色体长度。
# 参数 -org 是给物种拉丁名,下面以人为例:
perl /Business/psn_company/t01/public/ScriptManager/lib/PrepRef/create_hrds/create_hrds.pl -i reference.fa -o hrds.txt -org Homo_sapiens
# 2.5 cytoBand文件(用于基因组圈图绘制)
perl /Business/psn_company/t01/public/ScriptManager/lib/PrepRef/create_cytoband/create_cytoband.pl -i hrds.txt -c chr_list -o cytoband.txt
# 3.1 生成genome.ss文件 (hisat2 比对中用于寻找剪切位点的文件):
extract_splice_sites.py reference.gtf > genome.ss
# 3.2 生成 GTF.bed 文件 (bed 文件记录了基因的位置信息,主要用于 mapQC 的几个模块)
/Business/psn_company/t01/public/ScriptManager/lib/PrepRef/gtf2bed/gtf2bed.pl reference.gtf > GTF.bed
# 3.3 DEXSeq_gff (用于差异外显子分析)
python /Business/psn_company/t01/public/ScriptManager/lib/DE/exon_DEXSeq/dexseq_prepare_annotation.py reference.gtf DEXSeq.gff
# 3.4 lncRNA.gtf (此文件是 lncRNA 的结构注释文件,从GTF中提取,如果GTF中没有,下述命令会生成一个空文件)
perl /Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/abstract_lncRNA_gtf.pl -i reference.gtf -o lncRNA.gtf
mkdir annoDB
gtfToGenePred -genePredExt reference.gtf annoDB/Anno_refGene.txt
perl /Business/psn_company/t01/public/software/annovar/retrieve_seq_from_fasta.pl --format refGene --seqfile reference.fa annoDB/Anno_refGene.txt --out annoDB/Anno_refGeneMrna.fa
# 优先级1:NCBI 和ensembl 等公共数据库上 下载
# 优先级2:uniprot上下载 对应和比对
wget -c -t 3 -q https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606.fasta.gz
wget -c -t 3 -q https://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/reference_proteomes/Eukaryota/UP000005640/UP000005640_9606.dat.gz
perl /personalbio1/R_Report/Users/hudb/script/download_genome/uniprot_dat2tsv.pl UP000005640_9606.dat.gz > UP000005640_9606.tsv
## 对应:
UP000005640_9606.tsv里分别是Uniprot AC ;GeneID (NCBI Entryid); GO ID
## 比对
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/blast_uniprot.sh -i pep.fa -d UP000005640_9606.fasta -g UP000005640_9606.tsv
Usage:
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/blast_uniprot.sh -i cds.fa -o outdir -t CDS -d uniprot.fa -g uniprot.go.tsv
OPTIONS:
-h: help information
-i: fasta file
-o: output dir
-t: proteins or CDS default:proteins
-d: uniprot species fasta
-g: uniprot species go annotation
# 优先级3:eggnog_map 注释
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/eggNOG_GO.sh -i pep.fa -s animal -o ./ -t proteins
Usage:
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/eggNOG_GO.sh -i cds.fa -o outdir -t CDS -s animal
OPTIONS:
-h: help information
-i: fasta file
-o: output dir
-t: proteins or CDS default:proteins
-s: species [ animal plant fungi prokaryotes ]
# 优先级1:脚本内会判断是否匹配上已知数据库
Usage:
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/KEGG.sh -i cds.fa -o outdir -t CDS -s animal -l idlist
OPTIONS:
-h: help information
-i: fasta file
-o: output dir
-l: idlist;col1-6 is gene_id locus_tag old_locus_tag entry_id protein_id gene_name
-t: proteins or CDS default:proteins
-s: kegg org [ animal plant fungi prokaryotes hsa mmu ... ]
-r: rate of Matched to known KO; default:0.5
# 优先级1:NCBI 和ensembl 等公共数据库上 下载
# 优先级2: eggnog_map 注释
同GO注释优先级3;会同时出eggnog 和GO注释结果
# 优先级1:NCBI 和ensembl 等公共数据库上 下载
# 优先级2:比对
Usage:
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/Swissprot.sh -i cds.fa -o outdir -t CDS
OPTIONS:
-h: help information
-i: fasta file
-o: output dir;default:./
-t: proteins or CDS default:proteins
# 优先级1 NCBI 和ensembl 等公共数据库上 下载
# 优先级2 比对本物种
# 优先级3 比对大库
Usage:
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/NR.sh -i cds.fa -o outdir -t CDS -s animal
OPTIONS:
-h: help information
-i: fasta file
-o: output dir
-t: proteins or CDS default:proteins
-s: species [ animal plant fungi prokaryotes virus taxid(9606 10096 ..) ]
# 优先级1:脚本内会判断是否匹配上已知数据库
Usage:
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/NR.sh -i cds.fa -o outdir -t CDS -s animal
OPTIONS:
-h: help information
-i: fasta file
-o: output dir
-t: proteins or CDS default:proteins
-s: species [ animal plant fungi prokaryotes virus taxid(9606 10096 ..) ]
# 优先级1:脚本内会判断是否匹配上已知数据库
Usage:
/Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/NR.sh -i cds.fa -o outdir -t CDS -s animal
OPTIONS:
-h: help information
-i: fasta file
-o: output dir
-t: proteins or CDS default:proteins
-s: species [ animal plant fungi prokaryotes virus taxid(9606 10096 ..) ]
cd Annotation/tmp
perl /Business/psn_company/Work/Transcriptome/Datum/Public/Database/genome_prepare/script/merge_anno.v1.pl -g ../../GTF/Mus_musculus.GRCm39.111.gtf GO KEGG eggNOG eggNOG_Category Swissprot NR SwissprotName Entrez_geneID BP CC MF Pathway -o ../Annotation.xls
# 统计:
perl /Business/psn_company/t01/public/ScriptManager/lib/PrepRef/stat_Annotation/stat_Annotation.pl -i Annotation.xls -o Annotation.stat
如果有文件下载失败;则重新运行下载脚本;会跳过已下载的文件
/personalbio1/R_Report/Users/hudb/script/download_genome/ncbi.sh
Usage:
/personalbio1/R_Report/Users/hudb/script/download_genome/ncbi.sh -a [GCF.. GCA..] -o outdir
assembly information: 245: /personalbio1/R_Report/Users/hudb/script/download_genome/ncbi/assembly_summary*txt
OPTIONS:
-h: help information
-a: assembly_accession
-o: output dir
。。。
说明:ensembl 数据均使用wget下载;下载均加了-c参数;如果日志里有某些文件下载失败;则重新运行整个下载脚本
Usage: /personalbio1/R_Report/Users/hudb/script/download_genome/ensembl.sh -n homo_sapiens -v 111 -o ./ -r 3
Description:
Download ensembl genome;for all species to see /personalbio1/R_Report/Users/hudb/script/download_genome/ensembl/all_species.txt
Options:
-h, --help Show this help message and exit.
-v, --version ensembl genome version;default: current version
-o, --outdir Output directory;default: ./
-n, --name species name
-r, --retry curl and wget retry times;default: 3
。。。