srWGS分析¶

硬盘挂载¶

可以磁盘信息表确定所需硬盘,将相关硬盘按照xxxx规范挂载到存储节点。

申请样本¶

这一步获取需要分析的样本软链接，详情见样本申请以 Ovis aries为例，save_path中产生了3625个样本

{
  "save_path": "/data01/idata/tmp/apply_sample/xzhang/Ovis_aries",
  "need_runs": [],
  "species": "Ovis aries",
  "tissue": [
    "all"
  ],
  "omics": "WGS",
  "depth": 5,
  "reads": 6000000,
  "custom_gs": 0,
  "sample_count": 0,
  "show_details": true,
  "worker": "storage_cu01_main"
}

Note

受当前存储方案限制，数据主要通过cu01或者cu06挂载到服务器，尽管在其他节点能够看到软链接，实际上只有在对应的挂载节点才能访问数据！

拷贝数据¶

除非数据就在挂载节点分析，否则就需要在挂载节点拷贝数据，让各节点拥有数据访问权。大多数情况我们会将数据拷贝到多瑙平台进行分析，为了降低存储成本(100TB数据大概40元)，建议分批次传输数据进行分析(数据传输--数据分析--分析结果返回--数据删除，批次颗粒度越低，存储成本越低)。

##cu01 将上述样本mv 500到该目录batch1
screen -S cp_cu01_duonao_xzhang
scp -P 18081 -r /data01/idata/tmp/apply_sample/xzhang/Ovis_aries/batch1  lxl@211.69.141.180:/share/home/lxl/08Users/yhfu/04Projects/006_idata_sheep/data/

多瑙平台的入口带宽为万兆(或者更高？)，所有除了从cu01拷贝数据，也可以通过cu06拷贝数据(二者使用的不同的网络)，这个时候需要通过隧道将数据通过cu01→cu06→多瑙平台进行传输，具体参考SSH隧道

#构建隧道(注意端口占用)
screen -S ssh_tunnel_for_cu06_xzhang
ssh -L 12366:211.69.141.180:18081 idata@cu06
#传输数据 batch2
screen -S cp_cu01_cu06_duonao_xzhang
scp -P 12366 -r /data01/idata/tmp/apply_sample/xzhang/Ovis_aries/batch2 lxl@localhost:/share/home/lxl/08Users/yhfu/04Projects/006_idata_sheep/data/

数据分析¶

环境检查¶

#检查sentieon证书
sentieon licclnt ping -s cli-ARM-01:9001
#检查API接口 (cli-ARM-01 已提前加入 allowhost)
curl cli-ARM-01:8001

配置srWGS.yaml¶

#需要导入的模块
required_rules:
  - QC_fastp
  - mapping_DNAseq
  - calling_DNAseq
  # - joint_GVCFtyper
  # - VariantFiltration_gatk_SNP
  # - VariantFiltration_gatk_INDEL
  # - VariantFiltration_bcftools_SNP
  # - VariantFiltration_bcftools_INDEL

#通配符配置(用于自动解析output,注意变量名称务必为通配符+s) ？这是注意是什么意思
wildcards:
  samples: /share/home/lxl/08Users/yhfu/04Projects/004_idata_mmu/data/*
  # samples: ./test/samples/*

#共用配置
global_params:
  description: "cfa_yhfu_20250104" #当前分析任务的备注 最好是物种缩写_用户名_日期
  # work_dir: ./test
  work_dir: /share/home/lxl/08Users/yhfu/04Projects/004_idata_mmu/analysis/work
  script_path: auto # auto为自动推断
  data_dir: /share/home/lxl/08Users/yhfu/04Projects/004_idata_mmu/data
  #基因组相关配置
  genomelable: "GRCm39_ensembl"
  index: /share/home/lxl/08Users/yhfu/04Projects/004_idata_mmu/GRCm39_ensembl/bwa_index/GRCm39_ensembl_bwa
  genome: /share/home/lxl/08Users/yhfu/04Projects/004_idata_mmu/GRCm39_ensembl/genome/Mus_musculus.GRCm39.dna_sm.toplevel.fa
  genome_dict: /share/home/lxl/08Users/yhfu/04Projects/004_idata_mmu/GRCm39_ensembl/genome/Mus_musculus.GRCm39.dna_sm.toplevel.fa.dict
  genome_size: 2728222451
  threads: 32

#rules特有的配置
rules:
  QC_fastp:
    input:
      sample:
        value: f"{data_dir}/{{sample}}"
        args: -r
    output:
      cleaned_gz:
        value: directory(f"{work_dir}/results/QC/{{sample}}")
        ignore: True
    resources:
      mem_mb:
        value: threads*4096
    params:
      outdir:
        value: f"{work_dir}/results"
        args: -o #缺省或者为null代表无标识符,如果不需要该参数，请用ignore
      threads:
        value: 8 if threads >8 else threads
        args: -t
      fast:
        value: f""
        args: -f
        ignore: True
      upload:
        value: f""
        args: -u
    threads:
      value: 8 if threads >8 else threads
    shell: #该设计无法推断多条命令，需要自定义
      cmd_template: python3 {script_path}/common/QC_fastp.py
      cmd_args: auto #自动推断参数,并添加到cmd_template后面;如果不为auto,则会忽略cmd_template，直接使用cmd_args(即这里手动写完整的命令行参数)
      #"python3 {script_path}/srWGS/mapping_DNAseq.py -r {input.sample} -i {input.index} -g {params.genome} -t {threads} -o {params.outdir}"
    chain:
      next: mapping_DNAseq

  mapping_DNAseq:
    input:
      sample:
        value: f"{work_dir}/results/QC/{{sample}}"
        args: -r
    output:
      bam:
        value: directory(f"{work_dir}/results/mapping/{{sample}}")
        ignore: True
    resources:
      mem_mb:
        value: threads*4096
    params:
      outdir:
        value: f"{work_dir}/results"
        args: -o #缺省或者为null代表无标识符,如果不需要该参数，请用ignore
      index:
        value: index
        args: -i
      genome:
        value: genome
        args: -g
      genome_size:
        value: genome_size
        args: -s
      genomelable:
        value: genomelable
        args: -l
      threads:
        value: threads
        args: -t
      upload:
        value: f""
        args: -u
    threads:
      value: threads
    shell: #该设计无法推断多条命令，需要自定义
      cmd_template: python3 {script_path}/srWGS/mapping_DNAseq.py
      cmd_args: auto #自动推断参数并添加到cmd_template后面，如果不为auto则会忽略cmd_template，直接使用cmd_args
      #"python3 {script_path}/srWGS/mapping_DNAseq.py -r {input.sample} -i {input.index} -g {params.genome} -t {threads} -o {params.outdir}"
    chain:
      next: calling_DNAseq

  calling_DNAseq:
    input:
      bam:
        value: f"{work_dir}/results/mapping/{{sample}}"
        args: -b
    output:
      gvcf:
        value: directory(f"{work_dir}/results/calling/{{sample}}")
        ignore: True
    resources:
      mem_mb:
        value: threads*4096
    params:
      outdir:
        value: f"{work_dir}/results"
        args: -o #缺省或者为null代表无标识符,如果不需要该参数，请用ignore
      index:
        value: index
        args: -i
      genome:
        value: genome
        args: -g
      genomelable:
        value: genomelable
        args: -l
      threads:
        value: threads
        args: -t
      upload:
        value: f""
        args: -u
    threads:
      value: threads
    shell: #该设计无法推断多条命令，需要自定义
      cmd_template: python3 {script_path}/srWGS/calling_DNAseq.py
      cmd_args: auto #自动推断参数并添加到cmd_template后面，如果不为auto则会忽略cmd_template，直接使用cmd_args
      #"python3 {script_path}/srWGS/mapping_DNAseq.py -r {input.sample} -i {input.index} -g {params.genome} -t {threads} -o {params.outdir}"
    chain:
      next: joint_GVCFtyper

  joint_GVCFtyper:
    input:
      gvcf:
        value: expand(f"{work_dir}/results/calling/{{sample}}", sample=SAMPLES)
        args: -i
    output:
      vcf:
        value: f"{work_dir}/results/joint/joint.raw.vcf.gz"
        ignore: True
    resources:
      mem_mb:
        value: threads*4096
    params:
      outdir:
        value: f"{work_dir}/results"
        args: -o #缺省或者为null代表无标识符,如果不需要该参数，请用ignore
      genome:
        value: genome
        args: -g
      genomelable:
        value: genomelable
        args: -l
      threads:
        value: threads
        args: -t
      upload:
        value: f""
        args: -u
    threads:
      value: threads
    shell: #该设计无法推断多条命令，需要自定义
      cmd_template: python3 {script_path}/srWGS/joint_GVCFtyper.py
      cmd_args: auto #自动推断参数并添加到cmd_template后面，如果不为auto则会忽略cmd_template，直接使用cmd_args
      #"python3 {script_path}/srWGS/mapping_DNAseq.py -r {input.sample} -i {input.index} -g {params.genome} -t {threads} -o {params.outdir}"
    chain:
      next: VariantFiltration_gatk_SNP,VariantFiltration_gatk_INDEL

  VariantFiltration_gatk_SNP:
    input:
      gvcf:
        value: f"{work_dir}/results/joint/joint.raw.vcf.gz"
        args: -i
    output:
      vcf:
        value: f"{work_dir}/results/filtration/SNP/snp.filter.vcf"
        ignore: True
    resources:
      mem_mb:
        value: threads*4096
    params:
      outdir:
        value: f"{work_dir}/results"
        args: -o #缺省或者为null代表无标识符,如果不需要该参数，请用ignore
      variant_type:
        value: f"SNP"
        args: -s
      genome:
        value: genome
        args: -g
      threads:
        value: "1" #改配置文件禁止使用数字，在生成smk时会自动格式化为数值
        args: -t
    threads:
      value: "1"
    shell: #该设计无法推断多条命令，需要自定义
      cmd_template: python3 {script_path}/srWGS/VariantFiltration_gatk.py
      cmd_args: auto #自动推断参数并添加到cmd_template后面，如果不为auto则会忽略cmd_template，直接使用cmd_args
      #"python3 {script_path}/srWGS/mapping_DNAseq.py -r {input.sample} -i {input.index} -g {params.genome} -t {threads} -o {params.outdir}"
    chain:
      next: None

  VariantFiltration_gatk_INDEL:
    input:
      gvcf:
        value: f"{work_dir}/results/joint/joint.raw.vcf.gz"
        args: -i
    output:
      vcf:
        value: f"{work_dir}/results/filtration/INDEL/indel.filter.vcf"
        ignore: True
    resources:
      mem_mb:
        value: threads*4096
    params:
      outdir:
        value: f"{work_dir}/results"
        args: -o #缺省或者为null代表无标识符,如果不需要该参数，请用ignore
      variant_type:
        value: f"INDEL"
        args: -s
      genome:
        value: genome
        args: -g
      threads:
        value: "1"
        args: -t
    threads:
      value: "1"
    shell: #该设计无法推断多条命令，需要自定义
      cmd_template: python3 {script_path}/srWGS/VariantFiltration_gatk.py
      cmd_args: auto #自动推断参数并添加到cmd_template后面，如果不为auto则会忽略cmd_template，直接使用cmd_args
      #"python3 {script_path}/srWGS/mapping_DNAseq.py -r {input.sample} -i {input.index} -g {params.genome} -t {threads} -o {params.outdir}"
    chain:
      next: None

提交任务¶

conda activate snakemake
cd /share/home/lxl/08Users/yhfu/04Projects/004_idata_mmu/analysis 
python3 /share/home/lxl/08Users/yhfu/07Test/IAnimal_Pipeline/pipeline_builder.py -c srWGS.yaml

#创建screen
screen -S srWGS_mmu_yhfu
cd ./work
conda activate snakemake
snakemake \
         --executor cluster-generic \
         --cluster-generic-submit-cmd "dsub -n {rule} -q default -R 'cpu={threads};mem={resources.mem_mb}' -o jobname_%J.out"\
         --keep-going --jobs 30

本文阅读量次
本站总访问量次

Authors: hliu (81.16%), Wind (18.84%)