Pipeline template template » History » Version 25
Bryan Cosca, 09/28/2015 06:39 PM
| 1 | 1 | Bryan Cosca | h1. Pipeline template template |
|---|---|---|---|
| 2 | |||
| 3 | h2. Run-Command Template |
||
| 4 | |||
| 5 | 4 | Bryan Cosca | <pre> |
| 6 | 7 | Bryan Cosca | "NAME":{ |
| 7 | 6 | Bryan Cosca | "script_version":"29009a1c1f8a9653042c5853832881aca4141cf2", |
| 8 | 4 | Bryan Cosca | "repository":"arvados", |
| 9 | "script":"run-command", |
||
| 10 | "script_parameters":{ |
||
| 11 | "input":{ |
||
| 12 | 7 | Bryan Cosca | "output_of":"OUTPUT_OF_PREVIOUS_JOB_NAME" |
| 13 | 4 | Bryan Cosca | }, |
| 14 | "reference":{ |
||
| 15 | "dataclass":"Collection", |
||
| 16 | "default":"3514b8e5da0e8d109946bc809b20a78a+5698", |
||
| 17 | "link_name":"human_g1k_v37 reference data", |
||
| 18 | 7 | Bryan Cosca | "title":"NAME Input Reference genome (FASTA)" |
| 19 | 4 | Bryan Cosca | }, |
| 20 | "command":[ |
||
| 21 | "java", |
||
| 22 | "-Xmx60g", |
||
| 23 | "-jar", |
||
| 24 | "$(dir $(gatk3))/GenomeAnalysisTK.jar", |
||
| 25 | "-T", |
||
| 26 | "PrintReads", |
||
| 27 | "-R", |
||
| 28 | "$(glob $(dir $(reference))/*.fasta)", |
||
| 29 | { |
||
| 30 | "foreach":"iterator", |
||
| 31 | "command":[ |
||
| 32 | "-I", |
||
| 33 | "$(iterator)" |
||
| 34 | ] |
||
| 35 | }, |
||
| 36 | "-BQSR", |
||
| 37 | "$(bqsr_table)", |
||
| 38 | "-nct", |
||
| 39 | "16", |
||
| 40 | "-o", |
||
| 41 | "$(outputname)" |
||
| 42 | ], |
||
| 43 | "outputname":{ |
||
| 44 | "value":{ |
||
| 45 | "list":"iterator", |
||
| 46 | "index":"0", |
||
| 47 | "command":"$(basename $(iterator)).bqsrCal.bam" |
||
| 48 | } |
||
| 49 | }, |
||
| 50 | "bqsr_table":{ |
||
| 51 | "value":{ |
||
| 52 | "list":"iterator", |
||
| 53 | "index":"0", |
||
| 54 | "command":"$(dir $(bqsr))/$(basename $(basename $(iterator))).recal_data.table" |
||
| 55 | } |
||
| 56 | }, |
||
| 57 | "input_dir":"$(dir $(input))", |
||
| 58 | "task.foreach":[ |
||
| 59 | "iterator" |
||
| 60 | ], |
||
| 61 | "iterator":{ |
||
| 62 | "value":{ |
||
| 63 | "group":"input_dir", |
||
| 64 | "regex":"(.*)\\.realigned.bam" |
||
| 65 | 1 | Bryan Cosca | } |
| 66 | 4 | Bryan Cosca | }, |
| 67 | "gatk3":{ |
||
| 68 | "dataclass":"Collection", |
||
| 69 | "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", |
||
| 70 | 1 | Bryan Cosca | "link_name":"Genome Analysis Toolkit 3.2-2", |
| 71 | 7 | Bryan Cosca | "title":"NAME Input Version of GATK3 jar" |
| 72 | 4 | Bryan Cosca | }, |
| 73 | "dbsnp":{ |
||
| 74 | "dataclass":"Collection", |
||
| 75 | "default":"8ac324bfa3dfff1ff81ed34b433869b1+6712", |
||
| 76 | "title":"Single Nucleotide Polymorphism database", |
||
| 77 | 7 | Bryan Cosca | "description":"NAME Input DBsnp" |
| 78 | 4 | Bryan Cosca | } |
| 79 | }, |
||
| 80 | "runtime_constraints":{ |
||
| 81 | "max_tasks_per_node":1, |
||
| 82 | "min_nodes":1, |
||
| 83 | "docker_image":"bcosc/arv-base-java", |
||
| 84 | 6 | Bryan Cosca | "arvados_sdk_version":"29009a1c1f8a9653042c5853832881aca4141cf2" |
| 85 | 4 | Bryan Cosca | }, |
| 86 | "output_name":false |
||
| 87 | 6 | Bryan Cosca | |
| 88 | 4 | Bryan Cosca | </pre> |
| 89 | 1 | Bryan Cosca | |
| 90 | 8 | Bryan Cosca | h2. Crunch Script Template Template |
| 91 | |||
| 92 | <pre> |
||
| 93 | 9 | Bryan Cosca | "NAME":{ |
| 94 | "script_version":"GIT_BRANCH_NAME", |
||
| 95 | "repository":"GIT_REPO_NAME", |
||
| 96 | "script":"GIT_SCRIPT_NAME", |
||
| 97 | 8 | Bryan Cosca | "script_parameters":{ |
| 98 | "input":{ |
||
| 99 | 9 | Bryan Cosca | "output_of":"PREVIOUS_JOB_NAME" |
| 100 | 8 | Bryan Cosca | }, |
| 101 | "samtools":{ |
||
| 102 | "required":true, |
||
| 103 | "dataclass":"Collection", |
||
| 104 | "default":"0b5dd5ad3fd555dbb9ef81a027b69dec+18147", |
||
| 105 | 9 | Bryan Cosca | "title":"NAME Input Samtools Collection" |
| 106 | 8 | Bryan Cosca | }, |
| 107 | 13 | Bryan Cosca | "bcftools":{ |
| 108 | "required":true, |
||
| 109 | "dataclass":"Collection", |
||
| 110 | "default":"6a0c51bea360b487aa5c9d130435cd00+14097", |
||
| 111 | "title":"NAME Input BCFtools Collection" |
||
| 112 | }, |
||
| 113 | 8 | Bryan Cosca | "gatk_jar":{ |
| 114 | "required":true, |
||
| 115 | "dataclass":"Collection", |
||
| 116 | 1 | Bryan Cosca | "default":"2e98fdc8e90f4c48a0714b711767c9ce+76", |
| 117 | 9 | Bryan Cosca | "title":"NAME Input GATK Collection" |
| 118 | 1 | Bryan Cosca | }, |
| 119 | "reference":{ |
||
| 120 | "required":true, |
||
| 121 | "dataclass":"Collection", |
||
| 122 | "default":"3514b8e5da0e8d109946bc809b20a78a+5698", |
||
| 123 | 9 | Bryan Cosca | "title":"NAME Input Reference Collection" |
| 124 | }, |
||
| 125 | "picard":{ |
||
| 126 | "required":true, |
||
| 127 | "dataclass":"Collection", |
||
| 128 | "default":"0eaa58017c3689414a9e644a2297df5c+165", |
||
| 129 | "title":"NAME Input Picard Collection" |
||
| 130 | }, |
||
| 131 | "bedtools_bin":{ |
||
| 132 | "required":true, |
||
| 133 | "dataclass":"Collection", |
||
| 134 | "default":"b2f86c26e05e7a0686e7f39a86d406bf+34584", |
||
| 135 | "title":"NAME Input Bedtools Collection" |
||
| 136 | }, |
||
| 137 | "bed_path":{ |
||
| 138 | "required":true, |
||
| 139 | "dataclass":"Collection", |
||
| 140 | "default":"a0e2c5057d64bfb9ce2cdefd46f71b27+516", |
||
| 141 | "title":"NAME Input Bed Collection" |
||
| 142 | 18 | Bryan Cosca | }, |
| 143 | "bwa_bin": { |
||
| 144 | "required": true, |
||
| 145 | "dataclass": "Collection", |
||
| 146 | "default": "39c6f22d40001074f4200a72559ae7eb+5745", |
||
| 147 | "title": "NAME Input BWA Binary" |
||
| 148 | }, |
||
| 149 | 19 | Bryan Cosca | "gatk_ref":{ |
| 150 | "required":true, |
||
| 151 | "dataclass":"Collection", |
||
| 152 | "default":"25b68283b442c1a921ac826296103426+9636", |
||
| 153 | "title":"NAME GATK Reference Collection" |
||
| 154 | 20 | Bryan Cosca | }, |
| 155 | "tabix": { |
||
| 156 | "required": true, |
||
| 157 | "dataclass": "Collection", |
||
| 158 | "default": "180c32253e97ab7a117f8c9c15e95e8b+1131", |
||
| 159 | "title": "NAME Input Tabix/Bgzip" |
||
| 160 | 19 | Bryan Cosca | } |
| 161 | 8 | Bryan Cosca | }, |
| 162 | "runtime_constraints":{ |
||
| 163 | "max_tasks_per_node":1, |
||
| 164 | "min_nodes":1, |
||
| 165 | "docker_image":"bcosc/arv-base-java", |
||
| 166 | 22 | Bryan Cosca | "arvados_sdk_version":"a4d63932d669acd5011a7fa5afcbeec513acfe2c" |
| 167 | } |
||
| 168 | 8 | Bryan Cosca | }, |
| 169 | </pre> |
||
| 170 | |||
| 171 | 1 | Bryan Cosca | h2. Crunch Script Template |
| 172 | <pre> |
||
| 173 | #!/usr/bin/env python |
||
| 174 | |||
| 175 | import arvados |
||
| 176 | import subprocess |
||
| 177 | import os |
||
| 178 | import sys |
||
| 179 | import re |
||
| 180 | from arvados.collection import Collection as coll |
||
| 181 | import arvados_tools |
||
| 182 | import shutil |
||
| 183 | |||
| 184 | 23 | Bryan Cosca | arvados_tools.spawn_new_task_per_file('input','.*(bam|fastq)$',if_sequence=0, and_end_task=True) |
| 185 | 1 | Bryan Cosca | |
| 186 | this_job = arvados.current_job() |
||
| 187 | this_task = arvados.current_task() |
||
| 188 | tmpdir = arvados.current_task().tmpdir |
||
| 189 | |||
| 190 | input_1 = this_task['parameters']['input_1'] |
||
| 191 | input_1_path = os.path.join(arvados.get_job_param_mount("input"),input_1) |
||
| 192 | 23 | Bryan Cosca | #tmp_input_1_path = os.path.join(tmpdir,input_1) # If we need to copy to tmpdir |
| 193 | #shutil.copyfile(input_1_path,tmp_input_1_path) # If we need to copy to tmpdir |
||
| 194 | #os.chdir(tmpdir) |
||
| 195 | |||
| 196 | NAME_args = ['','','','','','','','',''] |
||
| 197 | print NAME_args |
||
| 198 | 25 | Bryan Cosca | NAME_out_file = os.path.join(tmpdir,input_1.replace('','')) |
| 199 | 23 | Bryan Cosca | |
| 200 | NAME_out_handle = open(NAME_out_file,'w') |
||
| 201 | NAME_pipe = subprocess.Popen(NAME_args,stdout=NAME_out_handle) |
||
| 202 | NAME_pipe.wait() |
||
| 203 | print NAME_pipe.returncode |
||
| 204 | NAME_out_handle.close() |
||
| 205 | |||
| 206 | NAME_pipe = subprocess.check_output(NAME_args) |
||
| 207 | |||
| 208 | arvados_tools.write_tmpdir(tmpdir) |
||
| 209 | |||
| 210 | 1 | Bryan Cosca | </pre> |
| 211 | |||
| 212 | |||
| 213 | <pre> |
||
| 214 | samtools_path = arvados_tools.get_file_path('samtools','^samtools$') |
||
| 215 | gatk_path = arvados_tools.get_file_path('gatk_jar','^GenomeAnalysisTK.jar$') |
||
| 216 | reference_path = arvados_tools.get_file_path('reference','.*f(ast)?a(.gz)?$') |
||
| 217 | dbsnp_path = arvados_tools.get_file_path('dbsnp','^dbsnp.*vcf$') |
||
| 218 | 11 | Bryan Cosca | bgzip_path = arvados_tools.get_file_path('tabix','^bgzip$') |
| 219 | tabix_path = arvados_tools.get_file_path('tabix','^tabix$') |
||
| 220 | 24 | Bryan Cosca | bwa_path = arvados_tools.get_file_path('bwa_bin','^bwa$') |
| 221 | 12 | Bryan Cosca | |
| 222 | 17 | Bryan Cosca | tmp_picard_path = arvados_tools.get_file_path('picard','^picard.jar$') |
| 223 | # Copy picard over to tmpdir because java cannot hand "+" characters |
||
| 224 | picard_path = os.path.join(tmpdir,"picard.jar") |
||
| 225 | shutil.copyfile(tmp_picard_path,picard_path) |
||
| 226 | |||
| 227 | 12 | Bryan Cosca | others: |
| 228 | bed56Gb37_path = arvados_tools.get_file_path('bed_path','.*bed$') |
||
| 229 | 19 | Bryan Cosca | |
| 230 | 12 | Bryan Cosca | |
| 231 | 1 | Bryan Cosca | </pre> |
| 232 | |||
| 233 | <pre> |
||
| 234 | gatk_inserttool_args = [] |
||
| 235 | gatk_inserttool_output_name |
||
| 236 | gatk_inserttool_output_path |
||
| 237 | gatk_inserttool_pipe = subprocess.check_output() |
||
| 238 | </pre> |
||
| 239 | |||
| 240 | <pre> |
||
| 241 | samtools_inserttool_args = [] |
||
| 242 | </pre> |
||
| 243 | |||
| 244 | h2. Script Parameter Template |
||
| 245 | |||
| 246 | 21 | Bryan Cosca | for grabbing random script parameters |
| 247 | |||
| 248 | num_files = this_job['script_parameters']['param'] |
||
| 249 | |||
| 250 | |||
| 251 | 10 | Bryan Cosca | h2. Random stuff |
| 252 | 1 | Bryan Cosca | |
| 253 | 3 | Bryan Cosca | h2. Latest arvados_sdk_version: |
| 254 | 2 | Bryan Cosca | |
| 255 | https://arvados.org/projects/arvados/repository/revisions/master/show/sdk/python |
||
| 256 | 10 | Bryan Cosca | |
| 257 | h2. Random tools I use |
||
| 258 | |||
| 259 | os.path.join(arvados.get_job_param_mount("param"),name) |
||
| 260 | 14 | Bryan Cosca | |
| 261 | h2. Pipe through tools |
||
| 262 | |||
| 263 | bwa_pipe = subprocess.Popen(args,stdout=subprocess.PIPE) |
||
| 264 | output_file = open(output_bam_path,'w') |
||
| 265 | samtools_pipe = subprocess.Popen(sam_args, stdin=bwa_pipe.stdout, stdout=output_file) |
||
| 266 | bwa_pipe.wait() |
||
| 267 | samtools_pipe.wait() |
||
| 268 | output_file.close() |
||
| 269 | 15 | Bryan Cosca | |
| 270 | h2. Get name of file without n extensions |
||
| 271 | 16 | Bryan Cosca | |
| 272 | 15 | Bryan Cosca | base_input_split = re.split('(\.)',input_1) |
| 273 | base_input_list = base_input_split[0:len(base_input_split)-n*2] |
||
| 274 | base_name = ''.join(base_input_list) |
||
| 275 | print base_name |