I created per-chromosome zarr stores from a multisample VCF file using scikit-allel in a python program launched by a SLURM job array. I did this in order to parallelize the work so it would complete in less time and require less memory. Can I now merge the resulting 14 zarr stores into a single zarr store by simply renaming them, putting them inside an enclosing directory, and adding a .zgroup file to it? This was the original program:
#!/usr/bin/env python3
import sys; print(sys.version)
import os
import glob
import subprocess
import numpy as np; print('numpy', np.__version__)
import pandas as pd; print('pandas',pd.__version__)
import allel; print('allel', allel.__version__)
import zarr; print('zarr', zarr.__version__)
INFN = sys.argv[1]
if not INFN:
print('Must provide input .vcf.gz as first argument')
sys.exit(2)
FIELDS = [
'samples',
'variants/CHROM',
'variants/POS',
'variants/REF',
'variants/ALT',
'variants/QUAL',
'variants/TYPE',
'variants/is_snp',
'variants/numalt',
'variants/AF',
'variants/DP',
'variants/ANN',
'calldata/DP',
'calldata/GT',
]
EXCLUDE_FIELDS = None
TABIX_EXEC = 'tabix'
print("Using tabix executable '{}' {} '{}'\n{}".format(TABIX_EXEC, "->",
subprocess.check_output(['which', 'tabix']).decode('utf-8').rstrip(),
subprocess.check_output([TABIX_EXEC, '--version']).decode('utf-8')))
task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))
chroms = subprocess.check_output([TABIX_EXEC,'-l',INFN],
universal_newlines=True).strip().split('\n')
ch = chroms[task_id]
OUTFN = f"{INFN}.{ch}.zarr"
transformers = None
if 'ANN' in FIELDS:
transformers=allel.ANNTransformer()
def vcf_to_zarr_func(ch):
allel.vcf_to_zarr(INFN, OUTFN,
region=ch,
group=ch,
log=sys.stderr,
fields=FIELDS,
exclude_fields=EXCLUDE_FIELDS,
tabix=TABIX_EXEC,
transformers=transformers)
print(f"Processing chromosome: {ch}")
vcf_to_zarr_func(ch)