Can I merge zarr stores by moving them into an enclosing directory?

13:48 13 May 2026

I created per-chromosome zarr stores from a multisample VCF file using scikit-allel in a python program launched by a SLURM job array. I did this in order to parallelize the work so it would complete in less time and require less memory. Can I now merge the resulting 14 zarr stores into a single zarr store by simply renaming them, putting them inside an enclosing directory, and adding a .zgroup file to it? This was the original program:

#!/usr/bin/env python3

import sys; print(sys.version)
import os
import glob
import subprocess
import numpy as np; print('numpy', np.__version__)
import pandas as pd; print('pandas',pd.__version__)
import allel; print('allel', allel.__version__)
import zarr; print('zarr', zarr.__version__)

INFN = sys.argv[1]
if not INFN:
    print('Must provide input .vcf.gz as first argument')
    sys.exit(2)

FIELDS = [
    'samples',
    'variants/CHROM',
    'variants/POS',
    'variants/REF',
     'variants/ALT',
    'variants/QUAL',
    'variants/TYPE',
    'variants/is_snp',
    'variants/numalt',
    'variants/AF',
    'variants/DP',
     'variants/ANN',
    'calldata/DP',
    'calldata/GT',
         ]
EXCLUDE_FIELDS = None

TABIX_EXEC = 'tabix'

print("Using tabix executable '{}' {} '{}'\n{}".format(TABIX_EXEC, "->",
        subprocess.check_output(['which', 'tabix']).decode('utf-8').rstrip(),
        subprocess.check_output([TABIX_EXEC, '--version']).decode('utf-8')))

task_id = int(os.environ.get("SLURM_ARRAY_TASK_ID", 0))

chroms = subprocess.check_output([TABIX_EXEC,'-l',INFN],
                                 universal_newlines=True).strip().split('\n')

ch = chroms[task_id]

OUTFN = f"{INFN}.{ch}.zarr"

transformers = None
if 'ANN' in FIELDS:
    transformers=allel.ANNTransformer()

def vcf_to_zarr_func(ch):
    allel.vcf_to_zarr(INFN, OUTFN,
                      region=ch,
                      group=ch,
                      log=sys.stderr,
                      fields=FIELDS,
                      exclude_fields=EXCLUDE_FIELDS,
                      tabix=TABIX_EXEC,
                      transformers=transformers)

print(f"Processing chromosome: {ch}")

vcf_to_zarr_func(ch)

python zarr

Your Answer

Privacy & Cookie Consent