Zarr Operations Cookbook

This vignette covers common zarr array operations: persistent storage, compression, resizing, filters, and advanced indexing.

library(pizzarr)

Persistent arrays

Create an array on disk, close the session, and reopen it later.

path <- file.path(tempdir(), "example.zarr")

# Create a persistent array backed by a DirectoryStore
z <- zarr_open_array(
  store = path, mode = "w",
  shape = c(5, 10), chunks = c(5, 5), dtype = "<f4"
)

# Write data
z$set_item("...", array(1:50, dim = c(5, 10)))
#> NULL

z$get_shape()
#> [1]  5 10

Reopen the same path in read mode:

z2 <- zarr_open_array(store = path, mode = "r")

z2$get_shape()
#> [1]  5 10

z2$get_item("...")$data
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    1    6   11   16   21   26   31   36   41    46
#> [2,]    2    7   12   17   22   27   32   37   42    47
#> [3,]    3    8   13   18   23   28   33   38   43    48
#> [4,]    4    9   14   19   24   29   34   39   44    49
#> [5,]    5   10   15   20   25   30   35   40   45    50

For quick save/load of an existing array:

save_path <- file.path(tempdir(), "saved.zarr")

# Save an R array directly
zarr_save_array(save_path, zarr_create_array(
  data = volcano, shape = dim(volcano), dtype = "<f8"
))
#> <ZarrArray> /
#>   Shape       : (87, 61)
#>   Chunks      : (87, 61)
#>   Data type   : <f8
#>   Fill value  : 0
#>   Order       : F
#>   Read-only   : FALSE
#>   Compressor  : ZstdCodec
#>   Store type  : DirectoryStore
#>   Zarr format : 2

# Reopen
z3 <- zarr_open_array(save_path, mode = "r")

all.equal(z3$as.array(), volcano)
#> [1] TRUE

Compression

By default, pizzarr uses Zstandard compression. You can choose a different compressor when creating an array.

Zstandard (default)

z_zstd <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = ZstdCodec$new(level = 3)
)

z_zstd$get_compressor()$get_config()
#> $id
#> [x] "zstd"
#> 
#> $level
#> [x] 3

Gzip

Gzip compression is interoperable with zarr-python and other implementations, but is slower than Zstandard because R lacks an in-memory gzip API. For best write performance, prefer ZstdCodec.

z_gzip <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = GzipCodec$new(level = 5)
)

z_gzip$get_compressor()$get_config()
#> $id
#> [x] "gzip"
#> 
#> $level
#> [x] 5

Blosc (with algorithm selection)

z_blosc <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = BloscCodec$new(cname = "lz4", clevel = 5, shuffle = TRUE)
)

z_blosc$get_compressor()$get_config()
#> $id
#> [x] "blosc"
#> 
#> $cname
#> [x] "lz4"
#> 
#> $clevel
#> [x] 5
#> 
#> $shuffle
#> [x] 1
#> 
#> $blocksize
#> [x] 0

No compression

z_none <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = NA
)

is.na(z_none$get_compressor())
#> Warning in is.na(z_none$get_compressor()): is.na() applied to non-(list or
#> vector) of type 'environment'
#> [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Resizing arrays

Arrays can be resized after creation. Data in the overlapping region is preserved; new regions are filled with the fill value.

z <- zarr_create(
  shape = c(5, 10), chunks = c(5, 5),
  dtype = "<i4", fill_value = 0L,
  compressor = "default"
)

z$set_item("...", array(1:50, dim = c(5, 10)))
#> NULL

z$get_shape()
#> [1]  5 10

# Grow the array
z$resize(10, 20)

z$get_shape()
#> [1] 10 20

# Original data is preserved in the top-left corner
z[1:5, 1:10]$data
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    1    6   11   16   21   26   31   36   41    46
#> [2,]    2    7   12   17   22   27   32   37   42    47
#> [3,]    3    8   13   18   23   28   33   38   43    48
#> [4,]    4    9   14   19   24   29   34   39   44    49
#> [5,]    5   10   15   20   25   30   35   40   45    50

# New region is filled with fill_value
z[6:10, 1:5]$data
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    0    0    0    0    0
#> [2,]    0    0    0    0    0
#> [3,]    0    0    0    0    0
#> [4,]    0    0    0    0    0
#> [5,]    0    0    0    0    0

Shrinking removes chunks that fall outside the new shape:

z$resize(3, 4)

z$get_shape()
#> [1] 3 4

z$get_item("...")$data
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    6   11   16
#> [2,]    2    7   12   17
#> [3,]    3    8   13   18

Appending data

Use append() to grow an array along an axis, adding new data at the end. This is equivalent to zarr-python’s z.append(data, axis=0), but uses R’s 1-based axis indexing (axis 1 = first dimension).

z <- zarr_create(
  shape = c(3, 4), chunks = c(3, 4),
  dtype = "<i4", fill_value = 0L
)

z$set_item("...", array(1:12, dim = c(3, 4)))
#> NULL

z$as.array()
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12

Append new rows (axis 1, the default):

new_rows <- array(13:20, dim = c(2, 4))

z$append(new_rows)
#> NULL

z$get_shape()
#> [1] 5 4

z$as.array()
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
#> [4,]   13   15   17   19
#> [5,]   14   16   18   20

Append new columns (axis 2):

new_cols <- array(21:30, dim = c(5, 2))

z$append(new_cols, axis = 2)
#> NULL

z$get_shape()
#> [1] 5 6

z$as.array()
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    4    7   10   21   26
#> [2,]    2    5    8   11   22   27
#> [3,]    3    6    9   12   23   28
#> [4,]   13   15   17   19   24   29
#> [5,]   14   16   18   20   25   30

Filters

Filters transform chunk data before compression. They are codec instances passed as a list to the filters parameter. A common use case is variable-length UTF-8 string arrays, which require VLenUtf8Codec as a filter.

words <- c("alpha", "bravo", "charlie", "delta")

z_str <- zarr_create_array(
  data = array(words, dim = length(words)),
  shape = length(words), dtype = "|O",
  object_codec = VLenUtf8Codec$new()
)

z_str$get_item("...")$data
#> [1] "alpha"   "bravo"   "charlie" "delta"

z_str$get_filters()
#> [[1]]
#> <VLenUtf8Codec>
#>   Inherits from: <Codec>
#>   Public:
#>     clone: function (deep = FALSE) 
#>     decode: function (buf, zarr_arr) 
#>     encode: function (buf, zarr_arr) 
#>     get_config: function ()

Advanced indexing

Beyond basic slicing with slice() or [, pizzarr supports orthogonal indexing for independent selection along each dimension.

Setup

z <- zarr_create_array(
  data = matrix(1:30, nrow = 5, ncol = 6),
  shape = c(5, 6), dtype = "<i4"
)

z$as.array()
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    6   11   16   21   26
#> [2,]    2    7   12   17   22   27
#> [3,]    3    8   13   18   23   28
#> [4,]    4    9   14   19   24   29
#> [5,]    5   10   15   20   25   30

Basic slicing with `[`

The bracket operator uses orthogonal indexing internally:

# Select rows 1-3, columns 2-4
z[1:3, 2:4]$data
#>      [,1] [,2] [,3]
#> [1,]    6   11   16
#> [2,]    7   12   17
#> [3,]    8   13   18

Orthogonal selection with integer arrays

Select specific rows and columns independently. Note that get_orthogonal_selection uses zero-based indices (like zarr-python), while the [ operator uses R’s one-based indexing:

z$get_orthogonal_selection(list(c(0L, 2L, 4L), zb_slice(0, 6)))$data
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    6   11   16   21   26
#> [2,]    3    8   13   18   23   28
#> [3,]    5   10   15   20   25   30

Boolean (mask) dimension indexing

Select dimensions using logical vectors:

row_mask <- c(TRUE, FALSE, TRUE, FALSE, TRUE)

z$get_orthogonal_selection(list(row_mask, zb_slice(0, 6)))$data
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    6   11   16   21   26
#> [2,]    3    8   13   18   23   28
#> [3,]    5   10   15   20   25   30

Using the OIndex object

The $get_oindex() accessor provides the same orthogonal indexing:

oi <- z$get_oindex()

oi$get_item(list(c(0L, 4L), c(1L, 3L, 5L)))$data
#>      [,1] [,2] [,3]
#> [1,]    6   16   26
#> [2,]   10   20   30

Slicing with step

Select every other row, every third column using seq() in bracket notation:

z[seq(1, 5, 2), seq(1, 6, 3)]$data
#>      [,1] [,2]
#> [1,]    1   16
#> [2,]    3   18
#> [3,]    5   20

Ellipsis and colon shorthand

"..." selects all remaining dimensions; ":" selects all along one dimension. These work with get_item():

# All rows, column 1
z$get_item(list(":", 1))$data
#>      [,1]
#> [1,]    6
#> [2,]    7
#> [3,]    8
#> [4,]    9
#> [5,]   10

# Row 1, all columns
z$get_item(list(1, "..."))$data
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    2    7   12   17   22   27