Source code for ch5mpy.write

# coding: utf-8

# ====================================================
# imports
from __future__ import annotations

import pickle
from numbers import Number
from typing import TYPE_CHECKING, Any, Mapping

import numpy as np
import numpy.typing as npt
from h5py import string_dtype
from tqdm.auto import tqdm

import ch5mpy.dict
from ch5mpy.objects.dataset import Dataset
from ch5mpy.objects.group import File, Group
from ch5mpy.utils import is_sequence

if TYPE_CHECKING:
    from ch5mpy import H5Array


# ====================================================
# code
def _store_dataset(
    loc: Group | File,
    name: str,
    array: npt.NDArray[Any] | H5Array[Any] | None = None,
    shape: tuple[int, ...] | None = None,
    dtype: npt.DTypeLike | None = None,
    chunks: bool | tuple[int, ...] = True,
    maxshape: int | tuple[int | None, ...] | None = None,
    fill_value: Any = None,
) -> Dataset[Any]:
    """Store a dataset."""
    if dtype is None:
        if array is not None:
            dtype = array.dtype

    if isinstance(dtype, type):
        str_dtype = str(dtype().dtype)
    else:
        str_dtype = str(dtype)

    if np.issubdtype(dtype, np.str_):
        array = None if array is None else array.astype(object)
        dtype = string_dtype()

    if array is not None:
        if shape is None:
            shape = array.shape

        elif shape != array.shape:
            raise ValueError("array's shape does not match the shape parameter.")

    elif shape is None:
        raise ValueError("At least one of `array` or `shape` must be provided.")

    if chunks:
        if chunks is True:  # literally `True`, not a tuple
            # FIXME : causes huge lag
            # parsed_chunks = (get_size(ch5mpy.H5Array.MAX_MEM_USAGE),) + (1,) * (len(shape) - 1)
            pass

        parsed_chunks = chunks

        if maxshape is None:
            maxshape = (None,) * len(shape)

    else:
        parsed_chunks = None

    dset = loc.create_dataset(
        name,
        data=array,
        shape=shape,
        dtype=dtype,
        chunks=parsed_chunks,
        maxshape=maxshape,
        fillvalue=fill_value,
    )
    dset.attrs["dtype"] = str_dtype

    return dset


[docs]def write_dataset(
    loc: Group | File | ch5mpy.dict.H5Dict[Any],
    name: str,
    obj: Any,
    *,
    chunks: bool | tuple[int, ...] = True,
    maxshape: tuple[int, ...] | None = None,
) -> None:
    """Write an array-like object to a H5 dataset."""
    if isinstance(loc, ch5mpy.dict.H5Dict):
        loc = loc.file

    if isinstance(obj, Mapping):
        group = loc.create_group(name)
        write_datasets(group, **obj)
        return

    # cast to np.array if needed (to get shape and dtype)
    array = np.array(obj) if not hasattr(obj, "shape") else obj
    if array.dtype == object:
        array = array.astype(str)

    if name in loc.keys():
        if loc[name] is array:
            # this exact dataset is already stored > do nothing
            return

        if loc[name].shape == array.shape and loc[name].dtype == array.dtype:
            # a similar array already exists > simply copy the data
            loc[name][()] = array
            return

        # a different array was stored, delete it before storing the new array
        del loc[name]

    _store_dataset(loc, name, array, chunks=chunks, maxshape=maxshape)


[docs]def write_datasets(
    loc: Group | File | ch5mpy.dict.H5Dict[Any],
    *,
    chunks: bool | tuple[int, ...] = True,
    maxshape: tuple[int, ...] | None = None,
    **kwargs: Any,
) -> None:
    """Write multiple array-like objects to H5 datasets."""
    for name, obj in kwargs.items():
        write_dataset(loc, name, obj, chunks=chunks, maxshape=maxshape)


[docs]def write_object(
    loc: Group | File | ch5mpy.dict.H5Dict[Any],
    name: str,
    obj: Any,
    *,
    chunks: bool | tuple[int, ...] = True,
    maxshape: tuple[int, ...] | None = None,
    overwrite: bool = False,
    progress: tqdm[Any] | None = None,
) -> None:
    """Write any object to a H5 file."""
    if isinstance(loc, ch5mpy.dict.H5Dict):
        loc = loc.file

    if hasattr(obj, "__h5_write__"):
        group = loc.create_group(name, overwrite=overwrite) if name else loc
        obj.__h5_write__(ch5mpy.dict.H5Dict(group))
        group.attrs["__h5_type__"] = "object"
        group.attrs["__h5_class__"] = np.void(pickle.dumps(type(obj), protocol=pickle.HIGHEST_PROTOCOL))

    elif isinstance(obj, Mapping):
        group = loc.create_group(name, overwrite=overwrite) if name else loc
        write_objects(group, **obj, chunks=chunks, maxshape=maxshape, progress=progress)

    elif is_sequence(obj):
        write_dataset(loc, name, obj, chunks=chunks, maxshape=maxshape)

    elif isinstance(obj, (Number, str)):
        if name in loc and overwrite:
            del loc[name]

        loc[name] = obj

    else:
        loc[name] = np.void(pickle.dumps(obj, protocol=pickle.HIGHEST_PROTOCOL))
        loc[name].attrs["__h5_type__"] = "pickle"

    if progress is not None:
        progress.update()


[docs]def write_objects(
    loc: Group | File | ch5mpy.dict.H5Dict[Any],
    *,
    chunks: bool | tuple[int, ...] = True,
    maxshape: tuple[int, ...] | None = None,
    overwrite: bool = False,
    progress: tqdm[Any] | None = None,
    **kwargs: Any,
) -> None:
    """Write multiple objects of any type to a H5 file."""
    for name, obj in kwargs.items():
        write_object(loc, name, obj, chunks=chunks, maxshape=maxshape, overwrite=overwrite, progress=progress)