from typing import Sequence, Union
import numpy
import h5py
from . import choose_missing_placeholder as ch
from . import _utils_misc as misc
from . import _utils_string as strings
def _is_missing_scalar(x):
return x is None or numpy.ma.is_masked(x)
def _has_missing(x: Sequence):
if isinstance(x, numpy.ndarray):
if isinstance(x, numpy.ma.MaskedArray):
return x.mask.any()
return False
return any(_is_missing_scalar(y) for y in x)
def _fill_with_placeholder(x, dtype, placeholder):
copy = numpy.ndarray(len(x), dtype=dtype)
for i, y in enumerate(x):
if _is_missing_scalar(y):
copy[i] = placeholder
else:
copy[i] = y
return copy
###################################################
###################################################
[docs]
def write_string_vector_to_hdf5(
handle: h5py.Group,
name: str,
x: Sequence[str],
placeholder_name: str = "missing-value-placeholder"
) -> h5py.Dataset:
"""
Write a string vector to a HDF5 file as a 1-dimensional dataset with a
fixed-length string datatype. If ``x`` contains missing values, a suitable
placeholder value is selected using
:py:func:`~dolomite_base.choose_missing_placeholder.choose_missing_string_placeholder`.
and used to replace all missing values in the dataset. The placeholder
itself is stored as an attribute of the dataset.
Args:
handle: A handle to a HDF5 group.
name: Name of the dataset in which to save the string vector.
x: Sequence containing strings, Nones, and/or masked NumPy values.
placeholder_name:
Name of the attribute in which to store the missing value
placeholder, if ``x`` contains None or masked values.
Returns:
Handle for the newly created dataset.
"""
missed = _has_missing(x)
if missed:
placeholder = ch.choose_missing_string_placeholder(x)
x = list(x)
for i, y in enumerate(x):
if _is_missing_scalar(y):
x[i] = placeholder
dset = strings.save_fixed_length_strings(handle, name, x)
if missed:
dset.attrs[placeholder_name] = placeholder
return dset
###################################################
###################################################
[docs]
def write_integer_vector_to_hdf5(
handle: h5py.Group,
name: str,
x: Sequence[int],
h5type: str = "i4",
placeholder_name: str = "missing-value-placeholder",
allow_float_promotion: bool = False
) -> h5py.Dataset:
"""
Write an integer vector to a HDF5 file as a 1-dimensional dataset. If
``x`` contains missing values, a placeholder value is selected by
:py:func:`~dolomite_base.choose_missing_placeholder.choose_missing_integer_placeholder`
and used to replace all of the missing values in the dataset. The
placeholder value itself is stored as an attribute of the dataset.
Args:
handle: A handle to a HDF5 group.
name: Name of the dataset in which to save the integer vector.
x: Sequence containing integers, Nones, and/or masked NumPy values.
h5type: Integer type of the HDF5 dataset to create.
placeholder_name:
Name of the attribute in which to store the missing value
placeholder, if ``x`` contains None or masked values.
allow_float_promotion:
Whether to save ``x`` into a 64-bit floating-point dataset if any
values in ``x`` exceeds the range of values that can be represented
by ``h5type``, or if no missing value placeholder can be found
within the acceptable range of integer values. If ``False``, an
error is raised if ``x`` cannot be saved without promotion.
Returns:
Handle for the newly created dataset.
"""
missed = _has_missing(x)
max_dtype = numpy.dtype(h5type).type
limits = numpy.iinfo(max_dtype)
exceeds = False
for y in x:
if not _is_missing_scalar(y):
if y < limits.min or y > limits.max:
exceeds = True
break
if exceeds:
if not allow_float_promotion:
raise ValueError("cannot save out-of-range integers without type promotion")
if missed:
placeholder = numpy.nan
x = _fill_with_placeholder(x, numpy.float64, placeholder)
else:
if missed:
placeholder = ch.choose_missing_integer_placeholder(x, max_dtype=max_dtype)
if placeholder is None:
exceeds = True
if not allow_float_promotion:
raise ValueError("cannot find a suitable missing value placeholder without type promotion")
placeholder = numpy.nan
x = _fill_with_placeholder(x, numpy.float64, placeholder)
else:
x = _fill_with_placeholder(x, placeholder.dtype.type, placeholder)
if exceeds:
h5type = "f8"
dset = handle.create_dataset(name, data=x, dtype=h5type, compression="gzip", chunks=True)
if missed:
dset.attrs.create(placeholder_name, placeholder, dtype=h5type)
return dset
###################################################
###################################################
[docs]
def write_float_vector_to_hdf5(
handle: h5py.Group,
name: str,
x: Sequence[float],
h5type: str = "f8",
placeholder_name: str = "missing-value-placeholder"
) -> h5py.Dataset:
"""
Write a floating-point vector to a HDF5 file as a 1-dimensional dataset.
If ``x`` contains missing values, a placeholder value is selected by
:py:func:`~dolomite_base.choose_missing_placeholder.choose_missing_float_placeholder`.
and used to replace all of the missing values in the dataset. The
placeholder value itself is stored as an attribute of the dataset.
Args:
handle: A handle to a HDF5 group.
name: Name of the dataset in which to save the integer vector.
x: Sequence containing floats, Nones, and/or masked NumPy values.
h5type: Floating-point type of the HDF5 dataset to create.
placeholder_name:
Name of the attribute in which to store the missing value
placeholder, if ``x`` contains None or masked values.
Returns:
Handle for the newly created dataset.
"""
missed = _has_missing(x)
if missed:
dtype = numpy.dtype(h5type).type
placeholder = ch.choose_missing_float_placeholder(x, dtype=dtype)
x = _fill_with_placeholder(x, dtype, placeholder)
dset = handle.create_dataset(name, data=x, dtype=h5type, compression="gzip", chunks=True)
if missed:
dset.attrs.create(placeholder_name, placeholder, dtype=h5type)
return dset
###################################################
###################################################
[docs]
def write_boolean_vector_to_hdf5(
handle: h5py.Group,
name: str,
x: Sequence[bool],
placeholder_name: str = "missing-value-placeholder"
) -> h5py.Dataset:
"""
Write a boolean vector to a HDF5 file as a 1-dimensional dataset with
a 8-bit signed integer datatype. If ``x`` contains missing values, they
are replaced with a placeholder value of -1.
Args:
handle: A handle to a HDF5 group.
name: Name of the dataset in which to save the integer vector.
x: Sequence containing booleans, Nones, and/or masked NumPy values.
placeholder_name:
Name of the attribute in which to store the missing value
placeholder, if ``x`` contains None or masked values.
Returns:
Handle for the newly created dataset.
"""
missed = _has_missing(x)
if missed:
placeholder = -1
x = _fill_with_placeholder(x, numpy.int8, placeholder)
h5type = "i1"
dset = handle.create_dataset(name, data=x, dtype=h5type, compression="gzip", chunks=True)
if missed:
dset.attrs.create(placeholder_name, placeholder, dtype=h5type)
return dset