Source code for dolomite_base.load_vector_from_hdf5

from typing import Sequence, Union
import numpy
import h5py
from biocutils import StringList, IntegerList, FloatList, BooleanList

from . import _utils_string as strings


[docs] def load_vector_from_hdf5(handle: h5py.Dataset, expected_type: type, report_1darray: bool) -> Union[StringList, IntegerList, FloatList, BooleanList, numpy.ndarray]: """ Load a vector from a 1-dimensional HDF5 dataset, with coercion to the expected type. Any missing value placeholders are used to set Nones or to create masks. Args: handle: Handle to a HDF5 dataset. expected_type: Expected type of the output vector. This should be one of ``float``, ``int``, ``str`` or ``bool``. report_1darray: Whether to report the output as a 1-dimensional NumPy array. Returns: The contents of the dataset as a vector-like object. By default, this is a typed :py:class:`~biocutils.biocutils.NamedList` subclass with missing values represented by None. If ``keep_as_1darray = True``, a 1-dimensional NumPy array is returned instead, possibly with masking. """ if expected_type == str: values = strings.load_string_vector_from_hdf5(handle) placeholder = None if "missing-value-placeholder" in handle.attrs: placeholder = strings.load_scalar_string_attribute_from_hdf5(handle, "missing-value-placeholder") if report_1darray: values = numpy.array(values) if placeholder is not None: mask = values == placeholder values = numpy.ma.MaskedArray(values, mask=mask) else: if placeholder is not None: for j, y in enumerate(values): if y == placeholder: values[j] = None values = StringList(values) return values values = handle[:] if "missing-value-placeholder" in handle.attrs: placeholder = handle.attrs["missing-value-placeholder"] if numpy.isnan(placeholder): mask = numpy.isnan(values) else: mask = (values == placeholder) if report_1darray: return numpy.ma.MaskedArray(_coerce_numpy_type(values, expected_type), mask=mask) else: output = [] for i, y in enumerate(values): if mask[i]: output.append(None) else: output.append(y) return _choose_NamedList_subclass(output, expected_type) if report_1darray: return _coerce_numpy_type(values, expected_type) else: return _choose_NamedList_subclass(values, expected_type)
def _coerce_numpy_type(values: numpy.ndarray, expected_type: str) -> numpy.ndarray: if expected_type == bool: return values != 0 elif expected_type == float: if not numpy.issubdtype(values.dtype, numpy.floating): return values.astype(numpy.double) return values def _choose_NamedList_subclass(values: Sequence, expected_type: str) -> Union[IntegerList, FloatList, BooleanList]: if expected_type == bool: return BooleanList(values) elif expected_type == float: return FloatList(values) else: return IntegerList(values)