Source code for dolomite_base.read_data_frame

from biocframe import BiocFrame
import h5py
import os

from .alt_read_object import alt_read_object
from . import _utils_string as strings
from .load_vector_from_hdf5 import load_vector_from_hdf5
from ._utils_factor import load_factor_from_hdf5 
from . import _utils_misc as misc


[docs] def read_data_frame(path: str, metadata: dict, data_frame_represent_numeric_column_as_1darray : bool = True, **kwargs) -> BiocFrame: """Load a data frame from a HDF5 file. In general, this function should not be called directly but instead via :py:meth:`~dolomite_base.read_object.read_object`. Args: path: Path to the directory containing the object. metadata: Metadata for the object. data_frame_represent_numeric_column_as_1darray: Whether numeric columns should be represented as 1-dimensional NumPy arrays. This is more efficient than regular Python lists but discards the distinction between vectors and 1-D arrays. Usually this is not an important difference, but nonetheless, users can set this flag to ``False`` to load columns as (typed) lists instead. kwargs: Further arguments, passed to nested objects. Returns: A data frame. """ column_names = [] contents = {} row_names = None expected_rows = 0 with h5py.File(os.path.join(path, "basic_columns.h5"), "r") as handle: ghandle = handle["data_frame"] expected_rows = ghandle.attrs["row-count"][()] column_names = strings.load_string_vector_from_hdf5(ghandle["column_names"]) if "row_names" in ghandle: row_names = strings.load_string_vector_from_hdf5(ghandle["row_names"]) dhandle = ghandle["data"] for i, col in enumerate(column_names): name = str(i) if name not in dhandle: contents[col] = alt_read_object(os.path.join(path, "other_columns", name), **kwargs) else: xhandle = dhandle[name] curtype = strings.load_scalar_string_attribute_from_hdf5(xhandle, "type") if curtype == "factor": contents[col] = load_factor_from_hdf5(xhandle) else: expected_type = misc.translate_type(curtype) contents[col] = load_vector_from_hdf5(xhandle, expected_type, report_1darray=(expected_type != str and data_frame_represent_numeric_column_as_1darray)) df = BiocFrame(contents, number_of_rows=expected_rows, row_names=row_names, column_names=column_names) other_dir = os.path.join(path, "other_annotations") if os.path.exists(other_dir): df.set_metadata(alt_read_object(other_dir, **kwargs).as_dict(), in_place=True) mcol_dir = os.path.join(path, "column_annotations") if os.path.exists(mcol_dir): df.set_column_data(alt_read_object(mcol_dir, **kwargs), in_place=True) return df