DataFrame.js

import { DataFrame, List, IntegerList, NumberList, BooleanList, StringList } from "bioconductor";
import { H5Group, H5DataSet } from "./h5.js";
import { readObject, readObjectFile, saveObject } from "./general.js";
import { joinPath, formatNumberArrayForHdf5, formatIntegerArrayForHdf5, formatStringArrayForHdf5, formatBooleanArrayForHdf5, jsonBuffer } from "./utils.js";
import { readAnnotatedMetadata, saveAnnotatedMetadata } from "./metadata.js";

/**
 * A data frame of columnar data.
 * @external DataFrame
 * @see {@link https://ltla.github.io/bioconductor.js/DataFrame.html}
 */

/**
 * @param {string} path - Path to the takane-formatted object directory containing the {@link external:DataFrame DataFrame}.
 * @param {object} metadata - Takane object metadata, typically generated by calling {@linkcode readObjectFile} on `path`.
 * @param {object} globals - Object satisfying the {@link GlobalsInterface}.
 * @param {object} [options={}] - Further options.
 * @param {function|boolean} [options.DataFrame_readNested=true] - How to read columns containing nested objects.
 * If `true`, {@linkcode readObject} is used, while if `false`, nested objects will be skipped.
 * If a function is provided, it should accept `nrow` (the number of rows in the data frame) as well as `path`, `metadata`, `globals` and `options` (as described above);
 * and should return an object (possibly asynchronously) for which [`NUMBER_OF_ROWS`](https://ltla.github.io/bioconductor.js/global.html#NUMBER_OF_ROWS) is equal to `nrow`. 
 * @param {function|boolean} [options.DataFrame_readMetadata=true] - How to read the metadata.
 * If `true`, {@linkcode readObject} is used, while if `false`, metadata will be skipped.
 * If a function is provided, it should accept `path`, `metadata`, `globals` and `options` (as described above), and return a {@link external:List List}.
 * @param {boolean} [options.DataFrame_toTypedArray=false] - Whether to report integer/number vectors without missing values as TypedArrays.
 * If `false`, vectors are reported as instances of an appropriately-typed {@link List} subclass.
 *
 * @return {external:DataFrame} The data frame.
 * @async
 */
export async function readDataFrame(path, metadata, globals, options = {}) {
    let read_nested = true; 
    if ("DataFrame_readNested" in options) {
        read_nested = options.DataFrame_readNested;
    } 
    let typedarray = true;
    if ("DataFrame_toTypedArray" in options) {
        typedarray = options.DataFrame_toTypedArray;
    }

    let fhandle = await globals.h5open(joinPath(path, "basic_columns.h5")); 
    let handle_stack = [fhandle];
    try {
        let ghandle = fhandle.open("data_frame");
        handle_stack.push(ghandle);
        let dhandle = ghandle.open("data");
        handle_stack.push(dhandle);

        let cnhandle = ghandle.open("column_names");
        handle_stack.push(cnhandle);
        let nrows = Number(ghandle.readAttribute("row-count").values[0]);
        let colnames = cnhandle.values();
        cnhandle.close();
        handle_stack.pop();

        let collected = {};
        let skip_nested = false;
        let kids = dhandle.children();
        for (const [i, k] of Object.entries(colnames)) {
            let iname = String(i)
            if (kids.indexOf(iname) < 0) {
                if (read_nested !== false) {
                    let nest_path = joinPath(path, "other_columns", iname);
                    let nest_meta = await readObjectFile(nest_path, globals);
                    if (read_nested === true) {
                        collected[k] = await readObject(nest_path, nest_meta, globals, options);
                    } else {
                        collected[k] = await read_nested(nrows, nest_path, nest_meta, globals, options);
                    }
                } else {
                    skip_nested = true;
                }
                continue;
            }

            let child_handle = dhandle.open(iname);
            handle_stack.push(child_handle);

            if (child_handle instanceof H5DataSet) {
                let vals;
                let rawvals = child_handle.values();
                let type = child_handle.readAttribute("type").values[0];

                let child_attrs = child_handle.attributes();
                let has_missing = child_attrs.indexOf("missing-value-placeholder") >= 0;
                let missing_attr;
                if (has_missing) {
                    missing_attr = child_handle.readAttribute("missing-value-placeholder").values[0];
                }

                if (type == "number") {
                    if (has_missing || !typedarray) {
                        vals = Array.from(rawvals)
                        if (Number.isNaN(missing_attr)) {
                            for (let i = 0; i < vals.length; i++) {
                                if (Number.isNaN(vals[i])) {
                                    vals[i] = null;
                                }
                            }
                        } else {
                            for (let i = 0; i < vals.length; i++) {
                                if (vals[i] == missing_attr) {
                                    vals[i] = null;
                                }
                            }
                        }
                        vals = new NumberList(vals);
                    } else {
                        vals = new Float64Array(rawvals); // force it to be floating-point.
                    }

                } else if (type == "boolean") {
                    vals = new Array(rawvals.length)
                    if (has_missing) {
                        for (let i = 0; i < rawvals.length; i++) {
                            if (rawvals[i] == missing_attr) {
                                vals[i] = null;
                            } else {
                                vals[i] = (rawvals[i] != 0);
                            }
                        }
                    } else {
                        for (let i = 0; i < rawvals.length; i++) {
                            vals[i] = (rawvals[i] != 0);
                        }
                    }
                    vals = new BooleanList(vals);

                } else if (type == "integer") {
                    if (has_missing || !typedarray) {
                        vals = Array.from(rawvals);
                        for (let i = 0; i < vals.length; i++) {
                            if (vals[i] == missing_attr) {
                                vals[i] = null;
                            }
                        }
                        vals = new IntegerList(vals);
                    } else {
                        vals = new Int32Array(rawvals);
                    }

                } else if (type == "string") {
                    vals = rawvals.slice(); // make a copy, to be safe.
                    if (has_missing) {
                        for (let i = 0; i < vals.length; i++) {
                            if (vals[i] == missing_attr) {
                                vals[i] = null;
                            }
                        }
                    }
                    vals = new StringList(vals);

                } else {
                    throw new Error("unknown type '" + type + "' in column '" + k + "' of a DataFrame at '" + path + "'");
                }

                collected[k] = vals;

            } else if (child_handle instanceof H5Group) {
                let type = child_handle.readAttribute("type").values[0];

                if (type == "factor") {
                    let lhandle = child_handle.open("levels");
                    handle_stack.push(lhandle);
                    let levels = lhandle.values();
                    lhandle.close();
                    handle_stack.pop();

                    let cohandle = child_handle.open("codes");
                    handle_stack.push(cohandle);
                    let codes = cohandle.values();
                    let code_attrs = cohandle.attributes();

                    // Just reading factors as string vectors here, as we don't have a separate
                    // representation in Javascript for a factor.
                    let vals = Array(codes.length);
                    if (code_attrs.indexOf("missing-value-placeholder") >= 0) {
                        let missing_attr = cohandle.readAttribute("missing-value-placeholder").values[0];
                        for (let i = 0; i < codes.length; i++) {
                            if (codes[i] == missing_attr) {
                                vals[i] = null;
                            } else {
                                vals[i] = levels[codes[i]];
                            }
                        }
                    } else {
                        for (let i = 0; i < codes.length; i++) {
                            vals[i] = levels[codes[i]];
                        }
                    }

                    collected[k] = new StringList(vals);
                    cohandle.close();
                    handle_stack.pop();

                } else if (type == "vls") {
                    let hhandle = child_handle.open("heap");
                    handle_stack.push(hhandle);
                    let heap = hhandle.values();
                    hhandle.close();
                    handle_stack.pop();

                    let phandle = child_handle.open("pointers");
                    handle_stack.push(phandle);
                    let pointers = phandle.values();
                    let pointer_attrs = phandle.attributes();

                    let vals = new Array(pointers.length);
                    let dec = new TextDecoder;
                    for (let i = 0; i < pointers.length; i++) {
                        const { offset, length } = pointers[i];
                        let current = heap.slice(Number(offset), Number(offset + length));
                        let early = current.indexOf(0);
                        if (early >= 0) {
                            current = current.slice(0, early);
                        }
                        vals[i] = dec.decode(current);
                    }

                    if (pointer_attrs.indexOf("missing-value-placeholder") >= 0) {
                        let missing_attr = phandle.readAttribute("missing-value-placeholder").values[0];
                        for (let i = 0; i < vals.length; i++) {
                            if (vals[i] == missing_attr) {
                                vals[i] = null;
                            }
                        }
                    }

                    collected[k] = new StringList(vals);
                    phandle.close();
                    handle_stack.pop();

                } else {
                    throw new Error("unknown type '" + type + "' in column '" + k + "' of a DataFrame at '" + path + "'");
                }

            } else {
                throw new Error("unknown type for column '" + k + "' at path '" + path + "'");
            }

            child_handle.close();
            handle_stack.pop();
        }

        let rownames = null;
        let gkids = ghandle.children();
        if (gkids.indexOf("row_names") >= 0) {
            let rnhandle = ghandle.open("row_names");
            handle_stack.push(rnhandle);
            rownames = rnhandle.values();
            rnhandle.close();
            handle_stack.pop();
        }

        if (skip_nested) {
            let new_colnames = [];
            for (const cn of colnames) {
                if (cn in collected) {
                    new_colnames.push(cn);
                }
            }
            colnames = new_colnames;
        }

        let metadata = await readAnnotatedMetadata(joinPath(path, "other_annotations"), globals, options, "DataFrame_readMetadata")
        return new DataFrame(collected, { columnOrder: colnames, numberOfRows: nrows, rowNames: rownames, metadata: metadata });

    } finally {
        for (const handle of handle_stack.toReversed()) {
            handle.close();
        }
        await globals.h5close(fhandle);
    }
}

/**
 * @param {external:DataFrame} x - The data frame.
 * @param {string} path - Path to the directory in which to save `x`.
 * @param {object} globals - Object satisfying the {@link GlobalsInterface}.
 * @param {object} [options={}] - Further options.
 * @param {function} [?options.DataFrame_saveOther=null] - Function to save custom class instances as columns of a data frame, without resorting to a reference to an external object.
 * This should accept `y`, an instance of a custom object; `handle`, the {@link H5Group} in which `y` is to be saved; and `name`, the name of the child of `handle` in which to save `y`. 
 * It should return `true` if `y` was saved and `false` otherwise (e.g., if it does not know how to handle the class of`y`).
 *
 * @return `x` is stored at `path`.
 * @async
 */
export async function saveDataFrame(x, path, globals, options = {}) {
    await globals.mkdir(path);
    await globals.write(joinPath(path, "OBJECT"), jsonBuffer({ type: "data_frame", data_frame: { version: "1.1" } }));

    let externals = {};
    let success = false;

    let fhandle = await globals.h5create(joinPath(path, "basic_columns.h5"));
    let handle_stack = [fhandle];
    try {
        let ghandle = fhandle.createGroup("data_frame");
        handle_stack.push(ghandle);
        ghandle.writeAttribute("row-count", "Uint64", [], [x.numberOfRows()]);
        ghandle.createDataSet("column_names", "String", [ x.numberOfColumns() ], { data: x.columnNames() }).close();
        if (x.rowNames() != null) {
            ghandle.createDataSet("row_names", "String", [ x.numberOfRows() ], { data: x.rowNames() }).close();
        }

        let dhandle = ghandle.createGroup("data");
        for (const [i, k] of Object.entries(x.columnNames())) {
            let iname = String(i);
            let col = x.column(k);

            if (col instanceof Uint8Array) {
                let chandle = dhandle.createDataSet(iname, "Uint8", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Int8Array) {
                let chandle = dhandle.createDataSet(iname, "Int8", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Uint16Array) {
                let chandle = dhandle.createDataSet(iname, "Uint16", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Int16Array) {
                let chandle = dhandle.createDataSet(iname, "Int16", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Uint32Array) {
                let chandle = dhandle.createDataSet(iname, "Uint32", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]); // only up to int32 is supported by 'integer'.
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Int32Array) {
                let chandle = dhandle.createDataSet(iname, "Int32", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof BigUint64Array) {
                let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]); // only up to int32 is supported by 'integer'.
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof BigInt64Array) {
                let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]); // only up to int32 is supported by 'integer'.
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Float32Array) {
                let chandle = dhandle.createDataSet(iname, "Float32", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Float64Array) {
                let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof IntegerList) {
                let formatted = formatIntegerArrayForHdf5(col.toArray());
                let dtype;
                let htype;
                if (formatted.integer) {
                    dtype = "integer";
                    htype = "Int32";
                } else {
                    dtype = "number";
                    htype = "Float64";
                }
                let chandle = dhandle.createDataSet(iname, htype, [ formatted.data.length ], { data: formatted.data });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], [dtype]);
                if (formatted.placeholder !== null) {
                    chandle.writeAttribute("missing-value-placeholder", htype, [], [formatted.placeholder]);
                }
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof NumberList) {
                let formatted = formatNumberArrayForHdf5(col.toArray());
                let chandle = dhandle.createDataSet(iname, "Float64", [ formatted.data.length ], { data: formatted.data });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]);
                if (formatted.placeholder !== null) {
                    chandle.writeAttribute("missing-value-placeholder", "Float64", [], [formatted.placeholder]);
                }
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof StringList) {
                let formatted = formatStringArrayForHdf5(col.toArray());
                let chandle = dhandle.createDataSet(iname, "String", [ formatted.data.length ], { data: formatted.data });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["string"]);
                if (formatted.placeholder !== null) {
                    chandle.writeAttribute("missing-value-placeholder", "String", [], [formatted.placeholder]);
                }
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof BooleanList) {
                let formatted = formatBooleanArrayForHdf5(col.toArray());
                let chandle = dhandle.createDataSet(iname, "Int8", [ formatted.data.length ], { data: formatted.data });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["boolean"]);
                if (formatted.placeholder !== null) {
                    chandle.writeAttribute("missing-value-placeholder", "Int8", [], [formatted.placeholder]);
                }
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Array) {
                // Try to guess the type of everything.
                let types = new Set;
                let has_missing = false;
                for (const entry of col) {
                    if (entry == null) {
                        has_missing = true;
                    } else {
                        types.add(typeof entry);
                    }
                }

                let okay = false;
                if (types.size == 0) {
                    let chandle = dhandle.createDataSet(iname, "Uint8", [ df.numberOfRows() ], { data: new Uint8Array(df.numberOfRows()) });
                    handle_stack.push(chandle);
                    chandle.writeAttribute("type", "String", [], [ "boolean" ]);
                    chandle.close();
                    handle_stack.pop();
                    okay = true;

                } else if (types.size == 1) {
                    // Javascript doesn't have native integers, so we'll save it all as 'number'.
                    if (types.has("number")) {
                        let formatted = formatNumberArrayForHdf5(col);
                        let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: formatted.data });
                        handle_stack.push(chandle);
                        chandle.writeAttribute("type", "String", [], ["number"]);
                        if (formatted.placeholder !== null) {
                            chandle.writeAttribute("missing-value-placeholder", "Float64", [], [ formatted.placeholder ]);
                        }
                        chandle.close();
                        handle_stack.pop();
                        okay = true;

                    } else if (types.has("boolean")) {
                        let formatted = formatBooleanArrayForHdf5(col);
                        let chandle = dhandle.createDataSet(iname, "Uint8", [ col.length ], { data: formatted.data });
                        handle_stack.push(chandle);
                        chandle.writeAttribute("type", "String", [], ["boolean"]);
                        if (formatted.placeholder !== null) {
                            chandle.writeAttribute("missing-value-placeholder", "Uint8", [], [ formatted.placeholder ]);
                        }
                        chandle.close();
                        handle_stack.pop();
                        okay = true;

                    } else if (types.has("string")) {
                        let formatted = formatStringArrayForHdf5(col);
                        let chandle = dhandle.createDataSet(iname, "String", [ col.length ], { data: formatted.data });
                        handle_stack.push(chandle);
                        chandle.writeAttribute("type", "String", [], ["string"]);
                        if (has_missing) {
                            chandle.writeAttribute("missing-value-placeholder", "String", [], [ formatted.placeholder ]);
                        }
                        chandle.close();
                        handle_stack.pop();
                        okay = true;
                    }
                }

                if (!okay) {
                    externals[iname] = new List(col);
                }

            } else {
                let handled = false;
                if ("DataFrame_saveOther" in options) {
                    handled = options.DataFrame_saveOther(col, dhandle, iname);
                }
                if (!handled) {
                    externals[iname] = col;
                }
            }
        }

        success = true;
    } finally {
        for (const handle of handle_stack.toReversed()) {
            handle.close();
        }
        await globals.h5finish(fhandle, !success);
    }

    let external_array = Object.entries(externals);
    if (external_array.length > 0) {
        let other_dir = joinPath(path, "other_columns");
        await globals.mkdir(other_dir);
        for (const [iname, col] of external_array) {
            await saveObject(col, joinPath(other_dir, iname), globals, options);
        }
    }

    await saveAnnotatedMetadata(x.metadata(), joinPath(path, "other_annotations"), globals, options);
}