DataFrame.js

import { DataFrame } from "bioconductor";
import { H5Group, H5DataSet } from "./h5.js";
import { readObject, readObjectFile, saveObject } from "./general.js";

/**
 * A data frame of columnar data.
 * @external DataFrame
 * @see {@link https://ltla.github.io/bioconductor.js/DataFrame.html}
 */

/**
 * @param {string} path - Path to the takane-formatted object directory containing the {@link external:DataFrame DataFrame}.
 * @param {object} metadata - Takane object metadata, typically generated by calling {@linkcode readObjectFile} on `path`.
 * @param {object} globals - Object containing `fs`, an object satisfying the {@link GlobalFsInterface}; and `h5`, an object satisfying the {@link GlobalH5Interface}.
 * @param {object} [options={}] - Further options.
 * @param {function|boolean} [options.DataFrame_readNested=true] - How to read columns containing nested objects.
 * If `true`, {@linkcode readObject} is used, while if `false`, nested objects will be skipped.
 * If a function is provided, it should accept `nrow` (the number of rows in the data frame) as well as `path`, `metadata`, `globals` and `options` (as described above);
 * and should return an object (possibly asynchronously) for which [`NUMBER_OF_ROWS`](https://ltla.github.io/bioconductor.js/global.html#NUMBER_OF_ROWS) is equal to `nrow`. 
 *
 * @return {external:DataFrame} The data frame.
 * @async
 */
export async function readDataFrame(path, metadata, globals, options = {}) {
    let read_nested = true; 
    if ("DataFrame_readNested" in options) {
        read_nested = options.DataFrame_readNested;
    } 

    let handle_stack = [];
    let contents = await globals.fs.get(path + "/basic_columns.h5");

    try {
        let fhandle = await globals.h5.open(contents, { readOnly: true });
        handle_stack.push(fhandle);
        let ghandle = fhandle.open("data_frame");
        handle_stack.push(ghandle);
        let dhandle = ghandle.open("data");
        handle_stack.push(dhandle);

        let cnhandle = ghandle.open("column_names");
        handle_stack.push(cnhandle);
        let nrows = Number(ghandle.readAttribute("row-count").values[0]);
        let colnames = cnhandle.values();
        cnhandle.close();
        handle_stack.pop();

        let collected = {};
        let skip_nested = false;
        let kids = dhandle.children();
        for (const [i, k] of Object.entries(colnames)) {
            let iname = String(i)
            if (kids.indexOf(iname) < 0) {
                if (read_nested !== false) {
                    let nest_path = path + "/other_columns/" + iname;
                    let nest_meta = await readObjectFile(nest_path, globals);
                    if (read_nested === true) {
                        collected[k] = await readObject(nest_path, nest_meta, globals, options);
                    } else {
                        collected[k] = await read_nested(nrows, nest_path, nest_meta, globals, options);
                    }
                } else {
                    skip_nested = true;
                }
                continue;
            }

            let child_handle = dhandle.open(iname);
            handle_stack.push(child_handle);

            if (child_handle instanceof H5DataSet) {
                let vals;
                let rawvals = child_handle.values();
                let type = child_handle.readAttribute("type").values[0];

                let child_attrs = child_handle.attributes();
                let has_missing = child_attrs.indexOf("missing-value-placeholder") >= 0;
                let missing_attr;
                if (has_missing) {
                    missing_attr = child_handle.readAttribute("missing-value-placeholder").values[0];
                }

                if (type == "number") {
                    if (has_missing) {
                        vals = Array.from(rawvals)
                        if (Number.isNaN(missing_attr)) {
                            for (let i = 0; i < vals.length; i++) {
                                if (Number.isNaN(vals[i])) {
                                    vals[i] = null;
                                }
                            }
                        } else {
                            for (let i = 0; i < vals.length; i++) {
                                if (vals[i] == missing_attr) {
                                    vals[i] = null;
                                }
                            }
                        }
                    } else {
                        vals = new Float64Array(rawvals); // force it to be floating-point.
                    }

                } else if (type == "boolean") {
                    vals = new Array(rawvals.length)
                    if (has_missing) {
                        for (let i = 0; i < rawvals.length; i++) {
                            if (rawvals[i] == missing_attr) {
                                vals[i] = null;
                            } else {
                                vals[i] = (rawvals[i] != 0);
                            }
                        }
                    } else {
                        for (let i = 0; i < rawvals.length; i++) {
                            vals[i] = (rawvals[i] != 0);
                        }
                    }

                } else if (type == "integer") {
                    if (has_missing) {
                        vals = Array.from(rawvals);
                        for (let i = 0; i < vals.length; i++) {
                            if (vals[i] == missing_attr) {
                                vals[i] = null;
                            }
                        }
                    } else {
                        vals = rawvals.slice(); // make a copy, to be safe.
                    }

                } else if (type == "string") {
                    vals = rawvals.slice(); // make a copy, to be safe.
                    if (has_missing) {
                        for (let i = 0; i < vals.length; i++) {
                            if (vals[i] == missing_attr) {
                                vals[i] = null;
                            }
                        }
                    }

                } else {
                    throw new Error("unknown type '" + type + "' in column '" + k + "' of a DataFrame at '" + path + "'");
                }

                collected[k] = vals;

            } else if (child_handle instanceof H5Group) {
                let type = child_handle.readAttribute("type").values[0];

                if (type == "factor") {
                    let lhandle = child_handle.open("levels");
                    handle_stack.push(lhandle);
                    let levels = lhandle.values();
                    lhandle.close();
                    handle_stack.pop();

                    let cohandle = child_handle.open("codes");
                    handle_stack.push(cohandle);
                    let codes = cohandle.values();
                    let code_attrs = cohandle.attributes();

                    // Just reading factors as string vectors here, as we don't have a separate
                    // representation in Javascript for a factor.
                    let vals = Array(codes.length);
                    if (code_attrs.indexOf("missing-value-placeholder") >= 0) {
                        let missing_attr = cohandle.readAttribute("missing-value-placeholder").values[0];
                        for (let i = 0; i < codes.length; i++) {
                            if (codes[i] == missing_attr) {
                                vals[i] = null;
                            } else {
                                vals[i] = levels[codes[i]];
                            }
                        }
                    } else {
                        for (let i = 0; i < codes.length; i++) {
                            vals[i] = levels[codes[i]];
                        }
                    }

                    collected[k] = vals;
                    cohandle.close();
                    handle_stack.pop();

                } else if (type == "vls") {
                    let hhandle = child_handle.open("heap");
                    handle_stack.push(hhandle);
                    let heap = hhandle.values();
                    hhandle.close();
                    handle_stack.pop();

                    let phandle = child_handle.open("pointers");
                    handle_stack.push(phandle);
                    let pointers = phandle.values();
                    let pointer_attrs = phandle.attributes();

                    let vals = new Array(pointers.length);
                    let dec = new TextDecoder;
                    for (let i = 0; i < pointers.length; i++) {
                        const { offset, length } = pointers[i];
                        let current = heap.slice(Number(offset), Number(offset + length));
                        let early = current.indexOf(0);
                        if (early >= 0) {
                            current = current.slice(0, early);
                        }
                        vals[i] = dec.decode(current);
                    }

                    if (pointer_attrs.indexOf("missing-value-placeholder") >= 0) {
                        let missing_attr = phandle.readAttribute("missing-value-placeholder").values[0];
                        for (let i = 0; i < vals.length; i++) {
                            if (vals[i] == missing_attr) {
                                vals[i] = null;
                            }
                        }
                    }

                    collected[k] = vals;
                    phandle.close();
                    handle_stack.pop();

                } else {
                    throw new Error("unknown type '" + type + "' in column '" + k + "' of a DataFrame at '" + path + "'");
                }

            } else {
                throw new Error("unknown type for column '" + k + "' at path '" + path + "'");
            }

            child_handle.close();
            handle_stack.pop();
        }

        let rownames = null;
        let gkids = ghandle.children();
        if (gkids.indexOf("row_names") >= 0) {
            let rnhandle = ghandle.open("row_names");
            handle_stack.push(rnhandle);
            rownames = rnhandle.values();
            rnhandle.close();
            handle_stack.pop();
        }

        if (skip_nested) {
            let new_colnames = [];
            for (const cn of colnames) {
                if (cn in collected) {
                    new_colnames.push(cn);
                }
            }
            colnames = new_colnames;
        }

        return new DataFrame(collected, { columnOrder: colnames, numberOfRows: nrows, rowNames: rownames });

    } finally {
        for (const handle of handle_stack.toReversed()) {
            handle.close();
        }
        if (handle_stack.length > 0) {
            await globals.h5.close(handle_stack[0]);
        }
        if (typeof contents == "string") {
            await globals.fs.clean(contents);
        }
    }
}

/**
 * @param {external:DataFrame} x - The data frame.
 * @param {string} path - Path to the directory in which to save `x`.
 * @param {object} globals - Object containing `fs`, an object satisfying the {@link GlobalFsInterface}; and `h5`, an object satisfying the {@link GlobalH5Interface}.
 * @param {object} [options={}] - Further options.
 *
 * @return `x` is stored at `path`.
 * @async
 */
export async function saveDataFrame(x, path, globals, options = {}) {
    await globals.fs.mkdir(path);
    await globals.fs.write(path + "/OBJECT", JSON.stringify({ type: "data_frame", data_frame: { version: "1.1" } }));

    let externals = {};
    let handle_stack = [];
    try {
        let fhandle = await globals.h5.open(path + "/basic_columns.h5", { readOnly: false });
        handle_stack.push(fhandle);

        let ghandle = fhandle.createGroup("data_frame");
        handle_stack.push(ghandle);
        ghandle.writeAttribute("row-count", "Uint64", [], [x.numberOfRows()]);
        ghandle.createDataSet("column_names", "String", [ x.numberOfColumns() ], { data: x.columnNames(), returnHandle: false });
        if (x.rowNames() != null) {
            ghandle.createDataSet("row_names", "String", [ x.numberOfRows() ], { data: x.rowNames(), returnHandle: false });
        }

        let dhandle = ghandle.createGroup("data");
        for (const [i, k] of Object.entries(x.columnNames())) {
            let iname = String(i);
            let col = x.column(k);

            if (col instanceof Uint8Array) {
                let chandle = dhandle.createDataSet(iname, "Uint8", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Int8Array) {
                let chandle = dhandle.createDataSet(iname, "Int8", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Uint16Array) {
                let chandle = dhandle.createDataSet(iname, "Uint16", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Int16Array) {
                let chandle = dhandle.createDataSet(iname, "Int16", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Uint32Array) {
                let chandle = dhandle.createDataSet(iname, "Uint32", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]); // only up to int32 is supported by 'integer'.
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Int32Array) {
                let chandle = dhandle.createDataSet(iname, "Int32", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["integer"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof BigUint64Array) {
                let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]); // only up to int32 is supported by 'integer'.
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof BigInt64Array) {
                let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]); // only up to int32 is supported by 'integer'.
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Float32Array) {
                let chandle = dhandle.createDataSet(iname, "Float32", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Float64Array) {
                let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: col });
                handle_stack.push(chandle);
                chandle.writeAttribute("type", "String", [], ["number"]);
                chandle.close();
                handle_stack.pop();

            } else if (col instanceof Array) {
                // Try to guess the type of everything.
                let types = new Set;
                let has_missing = false;
                for (const entry of col) {
                    if (entry == null) {
                        has_missing = true;
                    } else {
                        types.add(typeof entry);
                    }
                }

                let okay = false;
                if (types.size == 0) {
                    let chandle = dhandle.createDataSet(iname, "Uint8", [ df.numberOfRows() ], { data: new Uint8Array(df.numberOfRows()) });
                    handle_stack.push(chandle);
                    chandle.writeAttribute("type", "String", [], [ "boolean" ]);
                    chandle.close();
                    handle_stack.pop();
                    okay = true;

                } else if (types.size == 1) {
                    // Javascript doesn't have native integers, so we'll save it all as 'number'.
                    if (types.has("number")) {
                        let placeholder = null;
                        if (has_missing) {
                            col = col.slice();
                            if (!col.some(Number.isNaN)) {
                                placeholder = Number.NaN;
                            } else {
                                for (const candidate of [0, Number.POSITIVE_INFINITY, Number.NEGATIVE_INFINITY, Number.MAX_VALUE, -Number.MAX_VALUE, 0 ]) {
                                    if (col.indexOf(candidate) < 0) {
                                        placeholder = candidate;
                                        break;
                                    }
                                }
                            }

                            if (placeholder === null) {
                                let sorted = Array.from(new Set(col)).sort((a, b) => a - b);
                                let last = -Number.MAX_VALUE;
                                for (const x of sorted) {
                                    if (Number.isFinite(x)) {
                                        let candidate = last + (x - last) / 2;
                                        if (candidate != last && candidate != x) {
                                            placeholder = candidate;
                                            break;
                                        }
                                        last = x;
                                    }
                                }
                            }

                            for (const [i, v] of Object.entries(col)) {
                                if (v == null) {
                                    col[i] = placeholder;
                                }
                            }
                        }

                        let chandle = dhandle.createDataSet(iname, "Float64", [ col.length ], { data: col });
                        handle_stack.push(chandle);
                        chandle.writeAttribute("type", "String", [], ["number"]);
                        if (has_missing) {
                            chandle.writeAttribute("missing-value-placeholder", "Float64", [], [ placeholder ]);
                        }
                        chandle.close();
                        handle_stack.pop();
                        okay = true;

                    } else if (types.has("boolean")) {
                        let vals = new Uint8Array(col.length);
                        for (let i = 0; i < col.length; i++) {
                            if (col[i] == null) {
                                vals[i] = 2;
                            } else {
                                vals[i] = col[i];
                            }
                        }

                        let chandle = dhandle.createDataSet(iname, "Uint8", [ col.length ], { data: vals });
                        handle_stack.push(chandle);
                        chandle.writeAttribute("type", "String", [], ["boolean"]);
                        if (has_missing) {
                            chandle.writeAttribute("missing-value-placeholder", "Uint8", [], [ 2 ]);
                        }
                        chandle.close();
                        handle_stack.pop();
                        okay = true;

                    } else if (types.has("string")) {
                        let placeholder;
                        if (has_missing) {
                            col = col.slice();
                            placeholder = "NA";
                            while (col.indexOf(placeholder) >= 0) {
                                placeholder += "_";
                            }
                            for (const [i, v] of Object.entries(col)) {
                                if (v === null) {
                                    col[i] = placeholder;
                                }
                            }
                        }

                        // Not saving as VLS for simplicity.
                        let chandle = dhandle.createDataSet(iname, "String", [ col.length ], { data: col });
                        handle_stack.push(chandle);
                        chandle.writeAttribute("type", "String", [], ["string"]);
                        if (has_missing) {
                            chandle.writeAttribute("missing-value-placeholder", "String", [], [ placeholder ]);
                        }
                        chandle.close();
                        handle_stack.pop();
                        okay = true;
                    }
                }

                if (!okay) {
                    externals[iname] = col;
                }

            } else {
                externals[iname] = col;
            }
        }

    } finally {
        for (const handle of handle_stack.toReversed()) {
            handle.close();
        }
        if (handle_stack.length > 0) {
            await globals.h5.close(handle_stack[0]);
        }
    }

    let external_array = Object.entries(externals);
    if (external_array.length > 0) {
        await globals.fs.mkdir(path + "/other_columns");
        for (const [iname, col] of external_array) {
            await saveObject(col, path + "/other_columns/" + iname, globals, options);
        }
    }
}