Source code for mcsas3.McHDF

import inspect
from collections.abc import Iterable
from pathlib import Path, PurePosixPath

import h5py
import numpy as np
import pandas
import pint


[docs]class ResultIndex(object):
    """Helper functions for HDF5 storage of items. Appears as base class of many McSAS3 methods"""

    resultIndex = 1  # per default number 1, but can be changed.

[docs]    def __init__(self, resultIndex: int) -> None:
        # resultIndex = -1 should go to the last existing one
        assert (
            resultIndex is not None
        ), "setting resultIndex to None (append new result) is not implemented yet"
        assert resultIndex >= 0, (
            'resultIndex should be positive, "set to last existing" (resultIndex = -1) is not'
            " implemented yet"
        )
        self.resultIndex = resultIndex

    @property
    def nxsEntryPoint(self):
        return PurePosixPath(f"/analyses/MCResult{self.resultIndex}")


[docs]def loadKVPairs(filename: Path, path: PurePosixPath, keys: Iterable) -> None:
    assert filename is not None
    assert path is not None
    with h5py.File(filename, "r") as h5f:
        for key in keys:
            yield key, h5f[str(path / key)][()]


[docs]def loadKV(
    filename: Path, path: PurePosixPath, datatype=None, default=None, dbg=False
):  # outputs any hdf5 value type
    path = str(path)  # get a h5py compatible path
    if dbg:
        print(f"loadKV({path})")
    with h5py.File(filename, "r") as h5f:
        if path not in h5f:
            return default

    if datatype is None or datatype == "str" or inspect.isclass(datatype):
        # print("picking out value from path {}".format(path))
        with h5py.File(filename, "r") as h5f:
            value = h5f[path][()]
        if (datatype == "str" or datatype == Path) and not isinstance(value, str):
            if isinstance(value, bytes) or isinstance(value, bytearray):
                value = value.decode()
            else:
                # try this:
                value = str(value)
        if inspect.isclass(datatype):  # assuming it is something like Path, int or float here..
            value = datatype(value)

    elif datatype == "dict" or datatype == "dictToPandas":
        # these *may* have to be cast into the right datatype,
        # h5py seems to assume int for much of this data
        value = dict()
        with h5py.File(filename, "r") as h5f:
            # not sure why the following doesn't work for h5py Groups,
            for key, keyValue in h5f[path].items():
                # print("Key: {}, Value: {}".format(key, keyValue))
                if isinstance(keyValue, h5py.Group):  # it's a group, so needs to be unpacked too.
                    # This should probably be a recursive function
                    subDict = {}
                    for gkey, gValue in keyValue.items():
                        subDict.update({gkey: gValue[()]})
                    value.update({key: subDict})
                else:
                    value.update({key: keyValue[()]})
                    # special case: array of bytes objects that should've been strings:
                    if isinstance(keyValue[()], np.ndarray):
                        if isinstance(keyValue[()][0], bytes):
                            value.update({key: np.array([i.decode() for i in keyValue[()]])})
                    elif isinstance(keyValue[()], bytes):
                        value.update({key: keyValue[()].decode()})

    if datatype == "dictToPandas":
        cols, idx, vals = (
            value.pop("columns"),
            value.pop("index"),
            value.pop("data"),
        )
        value = pandas.DataFrame(data=vals, columns=cols, index=idx)
        # ensure column names are str:
        value.columns = [
            (colname.decode("utf8") if isinstance(colname, bytes) else colname)
            for colname in value.columns
        ]

    return value


[docs]def storeKVPairs(filename: Path, path: PurePosixPath, pairs: Iterable) -> None:
    """Stores a given list of pairs (or iterable) to an HDF5 output file."""
    assert filename is not None
    assert path is not None
    try:
        for key, value in pairs:
            storeKV(filename=filename, path=path / key, value=value)
    except Exception:
        print(f"Error for path {key} and value '{value}' of type {type(value)}.")
        raise


# TODO: move open file to storeKVPairs for efficiency


[docs]def storeKV(filename: Path, path: PurePosixPath, value=None) -> None:
    """Stores the settings in an output file (HDF5)"""
    assert filename is not None, "filename (output filename) cannot be empty"
    assert path is not None, "HDF5 path cannot be empty"

    if type(value) in (dict, pandas.DataFrame):  # entering recursive traversal of hierachical maps
        storeKVPairs(filename, path, value.items())
        return

    path, key = path.parent, path.name
    with h5py.File(filename, "a") as h5f:
        h5g = h5f.require_group(str(path))
        dset, unit = None, None
        if isinstance(value, pint.Quantity):
            value, unit = value.m, value.u
        if isinstance(value, Path):
            value = value.as_posix()
        if isinstance(value, pandas.Timestamp):
            value = value.timestamp()
        # store arrays: convert all compatible data types to arrays:
        if type(value) is tuple or type(value) is list:
            value = np.array(value)
        if value is not None and type(value) in (np.ndarray, pandas.Series):
            # HDF cannot store unicode string arrays, these need to be stored as a special type:
            if str(value.dtype).startswith("<U") or str(value.dtype).startswith("object"):
                # try casting it into str class
                value = value.astype(h5py.special_dtype(vlen=str))

            # store the data in the prefiously defined group:
            try:
                dset = h5g.require_dataset(key, data=value, shape=value.shape, dtype=value.dtype)
            except TypeError:
                # if it exists, but isn't of the right shape or compatible dtype:
                del h5g[key]
                dset = h5g.require_dataset(key, data=value, shape=value.shape, dtype=value.dtype)

        # non-array values are stored here:
        elif value is not None:
            # try and see if the destination already exists.. This can be done by require_dataset,
            # but that requires shape and dtype to be specified. This method doesn't:
            dset = h5g.get(key, None)

            # if str(value.dtype).startswith("object"): # try casting it into str class
            #     value = value.astype(h5py.special_dtype(vlen=str))

            if dset is None:
                dset = h5g.create_dataset(key, data=value)
            else:
                dset[()] = value

        # we are skipping None values for now, that case should be caught on load.
        if unit is not None:
            dset.attrs["unit"] = str(unit)