Source code for mcsas3.McData

from pathlib import Path, PurePosixPath
from typing import Optional

import h5py
import numpy as np
import pandas

import mcsas3.McHDF as McHDF

# todo use attrs to @define a McData dataclass


[docs]class McData: """ A simple base class for a data carrier object that can load from a range of sources, and do rebinning for too large datasets. This is inherited by the McData1D and McData2D classes intended for actual use. """ # dataframe objects at least should contain entries for Q, I, ISigma (1D) # or Qx, Qy, I, ISigma (2D) filename = None # input filename _outputFilename = None # output filename for storing loader = None # can be set to one of the available loaders rawData = None # as read from the file, rawData2D = None # only filled if a 2D NeXus file is loaded clippedData = None # clipped to range, dataframe object binnedData = None # clipped and rebinned measData = binnedData # measurement data dict, translated from binnedData dataframe measDataLink = "binnedData" # indicate what measData links to dataRange = None # min-max for data range to fit. overwritten in subclass nbins = 100 # default, set to zero for no rebinning pathDict = None # for loading HDF5 files without pointers to the data binning = "logarithmic" # the only option that makes sense csvargs = {} # overwritten in subclass qNudge = None # can adjust/offset the q values in case of misaligned q vector, # in particular visible in 2D data... omitQRanges = None # to skip or omit unwanted data ranges, for example with sharp XRD peaks, # must be a list of [[qmin, qmax], ...] pairs resultIndex = None # maybe make this behave like a dict? or maybe that's a bad idea... possible method here: # https://stackoverflow.com/questions/4014621/a-python-class-that-acts-like-dict # Q = None # links to measData # I = None # links to measData # ISigma = None # links to measData storeKeys = [ # keys to store in an HDF5 output file "filename", "rawData", "clippedData", "binnedData", "measData", "measDataLink", "nbins", "binning", "dataRange", "pathDict", "csvargs", "loader", "qNudge", "omitQRanges", ] loadKeys = ( { # keys to store in an HDF5 output file, values are types to cast to using _HDFLoadKV. "filename": Path, "measDataLink": "str", "nbins": int, "binning": "str", "dataRange": None, # not sure what this is.. array? "csvargs": "dict", "loader": "str", "omitQRanges": list, # not sure if this works? } )
[docs] def __init__( self, df: Optional[pandas.DataFrame] = None, loadFromFile: Optional[Path] = None, resultIndex: int = 1, **kwargs: dict, ) -> None: """loadFromFile must be a previous optimization. Else, use any of the other 'from_*' functions""" # reset everything so we're sure not to inherit anything from elsewhere: self.filename = None # input filename self._outputFilename = None # output filename for storing self.loader = None # can be set to one of the available loaders self.rawData = None # as read from the file, self.rawData2D = None # only filled if a 2D NeXus file is loaded self.clippedData = None # clipped to range, dataframe object self.binnedData = None # clipped and rebinned self.measData = ( self.binnedData ) # measurement data dict, translated from binnedData dataframe self.measDataLink = "binnedData" # indicate what measData links to self.dataRange = None # min-max for data range to fit. overwritten in subclass self.nbins = 100 # default, set to zero for no rebinning self.pathDict = None # for loading HDF5 files without pointers to the data self.binning = "logarithmic" # the only option that makes sense self.csvargs = {} # overwritten in subclass self.qNudge = 0 # can adjust/offset the q values in case of misaligned q vector, # in particular visible in 2D data... self.omitQRanges = None # to skip or omit unwanted data ranges, for example with sharp # XRD peaks, must be a list of [[qmin, qmax], ...] pairs # make sure we store and read from the right place. self.resultIndex = McHDF.ResultIndex(resultIndex) # defines the HDF5 root path if loadFromFile is not None: self.load(loadFromFile)
def processKwargs(self, **kwargs: dict) -> None: for key, value in kwargs.items(): assert key in self.storeKeys, "Key {} is not a valid option".format(key) setattr(self, key, value) def linkMeasData(self, measDataLink: str = None) -> None: assert False, "defined in 1D and 2D subclasses" pass def from_file(self, filename: Optional[Path] = None) -> None: if filename is None: assert ( self.filename is not None ), "at least filename or self.filename must be set for loading from file" else: self.filename = Path(filename) self.filename = Path(self.filename) # cast into pathlib if not already # make sure file exists assert self.filename.is_file(), f"input filename: {self.filename} must exist" if (self.filename.suffix == ".pdh") or (self.loader == "from_pdh"): self.loader = "from_pdh" # ensure this is set self.from_pdh(self.filename) elif (self.filename.suffix in [".csv", ".dat", ".txt"]) or (self.loader == "from_csv"): self.loader = "from_csv" # ensure this is set self.from_csv(self.filename) elif (self.filename.suffix in [".h5", ".hdf5", ".nx", ".nxs"]) or ( self.loader == "from_nexus" ): self.loader = "from_nexus" self.from_nexus(self.filename) # load first, then find out if 1D or 2D else: assert False, ( "Input file type could not be determined. Use from_pandas to load a dataframe or" " use df = [DataFrame] in input, or use 'loader' = 'from_pdh' or 'from_csv' in" " input" ) def from_pandas(self, df: pandas.DataFrame = None) -> None: assert False, "defined in 1D and 2D subclasses" pass def from_csv(self, filename: Path = None, csvargs=None) -> None: assert False, "defined in 1D and 2D subclasses" pass def from_pdh(self, filename: Path = None) -> None: assert False, "defined in 1D subclass only" pass # def from_nexus(self, filename=None): # # find out if 1D or 2D, then use 1D or 2D loaders? # assert False, "defined in 1D and 2D subclasses" # pass # universal reader for 1D and 2D! def from_nexus(self, filename: Optional[Path] = None) -> None: # optionally, path can be defined as a dict to point at Q, I and ISigma entries. def objBytesToStr(inObject): outObject = inObject if isinstance(inObject, bytes): outObject = inObject.decode("utf-8") if isinstance(inObject, np.ndarray): outObject = inObject.astype("str") return outObject if filename is None: assert ( self.filename is not None ), "either filename or self.filename must be set to a data source" filename = self.filename else: self.filename = filename # reset to new source if not already set self.rawData = {} if self.pathDict is not None: assert isinstance( self.pathDict, dict ), "provided path must be dict with keys 'Q', 'I', and 'ISigma'" assert all( [j in self.pathDict.keys() for j in ["Q", "I", "ISigma"]] ), "provided path must be dict with keys 'Q', 'I', and 'ISigma'" with h5py.File(filename, "r") as h5f: [ self.rawData.update({key: h5f[f"{val}"][()].squeeze()}) for key, val in self.pathDict.items() ] else: sigPath = "/" with h5py.File(filename, "r") as h5f: while "default" in h5f[sigPath].attrs: # this is what we find as a new default to add to the path sigPathAdd = h5f[sigPath].attrs["default"] # make sure it's not a bytes string: sigPathAdd = objBytesToStr(sigPathAdd) # if isinstance(sigPathAdd, bytes): sigPathAdd = sigPathAdd.decode("utf-8") # add to the path sigPath += sigPathAdd + "/" # make sure we now have access to a signal: assert "signal" in h5f[sigPath].attrs, "no signal in default neXus path" signalLabel = objBytesToStr(h5f[sigPath].attrs["signal"]) # if isinstance(signalLabel, bytes): signalLabel = signalLabel.decode("utf-8") sigPathI = sigPath + signalLabel # extract intensity along qDim... sorry, don't know how (qDim is found below): self.rawData.update({"I": h5f[sigPathI][()].squeeze()}) # and ISigma: uncertaintiesAvailable = False maskAvailable = False if f"{signalLabel}_uncertainty" in h5f[sigPath].attrs: uncLabel = objBytesToStr(h5f[sigPath].attrs[f"{signalLabel}_uncertainty"]) uncertaintiesAvailable = True elif "uncertainties" in h5f[sigPathI].attrs: uncLabel = objBytesToStr(h5f[sigPathI].attrs["uncertainties"]) uncertaintiesAvailable = True else: # some default: self.rawData.update({"ISigma": self.rawData["I"] * 0.001}) if "mask" in h5f[sigPath].attrs: maskLabel = objBytesToStr(h5f[sigPath].attrs["mask"]) maskAvailable = True if uncertaintiesAvailable: # load them # if isinstance(uncLabel, bytes): uncLabel = uncLabel.decode("utf-8") sigPathISigma = sigPath + uncLabel self.rawData.update({"ISigma": h5f[sigPathISigma][()].squeeze()}) if maskAvailable: # load them sigPathMask = sigPath + maskLabel self.rawData.update({"mask": h5f[sigPathMask][()].squeeze()}) # now we have I, we search for Q in the "axes" attribute: axesLabel = None if "axes" in h5f[sigPath].attrs: axesLabel = "axes" elif "I_axes" in h5f[sigPath].attrs: axesLabel = "I_axes" assert ( axesLabel is not None ), "could not find axes label associated with dataset signal in HDF5 file" axesObj = objBytesToStr(h5f[sigPath].attrs[axesLabel]) # q can have many names in here: ques = ["q", "Q"] # q options # ques = ['q', 'Q', b'q', b'Q'] # q options # check where we may have a match: quesTest = [i in axesObj for i in ques] # assert one of them is there assert any(quesTest), "q (or Q) not found in signal axes description" # this is what our q label is in the axes attribute: qLabel = ques[np.argwhere(np.array(quesTest)).squeeze()] # find out which dimension of our data this is: # qDim = np.argwhere([qLabel == i for i in axesObj]).squeeze() # back to picking out q: # if isinstance(qLabel, bytes): qLabel = qLabel.decode("utf-8") self.rawData.update({"Q": h5f[sigPath + qLabel][()].squeeze()}) if self.rawData["Q"].ndim > 1: # we have a three-dimensional Q array, in the order of [dim, y, x] # find out which dimensions are nonzero (the remainder is Qz): QxyIndices = np.argwhere( [self.rawData["Q"][i, :, :].any() for i in range(self.rawData["Q"].shape[0])] ) self.rawData["Q"] = self.rawData["Q"][QxyIndices, :, :].squeeze() self.rawData["Qx"] = self.rawData["Q"][QxyIndices[1], :, :].squeeze() self.rawData["Qy"] = self.rawData["Q"][QxyIndices[0], :, :].squeeze() self.rawData2D = self.rawData.copy() # intermediate storage of original data # but we also need to prepare a Pandas-compatible list-format data del self.rawData["Q"] for key in self.rawData.keys(): self.rawData[key] = self.rawData[key].flatten() # if not self.is2D(): self.rawData = pandas.DataFrame(data=self.rawData) self.prepare() def is2D(self) -> bool: return self.rawData2D is not None def clip(self) -> None: assert False, "defined in 1D and 2D subclasses" pass def omit(self) -> None: assert False, "defined in the 1D and (maybe) 2D subclasses" pass def reBin(self) -> None: assert False, "defined in 1D and 2D subclasses" pass
[docs] def prepare(self) -> None: """runs the clipping and binning (in that order), populates clippedData and binnedData""" self.clip() self.omit() if self.nbins != 0: self.reBin() else: self.binnedData = self.clippedData.copy() self.linkMeasData()
[docs] def store(self, filename: Path, path: Optional[PurePosixPath] = None) -> None: """stores the settings in an output file (HDF5)""" if path is None: path = self.resultIndex.nxsEntryPoint / "mcdata" McHDF.storeKVPairs( filename, path, [(key, getattr(self, key, None)) for key in self.storeKeys] )
def load(self, filename: Path, path: Optional[PurePosixPath] = None) -> None: if path is None: path = self.resultIndex.nxsEntryPoint / "mcdata" for key, datatype in self.loadKeys.items(): # if key == 'csvargs': # # special loading, csvargs was stored as dict. # # TODO: update to use _H5loadKV for additional type checking # with h5py.File(filename, "r") as h5f: # [self.csvargs.update({key: val[()]}) # for key, val in h5f[f'{path}csvargs'].items()] # else: value = McHDF.loadKV(filename, path / key, datatype=datatype, default=None, dbg=True) # with h5py.File(filename, "r") as h5f: # if key in h5f[f"{path}"]: if key == "csvargs": self.csvargs.update(value) else: setattr(self, key, value) if self.loader == "from_pandas": buildDict = {} with h5py.File(filename, "r") as h5f: [ buildDict.update({key: val[()]}) for key, val in h5f[str(path / "rawData")].items() ] self.rawData = pandas.DataFrame(data=buildDict) else: self.from_file() # try loading the data from the original file self.prepare()