from pathlib import Path
from typing import Optional
import numpy as np
import pandas
from .McData import McData
[docs]class McData1D(McData):
"""subclass for managing 1D datasets."""
csvargs = None # default for 1D, overwritten in subclass
dataRange = None # min-max for data range to fit
qNudge = None # nudge in case of misaligned centers. Applied to measData
omitQRanges = None # to skip or omit unwanted data ranges, for example with sharp XRD peaks
[docs] def __init__(
self,
df: Optional[pandas.DataFrame] = None,
loadFromFile: Optional[Path] = None,
resultIndex: int = 1,
**kwargs: dict,
) -> None:
super().__init__(loadFromFile=loadFromFile, resultIndex=resultIndex, **kwargs)
self.csvargs = {
"sep": r"\s+",
"header": None,
"names": ["Q", "I", "ISigma"],
} # default for 1D, overwritten in subclass
self.dataRange = [-np.inf, np.inf] # min-max for data range to fit
self.qNudge = 0 # nudge in case of misaligned centers. Applied to measData
self.processKwargs(**kwargs) # redo kwargs in case the reset values have been updated
# load from dataframe if provided
if df is not None:
self.loader = "from_pandas" # TODO: need to handle this on restore state
self.from_pandas(df)
elif loadFromFile is not None:
pass # do not try loading the file, the information is already there.
elif self.filename is not None: # filename has been set
self.from_file(self.filename)
# link measData to the requested value
def linkMeasData(self, measDataLink: Optional[str] = None) -> None: # measDataLink:str|None
if measDataLink is None:
measDataLink = self.measDataLink
assert measDataLink in [
"rawData",
"clippedData",
"binnedData",
], (
f"measDataLink value: {measDataLink} not valid. Must be one of 'rawData', 'clippedData'"
" or 'binnedData'"
)
measDataObj = getattr(self, measDataLink)
self.measData = dict(
Q=[measDataObj.Q.values + self.qNudge],
I=measDataObj.I.values,
ISigma=measDataObj.ISigma.values,
)
[docs] def from_pdh(self, filename: Path) -> None:
"""reads from a PDH file, re-uses Ingo Bressler's code from the notebook example"""
assert filename is not None, "from_pdh requires an input filename of a PDH file"
skiprows, nrows = 5, -1
with open(filename) as fd:
nrows = [ln for ln, line in enumerate(fd.readlines()) if line.startswith("<?xml")]
csvargs = self.csvargs.copy()
csvargs.update({"skiprows": skiprows, "nrows": nrows[0] - skiprows})
self.from_pandas(pandas.read_csv(filename, **csvargs))
[docs] def from_pandas(self, df: pandas.DataFrame) -> None:
"""uses a dataframe as input, should contain 'Q', 'I', and 'ISigma'"""
assert isinstance(
df, pandas.DataFrame
), "from_pandas requires a pandas DataFrame with 'Q', 'I', and 'ISigma'"
# maybe add a check for the keys:
assert all(
[key in df.keys() for key in ["Q", "I", "ISigma"]]
), "from_pandas requires the dataframe to contain 'Q', 'I', and 'ISigma'"
assert all(
[df[key].dtype.kind in "f" for key in ["Q", "I", "ISigma"]]
), "data could not be read correctly. If csv, did you supply the right csvargs?"
self.rawData = df
self.prepare()
[docs] def from_csv(self, filename: Path, csvargs: dict = {}) -> None:
"""reads from a three-column csv file, takes pandas from_csv arguments"""
assert filename is not None, "from_csv requires an input filename of a csv file"
localCsvargs = self.csvargs.copy()
localCsvargs.update(csvargs)
self.from_pandas(pandas.read_csv(filename, **localCsvargs))
def clip(self) -> None:
self.clippedData = (
self.rawData.query(f"{self.dataRange[0]} <= Q < {self.dataRange[1]}").dropna().copy()
)
assert len(self.clippedData) != 0, "Data clipping range too small, no datapoints found!"
[docs] def omit(self) -> None:
"""This can skip/omit unwanted ranges of data (for example a data range with an unwanted
XRD peak in it). Requires an "omitQRanges" list of [[qmin, qmax]]-data ranges to omit.
"""
# nothng to do:
if self.omitQRanges is None:
return
assert isinstance(self.omitQRanges, list), "omitQRanges must be a list"
for omitQRange in self.omitQRanges:
assert (
len(omitQRange) == 2
), "each omitQRange must contain two elements: a minimum and maximum value"
# we drop the matches:
self.clippedData.drop(
self.clippedData.query(f"{omitQRange[0]} <= Q < {omitQRange[1]}").index,
inplace=True,
)
[docs] def reBin(
self, nbins: Optional[int] = None, IEMin: float = 0.01, QEMin: float = 0.01
) -> None: # nbins:int|None
"""Unweighted rebinning funcionality with extended uncertainty estimation,
adapted from the datamerge methods, as implemented in Paulina's notebook of spring 2020
"""
if nbins is None:
nbins = self.nbins
qMin = self.clippedData.Q.dropna().min()
qMax = self.clippedData.Q.dropna().max()
# prepare bin edges:
binEdges = np.logspace(np.log10(qMin), np.log10(qMax), num=nbins + 1)
binDat = pandas.DataFrame(
data={
"Q": np.full(nbins, np.nan), # mean Q
"I": np.full(nbins, np.nan), # mean intensity
"IStd": np.full(nbins, np.nan), # standard deviation of the mean intensity
"ISEM": np.full(
nbins, np.nan
), # standard error on mean of the mean intensity (maybe, but weighted is hard.)
"IError": np.full(nbins, np.nan), # Propagated errors of the intensity
"ISigma": np.full(nbins, np.nan), # Combined error estimate of the intensity
"QStd": np.full(nbins, np.nan), # standard deviation of the mean Q
"QSEM": np.full(nbins, np.nan), # standard error on the mean Q
"QError": np.full(nbins, np.nan), # Propagated errors on the mean Q
"QSigma": np.full(nbins, np.nan), # Combined error estimate on the mean Q
}
)
# add a little to the end to ensure the last datapoint is captured:
binEdges[-1] = binEdges[-1] + 1e-3 * (binEdges[-1] - binEdges[-2])
# now do the binning per bin.
for binN in range(len(binEdges) - 1):
dfRange = self.clippedData.query(
"{} <= Q < {}".format(binEdges[binN], binEdges[binN + 1])
).copy()
if len(dfRange) == 0:
# no datapoints in the range
pass
elif len(dfRange) == 1:
# only one datapoint in the range
# might not be necessary to do this..
# can't do stats on this:
binDat.I.loc[binN] = float(dfRange.I)
binDat.IStd.loc[binN] = float(dfRange.ISigma)
binDat.ISEM.loc[binN] = float(dfRange.ISigma)
binDat.IError.loc[binN] = float(dfRange.ISigma)
binDat.ISigma.loc[binN] = np.max([binDat.ISEM.loc[binN], float(dfRange.I) * IEMin])
binDat.Q.loc[binN] = float(dfRange.Q)
binDat.QStd.loc[binN] = binDat.Q.loc[binN] * QEMin
binDat.QSEM.loc[binN] = binDat.Q.loc[binN] * QEMin
binDat.QError.loc[binN] = binDat.Q.loc[binN] * QEMin
if "QSigma" in dfRange.keys():
binDat.QStd.loc[binN] = float(dfRange.QSigma)
binDat.QSEM.loc[binN] = float(dfRange.QSigma)
binDat.QError.loc[binN] = float(dfRange.QSigma)
binDat.QSigma.loc[binN] = np.max(
[float(binDat.QSEM.loc[binN]), float(dfRange.Q) * QEMin]
)
else:
# multiple datapoints in the range
binDat.I.loc[binN] = dfRange.I.mean(skipna=True) # , numeric_only = True)
binDat.IStd.loc[binN] = dfRange.I.std(ddof=1, skipna=True)
binDat.ISEM.loc[binN] = dfRange.I.sem(ddof=1, skipna=True)
binDat.IError.loc[binN] = np.sqrt(((dfRange.ISigma) ** 2).sum()) / len(dfRange)
binDat.ISigma.loc[binN] = np.max(
[
binDat.ISEM.loc[binN],
binDat.IError.loc[binN],
binDat.I.loc[binN] * IEMin,
]
)
binDat.Q.loc[binN] = dfRange.Q.mean(skipna=True)
binDat.QStd.loc[binN] = dfRange.Q.std(ddof=1, skipna=True)
binDat.QSEM.loc[binN] = dfRange.Q.sem(ddof=1, skipna=True)
binDat.QError.loc[binN] = binDat.QSEM.loc[binN]
if "QSigma" in dfRange.keys():
# overwrite with propagated uncertainties if available
binDat.QError.loc[binN] = np.sqrt(((dfRange.QSigma) ** 2).sum()) / len(dfRange)
binDat.QSigma.loc[binN] = np.max(
[
binDat.QSEM.loc[binN],
binDat.QError.loc[binN],
binDat.Q.loc[binN] * QEMin,
]
)
# remove empty bins
binDat.dropna(thresh=4, inplace=True)
self.binnedData = binDat