Source code for diffpy.utils.parsers.loaddata

#!/usr/bin/env python
##############################################################################
#
# diffpy.utils      by DANSE Diffraction group
#                   Simon J. L. Billinge
#                   (c) 2010 The Trustees of Columbia University
#                   in the City of New York.  All rights reserved.
#
# File coded by:    Timur Davis, Chris Farrow, Pavol Juhas
#
# See AUTHORS.txt for a list of people who contributed.
# See LICENSE_DANSE.txt for license information.
#
##############################################################################

import os

import numpy

from diffpy.utils import validators



[docs]
def loadData(
    filename, minrows=10, headers=False, hdel="=", hignore=None, **kwargs
):
    """Find and load data from a text file.

    The data block is identified as the first matrix block of at least
    minrows rows and constant number of columns. This seems to work for most
    of the datafiles including those generated by diffpy programs.

    Parameters
    ----------
    filename
        Name of the file we want to load data from.
    minrows: int
        Minimum number of rows in the first data block. All rows must have
        the same number of floating point values.
    headers: bool
        when False (default), the function returns a numpy array of the data
        in the data block. When True, the function instead returns a
        dictionary of parameters and their corresponding values parsed from
        header (information prior the data block). See hdel and hignore for
        options to help with parsing header information.
    hdel: str
        (Only used when headers enabled.) Delimiter for parsing header
        information (default '='). e.g. using default hdel, the line '
        parameter = p_value' is put into the dictionary as
        {parameter: p_value}.
    hignore: list
        (Only used when headers enabled.) Ignore header rows beginning with
        any elements in hignore. e.g. hignore=['# ', '['] causes the
        following lines to be skipped: '# qmax=10', '[defaults]'.
    kwargs:
        Keyword arguments that are passed to numpy.loadtxt including the
        following arguments below. (See numpy.loadtxt for more details.) Only
        pass kwargs used by numpy.loadtxt.

    Useful kwargs
    =============
    comments: str, sequence of str
        The characters or list of characters used to indicate the start of a
        comment (default '#'). Comment lines are ignored.
    delimiter: str
        Delimiter for the data in the block (default use whitespace). For
        comma-separated data blocks, set delimiter to ','.
    unpack: bool
        Return data as a sequence of columns that allows tuple unpacking such
        as x, y = loadData(FILENAME, unpack=True). Note transposing the
        loaded array as loadData(FILENAME).T has the same effect.
    usecols:
        Zero-based index of columns to be loaded, by default use all detected
        columns. The reading skips data blocks that do not have the usecols-
        specified columns.

    Returns
    -------
    data_block: ndarray
        A numpy array containing the found data block. (This is not returned
        if headers is enabled.)
    hdata: dict
        If headers are enabled, return a dictionary of parameters read from
        the header.
    """
    from numpy import array, loadtxt

    # for storing header data
    hdata = {}
    # determine the arguments
    delimiter = kwargs.get("delimiter")
    usecols = kwargs.get("usecols")
    # required at least one column of floating point values
    mincv = (1, 1)
    # but if usecols is specified, require sufficient number of columns
    # where the used columns contain floats
    if usecols is not None:
        hiidx = max(-min(usecols), max(usecols) + 1)
        mincv = (hiidx, len(set(usecols)))

    # Check if a line consists of floats only and return their count
    # Return zero if some strings cannot be converted.
    def countcolumnsvalues(line):
        try:
            words = line.split(delimiter)
            # remove trailing blank columns
            while words and not words[-1].strip():
                words.pop(-1)
            nc = len(words)
            if usecols is not None:
                nv = len([float(words[i]) for i in usecols])
            else:
                nv = len([float(w) for w in words])
        except (IndexError, ValueError):
            nc = nv = 0
        return nc, nv

    # Check if file exists before trying to open
    if not os.path.exists(filename):
        raise IOError(
            (
                f"File {filename} cannot be found. "
                "Please rerun the program specifying a valid filename."
            )
        )

    # make sure fid gets cleaned up
    with open(filename, "rb") as fid:
        # search for the start of datablock
        start = ncvblock = None
        fpos = (0, 0)
        nrows = 0
        for line in fid:
            # decode line
            dline = line.decode()
            # find header information if requested
            if headers:
                hpair = dline.split(hdel)
                flag = True
                # ensure number of non-blank arguments is two
                if len(hpair) != 2:
                    flag = False
                else:
                    # ignore if an argument is blank
                    hpair[0] = hpair[0].strip()  # name of data entry
                    hpair[1] = hpair[1].strip()  # value of entry
                    if not hpair[0] or not hpair[1]:
                        flag = False
                    else:
                        # check if row has an ignore tag
                        if hignore is not None:
                            for tag in hignore:
                                taglen = len(tag)
                                if (
                                    len(hpair[0]) >= taglen
                                    and hpair[0][:taglen] == tag
                                ):
                                    flag = False
                # add header data
                if flag:
                    name = hpair[0]
                    value = hpair[1]
                    # check if data value should be stored as float
                    if validators.is_number(hpair[1]):
                        value = float(hpair[1])
                    hdata.update({name: value})
            # continue search for the start of datablock
            fpos = (fpos[1], fpos[1] + len(line))
            line = dline
            ncv = countcolumnsvalues(line)
            if ncv < mincv:
                start = None
                continue
            # ncv is acceptable here, require the same number of columns
            # throughout the datablock
            if start is None or ncv != ncvblock:
                ncvblock = ncv
                nrows = 0
                start = fpos[0]
            nrows += 1
            # block was found here!
            if nrows >= minrows:
                break

        # Return header data if requested
        if headers:
            return hdata  # Return, so do not proceed to reading datablock

        # Return an empty array when no data found.
        # loadtxt would otherwise raise an exception on loading from EOF.
        if start is None:
            data_block = array([], dtype=float)
        else:
            fid.seek(start)
            # always use usecols argument so that loadtxt does not crash
            # in case of trailing delimiters.
            kwargs.setdefault("usecols", list(range(ncvblock[0])))
            data_block = loadtxt(fid, **kwargs)
    return data_block




[docs]
class TextDataLoader(object):
    """Smart loading of a text data with possibly multiple datasets.

    Parameters
    ----------
    minrows: int
        Minimum number of rows in the first data block. (Default 10.)
    usecols: tuple
        Which columns in our dataset to use. Ignores all other columns. If
        None (default), use all columns.
    skiprows
        Rows in dataset to skip. (Currently not functional.)
    """

    def __init__(self, minrows=10, usecols=None, skiprows=None):
        if minrows is not None:
            self.minrows = minrows
        if usecols is not None:
            self.usecols = tuple(usecols)
        # FIXME: implement usage in _findDataBlocks
        if skiprows is not None:
            self.skiprows = skiprows
        # data items
        self._reset()
        return

    def _reset(self):
        self.filename = ""
        self.headers = []
        self.datasets = []
        self._resetvars()
        return

    def _resetvars(self):
        self._filename = ""
        self._lines = None
        self._splitlines = None
        self._words = None
        self._linerecs = None
        self._wordrecs = None
        return


[docs]
    def read(self, filename):
        """Open a file and run readfp.

        Use if file is not already open for read byte.
        """
        with open(filename, "rb") as fp:
            self.readfp(fp)
        return



[docs]
    def readfp(self, fp, append=False):
        """Get file details.

        File details include:
         *  File name.
         *  All data blocks findable by loadData.
         *  Headers (if present) for each data block. (Generally the headers
            contain column name information).
        """
        self._reset()
        # try to read lines from fp first
        self._lines = fp.readlines()
        # and if good, assign filename
        self.filename = getattr(fp, "name", "")
        self._words = "".join(self._lines).split()
        self._splitlines = [line.split() for line in self._lines]
        self._findDataBlocks()
        return


    def _findDataBlocks(self):
        mincols = 1
        if self.usecols is not None and len(self.usecols):
            mincols = max(mincols, max(self.usecols) + 1)
            mincols = max(mincols, abs(min(self.usecols)))
        nlines = len(self._lines)
        nwords = len(self._words)
        # idx - line index, nw0, nw1 - index of the first and last word,
        # nf - number of words, ok - has data
        self._linerecs = numpy.recarray(
            (nlines,),
            dtype=[
                ("idx", int),
                ("nw0", int),
                ("nw1", int),
                ("nf", int),
                ("ok", bool),
            ],
        )
        lr = self._linerecs
        lr.idx = numpy.arange(nlines)
        lr.nf = [len(sl) for sl in self._splitlines]
        lr.nw1 = lr.nf.cumsum()
        lr.nw0 = lr.nw1 - lr.nf
        lr.ok = True
        # word records
        lw = self._wordrecs = numpy.recarray(
            (nwords,),
            dtype=[
                ("idx", int),
                ("line", int),
                ("col", int),
                ("ok", bool),
                ("value", float),
            ],
        )
        lw.idx = numpy.arange(nwords)
        n1 = numpy.zeros(nwords, dtype=bool)
        n1[lr.nw1[:-1]] = True
        lw.line = n1.cumsum()
        lw.col = lw.idx - lr.nw0[lw.line]
        lw.ok = True
        values = nwords * [0.0]
        for i, w in enumerate(self._words):
            try:
                values[i] = float(w)
            except ValueError:
                lw.ok[i] = False
        # prune lines that have a non-float values:
        lw.values = values
        if self.usecols is None:
            badlines = lw.line[~lw.ok]
            lr.ok[badlines] = False
        else:
            for col in self.usecols:
                badlines = lw.line[(lw.col == col) & ~lw.ok]
                lr.ok[badlines] = False
        lr1 = lr[lr.nf >= mincols]
        okb = numpy.r_[lr1.ok[:1], lr1.ok[1:] & ~lr1.ok[:-1], False]
        oke = numpy.r_[False, ~lr1.ok[1:] & lr1.ok[:-1], lr1.ok[-1:]]
        blockb = numpy.r_[True, lr1.nf[1:] != lr1.nf[:-1], False]
        blocke = numpy.r_[False, blockb[1:-1], True]
        beg = numpy.nonzero(okb | blockb)[0]
        end = numpy.nonzero(oke | blocke)[0]
        rowcounts = end - beg
        assert not numpy.any(rowcounts < 0)
        goodrows = rowcounts >= self.minrows
        begend = numpy.transpose([beg, end - 1])[goodrows]
        hbeg = 0
        for dbeg, dend in begend:
            bb1 = lr1[dbeg]
            ee1 = lr1[dend]
            hend = bb1.idx
            header = "".join(self._lines[hbeg:hend])
            hbeg = ee1.idx + 1
            if self.usecols is None:
                data = numpy.reshape(lw.value[bb1.nw0 : ee1.nw1], (-1, bb1.nf))
            else:
                tdata = numpy.empty(
                    (len(self.usecols), dend - dbeg), dtype=float
                )
                for j, trow in zip(self.usecols, tdata):
                    j %= bb1.nf
                    trow[:] = lw.value[bb1.nw0 + j : ee1.nw1 : bb1.nf]
                data = tdata.transpose()
            self.headers.append(header)
            self.datasets.append(data)
        # finish reading to a last header and empty dataset
        if hbeg < len(self._lines):
            header = "".join(self._lines[hbeg:])
            data = numpy.empty(0, dtype=float)
            self.headers.append(header)
            self.datasets.append(data)
        return