Source code for bumps.data

"""
Data handling utilities.
"""
from __future__ import division

import os
import gzip
from contextlib import contextmanager

import numpy as np
from numpy import inf, nan

__all__ = ["indfloat", "parse_file"]


def parse_multi(file, keysep=None, sep=None, comment='#'):
    """
    Parse a multi-part file.

    Return a list of (header, data) pairs, where header is a key: value
    dictionary and data is a numpy array.

    The header section is list of key value pairs, with the *comment* character
    at the start of each line.  Key and value will be separated by *keysep*,
    or by spaces if *keysep = None*.  The data section is a sequence of
    floating point numbers separated by *sep*, or by spaces if *sep* is None.
    inf and nan are parsed as inf and nan.  Comments at the end of the data
    line will be ignored.  Data points can be commented out by including
    a comment character at the start of the data line, assuming the next
    character is a digit, plus, or decimal separator.

    Quotes around keys are removed, but not around values.  Use
    :func:`strip_quotes` to remove them if they are present.  This is different
    from the :func:`parse_file` interface, which strips quotes around values.
    The new interface allows *json.loads()* calls on values if values are
    stored as *key: json.dumps(value)*.

    Special hack for binned data: if the first column contains bin edges, then
    the last row will only have the bin edge.  To make the array square,
    we replace the bin edges with bin centers.  The original bins can be
    found in the header using the 'bins' key (unless that key already exists
    in the header, in which case the key will be ignored).
    """
    parts = []
    with maybe_open(file) as fh:
        while True:
            header, data, bins = _read_part(fh, comment=comment, multi_part=True,
                                            col_sep=sep, key_sep=keysep)
            if header is None:
                break
            if bins is not None:
                header.setdefault('bins', bins)
            parts.append((header, data))
    return parts


[docs]
def parse_file(file, keysep=None, sep=None, comment='#'):
    """
    Parse a file into a header and data.

    Return a (header, data) pair, where header is a key: value
    dictionary and data is a numpy array.

    The header section is list of key value pairs, with the *comment* character
    at the start of each line.  Key and value will be separated by *keysep*,
    or by spaces if *keysep = None*.  The data section is a sequence of
    floating point numbers separated by *sep*, or by spaces if *sep* is None.
    inf and nan are parsed as inf and nan.  Comments at the end of the data
    line will be ignored.  Data points can be commented out by including
    a comment character at the start of the data line, assuming the next
    character is a digit, plus, or decimal separator.

    Quotes around keys are removed.  For compatibility with the old interface,
    quotes around values are removed as well.

    Special hack for binned data: if the first column contains bin edges, then
    the last row will only have the bin edge.  To make the array square,
    we replace the bin edges with bin centers.  The original bins can be
    found in the header using the 'bins' key (unless that key already exists
    in the header, in which case the key will be ignored).
    """
    with maybe_open(file) as fh:
        header, data, bins = _read_part(fh, comment=comment, multi_part=False,
                                        col_sep=sep, key_sep=keysep)
    if header is None:
        raise IOError("data file is empty")
    # compatibility: strip quotes from values in key-value pairs
    header = dict((k, strip_quotes(v)) for k, v in header.items())
    if bins is not None:
        header.setdefault('bins', bins)
    return header, data


def _read_part(fh, key_sep=None, col_sep=None, comment="#", multi_part=False):
    header = {}
    data = []
    iseof = True
    for line in fh:
        # Blank lines indicate a section break.
        if not line.strip():
            # Skip blank lines if we are parsing the data as a single part file
            if not multi_part:
                continue
            # If we are at the beginning of a section, then iseof is True and
            # continuing to the next loop iteration will skip them. If we have
            # already consumed some non-blank lines, then iseof will be false,
            # and we need to break this section of the data.  If we have blank
            # lines at the end of the file, we will never set iseof to False
            # and they will be ignored.
            if iseof:
                continue
            break

        # Line is not blank, so process it.
        columns, key, value = _parse_line(line, comment=comment,
                                          col_sep=col_sep, key_sep=key_sep)
        if columns:
            data.append([indfloat(v) for v in columns])
        if key is not None:
            if key in header:
                header[key] = "\n".join((header[key], value))
            else:
                header[key] = value

        # We have processed some data, so
        iseof = False

    if iseof:
        return None, None, None

    # print data
    # print "\n".join(k+":"+v for k,v in header.items())
    if len(data) and len(data[-1]) == 1:
        # For TOF data, the first column is the bin edge, which has one
        # more row than the remaining columns; fill those columns with
        # bin centers instead
        last_edge = data[-1][0]
        data = np.array(data[:-1]).T
        edges = np.hstack((data[0], last_edge))
        data[0] = 0.5*(edges[:-1] + edges[1:])
        bins = edges
    else:
        data = np.array(data).T
        bins = None

    return header, data, bins


@contextmanager
def maybe_open(file_or_path):
    """
    A context manager for file opening, given as a file path or an open handle.

    If *file_or_path* is a string ending in ".gz" then open with gzip.
    """
    if hasattr(file_or_path, 'readline'):
        # If it is a file handle, yield it and return without closing.
        fh = file_or_path
        yield fh
    else:
        # Otherwise it should be a path. Make sure it is at least a string.
        if not string_like(file_or_path):
            raise ValueError('file must be a name or a file handle')
        # Open file; if name ends in .gz then assume it is compressed.
        path = os.path.expanduser(file_or_path)
        fh = gzip.open(path) if path.endswith('.gz') else open(path)
        try:
            yield fh
        finally:
            fh.close()


def string_like(s):
    """
    Return True if s operates like a string.
    """
    try:
        s + ''
    except Exception:
        return False
    return True


def _parse_line(line, key_sep=None, col_sep=None, comment='#'):
    # Find location of the comment character on the line
    idx = line.find(comment)

    # If the line does not contain a comment character or if the comment
    # character is not in the first column, then this is a data line which
    # should be returned as a sequence of text columns separated by spaces.
    # The caller can turn the columns into numbers or leave them as strings.
    # Data on the line after the comment character is ignored.
    # TODO: allow quoted strings or backslash escaped spaces for text columns
    if idx != 0:
        if idx > 0:
            return line[:idx].split(col_sep), None, ''
        else:
            return line.split(col_sep), None, ''

    # Split line on key separator
    parts = [p.strip() for p in line[1:].split(key_sep, 1)]
    key, value = parts if len(parts) > 1 else (parts[0], '')
    key = strip_quotes(key)

    # If key is a number assume it is simply a commented out data point
    if len(key) and (key[0] in '.-+0123456789' or key == 'inf' or key == 'nan'):
        return [], None, None

    return [], key, value


def strip_quotes(s):
    return s[1:-1] if len(s) and s[0] in "'\"" and s[0] == s[-1] else s

INF_VALUES = set(('inf', '1/0', '1.#inf', 'infinity'))
NAN_VALUES = set(('nan', '0/0', '1.#qnan', 'na', 'n/a'))

[docs]
def indfloat(s):
    """
    Convert string to float, with support for inf and nan.

    Example::

        >>> from numpy import isinf, isnan
        >>> print(isinf(indfloat('inf')))
        True
        >>> print(isinf(indfloat('-inf')))
        True
        >>> print(isnan(indfloat('nan')))
        True
    """
    try:
        return float(s)
    except Exception:
        s = s.lower()
        if s in INF_VALUES:
            return inf
        elif s and s[0] == '-' and s[1:] in INF_VALUES:
            return -inf
        elif s in NAN_VALUES:
            return nan
        raise