Source code for h5hep.read

import h5py as h5
import numpy as np

################################################################################
[docs]def load(filename=None, verbose=False, desired_datasets=None, subset=None):

    """ Reads all, or a subset of the data, from the HDF5 file to fill a data dictionary.
    Returns an empty dictionary to be filled later with select events.

    Args:
	**filename** (string): Name of the input file
	
	**verbose** (boolean): True if debug output is required

	**desired_datasets** (list): Datasets to be read from input file

	**subset** (int): Number of events to be read from input file

    Returns:
	**ourdata (dict): Selected data from HDF5 file
	
	**event** (dict): An empty event dictionary to be filled by individual events

    """

    f = None
    if filename != None:
        f = h5.File(filename, "r+")
    else:
        print("No filename passed in! Can't open file.\n")
        return None

    ourdata = {}
    ourdata["datasets_and_counters"] = {}
    ourdata["datasets_and_indices"] = {}
    ourdata["list_of_counters"] = []
    ourdata["all_datasets"] = []

    ourdata["nentries"] = f.attrs["nentries"]
    if subset is not None:
        if type(subset) == int:
            subset = (0, subset)
        ourdata["nentries"] = subset[1] - subset[0]

    event = {}

    # Get the datasets and counters
    dc = f["datasets_and_counters"]
    for vals in dc:
        # The decode is there because vals were stored as numpy.bytes
        counter = vals[1].decode()
        index = "%s_INDEX" % (counter)
        ourdata["datasets_and_counters"][vals[0].decode()] = counter
        ourdata["datasets_and_indices"][vals[0].decode()] = index
        ourdata["list_of_counters"].append(vals[1].decode())
        ourdata["all_datasets"].append(vals[0].decode())
        ourdata["all_datasets"].append(vals[1].decode())  # Get the counters as well

    # We may have added some strings (like counters) multiple times.
    ourdata["list_of_counters"] = np.unique(ourdata["list_of_counters"]).tolist()
    ourdata["all_datasets"] = np.unique(ourdata["all_datasets"]).tolist()

    # Pull out the SINGLETON datasets
    sg = f["_SINGLETONGROUP_"][0]  # This is a numpy array of strings
    decoded_string = sg[1].decode()

    vals = decoded_string.split("__:__")
    vals.remove("INDEX")

    ourdata["_SINGLETON_"] = vals

    # Get the list of datasets and groups, but remove the
    # 'datasets_and_counters', as that is a protected key.
    entries = ourdata["all_datasets"]

    ########################################################
    # Only keep select data from file
    ########################################################
    if desired_datasets is not None:
        if type(desired_datasets) != list:
            desired_datasets = list(desired_datasets)

        # Count backwards because we'll be removing stuff as we go.
        i = len(entries) - 1
        while i >= 0:
            entry = entries[i]

            is_dropped = True
            for desdat in desired_datasets:
                if desdat in entry:
                    is_dropped = False
                    break

            if is_dropped == True:
                print("Not reading out %s from the file...." % (entry))
                entries.remove(entry)

            i -= 1
    #######################################################

    if verbose == True:
        print("Datasets and counters:")
        print(ourdata["datasets_and_counters"])
        print("\nDatasets and indices:")
        print(ourdata["list_of_counters"])

    # Pull out the counters first and build the indices
    print("Building the indices...")
    for name in ourdata["list_of_counters"]:
        if subset is not None:
            ourdata[name] = f[name][subset[0] : subset[1]]
        else:
            ourdata[name] = f[name][:]

        # counter = f[name].value
        indexname = "%s_INDEX" % (name)
        index = np.zeros(len(ourdata[name]), dtype=int)
        start = 0
        nentries = len(index)
        for i in range(0, nentries):
            index[i] = start
            nobjs = ourdata[name][i]
            start = index[i] + nobjs
        ourdata[indexname] = index
    print("Built the indices!")

    # Loop over the entries we want and pull out the data.
    for name in entries:

        # The decode is there because counter is a numpy.bytes object
        counter = None
        if name not in ourdata["list_of_counters"]:
            counter = ourdata["datasets_and_counters"][name]

        if verbose == True:
            print(f[name])

        data = f[name]
        # for data in f[name]:
        if type(data) == h5.Dataset:
            datasetname = name

            if subset is not None:
                ourdata[datasetname] = data[subset[0] : subset[1]]
            else:
                ourdata[datasetname] = data[:]

            event[datasetname] = None  # This will be filled for individual events
            if verbose == True:
                print(data)

    f.close()
    print("Data is read in and input file is closed.")

    return ourdata, event


################################################################################


################################################################################
[docs]def unpack(event, data, n=0):

    """ Fills the event dictionary with selected events.

    Args:

	**event** (dict): Event dictionary to be filled

	**data** (dict): Data dictionary used to fill the event dictionary

    """

    keys = event.keys()

    for key in keys:

        # if "num" in key:
        # IS THERE A WAY THAT THIS COULD BE FASTER?
        # print(data['list_of_counters'],key)
        if key in data["list_of_counters"] or key in data["_SINGLETON_"]:
            # print("here! ",key)
            event[key] = data[key][n]

        elif "INDEX" not in key:  # and 'Jets' in key:
            indexkey = data["datasets_and_indices"][key]
            numkey = data["datasets_and_counters"][key]

            if len(data[indexkey]) > 0:
                index = data[indexkey][n]

            if len(data[numkey]) > 0:
                nobjs = data[numkey][n]
                event[key] = data[key][index : index + nobjs]


################################################################################
[docs]def get_nentries(filename):

    """ Get the number of entries in the file.

    """

    f = h5.File(filename, "r+")

    a = f.attrs

    if a.__contains__("nentries"):
        nentries = a.get("nentries")
        f.close()
        return nentries
    else:
        print('\nFile does not contain the attribute, "nentries"\n')
        f.close()
        return None