Source code for h5hep.read

import h5py as h5
import numpy as np

################################################################################
[docs]def load(filename=None,verbose=False,desired_datasets=None,subset=None):

    """ Reads all, or a subset of the data, from the HDF5 file to fill a data dictionary.
    Returns an empty dictionary to be filled later with select events.

    Args:
	**filename** (string): Name of the input file
	
	**verbose** (boolean): True if debug output is required

	**desired_datasets** (list): Datasets to be read from input file

	**subset** (int): Number of events to be read from input file

    Returns:
	**ourdata (dict): Selected data from HDF5 file
	
	**event** (dict): An empty event dictionary to be filled by individual events

    """

    f = None
    if filename!=None:
        f = h5.File(filename,'r+')
    else:
        print("No filename passed in! Can't open file.\n")
        return None

    ourdata = {}
    ourdata['datasets_and_counters'] = {}
    ourdata['datasets_and_indices'] = {}
    ourdata['list_of_counters'] = []
    ourdata['all_datasets'] = []

    ourdata['nentries'] = f.attrs['nentries']
    if subset is not None:
        if type(subset) == int:
            subset = (0,subset)
        ourdata['nentries'] = subset[1] - subset[0]

    event = {}

    # Get the datasets and counters
    dc = f['datasets_and_counters']
    for vals in dc:
        # The decode is there because vals were stored as numpy.bytes
        counter = vals[1].decode()
        index = "%s_INDEX" % (counter)
        ourdata['datasets_and_counters'][vals[0].decode()] = counter
        ourdata['datasets_and_indices'][vals[0].decode()] = index
        ourdata['list_of_counters'].append(vals[1].decode())
        ourdata['all_datasets'].append(vals[0].decode())
        ourdata['all_datasets'].append(vals[1].decode()) # Get the counters as well

    # We may have added some strings (like counters) multiple times.
    ourdata['list_of_counters'] = np.unique(ourdata['list_of_counters']).tolist()
    ourdata['all_datasets'] = np.unique(ourdata['all_datasets']).tolist()

    # Pull out the SINGLETON datasets
    sg = f['_SINGLETONGROUP_'][0] # This is a numpy array of strings
    decoded_string = sg[1].decode()

    vals = decoded_string.split("__:__")
    vals.remove('INDEX')

    ourdata['_SINGLETON_'] = vals


    # Get the list of datasets and groups, but remove the 
    # 'datasets_and_counters', as that is a protected key.
    entries = ourdata['all_datasets']

    ########################################################
    # Only keep select data from file
    ########################################################
    if desired_datasets is not None:
        if type(desired_datasets) != list:
            desired_datasets = list(desired_datasets)

        # Count backwards because we'll be removing stuff as we go.
        i = len(entries)-1
        while i>=0:
            entry = entries[i]

            is_dropped = True
            for desdat in desired_datasets:
                if desdat in entry:
                    is_dropped = False
                    break

            if is_dropped==True:
                print("Not reading out %s from the file...." % (entry))
                entries.remove(entry)

            i -= 1
    #######################################################

    if verbose==True:
        print("Datasets and counters:")
        print(ourdata['datasets_and_counters'])
        print("\nDatasets and indices:")
        print(ourdata['list_of_counters'])

    # Pull out the counters first and build the indices
    print("Building the indices...")
    for name in ourdata['list_of_counters']:
        if subset is not None:
            ourdata[name] = f[name][subset[0]:subset[1]]
        else:
            ourdata[name] = f[name][:]

        #counter = f[name].value
        indexname = "%s_INDEX" % (name)
        index = np.zeros(len(ourdata[name]),dtype=int)
        start = 0
        nentries = len(index)
        for i in range(0,nentries):
            index[i] = start
            nobjs = ourdata[name][i]
            start = index[i] + nobjs
        ourdata[indexname] = index
    print("Built the indices!")

    
    # Loop over the entries we want and pull out the data.
    for name in entries:

        # The decode is there because counter is a numpy.bytes object
        counter = None
        if name not in ourdata['list_of_counters']:
            counter = ourdata['datasets_and_counters'][name]

        if verbose==True:
            print(f[name])

        data = f[name]
        #for data in f[name]:
        if type(data)==h5.Dataset:
            datasetname = name

            if subset is not None:
                ourdata[datasetname] = data[subset[0]:subset[1]]
            else:
                ourdata[datasetname] = data[:]

            event[datasetname] = None # This will be filled for individual events
            if verbose==True:
                print(data)

    f.close()
    print("Data is read in and input file is closed.")

    return ourdata,event
################################################################################


################################################################################
[docs]def unpack(event,data,n=0):

    """ Fills the event dictionary with selected events.

    Args:

	**event** (dict): Event dictionary to be filled

	**data** (dict): Data dictionary used to fill the event dictionary

    """

    keys = event.keys()

    for key in keys:

        #if "num" in key:
        # IS THERE A WAY THAT THIS COULD BE FASTER?
        #print(data['list_of_counters'],key)
        if key in data['list_of_counters'] or key in data['_SINGLETON_']:
            #print("here! ",key)
            event[key] = data[key][n]

        elif "INDEX" not in key:# and 'Jets' in key:
            indexkey = data['datasets_and_indices'][key]
            numkey = data['datasets_and_counters'][key]

            if len(data[indexkey])>0:
                index = data[indexkey][n]

            if len(data[numkey])>0:
                nobjs = data[numkey][n]
                event[key] = data[key][index:index+nobjs]

################################################################################
[docs]def get_nentries(filename):

    """ Get the number of entries in the file.

    """

    f = h5.File(filename,'r+')

    a = f.attrs

    if a.__contains__('nentries'):
        nentries = a.get('nentries')
        f.close()
        return nentries
    else:
        print("\nFile does not contain the attribute, \"nentries\"\n")
        f.close()
        return None