Source code for data_processing

"""
Tools for reading and processing the sensitivity analysis data files.
Some of the files are specific to our project (`input_parameters.csv' and
`results.csv`), but the results of sensitivity analyses are formatted
as any SALib analysis results will be from a sobol analysis.

Our data files are stored outside this repository because they are too large,
so users need to specify the path to their data.
"""
import os

import pandas as pd


[docs]def read_file(path, numrows=None, drop=False, sep=','):
    """
    Function reads a file of input parameters or model results
    and returns a pandas dataframe with its contents.
    The first line of the input should contain headers
    corresponding to the column names.

    Parameters
    ----------
    path      : str
                the complete filename, including
                absolute or relative path.
    numrows   : int, optional
                number of rows of the file to read.
                If you don't specify this parameter all rows
                will be read.
    drop      : list, optional
                list of strings indicating which (if any)
                of the named columns you do not want to include
                in the resulting dataframe. (ex. ['cats', 'dogs'],
                default is not to drop any rows).
    sep       : str
                string indicating the column separator in the
                file (optional, default = ',').

    Returns
    --------
    df : pandas dataframe
         A pandas dataframe with the contents of the file,
         limited to the number of rows specified and without the
         columns named in "drop".
    """

    df = pd.read_csv(path, sep=sep, nrows=numrows)
    if not drop:
        df.drop(drop, axis=1, inplace=True)

    return df


[docs]def get_params(path='./input_parameters.csv',
               numrows=None, drop=['End_time', 'Oxygen']):
    """
    NOTE: This function is specific to our lignin modeling dataset
          and is not needed for the visualization features of savvy

    Returns a pandas dataframe with all the parameters analyzed in
    the sensitivity analysis, but not additional parameters like
    end time and oxygen content.  If you would like all of the
    parameters (even those not analyzed for sensitivity) then pass
    drop=False.

    Parameters
    ----------
    path    : str, optional
              string containing the path to the parameters csv.
    numrows : int, optional
              the number of rows of the input_parameters file to read
              (default is to read all rows).
    drop    : list, optional
              a list of strings for which parameters you do not want to
              include in the returned dataframe.  If you want all params
              then pass drop=False.

    Returns
    -------
    pandas dataframe

    """

    return read_file(path, numrows=numrows, drop=drop)


[docs]def get_results(path='./results.csv',
                numrows=None, drop=['light_aromatic_C-C',
                                    'light_aromatic_methoxyl']):
    """
    NOTE: This function is specific to our lignin modeling dataset
          and is not needed for the visualization features of savvy

    Returns a pandas dataframe with the results of running all of the
    simulations for the parameters sets in `input_parameters.csv`. This
    function drops two unused functional groups from the results file.

    Parameters
    ----------
    path    : str, optional
              the path to the results csv file.
    numrows : int, optional
              the number of rows of the input_parameters file to read
              (default is to read all rows).
    drop    : list, optional
              a list of strings for which output measures to drop from
              the returned dataframe.  If you want all outputs use
              drop=False.

    Returns
    -------
    pandas dataframe
    """

    return read_file(path, numrows=numrows, drop=drop)


[docs]def get_sa_data(path='.'):
    """
    This function reads and processes all the sensitivity analysis results
    in a specified folder and returns a dictionary with the corresponding
    dataframes for first/total order sensitivity indices and second order
    indices (if present).

    Sensitivity analysis results should be in the default SALib output
    format and must start with the word 'analysis'.

    NOTE: there are two lines of code at the beginning of this function
    (the filenames.remove lines) that are specific to our lignin modeling
    dataset.  Future users can remove or modify these lines to use
    with other datasets.

    Parameters
    -----------
    path : str, optional
           String containing the relative or absolute path of the directory
           where analysis_*.txt files are stored.  There cannot be any
           files or folders within this directory that start with 'analysis'
           except those generated by the SALib sensitivity analysis.  All
           `analysis*` files in this path should correspond to outputs from
           one sensitivity analysis project, and if second order sensitivity
           indices are included in any of the files they should be present in
           all the others.

    Returns
    --------
    sens_dfs : dict
               Dictionary where keys are the names of the various output
               measures (one output measure per analysis file in the folder
               specified by path).  Dictionary values are a list of pandas
               dataframes.

               sens_dfs['key'][0] is a dataframe with the first and total
               order indices of all the parameters with respect to the "key"
               output variable.

               sens_dfs['key'][1] is a dataframe with the second order
               indices for pairs of parameters (if second order indices are
               present in the analysis file).  If there are no second order
               results in the analysis file then this value is a boolean,
               False.
    """

    filenames = [filename for filename in os.listdir(
                 path) if filename.startswith('analysis')]

    # These two functional groups are not present in the light oil fraction
    if 'analysis_light_aromatic-C-C.txt' in filenames:
        filenames.remove('analysis_light_aromatic-C-C.txt')
    if 'analysis_light_aromatic-methoxyl.txt' in filenames:
        filenames.remove('analysis_light_aromatic-methoxyl.txt')

    # Make a dictionary where keys are the different output measures
    # (one for each analysis file) and values are lists of dataframes
    # with the first/total analysis results, and the second order results.
    sens_dfs = {}
    for filename in filenames:
        name = filename[9:].replace('.txt', '')

        with open(path + filename) as result:
            contents = []
            contents.append(result.readlines())
            # find the line number in the file where 2nd order results appear
            for j, line in enumerate(contents[0]):
                # End this loop when you reach the line that separates
                # the first/total indices from the second order indices
                if line.startswith('\n'):
                    break
                # If no second order indices in file
                else:
                    j = False
            # If there are second order indices in the file
            if j:
                sens_dfs[name] = [pd.read_csv(path + filename, sep=' ',
                                              nrows=(j - 1)),
                                  pd.read_csv(path + filename, sep=' ',
                                              skiprows=j)
                                  ]
            else:
                sens_dfs[name] = [pd.read_csv(path + filename, sep=' '),
                                  False]

        # Deal with negative values.  All negative values appear to be close
        # to zero already; they are the result of machine precision issues or
        # setting n too low when generating parameter sets.  To properly
        # correct this issue you should re-run your model with n greater,
        # but sometimes that is too expensive so this is a hack to allow
        # display of them in a logical way.
        # .
        # adjust confidence interval to account for shifting sensitivity value
        sens_dfs[name][0].ix[sens_dfs[name][0]['S1'] < 0, 'S1_conf'] = (
            sens_dfs[name][0]['S1_conf'] + sens_dfs[name][0]['S1'] - 0.0001)
        # set the new sensitivity value = 0.0001
        sens_dfs[name][0].ix[sens_dfs[name][0]['S1'] < 0, 'S1'] = 0.0001
        # do the same for total and second order indices
        sens_dfs[name][0].ix[sens_dfs[name][0]['ST'] < 0, 'ST_conf'] = (
            sens_dfs[name][0]['ST_conf'] + sens_dfs[name][0]['ST'] - 0.0001)
        sens_dfs[name][0].ix[sens_dfs[name][0]['ST'] < 0, 'ST'] = 0.0001
        if isinstance(sens_dfs[name][1], pd.DataFrame):
            sens_dfs[name][1].ix[sens_dfs[name][1]['S2'] < 0, 'S2_conf'] = (
                sens_dfs[name][1]['S2_conf'] + sens_dfs[name][1]['S2'] -
                0.0001)
            sens_dfs[name][1].ix[sens_dfs[name][1]['S2'] < 0, 'S2'] = 0.0001

        # Change 'rxn' to 'k' for consistency with inputs file
        sens_dfs[name][0].Parameter = (sens_dfs[name][0].Parameter
                                       .str.replace('rxn', 'k', case=False))

    return sens_dfs


[docs]def find_unimportant_params(header='ST', path='.'):
    """
    This function finds which parameters have sensitivities and confidence
    intervals equal to exactly 0.0, which means those parameters have no
    role in influencing the output variance for any of the calculated output
    measures.

    These parameters could be considered for removal from the model
    (although it is possible they might play a role in other, unsaved
    outputs)

    Parameters
    -----------
    header : str, optional
             string of the column header for the sensitivity index you choose.
    path   : str, optional
             string with the path to the folder where your analysis files
             are located.

    Returns
    --------
    unimportant : list
                  a list of the parameters that don't matter for these outputs.
    """

    if header not in set(['ST', 'S1']):
        raise ValueError('header must be ST or S1')

    zero_params = []
    sa_dict = get_sa_data(path)
    for key in sa_dict.keys():
        df = sa_dict[key][0]
        zero_params.append(df[(df[header] == 0.0) &
                              (df['%s_conf' % header] == 0.0)]
                           .ix[:, 'Parameter'].values.tolist())

    result = set(zero_params[0])
    for s in zero_params[1:]:
        result.intersection_update(s)
    unimportant = list(result)
    unimportant.sort()

    print 'The following %s parameters have %s==0 for all outputs:\n' % \
          (len(unimportant), header), unimportant, '\n'
    return unimportant


# THIS IS A FUNCTION THAT MIGHT BE USEFUL IN THE FUTURE
# BUT I HAVEN'T WRITTEN YET.
# def combine_sens(order):
#     """
#     STILL WORKING ON WRITING THIS FUNCTION
#
#     This function creates a pandas dataframe that has all the sensitivity
#     indices and confidence values of a specified order (first, total) from
#     every output measure.
#
#     The output of this function can be used to plot the sensitivity indices
#     of all of the output measures for a given input parameter.
#
#     Parameters
#     -----------
#     order : String indicating which order indices to combine (first, total)
#
#     Returns
#     --------
#
#     """