Source code for data_processing

"""
Tools for reading and processing the sensitivity analysis data files.
Some of the files are specific to our project (`input_parameters.csv' and
`results.csv`), but the results of sensitivity analyses are formatted
as any SALib analysis results will be from a sobol analysis.

Our data files are stored outside this repository because they are too large,
so users need to specify the path to their data.
"""
import os

import pandas as pd


[docs]def read_file(path, numrows=None, drop=False, sep=','): """ Function reads a file of input parameters or model results and returns a pandas dataframe with its contents. The first line of the input should contain headers corresponding to the column names. Parameters ---------- path : str the complete filename, including absolute or relative path. numrows : int, optional number of rows of the file to read. If you don't specify this parameter all rows will be read. drop : list, optional list of strings indicating which (if any) of the named columns you do not want to include in the resulting dataframe. (ex. ['cats', 'dogs'], default is not to drop any rows). sep : str string indicating the column separator in the file (optional, default = ','). Returns -------- df : pandas dataframe A pandas dataframe with the contents of the file, limited to the number of rows specified and without the columns named in "drop". """ df = pd.read_csv(path, sep=sep, nrows=numrows) if not drop: df.drop(drop, axis=1, inplace=True) return df
[docs]def get_params(path='./input_parameters.csv', numrows=None, drop=['End_time', 'Oxygen']): """ NOTE: This function is specific to our lignin modeling dataset and is not needed for the visualization features of savvy Returns a pandas dataframe with all the parameters analyzed in the sensitivity analysis, but not additional parameters like end time and oxygen content. If you would like all of the parameters (even those not analyzed for sensitivity) then pass drop=False. Parameters ---------- path : str, optional string containing the path to the parameters csv. numrows : int, optional the number of rows of the input_parameters file to read (default is to read all rows). drop : list, optional a list of strings for which parameters you do not want to include in the returned dataframe. If you want all params then pass drop=False. Returns ------- pandas dataframe """ return read_file(path, numrows=numrows, drop=drop)
[docs]def get_results(path='./results.csv', numrows=None, drop=['light_aromatic_C-C', 'light_aromatic_methoxyl']): """ NOTE: This function is specific to our lignin modeling dataset and is not needed for the visualization features of savvy Returns a pandas dataframe with the results of running all of the simulations for the parameters sets in `input_parameters.csv`. This function drops two unused functional groups from the results file. Parameters ---------- path : str, optional the path to the results csv file. numrows : int, optional the number of rows of the input_parameters file to read (default is to read all rows). drop : list, optional a list of strings for which output measures to drop from the returned dataframe. If you want all outputs use drop=False. Returns ------- pandas dataframe """ return read_file(path, numrows=numrows, drop=drop)
[docs]def get_sa_data(path='.'): """ This function reads and processes all the sensitivity analysis results in a specified folder and returns a dictionary with the corresponding dataframes for first/total order sensitivity indices and second order indices (if present). Sensitivity analysis results should be in the default SALib output format and must start with the word 'analysis'. NOTE: there are two lines of code at the beginning of this function (the filenames.remove lines) that are specific to our lignin modeling dataset. Future users can remove or modify these lines to use with other datasets. Parameters ----------- path : str, optional String containing the relative or absolute path of the directory where analysis_*.txt files are stored. There cannot be any files or folders within this directory that start with 'analysis' except those generated by the SALib sensitivity analysis. All `analysis*` files in this path should correspond to outputs from one sensitivity analysis project, and if second order sensitivity indices are included in any of the files they should be present in all the others. Returns -------- sens_dfs : dict Dictionary where keys are the names of the various output measures (one output measure per analysis file in the folder specified by path). Dictionary values are a list of pandas dataframes. sens_dfs['key'][0] is a dataframe with the first and total order indices of all the parameters with respect to the "key" output variable. sens_dfs['key'][1] is a dataframe with the second order indices for pairs of parameters (if second order indices are present in the analysis file). If there are no second order results in the analysis file then this value is a boolean, False. """ filenames = [filename for filename in os.listdir( path) if filename.startswith('analysis')] # These two functional groups are not present in the light oil fraction if 'analysis_light_aromatic-C-C.txt' in filenames: filenames.remove('analysis_light_aromatic-C-C.txt') if 'analysis_light_aromatic-methoxyl.txt' in filenames: filenames.remove('analysis_light_aromatic-methoxyl.txt') # Make a dictionary where keys are the different output measures # (one for each analysis file) and values are lists of dataframes # with the first/total analysis results, and the second order results. sens_dfs = {} for filename in filenames: name = filename[9:].replace('.txt', '') with open(path + filename) as result: contents = [] contents.append(result.readlines()) # find the line number in the file where 2nd order results appear for j, line in enumerate(contents[0]): # End this loop when you reach the line that separates # the first/total indices from the second order indices if line.startswith('\n'): break # If no second order indices in file else: j = False # If there are second order indices in the file if j: sens_dfs[name] = [pd.read_csv(path + filename, sep=' ', nrows=(j - 1)), pd.read_csv(path + filename, sep=' ', skiprows=j) ] else: sens_dfs[name] = [pd.read_csv(path + filename, sep=' '), False] # Deal with negative values. All negative values appear to be close # to zero already; they are the result of machine precision issues or # setting n too low when generating parameter sets. To properly # correct this issue you should re-run your model with n greater, # but sometimes that is too expensive so this is a hack to allow # display of them in a logical way. # . # adjust confidence interval to account for shifting sensitivity value sens_dfs[name][0].ix[sens_dfs[name][0]['S1'] < 0, 'S1_conf'] = ( sens_dfs[name][0]['S1_conf'] + sens_dfs[name][0]['S1'] - 0.0001) # set the new sensitivity value = 0.0001 sens_dfs[name][0].ix[sens_dfs[name][0]['S1'] < 0, 'S1'] = 0.0001 # do the same for total and second order indices sens_dfs[name][0].ix[sens_dfs[name][0]['ST'] < 0, 'ST_conf'] = ( sens_dfs[name][0]['ST_conf'] + sens_dfs[name][0]['ST'] - 0.0001) sens_dfs[name][0].ix[sens_dfs[name][0]['ST'] < 0, 'ST'] = 0.0001 if isinstance(sens_dfs[name][1], pd.DataFrame): sens_dfs[name][1].ix[sens_dfs[name][1]['S2'] < 0, 'S2_conf'] = ( sens_dfs[name][1]['S2_conf'] + sens_dfs[name][1]['S2'] - 0.0001) sens_dfs[name][1].ix[sens_dfs[name][1]['S2'] < 0, 'S2'] = 0.0001 # Change 'rxn' to 'k' for consistency with inputs file sens_dfs[name][0].Parameter = (sens_dfs[name][0].Parameter .str.replace('rxn', 'k', case=False)) return sens_dfs
[docs]def find_unimportant_params(header='ST', path='.'): """ This function finds which parameters have sensitivities and confidence intervals equal to exactly 0.0, which means those parameters have no role in influencing the output variance for any of the calculated output measures. These parameters could be considered for removal from the model (although it is possible they might play a role in other, unsaved outputs) Parameters ----------- header : str, optional string of the column header for the sensitivity index you choose. path : str, optional string with the path to the folder where your analysis files are located. Returns -------- unimportant : list a list of the parameters that don't matter for these outputs. """ if header not in set(['ST', 'S1']): raise ValueError('header must be ST or S1') zero_params = [] sa_dict = get_sa_data(path) for key in sa_dict.keys(): df = sa_dict[key][0] zero_params.append(df[(df[header] == 0.0) & (df['%s_conf' % header] == 0.0)] .ix[:, 'Parameter'].values.tolist()) result = set(zero_params[0]) for s in zero_params[1:]: result.intersection_update(s) unimportant = list(result) unimportant.sort() print 'The following %s parameters have %s==0 for all outputs:\n' % \ (len(unimportant), header), unimportant, '\n' return unimportant
# THIS IS A FUNCTION THAT MIGHT BE USEFUL IN THE FUTURE # BUT I HAVEN'T WRITTEN YET. # def combine_sens(order): # """ # STILL WORKING ON WRITING THIS FUNCTION # # This function creates a pandas dataframe that has all the sensitivity # indices and confidence values of a specified order (first, total) from # every output measure. # # The output of this function can be used to plot the sensitivity indices # of all of the output measures for a given input parameter. # # Parameters # ----------- # order : String indicating which order indices to combine (first, total) # # Returns # -------- # # """