Source code for choicemodels.mnl

from __future__ import print_function

import datetime
import logging
from collections import OrderedDict

import numpy as np
import pandas as pd
import pylogit
import scipy.optimize
import scipy.stats
from patsy import dmatrix
from statsmodels.iolib.table import SimpleTable

from .tools import MergedChoiceTable
from .tools import pmat
from .tools.pmat import PMAT


"""
#####################
NEW CLASS DEFINITIONS
#####################

"""


[docs]class MultinomialLogit(object): """ A class with methods for estimating multinomial logit discrete choice models. Each observation is a choice scenario in which a chooser selects one alternative from a choice set of two or more. The fitted parameters represent a joint optimization of utility expressions that explains observed choices based on attributes of the alternatives and of the choosers. The input data needs to be in "long" format, with one row for each combination of chooser and alternative. Columns contain relevant attributes and identifiers. (If the choice sets are large, sampling of alternatives should be carried out before data is passed to this class.) The class constructor supports two use cases: 1. The first use case is simpler and requires fewer inputs. Each choice scenario must have the same number of alternatives, and each alternative must have the same model expression (utility equation). This is typical when the alternatives are relatively numerous and homogenous, for example with travel destination choice or household location choice. The following parameters are required: 'data', 'observation_id_col', 'choice_col', 'model_expression' in Patsy format. If data is provided as a MergedChoiceTable, the observation id and choice column names can be read directly from its metadata. To fit this type of model, ChoiceModels will use its own estimation engine adapted from the UrbanSim MNL codebase. Migration from 'urbansim.urbanchoice': Note that these requirements differ from the old UrbanSim codebase in a couple of ways. (1) The chosen alternatives need to be indicated in a column of the estimation data table instead of in a separate matrix, and (2) in lieu of indicating the number of alternatives in each choice set, the estimation data table should include an observation id column. These changes make the API more consistent with other use cases. See the MergedChoiceTable() class for tools and code examples to help with migration. 2. The second use case is more flexible. Choice scenarios can have varying numbers of alternatives, and the model expression (utility equation) can be different for distinct alternatives. This is typical when there is a small number of alternatives whose salient characteristics vary, for example with travel mode choice. The following parameters are required: 'data', 'observation_id_col', 'alternative_id_col', 'choice_col', 'model_expression' in PyLogit format, 'model_labels' in PyLogit format (optional). To fit this type of model, ChoiceModels will use the PyLogit estimation engine. With either use case, the model expression can include attributes of both the choosers and the alternatives. Attributes of a particular alternative may vary for different choosers (distance, for example), but this must be set up manually in the input data. Note that prediction methods are in a separate class: see MultinomialLogitResults(). Parameters ---------- data : pd.DataFrame or choicemodels.tools.MergedChoiceTable A table of estimation data in "long" format, with one row for each combination of chooser and alternative. Column labeling must be consistent with the 'model_expression'. May include extra columns. model_expression : Patsy 'formula-like' or PyLogit 'specification' For the simpler use case where each choice scenario has the same number of alternatives and each alternative has the same model expression, this should be a Patsy formula representing the right-hand side of the single model expression. This can be a string or a number of other data types. See here: https://patsy.readthedocs.io/en/v0.1.0/API-reference.html#patsy.dmatrix For the more flexible use case where choice scenarios have varying numbers of alternatives or the model expessions vary, this should be a PyLogit OrderedDict model specification. See here: https://github.com/timothyb0912/pylogit/blob/master/pylogit/pylogit.py#L116-L130 observation_id_col : str, optional Name of column or index containing the observation id. This should uniquely identify each distinct choice scenario. Not required if data is passed as a MergedChoiceTable. choice_col : str, optional Name of column containing an indication of which alternative has been chosen in each scenario. Values should evaluate as binary: 1/0, True/False, etc. Not required if data is passed as a MergedChoiceTable. model_labels : PyLogit 'names', optional If the model expression is a PyLogit OrderedDict, you can provide a corresponding OrderedDict of labels. See here: https://github.com/timothyb0912/pylogit/blob/master/pylogit/pylogit.py#L151-L165 alternative_id_col : str, optional Name of column or index containing the alternative id. This is only required if the model expression varies for different alternatives. Not required if data is passed as a MergedChoiceTable. initial_coefs : numeric or list-like of numerics, optional Initial coefficients (beta values) to begin the optimization process with. Provide a single value for all coefficients, or an array containing a value for each one being estimated. If None, initial coefficients will be 0. weights : 1D array, optional NOT YET IMPLEMENTED - Estimation weights. """ def __init__(self, data, model_expression, observation_id_col=None, choice_col=None, model_labels=None, alternative_id_col=None, initial_coefs=None, weights=None): self._data = data self._model_expression = model_expression self._observation_id_col = observation_id_col self._alternative_id_col = alternative_id_col self._choice_col = choice_col self._model_labels = model_labels self._initial_coefs = initial_coefs self._weights = weights if isinstance(self._data, MergedChoiceTable): self._df = self._data.to_frame() self._observation_id_col = self._data.observation_id_col self._alternative_id_col = self._data.alternative_id_col self._choice_col = self._data.choice_col else: self._df = self._data if isinstance(self._model_expression, OrderedDict): self._estimation_engine = 'PyLogit' # parse initial_coefs if isinstance(self._initial_coefs, np.ndarray): pass elif isinstance(self._initial_coefs, list): self._initial_coefs = np.array(self._initial_coefs) elif (self._initial_coefs == None): self._initial_coefs = np.zeros(len(self._model_expression)) else: self._initial_coefs = np.repeat(self._initial_coefs, len(self._model_expression)) else: self._estimation_engine = 'ChoiceModels' self._numobs = self._df.reset_index()[[self._observation_id_col]].\ drop_duplicates().shape[0] self._numalts = self._df.shape[0] // self._numobs return
[docs] def fit(self): """ Fit the model using maximum likelihood estimation. Uses either the ChoiceModels or PyLogit estimation engine as appropriate. Returns ------- MultinomialLogitResults() object. """ if (self._estimation_engine == 'PyLogit'): m = pylogit.create_choice_model(data = self._df, obs_id_col = self._observation_id_col, alt_id_col = self._alternative_id_col, choice_col = self._choice_col, specification = self._model_expression, names = self._model_labels, model_type = 'MNL') m.fit_mle(init_vals = self._initial_coefs) results = MultinomialLogitResults(estimation_engine = self._estimation_engine, model_expression = self._model_expression, results = m) elif (self._estimation_engine == 'ChoiceModels'): dm = dmatrix(self._model_expression, data=self._df) chosen = np.reshape(self._df[[self._choice_col]].values, (self._numobs, self._numalts)) log_lik, fit = mnl_estimate(np.array(dm), chosen, self._numalts) result_params = dict(log_likelihood = log_lik, fit_parameters = fit, x_names = dm.design_info.column_names) results = MultinomialLogitResults(estimation_engine = self._estimation_engine, model_expression = self._model_expression, results = result_params) return results
@property def estimation_engine(self): """ 'ChoiceModels' or 'PyLogit'. """ return self._estimation_engine
[docs]class MultinomialLogitResults(object): """ The results class represents a fitted model. It can report the model fit, generate choice probabilties, etc. A full-featured results object is returned by MultinomialLogit.fit(). A results object with more limited functionality can also be built directly from fitted parameters and a model expression. Parameters ---------- model_expression : str or OrderedDict Patsy 'formula-like' (str) or PyLogit 'specification' (OrderedDict). results : dict or object, optional Raw results as currently provided by the estimation engine. This should be replaced with a more consistent and comprehensive set of inputs. fitted_parameters : list of floats, optional If not provided, these will be extracted from the raw results. estimation_engine : str, optional 'ChoiceModels' (default) or 'PyLogit'. """ def __init__(self, model_expression, results=None, fitted_parameters=None, estimation_engine='ChoiceModels'): if (fitted_parameters is None) & (results is not None): if (estimation_engine == 'ChoiceModels'): fitted_parameters = results['fit_parameters']['Coefficient'].tolist() self.estimation_engine = estimation_engine self.model_expression = model_expression self.results = results self.fitted_parameters = fitted_parameters def __repr__(self): return self.report_fit() def __str__(self): return self.report_fit()
[docs] def get_raw_results(self): """ Return the raw results as provided by the estimation engine. Dict or object. """ return self.results
[docs] def probabilities(self, data): """ Generate predicted probabilities for a table of choice scenarios, using the fitted parameters stored in the results object. Parameters ---------- data : choicemodels.tools.MergedChoiceTable Long-format table of choice scenarios. TO DO - accept other data formats. Expected class parameters ------------------------- self.model_expression : patsy string self.fitted_parameters : list of floats Returns ------- pandas.Series with indexes matching the input """ # TO DO - make sure this handles pylogit case # TO DO - does MergedChoiceTable guarantee that alternatives for a single scenario # are consecutive? seems like a requirement here; should document it df = data.to_frame() numalts = data.sample_size # TO DO - make this an official MCT param dm = dmatrix(self.model_expression, data=df) # utility is sum of data values times fitted betas u = np.dot(self.fitted_parameters, np.transpose(dm)) # reshape so axis 0 lists alternatives and axis 1 lists choosers u = np.reshape(u, (numalts, u.size // numalts), order='F') # scale the utilities to make exponentiation easier # https://stats.stackexchange.com/questions/304758/softmax-overflow u = u - u.max(axis=0) exponentiated_utility = np.exp(u) sum_exponentiated_utility = np.sum(exponentiated_utility, axis=0) probs = exponentiated_utility / sum_exponentiated_utility # convert back to ordering of the input data probs = probs.flatten(order='F') df['prob'] = probs # adds indexes return df.prob
[docs] def report_fit(self): """ Print a report of the model estimation results. """ if (self.estimation_engine == 'PyLogit'): output = self.results.get_statsmodels_summary().as_text() elif (self.estimation_engine == 'ChoiceModels'): # Pull out individual results components ll = self.results['log_likelihood']['convergence'] ll_null = self.results['log_likelihood']['null'] rho_bar_squared = self.results['log_likelihood']['rho_bar_squared'] rho_squared = self.results['log_likelihood']['rho_squared'] df_model = self.results['log_likelihood']['df_model'] df_resid = self.results['log_likelihood']['df_resid'] num_obs = self.results['log_likelihood']['num_obs'] bic = self.results['log_likelihood']['bic'] aic = self.results['log_likelihood']['aic'] x_names = self.results['x_names'] coefs = self.results['fit_parameters']['Coefficient'].tolist() std_errs = self.results['fit_parameters']['Std. Error'].tolist() t_scores = self.results['fit_parameters']['T-Score'].tolist() p_values = self.results['fit_parameters']['P-Values'].tolist() def time_now(*args, **kwds): now = datetime.datetime.now() return now.strftime('%H:%M') def date_now(*args, **kwds): now = datetime.datetime.now() return now.strftime('%Y-%m-%d') (header, body) = summary_table(dep_var = 'chosen', model_name = 'Multinomial Logit', method = 'Maximum Likelihood', log_likelihood = ll, null_log_likelihood = ll_null, rho_squared = rho_squared, rho_bar_squared = rho_bar_squared, df_model = df_model, df_resid = df_resid, x_names = x_names, coefs = coefs, std_errs = std_errs, t_scores = t_scores, p_values = p_values, bic = bic, aic = aic, num_obs = num_obs, time = time_now(), date = date_now()) output = header.as_text() + '\n' + body.as_text() return output
def summary_table(title=None, dep_var='', model_name='', method='', date='', time='', aic=None, bic=None, num_obs=None, df_resid=None, df_model=None, rho_squared=None, rho_bar_squared=None, log_likelihood=None, null_log_likelihood=None, x_names=[], coefs=[], std_errs=[], t_scores=[],p_values=[], alpha=None): """ Generate a summary table of estimation results using Statsmodels SimpleTable. Still a work in progress. SimpleTable is maddening to work with, so it would be nice to find an alternative. It would need to support pretty-printing of formatted tables to plaintext and ideally also to HTML and Latex. At first it looked like we could use Statsmodels's summary table generator directly (iolib.summary.Summary), but this requires a Statsmodels results object as input and doesn't document which properties are pulled from it. PyLogit reverse engineered this for use in get_statsmodels_summary() -- so it's possible, but could be hard to maintain in the long run. We can't use PyLogit's summary table generator either. It requires a PyLogit model class as input, and we can't create one from results parameters. Oh well! """ def fmt(value, format_str): # Custom numeric->string formatter that gracefully accepts null values return '' if value is None else format_str.format(value) if (title is None): title = "CHOICEMODELS ESTIMATION RESULTS" top_left = [['Dep. Var.:', dep_var], ['Model:', model_name], ['Method:', method], ['Date:', date], ['Time:', time], ['AIC:', fmt(aic, "{:,.3f}")], ['BIC:', fmt(bic, "{:,.3f}")]] top_right = [['No. Observations:', fmt(num_obs, "{:,}")], ['Df Residuals:', fmt(df_resid, "{:,}")], ['Df Model:', fmt(df_model, "{:,}")], ['Pseudo R-squ.:', fmt(rho_squared, "{:.3f}")], ['Pseudo R-bar-squ.:', fmt(rho_bar_squared, "{:.3f}")], ['Log-Likelihood:', fmt(log_likelihood, "{:,.3f}")], ['LL-Null:', fmt(null_log_likelihood, "{:,.3f}")]] # Zip into a single table (each side needs same number of entries) header_cells = [top_left[i] + top_right[i] for i in range(len(top_left))] # See end of statsmodels.iolib.table.py for formatting options header_fmt = dict(table_dec_below = '', data_aligns = 'lrlr', colwidths = 11, colsep = ' ', empty_cell = '') header = SimpleTable(header_cells, title=title, txt_fmt=header_fmt) col_labels = ['coef', 'std err', 'z', 'P>|z|', 'Conf. Int.'] row_labels = x_names body_cells = [[fmt(coefs[i], "{:,.4f}"), fmt(std_errs[i], "{:,.3f}"), fmt(t_scores[i], "{:,.3f}"), fmt(p_values[i], "{:,.3f}"), ''] # conf int placeholder for i in range(len(x_names))] body_fmt = dict(table_dec_below = '=', header_align = 'r', data_aligns = 'r', colwidths = 7, colsep = ' ') body = SimpleTable(body_cells, headers = col_labels, stubs = row_labels, txt_fmt = body_fmt) # Ideally we'd want to append these into a single table, but I can't get it to work # without completely messing up the formatting.. return (header, body) """ ############################# ORIGINAL FUNCTION DEFINITIONS ############################# """ logger = logging.getLogger(__name__) # right now MNL can only estimate location choice models, where every equation # is the same # it might be better to use stats models for a non-location choice problem # data should be column matrix of dimensions NUMVARS x (NUMALTS*NUMOBVS) # beta is a row vector of dimensions 1 X NUMVARS def mnl_probs(data, beta, numalts): logging.debug('start: calculate MNL probabilities') clamp = data.typ == 'numpy' utilities = beta.multiply(data) if numalts == 0: raise Exception("Number of alternatives is zero") utilities.reshape(numalts, utilities.size() // numalts) # https://stats.stackexchange.com/questions/304758/softmax-overflow utilities = utilities.subtract(utilities.max(0)) exponentiated_utility = utilities.exp(inplace=True) if clamp: exponentiated_utility.inftoval(1e20) exponentiated_utility.clamptomin(1e-300) sum_exponentiated_utility = exponentiated_utility.sum(axis=0) probs = exponentiated_utility.divide_by_row( sum_exponentiated_utility, inplace=True) if clamp: probs.nantoval(1e-300) probs.clamptomin(1e-300) logging.debug('finish: calculate MNL probabilities') return probs def get_hessian(derivative): return np.linalg.inv(np.dot(derivative, np.transpose(derivative))) def get_standard_error(hessian): return np.sqrt(np.diagonal(hessian)) # data should be column matrix of dimensions NUMVARS x (NUMALTS*NUMOBVS) # beta is a row vector of dimensions 1 X NUMVARS def mnl_loglik(beta, data, chosen, numalts, weights=None, lcgrad=False, stderr=0): logger.debug('start: calculate MNL log-likelihood') numvars = beta.size numobs = data.size() // numvars // numalts beta = np.reshape(beta, (1, beta.size)) beta = PMAT(beta, data.typ) probs = mnl_probs(data, beta, numalts) # lcgrad is the special gradient for the latent class membership model if lcgrad: assert weights gradmat = weights.subtract(probs).reshape(probs.size(), 1) gradarr = data.multiply(gradmat) else: if not weights: gradmat = chosen.subtract(probs).reshape(probs.size(), 1) else: gradmat = chosen.subtract(probs).multiply_by_row( weights).reshape(probs.size(), 1) gradarr = data.multiply(gradmat) if stderr: gradmat = data.multiply_by_row(gradmat.reshape(1, gradmat.size())) gradmat.reshape(numvars, numalts * numobs) return get_standard_error(get_hessian(gradmat.get_mat())) chosen.reshape(numalts, numobs) if weights is not None: if probs.shape() == weights.shape(): loglik = ((probs.log(inplace=True) .element_multiply(weights, inplace=True) .element_multiply(chosen, inplace=True)) .sum(axis=1).sum(axis=0)) else: loglik = ((probs.log(inplace=True) .multiply_by_row(weights, inplace=True) .element_multiply(chosen, inplace=True)) .sum(axis=1).sum(axis=0)) else: loglik = (probs.log(inplace=True).element_multiply( chosen, inplace=True)).sum(axis=1).sum(axis=0) if loglik.typ == 'numpy': loglik, gradarr = loglik.get_mat(), gradarr.get_mat().flatten() else: loglik = loglik.get_mat()[0, 0] gradarr = np.reshape(gradarr.get_mat(), (1, gradarr.size()))[0] logger.debug('finish: calculate MNL log-likelihood') return -1 * loglik, -1 * gradarr def mnl_estimate(data, chosen, numalts, GPU=False, coeffrange=(-1000, 1000), weights=None, lcgrad=False, beta=None): """ Calculate coefficients of the MNL model. Parameters ---------- data : 2D array The data are expected to be in "long" form where each row is for one alternative. Alternatives are in groups of `numalts` rows per choosers. Alternatives must be in the same order for each chooser. chosen : 2D array This boolean array has a row for each chooser and a column for each alternative. The column ordering for alternatives is expected to be the same as their row ordering in the `data` array. A one (True) indicates which alternative each chooser has chosen. numalts : int The number of alternatives. GPU : bool, optional coeffrange : tuple of floats, optional Limits of (min, max) to which coefficients are clipped. weights : ndarray, optional lcgrad : bool, optional beta : 1D array, optional Any initial guess for the coefficients. Returns ------- log_likelihood : dict Dictionary of log-likelihood values describing the quality of the model fit. The following keys is entered into `log_likelihood`. aic : float Akaike information criterion for an estimated model. aic = -2 * log_likelihood + 2 * num_estimated_parameters bic : float Bayesian information criterion for an estimated model. bic = -2 * log_likelihood + log(num_observations) * num_parameters num_obs : int Number of observations in the model. df_model : int The number of parameters estimated in this model. df_resid : int The number of observations minus the number of estimated parameters. llnull : float Value of the constant-only loglikelihood ll : float Value of the loglikelihood rho_squared : float McFadden's pseudo-R-squared. rho_squared = 1.0 - final_log_likelihood / null_log_likelihood rho_bar_squared : float rho_bar_squared = 1.0 - ((final_log_likelihood - num_est_parameters) / null_log_likelihood) fit_parameters : pandas.DataFrame Table of fit parameters with columns 'Coefficient', 'Std. Error', 'T-Score', and 'P-Values. Each row corresponds to a column in `data` and are given in the same order as in `data`. See Also -------- scipy.optimize.fmin_l_bfgs_b : The optimization routine used. """ logger.debug( 'start: MNL fit with len(data)={} and numalts={}'.format( len(data), numalts)) atype = 'numpy' if not GPU else 'cuda' numvars = data.shape[1] numobs = data.shape[0] // numalts if chosen is None: chosen = np.ones((numobs, numalts)) # used for latent classes data = np.transpose(data) chosen = np.transpose(chosen) data, chosen = PMAT(data, atype), PMAT(chosen, atype) if weights is not None: weights = PMAT(np.transpose(weights), atype) if beta is None: beta = np.zeros(numvars) bounds = [coeffrange] * numvars # scipy optimization for MNL fit logger.debug('start: scipy optimization for MNL fit') args = (data, chosen, numalts, weights, lcgrad) bfgs_result = scipy.optimize.fmin_l_bfgs_b(mnl_loglik, beta, args=args, fprime=None, factr=10, approx_grad=False, bounds=bounds) logger.debug('finish: scipy optimization for MNL fit') if bfgs_result[2]['warnflag'] > 0: logger.warning("mnl did not converge correctly: %s", bfgs_result) beta = bfgs_result[0] stderr = mnl_loglik( beta, data, chosen, numalts, weights, stderr=1, lcgrad=lcgrad) l0beta = np.zeros(numvars) l0 = -1 * mnl_loglik(l0beta, *args)[0] l1 = -1 * mnl_loglik(beta, *args)[0] ll = float(l1[0][0]) ll_null = float(l0[0][0]) rho_squared = 1.0 - ll / ll_null rho_bar_squared = 1.0 - ((ll - len(beta)) /ll_null) num_obs = numobs df_resid = numobs - len(beta) p_values = 2 * scipy.stats.norm.sf(np.abs(beta / stderr)) bic = -2 * ll + np.log(numobs) * len(beta) aic = -2 * ll + 2 * len(beta) log_likelihood = { 'null': float(l0[0][0]), 'convergence': float(l1[0][0]), 'ratio': float((1 - (l1 / l0))[0][0]), 'rho_bar_squared': rho_bar_squared, 'rho_squared': rho_squared, 'df_model': len(beta), 'df_resid': df_resid, 'num_obs':numobs, 'bic':bic, 'aic':aic} fit_parameters = pd.DataFrame({ 'Coefficient': beta, 'Std. Error': stderr, 'T-Score': beta / stderr, 'P-Values' : p_values }) logger.debug('finish: MNL fit') return log_likelihood, fit_parameters