Source code for urbansim.models.dcm

Use the ``MNLDiscreteChoiceModel`` class to train a choice module using
multinomial logit and make subsequent choice predictions.

from __future__ import print_function, division

import abc
import logging

import numpy as np
import pandas as pd
from patsy import dmatrix
from prettytable import PrettyTable
import toolz as tz

from . import util
from ..exceptions import ModelEvaluationError
from ..urbanchoice import interaction, mnl
from ..utils import yamlio
from ..utils.logutil import log_start_finish

logger = logging.getLogger(__name__)

[docs]def unit_choice(chooser_ids, alternative_ids, probabilities): """ Have a set of choosers choose from among alternatives according to a probability distribution. Choice is binary: each alternative can only be chosen once. Parameters ---------- chooser_ids : 1d array_like Array of IDs of the agents that are making choices. alternative_ids : 1d array_like Array of IDs of alternatives among which agents are making choices. probabilities : 1d array_like The probability that an agent will choose an alternative. Must be the same shape as `alternative_ids`. Unavailable alternatives should have a probability of 0. Returns ------- choices : pandas.Series Mapping of chooser ID to alternative ID. Some choosers will map to a nan value when there are not enough alternatives for all the choosers. """ chooser_ids = np.asanyarray(chooser_ids) alternative_ids = np.asanyarray(alternative_ids) probabilities = np.asanyarray(probabilities) logger.debug( 'start: unit choice with {} choosers and {} alternatives'.format( len(chooser_ids), len(alternative_ids))) choices = pd.Series(index=chooser_ids) if probabilities.sum() == 0: # return all nan if there are no available units return choices # probabilities need to sum to 1 for np.random.choice probabilities = probabilities / probabilities.sum() # need to see if there are as many available alternatives as choosers n_available = np.count_nonzero(probabilities) n_choosers = len(chooser_ids) n_to_choose = n_choosers if n_choosers < n_available else n_available chosen = np.random.choice( alternative_ids, size=n_to_choose, replace=False, p=probabilities) # if there are fewer available units than choosers we need to pick # which choosers get a unit if n_to_choose == n_available: chooser_ids = np.random.choice( chooser_ids, size=n_to_choose, replace=False) choices[chooser_ids] = chosen logger.debug('finish: unit choice') return choices
# define the minimum interface a class must have in order to # look like we expect DCMs to look
[docs]class DiscreteChoiceModel(object): """ Abstract base class for discrete choice models. """ __metaclass__ = abc.ABCMeta @staticmethod def _check_prob_choice_mode_compat(probability_mode, choice_mode): """ Check that the probability and choice modes are compatibly with each other. Currently 'single_chooser' must be paired with 'aggregate' and 'full_product' must be paired with 'individual'. """ if (probability_mode == 'full_product' and choice_mode == 'aggregate'): raise ValueError( "'full_product' probability mode is not compatible with " "'aggregate' choice mode") if (probability_mode == 'single_chooser' and choice_mode == 'individual'): raise ValueError( "'single_chooser' probability mode is not compatible with " "'individual' choice mode") @staticmethod def _check_prob_mode_interaction_compat( probability_mode, interaction_predict_filters): """ The 'full_product' probability mode is currently incompatible with post-interaction prediction filters, so make sure we don't have both of those. """ if (interaction_predict_filters is not None and probability_mode == 'full_product'): raise ValueError( "interaction filters may not be used in " "'full_product' mode") @abc.abstractmethod def apply_fit_filters(self, choosers, alternatives): choosers = util.apply_filter_query(choosers, self.choosers_fit_filters) alternatives = util.apply_filter_query( alternatives, self.alts_fit_filters) return choosers, alternatives @abc.abstractmethod def apply_predict_filters(self, choosers, alternatives): choosers = util.apply_filter_query( choosers, self.choosers_predict_filters) alternatives = util.apply_filter_query( alternatives, self.alts_predict_filters) return choosers, alternatives @abc.abstractproperty def fitted(self): pass @abc.abstractmethod def probabilities(self): pass @abc.abstractmethod def summed_probabilities(self): pass @abc.abstractmethod def fit(self): pass @abc.abstractmethod def predict(self): pass @abc.abstractmethod def choosers_columns_used(self): pass @abc.abstractmethod def alts_columns_used(self): pass @abc.abstractmethod def interaction_columns_used(self): pass @abc.abstractmethod def columns_used(self): pass
[docs]class MNLDiscreteChoiceModel(DiscreteChoiceModel): """ A discrete choice model with the ability to store an estimated model and predict new data based on the model. Based on multinomial logit. Parameters ---------- model_expression : str, iterable, or dict A patsy model expression. Should contain only a right-hand side. sample_size : int Number of choices to sample for estimating the model. probability_mode : str, optional Specify the method to use for calculating probabilities during prediction. Available string options are 'single_chooser' and 'full_product'. In "single chooser" mode one agent is chosen for calculating probabilities across all alternatives. In "full product" mode probabilities are calculated for every chooser across all alternatives. Currently "single chooser" mode must be used with a `choice_mode` of 'aggregate' and "full product" mode must be used with a `choice_mode` of 'individual'. choice_mode : str, optional Specify the method to use for making choices among alternatives. Available string options are 'individual' and 'aggregate'. In "individual" mode choices will be made separately for each chooser. In "aggregate" mode choices are made for all choosers at once. Aggregate mode implies that an alternative chosen by one agent is unavailable to other agents and that the same probabilities can be used for all choosers. Currently "individual" mode must be used with a `probability_mode` of 'full_product' and "aggregate" mode must be used with a `probability_mode` of 'single_chooser'. choosers_fit_filters : list of str, optional Filters applied to choosers table before fitting the model. choosers_predict_filters : list of str, optional Filters applied to the choosers table before calculating new data points. alts_fit_filters : list of str, optional Filters applied to the alternatives table before fitting the model. alts_predict_filters : list of str, optional Filters applied to the alternatives table before calculating new data points. interaction_predict_filters : list of str, optional Filters applied to the merged choosers/alternatives table before predicting agent choices. estimation_sample_size : int, optional Whether to sample choosers during estimation (needs to be applied after choosers_fit_filters). prediction_sample_size : int, optional Whether (and how much) to sample alternatives during prediction. Note that this can lead to multiple choosers picking the same alternative. choice_column : optional Name of the column in the `alternatives` table that choosers should choose. e.g. the 'building_id' column. If not provided the alternatives index is used. name : optional Optional descriptive name for this model that may be used in output. """ def __init__( self, model_expression, sample_size, probability_mode='full_product', choice_mode='individual', choosers_fit_filters=None, choosers_predict_filters=None, alts_fit_filters=None, alts_predict_filters=None, interaction_predict_filters=None, estimation_sample_size=None, prediction_sample_size=None, choice_column=None, name=None): self._check_prob_choice_mode_compat(probability_mode, choice_mode) self._check_prob_mode_interaction_compat( probability_mode, interaction_predict_filters) self.model_expression = model_expression self.sample_size = sample_size self.probability_mode = probability_mode self.choice_mode = choice_mode self.choosers_fit_filters = choosers_fit_filters self.choosers_predict_filters = choosers_predict_filters self.alts_fit_filters = alts_fit_filters self.alts_predict_filters = alts_predict_filters self.interaction_predict_filters = interaction_predict_filters self.estimation_sample_size = estimation_sample_size self.prediction_sample_size = prediction_sample_size self.choice_column = choice_column = name if name is not None else 'MNLDiscreteChoiceModel' self.sim_pdf = None self.log_likelihoods = None self.fit_parameters = None
[docs] @classmethod def from_yaml(cls, yaml_str=None, str_or_buffer=None): """ Create a DiscreteChoiceModel instance from a saved YAML configuration. Arguments are mutally exclusive. Parameters ---------- yaml_str : str, optional A YAML string from which to load model. str_or_buffer : str or file like, optional File name or buffer from which to load YAML. Returns ------- MNLDiscreteChoiceModel """ cfg = yamlio.yaml_to_dict(yaml_str, str_or_buffer) model = cls( cfg['model_expression'], cfg['sample_size'], probability_mode=cfg.get('probability_mode', 'full_product'), choice_mode=cfg.get('choice_mode', 'individual'), choosers_fit_filters=cfg.get('choosers_fit_filters', None), choosers_predict_filters=cfg.get('choosers_predict_filters', None), alts_fit_filters=cfg.get('alts_fit_filters', None), alts_predict_filters=cfg.get('alts_predict_filters', None), interaction_predict_filters=cfg.get( 'interaction_predict_filters', None), estimation_sample_size=cfg.get('estimation_sample_size', None), prediction_sample_size=cfg.get('prediction_sample_size', None), choice_column=cfg.get('choice_column', None), name=cfg.get('name', None) ) if cfg.get('log_likelihoods', None): model.log_likelihoods = cfg['log_likelihoods'] if cfg.get('fit_parameters', None): model.fit_parameters = pd.DataFrame(cfg['fit_parameters']) logger.debug('loaded LCM model {} from YAML'.format( return model
@property def str_model_expression(self): """ Model expression as a string suitable for use with patsy/statsmodels. """ return util.str_model_expression( self.model_expression, add_constant=False)
[docs] def apply_fit_filters(self, choosers, alternatives): """ Filter `choosers` and `alternatives` for fitting. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. Returns ------- filtered_choosers, filtered_alts : pandas.DataFrame """ return super(MNLDiscreteChoiceModel, self).apply_fit_filters( choosers, alternatives)
[docs] def apply_predict_filters(self, choosers, alternatives): """ Filter `choosers` and `alternatives` for prediction. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. Returns ------- filtered_choosers, filtered_alts : pandas.DataFrame """ return super(MNLDiscreteChoiceModel, self).apply_predict_filters( choosers, alternatives)
[docs] def fit(self, choosers, alternatives, current_choice): """ Fit and save model parameters based on given data. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. current_choice : pandas.Series or any A Series describing the `alternatives` currently chosen by the `choosers`. Should have an index matching `choosers` and values matching the index of `alternatives`. If a non-Series is given it should be a column in `choosers`. Returns ------- log_likelihoods : dict Dict of log-liklihood values describing the quality of the model fit. Will have keys 'null', 'convergence', and 'ratio'. """ logger.debug('start: fit LCM model {}'.format( if not isinstance(current_choice, pd.Series): current_choice = choosers[current_choice] choosers, alternatives = self.apply_fit_filters(choosers, alternatives) if self.estimation_sample_size: choosers = choosers.loc[np.random.choice( choosers.index, min(self.estimation_sample_size, len(choosers)), replace=False)] current_choice = current_choice.loc[choosers.index] _, merged, chosen = interaction.mnl_interaction_dataset( choosers, alternatives, self.sample_size, current_choice) model_design = dmatrix( self.str_model_expression, data=merged, return_type='dataframe') if len(merged) != model_design.values.shape[0]: raise ModelEvaluationError( 'Estimated data does not have the same length as input. ' 'This suggests there are null values in one or more of ' 'the input columns.') self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate( model_design.values, chosen, self.sample_size) self.fit_parameters.index = model_design.columns logger.debug('finish: fit LCM model {}'.format( return self.log_likelihoods
@property def fitted(self): """ True if model is ready for prediction. """ return self.fit_parameters is not None
[docs] def assert_fitted(self): """ Raises `RuntimeError` if the model is not ready for prediction. """ if not self.fitted: raise RuntimeError('Model has not been fit.')
[docs] def report_fit(self): """ Print a report of the fit results. """ if not self.fitted: print('Model not yet fit.') return print('Null Log-liklihood: {0:.3f}'.format( self.log_likelihoods['null'])) print('Log-liklihood at convergence: {0:.3f}'.format( self.log_likelihoods['convergence'])) print('Log-liklihood Ratio: {0:.3f}\n'.format( self.log_likelihoods['ratio'])) tbl = PrettyTable( ['Component', ]) tbl = PrettyTable() tbl.add_column('Component', self.fit_parameters.index.values) for col in ('Coefficient', 'Std. Error', 'T-Score'): tbl.add_column(col, self.fit_parameters[col].values) tbl.align['Component'] = 'l' tbl.float_format = '.3' print(tbl)
[docs] def probabilities(self, choosers, alternatives, filter_tables=True): """ Returns the probabilities for a set of choosers to choose from among a set of alternatives. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. filter_tables : bool, optional If True, filter `choosers` and `alternatives` with prediction filters before calculating probabilities. Returns ------- probabilities : pandas.Series Probability of selection associated with each chooser and alternative. Index will be a MultiIndex with alternative IDs in the inner index and chooser IDs in the out index. """ logger.debug('start: calculate probabilities for LCM model {}'.format( self.assert_fitted() if filter_tables: choosers, alternatives = self.apply_predict_filters( choosers, alternatives) if self.prediction_sample_size is not None: sample_size = self.prediction_sample_size else: sample_size = len(alternatives) if self.probability_mode == 'single_chooser': _, merged, _ = interaction.mnl_interaction_dataset( choosers.head(1), alternatives, sample_size) elif self.probability_mode == 'full_product': _, merged, _ = interaction.mnl_interaction_dataset( choosers, alternatives, sample_size) else: raise ValueError( 'Unrecognized probability_mode option: {}'.format( self.probability_mode)) merged = util.apply_filter_query( merged, self.interaction_predict_filters) model_design = dmatrix( self.str_model_expression, data=merged, return_type='dataframe') if len(merged) != model_design.values.shape[0]: raise ModelEvaluationError( 'Simulated data does not have the same length as input. ' 'This suggests there are null values in one or more of ' 'the input columns.') # get the order of the coefficients in the same order as the # columns in the design matrix coeffs = [self.fit_parameters['Coefficient'][x] for x in model_design.columns] # probabilities are returned from mnl_simulate as a 2d array # with choosers along rows and alternatives along columns if self.probability_mode == 'single_chooser': numalts = len(merged) else: numalts = sample_size probabilities = mnl.mnl_simulate( model_design.values, coeffs, numalts=numalts, returnprobs=True) # want to turn probabilities into a Series with a MultiIndex # of chooser IDs and alternative IDs. # indexing by chooser ID will get you the probabilities # across alternatives for that chooser mi = pd.MultiIndex.from_arrays( [merged['join_index'].values, merged.index.values], names=('chooser_id', 'alternative_id')) probabilities = pd.Series(probabilities.flatten(), index=mi) logger.debug('finish: calculate probabilities for LCM model {}'.format( return probabilities
[docs] def summed_probabilities(self, choosers, alternatives): """ Calculate total probability associated with each alternative. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. Returns ------- probs : pandas.Series Total probability associated with each alternative. """ def normalize(s): return s / s.sum() choosers, alternatives = self.apply_predict_filters( choosers, alternatives) probs = self.probabilities(choosers, alternatives, filter_tables=False) # groupby the the alternatives ID and sum if self.probability_mode == 'single_chooser': return ( normalize(probs) * len(choosers) ).reset_index(level=0, drop=True) elif self.probability_mode == 'full_product': return probs.groupby(level=0).apply(normalize)\ .groupby(level=1).sum() else: raise ValueError( 'Unrecognized probability_mode option: {}'.format( self.probability_mode))
[docs] def predict(self, choosers, alternatives, debug=False): """ Choose from among alternatives for a group of agents. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. debug : bool If debug is set to true, will set the variable "sim_pdf" on the object to store the probabilities for mapping of the outcome. Returns ------- choices : pandas.Series Mapping of chooser ID to alternative ID. Some choosers will map to a nan value when there are not enough alternatives for all the choosers. """ self.assert_fitted() logger.debug('start: predict LCM model {}'.format( choosers, alternatives = self.apply_predict_filters( choosers, alternatives) if len(choosers) == 0: return pd.Series() if len(alternatives) == 0: return pd.Series(index=choosers.index) probabilities = self.probabilities( choosers, alternatives, filter_tables=False) if debug: self.sim_pdf = probabilities if self.choice_mode == 'aggregate': choices = unit_choice( choosers.index.values, probabilities.index.get_level_values('alternative_id').values, probabilities.values) elif self.choice_mode == 'individual': def mkchoice(probs): probs.reset_index(0, drop=True, inplace=True) return np.random.choice( probs.index.values, p=probs.values / probs.sum()) choices = probabilities.groupby(level='chooser_id', sort=False)\ .apply(mkchoice) else: raise ValueError( 'Unrecognized choice_mode option: {}'.format(self.choice_mode)) logger.debug('finish: predict LCM model {}'.format( return choices
[docs] def to_dict(self): """ Return a dict respresentation of an MNLDiscreteChoiceModel instance. """ return { 'model_type': 'discretechoice', 'model_expression': self.model_expression, 'sample_size': self.sample_size, 'name':, 'probability_mode': self.probability_mode, 'choice_mode': self.choice_mode, 'choosers_fit_filters': self.choosers_fit_filters, 'choosers_predict_filters': self.choosers_predict_filters, 'alts_fit_filters': self.alts_fit_filters, 'alts_predict_filters': self.alts_predict_filters, 'interaction_predict_filters': self.interaction_predict_filters, 'estimation_sample_size': self.estimation_sample_size, 'prediction_sample_size': self.prediction_sample_size, 'choice_column': self.choice_column, 'fitted': self.fitted, 'log_likelihoods': self.log_likelihoods, 'fit_parameters': (yamlio.frame_to_yaml_safe(self.fit_parameters) if self.fitted else None) }
[docs] def to_yaml(self, str_or_buffer=None): """ Save a model respresentation to YAML. Parameters ---------- str_or_buffer : str or file like, optional By default a YAML string is returned. If a string is given here the YAML will be written to that file. If an object with a ``.write`` method is given the YAML will be written to that object. Returns ------- j : str YAML is string if `str_or_buffer` is not given. """ logger.debug('serializing LCM model {} to YAML'.format( if (not isinstance(self.probability_mode, str) or not isinstance(self.choice_mode, str)): raise TypeError( 'Cannot serialize model with non-string probability_mode ' 'or choice_mode attributes.') return yamlio.convert_to_yaml(self.to_dict(), str_or_buffer)
[docs] def choosers_columns_used(self): """ Columns from the choosers table that are used for filtering. """ return list(tz.unique(tz.concatv( util.columns_in_filters(self.choosers_predict_filters), util.columns_in_filters(self.choosers_fit_filters))))
[docs] def alts_columns_used(self): """ Columns from the alternatives table that are used for filtering. """ return list(tz.unique(tz.concatv( util.columns_in_filters(self.alts_predict_filters), util.columns_in_filters(self.alts_fit_filters))))
[docs] def interaction_columns_used(self): """ Columns from the interaction dataset used for filtering and in the model. These may come originally from either the choosers or alternatives tables. """ return list(tz.unique(tz.concatv( util.columns_in_filters(self.interaction_predict_filters), util.columns_in_formula(self.model_expression))))
[docs] def columns_used(self): """ Columns from any table used in the model. May come from either the choosers or alternatives tables. """ return list(tz.unique(tz.concatv( self.choosers_columns_used(), self.alts_columns_used(), self.interaction_columns_used())))
[docs] @classmethod def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname, outcfgname=None): """ Parameters ---------- choosers : DataFrame A dataframe in which rows represent choosers. chosen_fname : string A string indicating the column in the choosers dataframe which gives which alternatives the choosers have chosen. alternatives : DataFrame A table of alternatives. It should include the choices from the choosers table as well as other alternatives from which to sample. Values in choosers[chosen_fname] should index into the alternatives dataframe. cfgname : string The name of the yaml config file from which to read the discrete choice model. outcfgname : string, optional (default cfgname) The name of the output yaml config file where estimation results are written into. Returns ------- lcm : MNLDiscreteChoiceModel which was used to fit """ logger.debug('start: fit from configuration {}'.format(cfgname)) lcm = cls.from_yaml(str_or_buffer=cfgname), alternatives, choosers[chosen_fname]) lcm.report_fit() outcfgname = outcfgname or cfgname lcm.to_yaml(str_or_buffer=outcfgname) logger.debug('finish: fit into configuration {}'.format(outcfgname)) return lcm
[docs] @classmethod def predict_from_cfg(cls, choosers, alternatives, cfgname=None, cfg=None, alternative_ratio=2.0, debug=False): """ Simulate choices for the specified choosers Parameters ---------- choosers : DataFrame A dataframe of agents doing the choosing. alternatives : DataFrame A dataframe of locations which the choosers are locating in and which have a supply. cfgname : string The name of the yaml config file from which to read the discrete choice model. cfg: string an ordered yaml string of the model discrete choice model configuration. Used to read config from memory in lieu of loading cfgname from disk. alternative_ratio : float, optional Above the ratio of alternatives to choosers (default of 2.0), the alternatives will be sampled to meet this ratio (for performance reasons). debug : boolean, optional (default False) Whether to generate debug information on the model. Returns ------- choices : pandas.Series Mapping of chooser ID to alternative ID. Some choosers will map to a nan value when there are not enough alternatives for all the choosers. lcm : MNLDiscreteChoiceModel which was used to predict """ logger.debug('start: predict from configuration {}'.format(cfgname)) if cfgname: lcm = cls.from_yaml(str_or_buffer=cfgname) elif cfg: lcm = cls.from_yaml(yaml_str=cfg) else: msg = 'predict_from_cfg requires a configuration via the cfgname or cfg arguments' logger.error(msg) raise ValueError(msg) if len(alternatives) > len(choosers) * alternative_ratio: ("Alternative ratio exceeded: %d alternatives " "and only %d choosers") % (len(alternatives), len(choosers))) idxes = np.random.choice( alternatives.index, size=int(len(choosers) * alternative_ratio), replace=False) alternatives = alternatives.loc[idxes] " after sampling %d alternatives are available\n" % len(alternatives)) new_units = lcm.predict(choosers, alternatives, debug=debug) print("Assigned %d choosers to new units" % len(new_units.dropna())) logger.debug('finish: predict from configuration {}'.format(cfgname)) return new_units, lcm
[docs]class MNLDiscreteChoiceModelGroup(DiscreteChoiceModel): """ Manages a group of discrete choice models that refer to different segments of choosers. Model names must match the segment names after doing a pandas groupby. Parameters ---------- segmentation_col : str Name of a column in the table of choosers. Will be used to perform a pandas groupby on the choosers table. remove_alts : bool, optional Specify how to handle alternatives between prediction for different models. If False, the alternatives table is not modified between predictions. If True, alternatives that have been chosen are removed from the alternatives table before doing another round of prediction. name : str, optional A name that may be used in places to identify this group. """ def __init__(self, segmentation_col, remove_alts=False, name=None): self.segmentation_col = segmentation_col self.remove_alts = remove_alts = name if name is not None else 'MNLDiscreteChoiceModelGroup' self.models = {}
[docs] def add_model(self, model): """ Add an MNLDiscreteChoiceModel instance. Parameters ---------- model : MNLDiscreteChoiceModel Should have a ``.name`` attribute matching one of the segments in the choosers table. """ logger.debug( 'adding model {} to LCM group {}'.format(, self.models[] = model
[docs] def add_model_from_params( self, name, model_expression, sample_size, probability_mode='full_product', choice_mode='individual', choosers_fit_filters=None, choosers_predict_filters=None, alts_fit_filters=None, alts_predict_filters=None, interaction_predict_filters=None, estimation_sample_size=None, prediction_sample_size=None, choice_column=None): """ Add a model by passing parameters through to MNLDiscreteChoiceModel. Parameters ---------- name Must match a segment in the choosers table. model_expression : str, iterable, or dict A patsy model expression. Should contain only a right-hand side. sample_size : int Number of choices to sample for estimating the model. probability_mode : str, optional Specify the method to use for calculating probabilities during prediction. Available string options are 'single_chooser' and 'full_product'. In "single chooser" mode one agent is chosen for calculating probabilities across all alternatives. In "full product" mode probabilities are calculated for every chooser across all alternatives. choice_mode : str or callable, optional Specify the method to use for making choices among alternatives. Available string options are 'individual' and 'aggregate'. In "individual" mode choices will be made separately for each chooser. In "aggregate" mode choices are made for all choosers at once. Aggregate mode implies that an alternative chosen by one agent is unavailable to other agents and that the same probabilities can be used for all choosers. choosers_fit_filters : list of str, optional Filters applied to choosers table before fitting the model. choosers_predict_filters : list of str, optional Filters applied to the choosers table before calculating new data points. alts_fit_filters : list of str, optional Filters applied to the alternatives table before fitting the model. alts_predict_filters : list of str, optional Filters applied to the alternatives table before calculating new data points. interaction_predict_filters : list of str, optional Filters applied to the merged choosers/alternatives table before predicting agent choices. estimation_sample_size : int, optional Whether to sample choosers during estimation (needs to be applied after choosers_fit_filters) prediction_sample_size : int, optional Whether (and how much) to sample alternatives during prediction. Note that this can lead to multiple choosers picking the same alternative. choice_column : optional Name of the column in the `alternatives` table that choosers should choose. e.g. the 'building_id' column. If not provided the alternatives index is used. """ logger.debug('adding model {} to LCM group {}'.format(name, self.models[name] = MNLDiscreteChoiceModel( model_expression, sample_size, probability_mode, choice_mode, choosers_fit_filters, choosers_predict_filters, alts_fit_filters, alts_predict_filters, interaction_predict_filters, estimation_sample_size, prediction_sample_size, choice_column, name)
def _iter_groups(self, data): """ Iterate over the groups in `data` after grouping by `segmentation_col`. Skips any groups for which there is no model stored. Yields tuples of (name, df) where name is the group key and df is the group DataFrame. Parameters ---------- data : pandas.DataFrame Must have a column with the same name as `segmentation_col`. """ groups = data.groupby(self.segmentation_col) for name, group in groups: if name not in self.models: continue logger.debug( 'returning group {} in LCM group {}'.format(name, yield name, group
[docs] def apply_fit_filters(self, choosers, alternatives): """ Filter `choosers` and `alternatives` for fitting. This is done by filtering each submodel and concatenating the results. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. Returns ------- filtered_choosers, filtered_alts : pandas.DataFrame """ ch = [] alts = [] for name, df in self._iter_groups(choosers): filtered_choosers, filtered_alts = \ self.models[name].apply_fit_filters(df, alternatives) ch.append(filtered_choosers) alts.append(filtered_alts) return pd.concat(ch), pd.concat(alts)
[docs] def apply_predict_filters(self, choosers, alternatives): """ Filter `choosers` and `alternatives` for prediction. This is done by filtering each submodel and concatenating the results. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. Returns ------- filtered_choosers, filtered_alts : pandas.DataFrame """ ch = [] alts = [] for name, df in self._iter_groups(choosers): filtered_choosers, filtered_alts = \ self.models[name].apply_predict_filters(df, alternatives) ch.append(filtered_choosers) alts.append(filtered_alts) filtered_choosers = pd.concat(ch) filtered_alts = pd.concat(alts) return filtered_choosers, filtered_alts.drop_duplicates()
[docs] def fit(self, choosers, alternatives, current_choice): """ Fit and save models based on given data after segmenting the `choosers` table. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column with the same name as the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. current_choice Name of column in `choosers` that indicates which alternative they have currently chosen. Returns ------- log_likelihoods : dict of dict Keys will be model names and values will be dictionaries of log-liklihood values as returned by """ with log_start_finish( 'fit models in LCM group {}'.format(, logger): return { name: self.models[name].fit(df, alternatives, current_choice) for name, df in self._iter_groups(choosers)}
@property def fitted(self): """ Whether all models in the group have been fitted. """ return (all(m.fitted for m in self.models.values()) if self.models else False)
[docs] def probabilities(self, choosers, alternatives): """ Returns alternative probabilties for each chooser segment as a dictionary keyed by segment name. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column matching the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. Returns ------- probabilties : dict of pandas.Series """ logger.debug( 'start: calculate probabilities in LCM group {}'.format( probs = {} for name, df in self._iter_groups(choosers): probs[name] = self.models[name].probabilities(df, alternatives) logger.debug( 'finish: calculate probabilities in LCM group {}'.format( return probs
[docs] def summed_probabilities(self, choosers, alternatives): """ Returns the sum of probabilities for alternatives across all chooser segments. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column matching the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. Returns ------- probs : pandas.Series Summed probabilities from each segment added together. """ if len(alternatives) == 0 or len(choosers) == 0: return pd.Series() logger.debug( 'start: calculate summed probabilities in LCM group {}'.format( probs = [] for name, df in self._iter_groups(choosers): probs.append( self.models[name].summed_probabilities(df, alternatives)) add = tz.curry(pd.Series.add, fill_value=0) probs = tz.reduce(add, probs) logger.debug( 'finish: calculate summed probabilities in LCM group {}'.format( return probs
[docs] def predict(self, choosers, alternatives, debug=False): """ Choose from among alternatives for a group of agents after segmenting the `choosers` table. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column matching the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. debug : bool If debug is set to true, will set the variable "sim_pdf" on the object to store the probabilities for mapping of the outcome. Returns ------- choices : pandas.Series Mapping of chooser ID to alternative ID. Some choosers will map to a nan value when there are not enough alternatives for all the choosers. """ logger.debug('start: predict models in LCM group {}'.format( results = [] for name, df in self._iter_groups(choosers): choices = self.models[name].predict(df, alternatives, debug=debug) if self.remove_alts and len(alternatives) > 0: alternatives = alternatives.loc[ ~alternatives.index.isin(choices)] results.append(choices) logger.debug( 'finish: predict models in LCM group {}'.format( return pd.concat(results) if results else pd.Series()
[docs] def choosers_columns_used(self): """ Columns from the choosers table that are used for filtering. """ return list(tz.unique(tz.concat( m.choosers_columns_used() for m in self.models.values())))
[docs] def alts_columns_used(self): """ Columns from the alternatives table that are used for filtering. """ return list(tz.unique(tz.concat( m.alts_columns_used() for m in self.models.values())))
[docs] def interaction_columns_used(self): """ Columns from the interaction dataset used for filtering and in the model. These may come originally from either the choosers or alternatives tables. """ return list(tz.unique(tz.concat( m.interaction_columns_used() for m in self.models.values())))
[docs] def columns_used(self): """ Columns from any table used in the model. May come from either the choosers or alternatives tables. """ return list(tz.unique(tz.concat( m.columns_used() for m in self.models.values())))
[docs]class SegmentedMNLDiscreteChoiceModel(DiscreteChoiceModel): """ An MNL LCM group that allows segments to have different model expressions but otherwise share configurations. Parameters ---------- segmentation_col Name of column in the choosers table that will be used for groupby. sample_size : int Number of choices to sample for estimating the model. probability_mode : str, optional Specify the method to use for calculating probabilities during prediction. Available string options are 'single_chooser' and 'full_product'. In "single chooser" mode one agent is chosen for calculating probabilities across all alternatives. In "full product" mode probabilities are calculated for every chooser across all alternatives. Currently "single chooser" mode must be used with a `choice_mode` of 'aggregate' and "full product" mode must be used with a `choice_mode` of 'individual'. choice_mode : str, optional Specify the method to use for making choices among alternatives. Available string options are 'individual' and 'aggregate'. In "individual" mode choices will be made separately for each chooser. In "aggregate" mode choices are made for all choosers at once. Aggregate mode implies that an alternative chosen by one agent is unavailable to other agents and that the same probabilities can be used for all choosers. Currently "individual" mode must be used with a `probability_mode` of 'full_product' and "aggregate" mode must be used with a `probability_mode` of 'single_chooser'. choosers_fit_filters : list of str, optional Filters applied to choosers table before fitting the model. choosers_predict_filters : list of str, optional Filters applied to the choosers table before calculating new data points. alts_fit_filters : list of str, optional Filters applied to the alternatives table before fitting the model. alts_predict_filters : list of str, optional Filters applied to the alternatives table before calculating new data points. interaction_predict_filters : list of str, optional Filters applied to the merged choosers/alternatives table before predicting agent choices. estimation_sample_size : int, optional Whether to sample choosers during estimation (needs to be applied after choosers_fit_filters) prediction_sample_size : int, optional Whether (and how much) to sample alternatives during prediction. Note that this can lead to multiple choosers picking the same alternative. choice_column : optional Name of the column in the `alternatives` table that choosers should choose. e.g. the 'building_id' column. If not provided the alternatives index is used. default_model_expr : str, iterable, or dict, optional A patsy model expression. Should contain only a right-hand side. remove_alts : bool, optional Specify how to handle alternatives between prediction for different models. If False, the alternatives table is not modified between predictions. If True, alternatives that have been chosen are removed from the alternatives table before doing another round of prediction. name : str, optional An optional string used to identify the model in places. """ def __init__( self, segmentation_col, sample_size, probability_mode='full_product', choice_mode='individual', choosers_fit_filters=None, choosers_predict_filters=None, alts_fit_filters=None, alts_predict_filters=None, interaction_predict_filters=None, estimation_sample_size=None, prediction_sample_size=None, choice_column=None, default_model_expr=None, remove_alts=False, name=None): self._check_prob_choice_mode_compat(probability_mode, choice_mode) self._check_prob_mode_interaction_compat( probability_mode, interaction_predict_filters) self.segmentation_col = segmentation_col self.sample_size = sample_size self.probability_mode = probability_mode self.choice_mode = choice_mode self.choosers_fit_filters = choosers_fit_filters self.choosers_predict_filters = choosers_predict_filters self.alts_fit_filters = alts_fit_filters self.alts_predict_filters = alts_predict_filters self.interaction_predict_filters = interaction_predict_filters self.estimation_sample_size = estimation_sample_size self.prediction_sample_size = prediction_sample_size self.choice_column = choice_column self.default_model_expr = default_model_expr self.remove_alts = remove_alts self._group = MNLDiscreteChoiceModelGroup( segmentation_col, remove_alts=remove_alts) = (name if name is not None else 'SegmentedMNLDiscreteChoiceModel')
[docs] @classmethod def from_yaml(cls, yaml_str=None, str_or_buffer=None): """ Create a SegmentedMNLDiscreteChoiceModel instance from a saved YAML configuration. Arguments are mutally exclusive. Parameters ---------- yaml_str : str, optional A YAML string from which to load model. str_or_buffer : str or file like, optional File name or buffer from which to load YAML. Returns ------- SegmentedMNLDiscreteChoiceModel """ cfg = yamlio.yaml_to_dict(yaml_str, str_or_buffer) default_model_expr = cfg['default_config']['model_expression'] seg = cls( cfg['segmentation_col'], cfg['sample_size'], cfg['probability_mode'], cfg['choice_mode'], cfg['choosers_fit_filters'], cfg['choosers_predict_filters'], cfg['alts_fit_filters'], cfg['alts_predict_filters'], cfg['interaction_predict_filters'], cfg['estimation_sample_size'], cfg['prediction_sample_size'], cfg['choice_column'], default_model_expr, cfg['remove_alts'], cfg['name']) if "models" not in cfg: cfg["models"] = {} for name, m in cfg['models'].items(): m['model_expression'] = m.get( 'model_expression', default_model_expr) m['sample_size'] = cfg['sample_size'] m['probability_mode'] = cfg['probability_mode'] m['choice_mode'] = cfg['choice_mode'] m['choosers_fit_filters'] = None m['choosers_predict_filters'] = None m['alts_fit_filters'] = None m['alts_predict_filters'] = None m['interaction_predict_filters'] = \ cfg['interaction_predict_filters'] m['estimation_sample_size'] = cfg['estimation_sample_size'] m['prediction_sample_size'] = cfg['prediction_sample_size'] m['choice_column'] = cfg['choice_column'] model = MNLDiscreteChoiceModel.from_yaml( yamlio.convert_to_yaml(m, None)) seg._group.add_model(model) logger.debug( 'loaded segmented LCM model {} from YAML'.format( return seg
[docs] def add_segment(self, name, model_expression=None): """ Add a new segment with its own model expression. Parameters ---------- name Segment name. Must match a segment in the groupby of the data. model_expression : str or dict, optional A patsy model expression that can be used with statsmodels. Should contain both the left- and right-hand sides. If not given the default model will be used, which must not be None. """ logger.debug('adding LCM model {} to segmented model {}'.format( name, if not model_expression: if not self.default_model_expr: raise ValueError( 'No default model available, ' 'you must supply a model expression.') model_expression = self.default_model_expr # we'll take care of some of the filtering this side before # segmentation self._group.add_model_from_params( name=name, model_expression=model_expression, sample_size=self.sample_size, probability_mode=self.probability_mode, choice_mode=self.choice_mode, choosers_fit_filters=None, choosers_predict_filters=None, alts_fit_filters=None, alts_predict_filters=None, interaction_predict_filters=self.interaction_predict_filters, estimation_sample_size=self.estimation_sample_size, choice_column=self.choice_column)
[docs] def apply_fit_filters(self, choosers, alternatives): """ Filter `choosers` and `alternatives` for fitting. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. Returns ------- filtered_choosers, filtered_alts : pandas.DataFrame """ return super(SegmentedMNLDiscreteChoiceModel, self).apply_fit_filters( choosers, alternatives)
[docs] def apply_predict_filters(self, choosers, alternatives): """ Filter `choosers` and `alternatives` for prediction. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. Returns ------- filtered_choosers, filtered_alts : pandas.DataFrame """ return super( SegmentedMNLDiscreteChoiceModel, self ).apply_predict_filters(choosers, alternatives)
[docs] def fit(self, choosers, alternatives, current_choice): """ Fit and save models based on given data after segmenting the `choosers` table. Segments that have not already been explicitly added will be automatically added with default model. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column with the same name as the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing, e.g. buildings. current_choice Name of column in `choosers` that indicates which alternative they have currently chosen. Returns ------- log_likelihoods : dict of dict Keys will be model names and values will be dictionaries of log-liklihood values as returned by """ logger.debug('start: fit models in segmented LCM {}'.format( choosers, alternatives = self.apply_fit_filters(choosers, alternatives) unique = choosers[self.segmentation_col].unique() # Remove any existing segments that may no longer have counterparts # in the data. This can happen when loading a saved model and then # calling this method with data that no longer has segments that # were there the last time this was called. gone = set(self._group.models) - set(unique) for g in gone: del self._group.models[g] for x in unique: if x not in self._group.models: self.add_segment(x) results =, alternatives, current_choice) logger.debug( 'finish: fit models in segmented LCM {}'.format( return results
@property def fitted(self): """ Whether models for all segments have been fit. """ return self._group.fitted def _filter_choosers_alts(self, choosers, alternatives): """ Apply filters to the choosers and alts tables. """ return ( util.apply_filter_query( choosers, self.choosers_predict_filters), util.apply_filter_query( alternatives, self.alts_predict_filters))
[docs] def probabilities(self, choosers, alternatives): """ Returns alternative probabilties for each chooser segment as a dictionary keyed by segment name. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column matching the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. Returns ------- probabilties : dict of pandas.Series """ logger.debug( 'start: calculate probabilities in segmented LCM {}'.format( choosers, alternatives = self.apply_predict_filters( choosers, alternatives) result = self._group.probabilities(choosers, alternatives) logger.debug( 'finish: calculate probabilities in segmented LCM {}'.format( return result
[docs] def summed_probabilities(self, choosers, alternatives): """ Returns the sum of probabilities for alternatives across all chooser segments. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column matching the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. Returns ------- probs : pandas.Series Summed probabilities from each segment added together. """ logger.debug( 'start: calculate summed probabilities in segmented LCM {}'.format( choosers, alternatives = self.apply_predict_filters( choosers, alternatives) result = self._group.summed_probabilities(choosers, alternatives) logger.debug( ('finish: calculate summed probabilities in segmented LCM {}' ).format( return result
[docs] def predict(self, choosers, alternatives, debug=False): """ Choose from among alternatives for a group of agents after segmenting the `choosers` table. Parameters ---------- choosers : pandas.DataFrame Table describing the agents making choices, e.g. households. Must have a column matching the .segmentation_col attribute. alternatives : pandas.DataFrame Table describing the things from which agents are choosing. debug : bool If debug is set to true, will set the variable "sim_pdf" on the object to store the probabilities for mapping of the outcome. Returns ------- choices : pandas.Series Mapping of chooser ID to alternative ID. Some choosers will map to a nan value when there are not enough alternatives for all the choosers. """ logger.debug( 'start: predict models in segmented LCM {}'.format( choosers, alternatives = self._filter_choosers_alts( choosers, alternatives) results = self._group.predict(choosers, alternatives, debug=debug) logger.debug( 'finish: predict models in segmented LCM {}'.format( return results
def _process_model_dict(self, d): """ Remove redundant items from a model's configuration dict. Parameters ---------- d : dict Modified in place. Returns ------- dict Modified `d`. """ del d['model_type'] del d['sample_size'] del d['probability_mode'] del d['choice_mode'] del d['choosers_fit_filters'] del d['choosers_predict_filters'] del d['alts_fit_filters'] del d['alts_predict_filters'] del d['interaction_predict_filters'] del d['estimation_sample_size'] del d['prediction_sample_size'] del d['choice_column'] if d['model_expression'] == self.default_model_expr: del d['model_expression'] d["name"] = yamlio.to_scalar_safe(d["name"]) return d
[docs] def to_dict(self): """ Returns a dict representation of this instance suitable for conversion to YAML. """ return { 'model_type': 'segmented_discretechoice', 'name':, 'segmentation_col': self.segmentation_col, 'sample_size': self.sample_size, 'probability_mode': self.probability_mode, 'choice_mode': self.choice_mode, 'choosers_fit_filters': self.choosers_fit_filters, 'choosers_predict_filters': self.choosers_predict_filters, 'alts_fit_filters': self.alts_fit_filters, 'alts_predict_filters': self.alts_predict_filters, 'interaction_predict_filters': self.interaction_predict_filters, 'estimation_sample_size': self.estimation_sample_size, 'prediction_sample_size': self.prediction_sample_size, 'choice_column': self.choice_column, 'default_config': { 'model_expression': self.default_model_expr, }, 'remove_alts': self.remove_alts, 'fitted': self.fitted, 'models': { yamlio.to_scalar_safe(name): self._process_model_dict(m.to_dict()) for name, m in self._group.models.items() } }
[docs] def to_yaml(self, str_or_buffer=None): """ Save a model respresentation to YAML. Parameters ---------- str_or_buffer : str or file like, optional By default a YAML string is returned. If a string is given here the YAML will be written to that file. If an object with a ``.write`` method is given the YAML will be written to that object. Returns ------- j : str YAML is string if `str_or_buffer` is not given. """ logger.debug('serializing segmented LCM {} to YAML'.format( return yamlio.convert_to_yaml(self.to_dict(), str_or_buffer)
[docs] def choosers_columns_used(self): """ Columns from the choosers table that are used for filtering. """ return list(tz.unique(tz.concatv( util.columns_in_filters(self.choosers_predict_filters), util.columns_in_filters(self.choosers_fit_filters))))
[docs] def alts_columns_used(self): """ Columns from the alternatives table that are used for filtering. """ return list(tz.unique(tz.concatv( util.columns_in_filters(self.alts_predict_filters), util.columns_in_filters(self.alts_fit_filters))))
[docs] def interaction_columns_used(self): """ Columns from the interaction dataset used for filtering and in the model. These may come originally from either the choosers or alternatives tables. """ return self._group.interaction_columns_used()
[docs] def columns_used(self): """ Columns from any table used in the model. May come from either the choosers or alternatives tables. """ return list(tz.unique(tz.concatv( self.choosers_columns_used(), self.alts_columns_used(), self.interaction_columns_used(), util.columns_in_formula(self.default_model_expr), [self.segmentation_col])))
[docs] @classmethod def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname, outcfgname=None): """ Parameters ---------- choosers : DataFrame A dataframe of rows of agents that have made choices. chosen_fname : string A string indicating the column in the choosers dataframe which gives which alternative the choosers have chosen. alternatives : DataFrame A dataframe of alternatives. It should include the current choices from the choosers dataframe as well as some other alternatives from which to sample. Values in choosers[chosen_fname] should index into the alternatives dataframe. cfgname : string The name of the yaml config file from which to read the discrete choice model. outcfgname : string, optional (default cfgname) The name of the output yaml config file where estimation results are written into. Returns ------- lcm : SegmentedMNLDiscreteChoiceModel which was used to fit """ logger.debug('start: fit from configuration {}'.format(cfgname)) lcm = cls.from_yaml(str_or_buffer=cfgname), alternatives, choosers[chosen_fname]) for k, v in lcm._group.models.items(): print("LCM RESULTS FOR SEGMENT %s\n" % str(k)) v.report_fit() outcfgname = outcfgname or cfgname lcm.to_yaml(str_or_buffer=outcfgname) logger.debug('finish: fit into configuration {}'.format(outcfgname)) return lcm
[docs] @classmethod def predict_from_cfg(cls, choosers, alternatives, cfgname=None, cfg=None, alternative_ratio=2.0, debug=False): """ Simulate the discrete choices for the specified choosers Parameters ---------- choosers : DataFrame A dataframe of agents doing the choosing. alternatives : DataFrame A dataframe of alternatives which the choosers are locating in and which have a supply. cfgname : string The name of the yaml config file from which to read the discrete choice model. cfg: string an ordered yaml string of the model discrete choice model configuration. Used to read config from memory in lieu of loading cfgname from disk. alternative_ratio : float Above the ratio of alternatives to choosers (default of 2.0), the alternatives will be sampled to meet this ratio (for performance reasons). Returns ------- choices : pandas.Series Mapping of chooser ID to alternative ID. Some choosers will map to a nan value when there are not enough alternatives for all the choosers. lcm : SegmentedMNLDiscreteChoiceModel which was used to predict """ logger.debug('start: predict from configuration {}'.format(cfgname)) if cfgname: lcm = cls.from_yaml(str_or_buffer=cfgname) elif cfg: lcm = cls.from_yaml(yaml_str=cfg) else: msg = 'predict_from_cfg requires a configuration via the cfgname or cfg arguments' logger.error(msg) raise ValueError(msg) if len(alternatives) > len(choosers) * alternative_ratio: ("Alternative ratio exceeded: %d alternatives " "and only %d choosers") % (len(alternatives), len(choosers))) idxes = np.random.choice( alternatives.index, size=int(np.floor(len(choosers) * alternative_ratio)), replace=False) alternatives = alternatives.loc[idxes] " after sampling %d alternatives are available\n" % len(alternatives)) new_units = lcm.predict(choosers, alternatives, debug=debug) print("Assigned %d choosers to new units" % len(new_units.dropna())) logger.debug('finish: predict from configuration {}'.format(cfgname)) return new_units, lcm