"""
Use the ``MNLDiscreteChoiceModel`` class to train a choice module using
multinomial logit and make subsequent choice predictions.
"""
from __future__ import print_function, division
import abc
import logging
import numpy as np
import pandas as pd
from patsy import dmatrix
from prettytable import PrettyTable
import toolz as tz
from . import util
from ..exceptions import ModelEvaluationError
from ..urbanchoice import interaction, mnl
from ..utils import yamlio
from ..utils.logutil import log_start_finish
logger = logging.getLogger(__name__)
[docs]def unit_choice(chooser_ids, alternative_ids, probabilities):
"""
Have a set of choosers choose from among alternatives according
to a probability distribution. Choice is binary: each
alternative can only be chosen once.
Parameters
----------
chooser_ids : 1d array_like
Array of IDs of the agents that are making choices.
alternative_ids : 1d array_like
Array of IDs of alternatives among which agents are making choices.
probabilities : 1d array_like
The probability that an agent will choose an alternative.
Must be the same shape as `alternative_ids`. Unavailable
alternatives should have a probability of 0.
Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
"""
chooser_ids = np.asanyarray(chooser_ids)
alternative_ids = np.asanyarray(alternative_ids)
probabilities = np.asanyarray(probabilities)
logger.debug(
'start: unit choice with {} choosers and {} alternatives'.format(
len(chooser_ids), len(alternative_ids)))
choices = pd.Series(index=chooser_ids)
if probabilities.sum() == 0:
# return all nan if there are no available units
return choices
# probabilities need to sum to 1 for np.random.choice
probabilities = probabilities / probabilities.sum()
# need to see if there are as many available alternatives as choosers
n_available = np.count_nonzero(probabilities)
n_choosers = len(chooser_ids)
n_to_choose = n_choosers if n_choosers < n_available else n_available
chosen = np.random.choice(
alternative_ids, size=n_to_choose, replace=False, p=probabilities)
# if there are fewer available units than choosers we need to pick
# which choosers get a unit
if n_to_choose == n_available:
chooser_ids = np.random.choice(
chooser_ids, size=n_to_choose, replace=False)
choices[chooser_ids] = chosen
logger.debug('finish: unit choice')
return choices
# define the minimum interface a class must have in order to
# look like we expect DCMs to look
[docs]class DiscreteChoiceModel(object):
"""
Abstract base class for discrete choice models.
"""
__metaclass__ = abc.ABCMeta
@staticmethod
def _check_prob_choice_mode_compat(probability_mode, choice_mode):
"""
Check that the probability and choice modes are compatibly with
each other. Currently 'single_chooser' must be paired with
'aggregate' and 'full_product' must be paired with 'individual'.
"""
if (probability_mode == 'full_product' and
choice_mode == 'aggregate'):
raise ValueError(
"'full_product' probability mode is not compatible with "
"'aggregate' choice mode")
if (probability_mode == 'single_chooser' and
choice_mode == 'individual'):
raise ValueError(
"'single_chooser' probability mode is not compatible with "
"'individual' choice mode")
@staticmethod
def _check_prob_mode_interaction_compat(
probability_mode, interaction_predict_filters):
"""
The 'full_product' probability mode is currently incompatible with
post-interaction prediction filters, so make sure we don't have
both of those.
"""
if (interaction_predict_filters is not None and
probability_mode == 'full_product'):
raise ValueError(
"interaction filters may not be used in "
"'full_product' mode")
@abc.abstractmethod
def apply_fit_filters(self, choosers, alternatives):
choosers = util.apply_filter_query(choosers, self.choosers_fit_filters)
alternatives = util.apply_filter_query(
alternatives, self.alts_fit_filters)
return choosers, alternatives
@abc.abstractmethod
def apply_predict_filters(self, choosers, alternatives):
choosers = util.apply_filter_query(
choosers, self.choosers_predict_filters)
alternatives = util.apply_filter_query(
alternatives, self.alts_predict_filters)
return choosers, alternatives
@abc.abstractproperty
def fitted(self):
pass
@abc.abstractmethod
def probabilities(self):
pass
@abc.abstractmethod
def summed_probabilities(self):
pass
@abc.abstractmethod
def fit(self):
pass
@abc.abstractmethod
def predict(self):
pass
@abc.abstractmethod
def choosers_columns_used(self):
pass
@abc.abstractmethod
def alts_columns_used(self):
pass
@abc.abstractmethod
def interaction_columns_used(self):
pass
@abc.abstractmethod
def columns_used(self):
pass
[docs]class MNLDiscreteChoiceModel(DiscreteChoiceModel):
"""
A discrete choice model with the ability to store an estimated
model and predict new data based on the model.
Based on multinomial logit.
Parameters
----------
model_expression : str, iterable, or dict
A patsy model expression. Should contain only a right-hand side.
sample_size : int
Number of choices to sample for estimating the model.
probability_mode : str, optional
Specify the method to use for calculating probabilities
during prediction.
Available string options are 'single_chooser' and 'full_product'.
In "single chooser" mode one agent is chosen for calculating
probabilities across all alternatives. In "full product" mode
probabilities are calculated for every chooser across all alternatives.
Currently "single chooser" mode must be used with a `choice_mode`
of 'aggregate' and "full product" mode must be used with a
`choice_mode` of 'individual'.
choice_mode : str, optional
Specify the method to use for making choices among alternatives.
Available string options are 'individual' and 'aggregate'.
In "individual" mode choices will be made separately for each chooser.
In "aggregate" mode choices are made for all choosers at once.
Aggregate mode implies that an alternative chosen by one agent
is unavailable to other agents and that the same probabilities
can be used for all choosers.
Currently "individual" mode must be used with a `probability_mode`
of 'full_product' and "aggregate" mode must be used with a
`probability_mode` of 'single_chooser'.
choosers_fit_filters : list of str, optional
Filters applied to choosers table before fitting the model.
choosers_predict_filters : list of str, optional
Filters applied to the choosers table before calculating
new data points.
alts_fit_filters : list of str, optional
Filters applied to the alternatives table before fitting the model.
alts_predict_filters : list of str, optional
Filters applied to the alternatives table before calculating
new data points.
interaction_predict_filters : list of str, optional
Filters applied to the merged choosers/alternatives table
before predicting agent choices.
estimation_sample_size : int, optional
Whether to sample choosers during estimation
(needs to be applied after choosers_fit_filters).
prediction_sample_size : int, optional
Whether (and how much) to sample alternatives during prediction.
Note that this can lead to multiple choosers picking the same
alternative.
choice_column : optional
Name of the column in the `alternatives` table that choosers
should choose. e.g. the 'building_id' column. If not provided
the alternatives index is used.
name : optional
Optional descriptive name for this model that may be used
in output.
"""
def __init__(
self, model_expression, sample_size,
probability_mode='full_product', choice_mode='individual',
choosers_fit_filters=None, choosers_predict_filters=None,
alts_fit_filters=None, alts_predict_filters=None,
interaction_predict_filters=None,
estimation_sample_size=None,
prediction_sample_size=None,
choice_column=None, name=None):
self._check_prob_choice_mode_compat(probability_mode, choice_mode)
self._check_prob_mode_interaction_compat(
probability_mode, interaction_predict_filters)
self.model_expression = model_expression
self.sample_size = sample_size
self.probability_mode = probability_mode
self.choice_mode = choice_mode
self.choosers_fit_filters = choosers_fit_filters
self.choosers_predict_filters = choosers_predict_filters
self.alts_fit_filters = alts_fit_filters
self.alts_predict_filters = alts_predict_filters
self.interaction_predict_filters = interaction_predict_filters
self.estimation_sample_size = estimation_sample_size
self.prediction_sample_size = prediction_sample_size
self.choice_column = choice_column
self.name = name if name is not None else 'MNLDiscreteChoiceModel'
self.sim_pdf = None
self.log_likelihoods = None
self.fit_parameters = None
[docs] @classmethod
def from_yaml(cls, yaml_str=None, str_or_buffer=None):
"""
Create a DiscreteChoiceModel instance from a saved YAML configuration.
Arguments are mutally exclusive.
Parameters
----------
yaml_str : str, optional
A YAML string from which to load model.
str_or_buffer : str or file like, optional
File name or buffer from which to load YAML.
Returns
-------
MNLDiscreteChoiceModel
"""
cfg = yamlio.yaml_to_dict(yaml_str, str_or_buffer)
model = cls(
cfg['model_expression'],
cfg['sample_size'],
probability_mode=cfg.get('probability_mode', 'full_product'),
choice_mode=cfg.get('choice_mode', 'individual'),
choosers_fit_filters=cfg.get('choosers_fit_filters', None),
choosers_predict_filters=cfg.get('choosers_predict_filters', None),
alts_fit_filters=cfg.get('alts_fit_filters', None),
alts_predict_filters=cfg.get('alts_predict_filters', None),
interaction_predict_filters=cfg.get(
'interaction_predict_filters', None),
estimation_sample_size=cfg.get('estimation_sample_size', None),
prediction_sample_size=cfg.get('prediction_sample_size', None),
choice_column=cfg.get('choice_column', None),
name=cfg.get('name', None)
)
if cfg.get('log_likelihoods', None):
model.log_likelihoods = cfg['log_likelihoods']
if cfg.get('fit_parameters', None):
model.fit_parameters = pd.DataFrame(cfg['fit_parameters'])
logger.debug('loaded LCM model {} from YAML'.format(model.name))
return model
@property
def str_model_expression(self):
"""
Model expression as a string suitable for use with patsy/statsmodels.
"""
return util.str_model_expression(
self.model_expression, add_constant=False)
[docs] def apply_fit_filters(self, choosers, alternatives):
"""
Filter `choosers` and `alternatives` for fitting.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
Returns
-------
filtered_choosers, filtered_alts : pandas.DataFrame
"""
return super(MNLDiscreteChoiceModel, self).apply_fit_filters(
choosers, alternatives)
[docs] def apply_predict_filters(self, choosers, alternatives):
"""
Filter `choosers` and `alternatives` for prediction.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
Returns
-------
filtered_choosers, filtered_alts : pandas.DataFrame
"""
return super(MNLDiscreteChoiceModel, self).apply_predict_filters(
choosers, alternatives)
[docs] def fit(self, choosers, alternatives, current_choice):
"""
Fit and save model parameters based on given data.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
current_choice : pandas.Series or any
A Series describing the `alternatives` currently chosen
by the `choosers`. Should have an index matching `choosers`
and values matching the index of `alternatives`.
If a non-Series is given it should be a column in `choosers`.
Returns
-------
log_likelihoods : dict
Dict of log-liklihood values describing the quality of the
model fit. Will have keys 'null', 'convergence', and 'ratio'.
"""
logger.debug('start: fit LCM model {}'.format(self.name))
if not isinstance(current_choice, pd.Series):
current_choice = choosers[current_choice]
choosers, alternatives = self.apply_fit_filters(choosers, alternatives)
if self.estimation_sample_size:
choosers = choosers.loc[np.random.choice(
choosers.index,
min(self.estimation_sample_size, len(choosers)),
replace=False)]
current_choice = current_choice.loc[choosers.index]
_, merged, chosen = interaction.mnl_interaction_dataset(
choosers, alternatives, self.sample_size, current_choice)
model_design = dmatrix(
self.str_model_expression, data=merged, return_type='dataframe')
if len(merged) != model_design.values.shape[0]:
raise ModelEvaluationError(
'Estimated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')
self.log_likelihoods, self.fit_parameters = mnl.mnl_estimate(
model_design.values, chosen, self.sample_size)
self.fit_parameters.index = model_design.columns
logger.debug('finish: fit LCM model {}'.format(self.name))
return self.log_likelihoods
@property
def fitted(self):
"""
True if model is ready for prediction.
"""
return self.fit_parameters is not None
[docs] def assert_fitted(self):
"""
Raises `RuntimeError` if the model is not ready for prediction.
"""
if not self.fitted:
raise RuntimeError('Model has not been fit.')
[docs] def report_fit(self):
"""
Print a report of the fit results.
"""
if not self.fitted:
print('Model not yet fit.')
return
print('Null Log-liklihood: {0:.3f}'.format(
self.log_likelihoods['null']))
print('Log-liklihood at convergence: {0:.3f}'.format(
self.log_likelihoods['convergence']))
print('Log-liklihood Ratio: {0:.3f}\n'.format(
self.log_likelihoods['ratio']))
tbl = PrettyTable(
['Component', ])
tbl = PrettyTable()
tbl.add_column('Component', self.fit_parameters.index.values)
for col in ('Coefficient', 'Std. Error', 'T-Score'):
tbl.add_column(col, self.fit_parameters[col].values)
tbl.align['Component'] = 'l'
tbl.float_format = '.3'
print(tbl)
[docs] def probabilities(self, choosers, alternatives, filter_tables=True):
"""
Returns the probabilities for a set of choosers to choose
from among a set of alternatives.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
filter_tables : bool, optional
If True, filter `choosers` and `alternatives` with prediction
filters before calculating probabilities.
Returns
-------
probabilities : pandas.Series
Probability of selection associated with each chooser
and alternative. Index will be a MultiIndex with alternative
IDs in the inner index and chooser IDs in the out index.
"""
logger.debug('start: calculate probabilities for LCM model {}'.format(
self.name))
self.assert_fitted()
if filter_tables:
choosers, alternatives = self.apply_predict_filters(
choosers, alternatives)
if self.prediction_sample_size is not None:
sample_size = self.prediction_sample_size
else:
sample_size = len(alternatives)
if self.probability_mode == 'single_chooser':
_, merged, _ = interaction.mnl_interaction_dataset(
choosers.head(1), alternatives, sample_size)
elif self.probability_mode == 'full_product':
_, merged, _ = interaction.mnl_interaction_dataset(
choosers, alternatives, sample_size)
else:
raise ValueError(
'Unrecognized probability_mode option: {}'.format(
self.probability_mode))
merged = util.apply_filter_query(
merged, self.interaction_predict_filters)
model_design = dmatrix(
self.str_model_expression, data=merged, return_type='dataframe')
if len(merged) != model_design.values.shape[0]:
raise ModelEvaluationError(
'Simulated data does not have the same length as input. '
'This suggests there are null values in one or more of '
'the input columns.')
# get the order of the coefficients in the same order as the
# columns in the design matrix
coeffs = [self.fit_parameters['Coefficient'][x]
for x in model_design.columns]
# probabilities are returned from mnl_simulate as a 2d array
# with choosers along rows and alternatives along columns
if self.probability_mode == 'single_chooser':
numalts = len(merged)
else:
numalts = sample_size
probabilities = mnl.mnl_simulate(
model_design.values,
coeffs,
numalts=numalts, returnprobs=True)
# want to turn probabilities into a Series with a MultiIndex
# of chooser IDs and alternative IDs.
# indexing by chooser ID will get you the probabilities
# across alternatives for that chooser
mi = pd.MultiIndex.from_arrays(
[merged['join_index'].values, merged.index.values],
names=('chooser_id', 'alternative_id'))
probabilities = pd.Series(probabilities.flatten(), index=mi)
logger.debug('finish: calculate probabilities for LCM model {}'.format(
self.name))
return probabilities
[docs] def summed_probabilities(self, choosers, alternatives):
"""
Calculate total probability associated with each alternative.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
Returns
-------
probs : pandas.Series
Total probability associated with each alternative.
"""
def normalize(s):
return s / s.sum()
choosers, alternatives = self.apply_predict_filters(
choosers, alternatives)
probs = self.probabilities(choosers, alternatives, filter_tables=False)
# groupby the the alternatives ID and sum
if self.probability_mode == 'single_chooser':
return (
normalize(probs) * len(choosers)
).reset_index(level=0, drop=True)
elif self.probability_mode == 'full_product':
return probs.groupby(level=0).apply(normalize)\
.groupby(level=1).sum()
else:
raise ValueError(
'Unrecognized probability_mode option: {}'.format(
self.probability_mode))
[docs] def predict(self, choosers, alternatives, debug=False):
"""
Choose from among alternatives for a group of agents.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
debug : bool
If debug is set to true, will set the variable "sim_pdf" on
the object to store the probabilities for mapping of the
outcome.
Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
"""
self.assert_fitted()
logger.debug('start: predict LCM model {}'.format(self.name))
choosers, alternatives = self.apply_predict_filters(
choosers, alternatives)
if len(choosers) == 0:
return pd.Series()
if len(alternatives) == 0:
return pd.Series(index=choosers.index)
probabilities = self.probabilities(
choosers, alternatives, filter_tables=False)
if debug:
self.sim_pdf = probabilities
if self.choice_mode == 'aggregate':
choices = unit_choice(
choosers.index.values,
probabilities.index.get_level_values('alternative_id').values,
probabilities.values)
elif self.choice_mode == 'individual':
def mkchoice(probs):
probs.reset_index(0, drop=True, inplace=True)
return np.random.choice(
probs.index.values, p=probs.values / probs.sum())
choices = probabilities.groupby(level='chooser_id', sort=False)\
.apply(mkchoice)
else:
raise ValueError(
'Unrecognized choice_mode option: {}'.format(self.choice_mode))
logger.debug('finish: predict LCM model {}'.format(self.name))
return choices
[docs] def to_dict(self):
"""
Return a dict respresentation of an MNLDiscreteChoiceModel
instance.
"""
return {
'model_type': 'discretechoice',
'model_expression': self.model_expression,
'sample_size': self.sample_size,
'name': self.name,
'probability_mode': self.probability_mode,
'choice_mode': self.choice_mode,
'choosers_fit_filters': self.choosers_fit_filters,
'choosers_predict_filters': self.choosers_predict_filters,
'alts_fit_filters': self.alts_fit_filters,
'alts_predict_filters': self.alts_predict_filters,
'interaction_predict_filters': self.interaction_predict_filters,
'estimation_sample_size': self.estimation_sample_size,
'prediction_sample_size': self.prediction_sample_size,
'choice_column': self.choice_column,
'fitted': self.fitted,
'log_likelihoods': self.log_likelihoods,
'fit_parameters': (yamlio.frame_to_yaml_safe(self.fit_parameters)
if self.fitted else None)
}
[docs] def to_yaml(self, str_or_buffer=None):
"""
Save a model respresentation to YAML.
Parameters
----------
str_or_buffer : str or file like, optional
By default a YAML string is returned. If a string is
given here the YAML will be written to that file.
If an object with a ``.write`` method is given the
YAML will be written to that object.
Returns
-------
j : str
YAML is string if `str_or_buffer` is not given.
"""
logger.debug('serializing LCM model {} to YAML'.format(self.name))
if (not isinstance(self.probability_mode, str) or
not isinstance(self.choice_mode, str)):
raise TypeError(
'Cannot serialize model with non-string probability_mode '
'or choice_mode attributes.')
return yamlio.convert_to_yaml(self.to_dict(), str_or_buffer)
[docs] def choosers_columns_used(self):
"""
Columns from the choosers table that are used for filtering.
"""
return list(tz.unique(tz.concatv(
util.columns_in_filters(self.choosers_predict_filters),
util.columns_in_filters(self.choosers_fit_filters))))
[docs] def alts_columns_used(self):
"""
Columns from the alternatives table that are used for filtering.
"""
return list(tz.unique(tz.concatv(
util.columns_in_filters(self.alts_predict_filters),
util.columns_in_filters(self.alts_fit_filters))))
[docs] def interaction_columns_used(self):
"""
Columns from the interaction dataset used for filtering and in
the model. These may come originally from either the choosers or
alternatives tables.
"""
return list(tz.unique(tz.concatv(
util.columns_in_filters(self.interaction_predict_filters),
util.columns_in_formula(self.model_expression))))
[docs] def columns_used(self):
"""
Columns from any table used in the model. May come from either
the choosers or alternatives tables.
"""
return list(tz.unique(tz.concatv(
self.choosers_columns_used(),
self.alts_columns_used(),
self.interaction_columns_used())))
[docs] @classmethod
def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname, outcfgname=None):
"""
Parameters
----------
choosers : DataFrame
A dataframe in which rows represent choosers.
chosen_fname : string
A string indicating the column in the choosers dataframe which
gives which alternatives the choosers have chosen.
alternatives : DataFrame
A table of alternatives. It should include the choices
from the choosers table as well as other alternatives from
which to sample. Values in choosers[chosen_fname] should index
into the alternatives dataframe.
cfgname : string
The name of the yaml config file from which to read the discrete
choice model.
outcfgname : string, optional (default cfgname)
The name of the output yaml config file where estimation results are written into.
Returns
-------
lcm : MNLDiscreteChoiceModel which was used to fit
"""
logger.debug('start: fit from configuration {}'.format(cfgname))
lcm = cls.from_yaml(str_or_buffer=cfgname)
lcm.fit(choosers, alternatives, choosers[chosen_fname])
lcm.report_fit()
outcfgname = outcfgname or cfgname
lcm.to_yaml(str_or_buffer=outcfgname)
logger.debug('finish: fit into configuration {}'.format(outcfgname))
return lcm
[docs] @classmethod
def predict_from_cfg(cls, choosers, alternatives, cfgname=None, cfg=None,
alternative_ratio=2.0, debug=False):
"""
Simulate choices for the specified choosers
Parameters
----------
choosers : DataFrame
A dataframe of agents doing the choosing.
alternatives : DataFrame
A dataframe of locations which the choosers are locating in and
which have a supply.
cfgname : string
The name of the yaml config file from which to read the discrete
choice model.
cfg: string
an ordered yaml string of the model discrete choice model configuration.
Used to read config from memory in lieu of loading cfgname from disk.
alternative_ratio : float, optional
Above the ratio of alternatives to choosers (default of 2.0),
the alternatives will be sampled to meet this ratio
(for performance reasons).
debug : boolean, optional (default False)
Whether to generate debug information on the model.
Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
lcm : MNLDiscreteChoiceModel which was used to predict
"""
logger.debug('start: predict from configuration {}'.format(cfgname))
if cfgname:
lcm = cls.from_yaml(str_or_buffer=cfgname)
elif cfg:
lcm = cls.from_yaml(yaml_str=cfg)
else:
msg = 'predict_from_cfg requires a configuration via the cfgname or cfg arguments'
logger.error(msg)
raise ValueError(msg)
if len(alternatives) > len(choosers) * alternative_ratio:
logger.info(
("Alternative ratio exceeded: %d alternatives "
"and only %d choosers") %
(len(alternatives), len(choosers)))
idxes = np.random.choice(
alternatives.index, size=int(len(choosers) *
alternative_ratio),
replace=False)
alternatives = alternatives.loc[idxes]
logger.info(
" after sampling %d alternatives are available\n" %
len(alternatives))
new_units = lcm.predict(choosers, alternatives, debug=debug)
print("Assigned %d choosers to new units" % len(new_units.dropna()))
logger.debug('finish: predict from configuration {}'.format(cfgname))
return new_units, lcm
[docs]class MNLDiscreteChoiceModelGroup(DiscreteChoiceModel):
"""
Manages a group of discrete choice models that refer to different
segments of choosers.
Model names must match the segment names after doing a pandas groupby.
Parameters
----------
segmentation_col : str
Name of a column in the table of choosers. Will be used to perform
a pandas groupby on the choosers table.
remove_alts : bool, optional
Specify how to handle alternatives between prediction for different
models. If False, the alternatives table is not modified between
predictions. If True, alternatives that have been chosen
are removed from the alternatives table before doing another
round of prediction.
name : str, optional
A name that may be used in places to identify this group.
"""
def __init__(self, segmentation_col, remove_alts=False, name=None):
self.segmentation_col = segmentation_col
self.remove_alts = remove_alts
self.name = name if name is not None else 'MNLDiscreteChoiceModelGroup'
self.models = {}
[docs] def add_model(self, model):
"""
Add an MNLDiscreteChoiceModel instance.
Parameters
----------
model : MNLDiscreteChoiceModel
Should have a ``.name`` attribute matching one of the segments
in the choosers table.
"""
logger.debug(
'adding model {} to LCM group {}'.format(model.name, self.name))
self.models[model.name] = model
[docs] def add_model_from_params(
self, name, model_expression, sample_size,
probability_mode='full_product', choice_mode='individual',
choosers_fit_filters=None, choosers_predict_filters=None,
alts_fit_filters=None, alts_predict_filters=None,
interaction_predict_filters=None, estimation_sample_size=None,
prediction_sample_size=None, choice_column=None):
"""
Add a model by passing parameters through to MNLDiscreteChoiceModel.
Parameters
----------
name
Must match a segment in the choosers table.
model_expression : str, iterable, or dict
A patsy model expression. Should contain only a right-hand side.
sample_size : int
Number of choices to sample for estimating the model.
probability_mode : str, optional
Specify the method to use for calculating probabilities
during prediction.
Available string options are 'single_chooser' and 'full_product'.
In "single chooser" mode one agent is chosen for calculating
probabilities across all alternatives. In "full product" mode
probabilities are calculated for every chooser across all
alternatives.
choice_mode : str or callable, optional
Specify the method to use for making choices among alternatives.
Available string options are 'individual' and 'aggregate'.
In "individual" mode choices will be made separately for each
chooser. In "aggregate" mode choices are made for all choosers at
once. Aggregate mode implies that an alternative chosen by one
agent is unavailable to other agents and that the same
probabilities can be used for all choosers.
choosers_fit_filters : list of str, optional
Filters applied to choosers table before fitting the model.
choosers_predict_filters : list of str, optional
Filters applied to the choosers table before calculating
new data points.
alts_fit_filters : list of str, optional
Filters applied to the alternatives table before fitting the model.
alts_predict_filters : list of str, optional
Filters applied to the alternatives table before calculating
new data points.
interaction_predict_filters : list of str, optional
Filters applied to the merged choosers/alternatives table
before predicting agent choices.
estimation_sample_size : int, optional
Whether to sample choosers during estimation
(needs to be applied after choosers_fit_filters)
prediction_sample_size : int, optional
Whether (and how much) to sample alternatives during prediction.
Note that this can lead to multiple choosers picking the same
alternative.
choice_column : optional
Name of the column in the `alternatives` table that choosers
should choose. e.g. the 'building_id' column. If not provided
the alternatives index is used.
"""
logger.debug('adding model {} to LCM group {}'.format(name, self.name))
self.models[name] = MNLDiscreteChoiceModel(
model_expression, sample_size,
probability_mode, choice_mode,
choosers_fit_filters, choosers_predict_filters,
alts_fit_filters, alts_predict_filters,
interaction_predict_filters, estimation_sample_size,
prediction_sample_size, choice_column, name)
def _iter_groups(self, data):
"""
Iterate over the groups in `data` after grouping by
`segmentation_col`. Skips any groups for which there
is no model stored.
Yields tuples of (name, df) where name is the group key
and df is the group DataFrame.
Parameters
----------
data : pandas.DataFrame
Must have a column with the same name as `segmentation_col`.
"""
groups = data.groupby(self.segmentation_col)
for name, group in groups:
if name not in self.models:
continue
logger.debug(
'returning group {} in LCM group {}'.format(name, self.name))
yield name, group
[docs] def apply_fit_filters(self, choosers, alternatives):
"""
Filter `choosers` and `alternatives` for fitting.
This is done by filtering each submodel and concatenating
the results.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
Returns
-------
filtered_choosers, filtered_alts : pandas.DataFrame
"""
ch = []
alts = []
for name, df in self._iter_groups(choosers):
filtered_choosers, filtered_alts = \
self.models[name].apply_fit_filters(df, alternatives)
ch.append(filtered_choosers)
alts.append(filtered_alts)
return pd.concat(ch), pd.concat(alts)
[docs] def apply_predict_filters(self, choosers, alternatives):
"""
Filter `choosers` and `alternatives` for prediction.
This is done by filtering each submodel and concatenating
the results.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
Returns
-------
filtered_choosers, filtered_alts : pandas.DataFrame
"""
ch = []
alts = []
for name, df in self._iter_groups(choosers):
filtered_choosers, filtered_alts = \
self.models[name].apply_predict_filters(df, alternatives)
ch.append(filtered_choosers)
alts.append(filtered_alts)
filtered_choosers = pd.concat(ch)
filtered_alts = pd.concat(alts)
return filtered_choosers, filtered_alts.drop_duplicates()
[docs] def fit(self, choosers, alternatives, current_choice):
"""
Fit and save models based on given data after segmenting
the `choosers` table.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column with the same name as the .segmentation_col
attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
current_choice
Name of column in `choosers` that indicates which alternative
they have currently chosen.
Returns
-------
log_likelihoods : dict of dict
Keys will be model names and values will be dictionaries of
log-liklihood values as returned by MNLDiscreteChoiceModel.fit.
"""
with log_start_finish(
'fit models in LCM group {}'.format(self.name), logger):
return {
name: self.models[name].fit(df, alternatives, current_choice)
for name, df in self._iter_groups(choosers)}
@property
def fitted(self):
"""
Whether all models in the group have been fitted.
"""
return (all(m.fitted for m in self.models.values())
if self.models else False)
[docs] def probabilities(self, choosers, alternatives):
"""
Returns alternative probabilties for each chooser segment as
a dictionary keyed by segment name.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column matching the .segmentation_col attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
Returns
-------
probabilties : dict of pandas.Series
"""
logger.debug(
'start: calculate probabilities in LCM group {}'.format(self.name))
probs = {}
for name, df in self._iter_groups(choosers):
probs[name] = self.models[name].probabilities(df, alternatives)
logger.debug(
'finish: calculate probabilities in LCM group {}'.format(
self.name))
return probs
[docs] def summed_probabilities(self, choosers, alternatives):
"""
Returns the sum of probabilities for alternatives across all
chooser segments.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column matching the .segmentation_col attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
Returns
-------
probs : pandas.Series
Summed probabilities from each segment added together.
"""
if len(alternatives) == 0 or len(choosers) == 0:
return pd.Series()
logger.debug(
'start: calculate summed probabilities in LCM group {}'.format(
self.name))
probs = []
for name, df in self._iter_groups(choosers):
probs.append(
self.models[name].summed_probabilities(df, alternatives))
add = tz.curry(pd.Series.add, fill_value=0)
probs = tz.reduce(add, probs)
logger.debug(
'finish: calculate summed probabilities in LCM group {}'.format(
self.name))
return probs
[docs] def predict(self, choosers, alternatives, debug=False):
"""
Choose from among alternatives for a group of agents after
segmenting the `choosers` table.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column matching the .segmentation_col attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
debug : bool
If debug is set to true, will set the variable "sim_pdf" on
the object to store the probabilities for mapping of the
outcome.
Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
"""
logger.debug('start: predict models in LCM group {}'.format(self.name))
results = []
for name, df in self._iter_groups(choosers):
choices = self.models[name].predict(df, alternatives, debug=debug)
if self.remove_alts and len(alternatives) > 0:
alternatives = alternatives.loc[
~alternatives.index.isin(choices)]
results.append(choices)
logger.debug(
'finish: predict models in LCM group {}'.format(self.name))
return pd.concat(results) if results else pd.Series()
[docs] def choosers_columns_used(self):
"""
Columns from the choosers table that are used for filtering.
"""
return list(tz.unique(tz.concat(
m.choosers_columns_used() for m in self.models.values())))
[docs] def alts_columns_used(self):
"""
Columns from the alternatives table that are used for filtering.
"""
return list(tz.unique(tz.concat(
m.alts_columns_used() for m in self.models.values())))
[docs] def interaction_columns_used(self):
"""
Columns from the interaction dataset used for filtering and in
the model. These may come originally from either the choosers or
alternatives tables.
"""
return list(tz.unique(tz.concat(
m.interaction_columns_used() for m in self.models.values())))
[docs] def columns_used(self):
"""
Columns from any table used in the model. May come from either
the choosers or alternatives tables.
"""
return list(tz.unique(tz.concat(
m.columns_used() for m in self.models.values())))
[docs]class SegmentedMNLDiscreteChoiceModel(DiscreteChoiceModel):
"""
An MNL LCM group that allows segments to have different model expressions
but otherwise share configurations.
Parameters
----------
segmentation_col
Name of column in the choosers table that will be used for groupby.
sample_size : int
Number of choices to sample for estimating the model.
probability_mode : str, optional
Specify the method to use for calculating probabilities
during prediction.
Available string options are 'single_chooser' and 'full_product'.
In "single chooser" mode one agent is chosen for calculating
probabilities across all alternatives. In "full product" mode
probabilities are calculated for every chooser across all alternatives.
Currently "single chooser" mode must be used with a `choice_mode`
of 'aggregate' and "full product" mode must be used with a
`choice_mode` of 'individual'.
choice_mode : str, optional
Specify the method to use for making choices among alternatives.
Available string options are 'individual' and 'aggregate'.
In "individual" mode choices will be made separately for each chooser.
In "aggregate" mode choices are made for all choosers at once.
Aggregate mode implies that an alternative chosen by one agent
is unavailable to other agents and that the same probabilities
can be used for all choosers.
Currently "individual" mode must be used with a `probability_mode`
of 'full_product' and "aggregate" mode must be used with a
`probability_mode` of 'single_chooser'.
choosers_fit_filters : list of str, optional
Filters applied to choosers table before fitting the model.
choosers_predict_filters : list of str, optional
Filters applied to the choosers table before calculating
new data points.
alts_fit_filters : list of str, optional
Filters applied to the alternatives table before fitting the model.
alts_predict_filters : list of str, optional
Filters applied to the alternatives table before calculating
new data points.
interaction_predict_filters : list of str, optional
Filters applied to the merged choosers/alternatives table
before predicting agent choices.
estimation_sample_size : int, optional
Whether to sample choosers during estimation
(needs to be applied after choosers_fit_filters)
prediction_sample_size : int, optional
Whether (and how much) to sample alternatives during prediction.
Note that this can lead to multiple choosers picking the same
alternative.
choice_column : optional
Name of the column in the `alternatives` table that choosers
should choose. e.g. the 'building_id' column. If not provided
the alternatives index is used.
default_model_expr : str, iterable, or dict, optional
A patsy model expression. Should contain only a right-hand side.
remove_alts : bool, optional
Specify how to handle alternatives between prediction for different
models. If False, the alternatives table is not modified between
predictions. If True, alternatives that have been chosen
are removed from the alternatives table before doing another
round of prediction.
name : str, optional
An optional string used to identify the model in places.
"""
def __init__(
self, segmentation_col, sample_size,
probability_mode='full_product', choice_mode='individual',
choosers_fit_filters=None, choosers_predict_filters=None,
alts_fit_filters=None, alts_predict_filters=None,
interaction_predict_filters=None,
estimation_sample_size=None, prediction_sample_size=None,
choice_column=None, default_model_expr=None, remove_alts=False,
name=None):
self._check_prob_choice_mode_compat(probability_mode, choice_mode)
self._check_prob_mode_interaction_compat(
probability_mode, interaction_predict_filters)
self.segmentation_col = segmentation_col
self.sample_size = sample_size
self.probability_mode = probability_mode
self.choice_mode = choice_mode
self.choosers_fit_filters = choosers_fit_filters
self.choosers_predict_filters = choosers_predict_filters
self.alts_fit_filters = alts_fit_filters
self.alts_predict_filters = alts_predict_filters
self.interaction_predict_filters = interaction_predict_filters
self.estimation_sample_size = estimation_sample_size
self.prediction_sample_size = prediction_sample_size
self.choice_column = choice_column
self.default_model_expr = default_model_expr
self.remove_alts = remove_alts
self._group = MNLDiscreteChoiceModelGroup(
segmentation_col, remove_alts=remove_alts)
self.name = (name if name is not None else
'SegmentedMNLDiscreteChoiceModel')
[docs] @classmethod
def from_yaml(cls, yaml_str=None, str_or_buffer=None):
"""
Create a SegmentedMNLDiscreteChoiceModel instance from a saved YAML
configuration. Arguments are mutally exclusive.
Parameters
----------
yaml_str : str, optional
A YAML string from which to load model.
str_or_buffer : str or file like, optional
File name or buffer from which to load YAML.
Returns
-------
SegmentedMNLDiscreteChoiceModel
"""
cfg = yamlio.yaml_to_dict(yaml_str, str_or_buffer)
default_model_expr = cfg['default_config']['model_expression']
seg = cls(
cfg['segmentation_col'],
cfg['sample_size'],
cfg['probability_mode'],
cfg['choice_mode'],
cfg['choosers_fit_filters'],
cfg['choosers_predict_filters'],
cfg['alts_fit_filters'],
cfg['alts_predict_filters'],
cfg['interaction_predict_filters'],
cfg['estimation_sample_size'],
cfg['prediction_sample_size'],
cfg['choice_column'],
default_model_expr,
cfg['remove_alts'],
cfg['name'])
if "models" not in cfg:
cfg["models"] = {}
for name, m in cfg['models'].items():
m['model_expression'] = m.get(
'model_expression', default_model_expr)
m['sample_size'] = cfg['sample_size']
m['probability_mode'] = cfg['probability_mode']
m['choice_mode'] = cfg['choice_mode']
m['choosers_fit_filters'] = None
m['choosers_predict_filters'] = None
m['alts_fit_filters'] = None
m['alts_predict_filters'] = None
m['interaction_predict_filters'] = \
cfg['interaction_predict_filters']
m['estimation_sample_size'] = cfg['estimation_sample_size']
m['prediction_sample_size'] = cfg['prediction_sample_size']
m['choice_column'] = cfg['choice_column']
model = MNLDiscreteChoiceModel.from_yaml(
yamlio.convert_to_yaml(m, None))
seg._group.add_model(model)
logger.debug(
'loaded segmented LCM model {} from YAML'.format(seg.name))
return seg
[docs] def add_segment(self, name, model_expression=None):
"""
Add a new segment with its own model expression.
Parameters
----------
name
Segment name. Must match a segment in the groupby of the data.
model_expression : str or dict, optional
A patsy model expression that can be used with statsmodels.
Should contain both the left- and right-hand sides.
If not given the default model will be used, which must not be
None.
"""
logger.debug('adding LCM model {} to segmented model {}'.format(
name, self.name))
if not model_expression:
if not self.default_model_expr:
raise ValueError(
'No default model available, '
'you must supply a model expression.')
model_expression = self.default_model_expr
# we'll take care of some of the filtering this side before
# segmentation
self._group.add_model_from_params(
name=name,
model_expression=model_expression,
sample_size=self.sample_size,
probability_mode=self.probability_mode,
choice_mode=self.choice_mode,
choosers_fit_filters=None,
choosers_predict_filters=None,
alts_fit_filters=None,
alts_predict_filters=None,
interaction_predict_filters=self.interaction_predict_filters,
estimation_sample_size=self.estimation_sample_size,
choice_column=self.choice_column)
[docs] def apply_fit_filters(self, choosers, alternatives):
"""
Filter `choosers` and `alternatives` for fitting.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
Returns
-------
filtered_choosers, filtered_alts : pandas.DataFrame
"""
return super(SegmentedMNLDiscreteChoiceModel, self).apply_fit_filters(
choosers, alternatives)
[docs] def apply_predict_filters(self, choosers, alternatives):
"""
Filter `choosers` and `alternatives` for prediction.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
Returns
-------
filtered_choosers, filtered_alts : pandas.DataFrame
"""
return super(
SegmentedMNLDiscreteChoiceModel, self
).apply_predict_filters(choosers, alternatives)
[docs] def fit(self, choosers, alternatives, current_choice):
"""
Fit and save models based on given data after segmenting
the `choosers` table. Segments that have not already been explicitly
added will be automatically added with default model.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column with the same name as the .segmentation_col
attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing,
e.g. buildings.
current_choice
Name of column in `choosers` that indicates which alternative
they have currently chosen.
Returns
-------
log_likelihoods : dict of dict
Keys will be model names and values will be dictionaries of
log-liklihood values as returned by MNLDiscreteChoiceModel.fit.
"""
logger.debug('start: fit models in segmented LCM {}'.format(self.name))
choosers, alternatives = self.apply_fit_filters(choosers, alternatives)
unique = choosers[self.segmentation_col].unique()
# Remove any existing segments that may no longer have counterparts
# in the data. This can happen when loading a saved model and then
# calling this method with data that no longer has segments that
# were there the last time this was called.
gone = set(self._group.models) - set(unique)
for g in gone:
del self._group.models[g]
for x in unique:
if x not in self._group.models:
self.add_segment(x)
results = self._group.fit(choosers, alternatives, current_choice)
logger.debug(
'finish: fit models in segmented LCM {}'.format(self.name))
return results
@property
def fitted(self):
"""
Whether models for all segments have been fit.
"""
return self._group.fitted
def _filter_choosers_alts(self, choosers, alternatives):
"""
Apply filters to the choosers and alts tables.
"""
return (
util.apply_filter_query(
choosers, self.choosers_predict_filters),
util.apply_filter_query(
alternatives, self.alts_predict_filters))
[docs] def probabilities(self, choosers, alternatives):
"""
Returns alternative probabilties for each chooser segment as
a dictionary keyed by segment name.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column matching the .segmentation_col attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
Returns
-------
probabilties : dict of pandas.Series
"""
logger.debug(
'start: calculate probabilities in segmented LCM {}'.format(
self.name))
choosers, alternatives = self.apply_predict_filters(
choosers, alternatives)
result = self._group.probabilities(choosers, alternatives)
logger.debug(
'finish: calculate probabilities in segmented LCM {}'.format(
self.name))
return result
[docs] def summed_probabilities(self, choosers, alternatives):
"""
Returns the sum of probabilities for alternatives across all
chooser segments.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column matching the .segmentation_col attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
Returns
-------
probs : pandas.Series
Summed probabilities from each segment added together.
"""
logger.debug(
'start: calculate summed probabilities in segmented LCM {}'.format(
self.name))
choosers, alternatives = self.apply_predict_filters(
choosers, alternatives)
result = self._group.summed_probabilities(choosers, alternatives)
logger.debug(
('finish: calculate summed probabilities in segmented LCM {}'
).format(self.name))
return result
[docs] def predict(self, choosers, alternatives, debug=False):
"""
Choose from among alternatives for a group of agents after
segmenting the `choosers` table.
Parameters
----------
choosers : pandas.DataFrame
Table describing the agents making choices, e.g. households.
Must have a column matching the .segmentation_col attribute.
alternatives : pandas.DataFrame
Table describing the things from which agents are choosing.
debug : bool
If debug is set to true, will set the variable "sim_pdf" on
the object to store the probabilities for mapping of the
outcome.
Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
"""
logger.debug(
'start: predict models in segmented LCM {}'.format(self.name))
choosers, alternatives = self._filter_choosers_alts(
choosers, alternatives)
results = self._group.predict(choosers, alternatives, debug=debug)
logger.debug(
'finish: predict models in segmented LCM {}'.format(self.name))
return results
def _process_model_dict(self, d):
"""
Remove redundant items from a model's configuration dict.
Parameters
----------
d : dict
Modified in place.
Returns
-------
dict
Modified `d`.
"""
del d['model_type']
del d['sample_size']
del d['probability_mode']
del d['choice_mode']
del d['choosers_fit_filters']
del d['choosers_predict_filters']
del d['alts_fit_filters']
del d['alts_predict_filters']
del d['interaction_predict_filters']
del d['estimation_sample_size']
del d['prediction_sample_size']
del d['choice_column']
if d['model_expression'] == self.default_model_expr:
del d['model_expression']
d["name"] = yamlio.to_scalar_safe(d["name"])
return d
[docs] def to_dict(self):
"""
Returns a dict representation of this instance suitable for
conversion to YAML.
"""
return {
'model_type': 'segmented_discretechoice',
'name': self.name,
'segmentation_col': self.segmentation_col,
'sample_size': self.sample_size,
'probability_mode': self.probability_mode,
'choice_mode': self.choice_mode,
'choosers_fit_filters': self.choosers_fit_filters,
'choosers_predict_filters': self.choosers_predict_filters,
'alts_fit_filters': self.alts_fit_filters,
'alts_predict_filters': self.alts_predict_filters,
'interaction_predict_filters': self.interaction_predict_filters,
'estimation_sample_size': self.estimation_sample_size,
'prediction_sample_size': self.prediction_sample_size,
'choice_column': self.choice_column,
'default_config': {
'model_expression': self.default_model_expr,
},
'remove_alts': self.remove_alts,
'fitted': self.fitted,
'models': {
yamlio.to_scalar_safe(name):
self._process_model_dict(m.to_dict())
for name, m in self._group.models.items()
}
}
[docs] def to_yaml(self, str_or_buffer=None):
"""
Save a model respresentation to YAML.
Parameters
----------
str_or_buffer : str or file like, optional
By default a YAML string is returned. If a string is
given here the YAML will be written to that file.
If an object with a ``.write`` method is given the
YAML will be written to that object.
Returns
-------
j : str
YAML is string if `str_or_buffer` is not given.
"""
logger.debug('serializing segmented LCM {} to YAML'.format(self.name))
return yamlio.convert_to_yaml(self.to_dict(), str_or_buffer)
[docs] def choosers_columns_used(self):
"""
Columns from the choosers table that are used for filtering.
"""
return list(tz.unique(tz.concatv(
util.columns_in_filters(self.choosers_predict_filters),
util.columns_in_filters(self.choosers_fit_filters))))
[docs] def alts_columns_used(self):
"""
Columns from the alternatives table that are used for filtering.
"""
return list(tz.unique(tz.concatv(
util.columns_in_filters(self.alts_predict_filters),
util.columns_in_filters(self.alts_fit_filters))))
[docs] def interaction_columns_used(self):
"""
Columns from the interaction dataset used for filtering and in
the model. These may come originally from either the choosers or
alternatives tables.
"""
return self._group.interaction_columns_used()
[docs] def columns_used(self):
"""
Columns from any table used in the model. May come from either
the choosers or alternatives tables.
"""
return list(tz.unique(tz.concatv(
self.choosers_columns_used(),
self.alts_columns_used(),
self.interaction_columns_used(),
util.columns_in_formula(self.default_model_expr),
[self.segmentation_col])))
[docs] @classmethod
def fit_from_cfg(cls, choosers, chosen_fname, alternatives, cfgname, outcfgname=None):
"""
Parameters
----------
choosers : DataFrame
A dataframe of rows of agents that have made choices.
chosen_fname : string
A string indicating the column in the choosers dataframe which
gives which alternative the choosers have chosen.
alternatives : DataFrame
A dataframe of alternatives. It should include the current choices
from the choosers dataframe as well as some other alternatives from
which to sample. Values in choosers[chosen_fname] should index
into the alternatives dataframe.
cfgname : string
The name of the yaml config file from which to read the discrete
choice model.
outcfgname : string, optional (default cfgname)
The name of the output yaml config file where estimation results are written into.
Returns
-------
lcm : SegmentedMNLDiscreteChoiceModel which was used to fit
"""
logger.debug('start: fit from configuration {}'.format(cfgname))
lcm = cls.from_yaml(str_or_buffer=cfgname)
lcm.fit(choosers, alternatives, choosers[chosen_fname])
for k, v in lcm._group.models.items():
print("LCM RESULTS FOR SEGMENT %s\n" % str(k))
v.report_fit()
outcfgname = outcfgname or cfgname
lcm.to_yaml(str_or_buffer=outcfgname)
logger.debug('finish: fit into configuration {}'.format(outcfgname))
return lcm
[docs] @classmethod
def predict_from_cfg(cls, choosers, alternatives, cfgname=None, cfg=None,
alternative_ratio=2.0, debug=False):
"""
Simulate the discrete choices for the specified choosers
Parameters
----------
choosers : DataFrame
A dataframe of agents doing the choosing.
alternatives : DataFrame
A dataframe of alternatives which the choosers are locating in and
which have a supply.
cfgname : string
The name of the yaml config file from which to read the discrete
choice model.
cfg: string
an ordered yaml string of the model discrete choice model configuration.
Used to read config from memory in lieu of loading cfgname from disk.
alternative_ratio : float
Above the ratio of alternatives to choosers (default of 2.0),
the alternatives will be sampled to meet this ratio
(for performance reasons).
Returns
-------
choices : pandas.Series
Mapping of chooser ID to alternative ID. Some choosers
will map to a nan value when there are not enough alternatives
for all the choosers.
lcm : SegmentedMNLDiscreteChoiceModel which was used to predict
"""
logger.debug('start: predict from configuration {}'.format(cfgname))
if cfgname:
lcm = cls.from_yaml(str_or_buffer=cfgname)
elif cfg:
lcm = cls.from_yaml(yaml_str=cfg)
else:
msg = 'predict_from_cfg requires a configuration via the cfgname or cfg arguments'
logger.error(msg)
raise ValueError(msg)
if len(alternatives) > len(choosers) * alternative_ratio:
logger.info(
("Alternative ratio exceeded: %d alternatives "
"and only %d choosers") %
(len(alternatives), len(choosers)))
idxes = np.random.choice(
alternatives.index,
size=int(np.floor(len(choosers) * alternative_ratio)),
replace=False)
alternatives = alternatives.loc[idxes]
logger.info(
" after sampling %d alternatives are available\n"
% len(alternatives))
new_units = lcm.predict(choosers, alternatives, debug=debug)
print("Assigned %d choosers to new units" % len(new_units.dropna()))
logger.debug('finish: predict from configuration {}'.format(cfgname))
return new_units, lcm