"""
Utilities used within the ``urbansim.models`` package.
"""
import collections
import logging
import numbers
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
from tokenize import generate_tokens, NAME
import numpy as np
import pandas as pd
import patsy
import toolz as tz
from ..utils.logutil import log_start_finish
logger = logging.getLogger(__name__)
[docs]def apply_filter_query(df, filters=None):
"""
Use the DataFrame.query method to filter a table down to the
desired rows.
Parameters
----------
df : pandas.DataFrame
filters : list of str or str, optional
List of filters to apply. Will be joined together with
' and ' and passed to DataFrame.query. A string will be passed
straight to DataFrame.query.
If not supplied no filtering will be done.
Returns
-------
filtered_df : pandas.DataFrame
"""
with log_start_finish('apply filter query: {!r}'.format(filters), logger):
if filters:
if isinstance(filters, str):
query = filters
else:
query = ' and '.join(filters)
return df.query(query)
else:
return df
def _filterize(name, value):
"""
Turn a `name` and `value` into a string expression compatible
the ``DataFrame.query`` method.
Parameters
----------
name : str
Should be the name of a column in the table to which the
filter will be applied.
A suffix of '_max' will result in a "less than" filter,
a suffix of '_min' will result in a "greater than or equal to" filter,
and no recognized suffix will result in an "equal to" filter.
value : any
Value side of filter for comparison to column values.
Returns
-------
filter_exp : str
"""
if name.endswith('_min'):
name = name[:-4]
comp = '>='
elif name.endswith('_max'):
name = name[:-4]
comp = '<'
else:
comp = '=='
result = '{} {} {!r}'.format(name, comp, value)
logger.debug(
'converted name={} and value={} to filter {}'.format(
name, value, result))
return result
[docs]def filter_table(table, filter_series, ignore=None):
"""
Filter a table based on a set of restrictions given in
Series of column name / filter parameter pairs. The column
names can have suffixes `_min` and `_max` to indicate
"less than" and "greater than" constraints.
Parameters
----------
table : pandas.DataFrame
Table to filter.
filter_series : pandas.Series
Series of column name / value pairs of filter constraints.
Columns that ends with '_max' will be used to create
a "less than" filters, columns that end with '_min' will be
used to create "greater than or equal to" filters.
A column with no suffix will be used to make an 'equal to' filter.
ignore : sequence of str, optional
List of column names that should not be used for filtering.
Returns
-------
filtered : pandas.DataFrame
"""
with log_start_finish('filter table', logger):
ignore = ignore if ignore else set()
filters = [_filterize(name, val)
for name, val in filter_series.iteritems()
if not (name in ignore or
(isinstance(val, numbers.Number) and
np.isnan(val)))]
return apply_filter_query(table, filters)
[docs]def concat_indexes(indexes):
"""
Concatenate a sequence of pandas Indexes.
Parameters
----------
indexes : sequence of pandas.Index
Returns
-------
pandas.Index
"""
return pd.Index(np.concatenate(indexes))
[docs]def has_constant_expr(expr):
"""
Report whether a model expression has constant specific term.
That is, a term explicitly specying whether the model should or
should not include a constant. (e.g. '+ 1' or '- 1'.)
Parameters
----------
expr : str
Model expression to check.
Returns
-------
has_constant : bool
"""
def has_constant(node):
if node.type == 'ONE':
return True
for n in node.args:
if has_constant(n):
return True
return False
return has_constant(patsy.parse_formula.parse_formula(expr))
[docs]def str_model_expression(expr, add_constant=True):
"""
We support specifying model expressions as strings, lists, or dicts;
but for use with patsy and statsmodels we need a string.
This function will take any of those as input and return a string.
Parameters
----------
expr : str, iterable, or dict
A string will be returned unmodified except to add or remove
a constant.
An iterable sequence will be joined together with ' + '.
A dictionary should have ``right_side`` and, optionally,
``left_side`` keys. The ``right_side`` can be a list or a string
and will be handled as above. If ``left_side`` is present it will
be joined with ``right_side`` with ' ~ '.
add_constant : bool, optional
Whether to add a ' + 1' (if True) or ' - 1' (if False) to the model.
If the expression already has a '+ 1' or '- 1' this option will be
ignored.
Returns
-------
model_expression : str
A string model expression suitable for use with statsmodels and patsy.
"""
if not isinstance(expr, str):
if isinstance(expr, collections.Mapping):
left_side = expr.get('left_side')
right_side = str_model_expression(expr['right_side'], add_constant)
else:
# some kind of iterable like a list
left_side = None
right_side = ' + '.join(expr)
if left_side:
model_expression = ' ~ '.join((left_side, right_side))
else:
model_expression = right_side
else:
model_expression = expr
if not has_constant_expr(model_expression):
if add_constant:
model_expression += ' + 1'
else:
model_expression += ' - 1'
logger.debug(
'converted expression: {!r} to model: {!r}'.format(
expr, model_expression))
return model_expression
[docs]def sorted_groupby(df, groupby):
"""
Perform a groupby on a DataFrame using a specific column
and assuming that that column is sorted.
Parameters
----------
df : pandas.DataFrame
groupby : object
Column name on which to groupby. This column must be sorted.
Returns
-------
generator
Yields pairs of group_name, DataFrame.
"""
start = 0
prev = df[groupby].iloc[start]
for i, x in enumerate(df[groupby]):
if x != prev:
yield prev, df.iloc[start:i]
prev = x
start = i
# need to send back the last group
yield prev, df.iloc[start:]
[docs]def columns_in_filters(filters):
"""
Returns a list of the columns used in a set of query filters.
Parameters
----------
filters : list of str or str
List of the filters as passed passed to ``apply_filter_query``.
Returns
-------
columns : list of str
List of all the strings mentioned in the filters.
"""
if not filters:
return []
if not isinstance(filters, str):
filters = ' '.join(filters)
columns = []
reserved = {'and', 'or', 'in', 'not'}
for toknum, tokval, _, _, _ in generate_tokens(StringIO(filters).readline):
if toknum == NAME and tokval not in reserved:
columns.append(tokval)
return list(tz.unique(columns))
def _tokens_from_patsy(node):
"""
Yields all the individual tokens from within a patsy formula
as parsed by patsy.parse_formula.parse_formula.
Parameters
----------
node : patsy.parse_formula.ParseNode
"""
for n in node.args:
for t in _tokens_from_patsy(n):
yield t
if node.token:
yield node.token