Source code for choicemodels.tools.distancematrix

"""
Utilities for constructing pairwise distance matrices and calculating which
geographies are within various distance bands of some reference geography
"""




import numpy as np
import pandas as pd
from itertools import tee
from scipy.spatial.distance import squareform, pdist




def great_circle_vec(lat1, lng1, lat2, lng2, earth_radius=6371009):
    """
    Vectorized function to calculate the great-circle distance between two
    points or between vectors of points. This function is borrowed from OSMnx:
    https://github.com/gboeing/osmnx

    Parameters
    ----------
    lat1 : float or vector of floats
    lng1 : float or vector of floats
    lat2 : float or vector of floats
    lng2 : float or vector of floats
    earth_radius : numeric
        radius of earth in units in which distance will be returned (default is
        meters)

    Returns
    -------
    distance : float or vector of floats
        distance or vector of distances from (lat1, lng1) to (lat2, lng2) in
        units of earth_radius
    """

    phi1 = np.deg2rad(90 - lat1)
    phi2 = np.deg2rad(90 - lat2)

    theta1 = np.deg2rad(lng1)
    theta2 = np.deg2rad(lng2)

    cos = (np.sin(phi1) * np.sin(phi2) * np.cos(theta1 - theta2) + np.cos(phi1) * np.cos(phi2))
    arc = np.arccos(cos)

    # return distance in units of earth_radius
    distance = arc * earth_radius
    return distance




[docs]def great_circle_distance_matrix(df, x, y, earth_radius=6371009, return_int=True): """ Calculate a pairwise great-circle distance matrix from a DataFrame of points. Distances returned are in units of earth_radius (default is meters). Parameters ---------- df : pandas DataFrame a DataFrame of points, uniquely indexed by place identifier (e.g., tract ID or parcel ID), represented by x and y coordinate columns x : str label of the x coordinate column in the DataFrame y : str label of the y coordinate column in the DataFrame earth_radius : numeric radius of earth in units in which distance will be returned (default is meters) return_int : bool if True, convert all distances to integers Returns ------- pandas Series Multi-indexed distance vector in units of df's values, with top-level index representing "from" and second-level index representing "to". """ # calculate pairwise great-circle distances between each row and every # other row df_dist_matrix = df.apply(lambda row: great_circle_vec(row[y], row[x], df[y], df[x], earth_radius=earth_radius), axis='columns') # convert all distances to integers, if so configured if return_int: df_dist_matrix = df_dist_matrix.fillna(0).astype(int) # convert the distance matrix to a multi-indexed vector and return it labels = df.index.values df_dist_matrix.columns = labels df_dist_matrix.index = labels return df_dist_matrix.stack()
[docs]def euclidean_distance_matrix(df): """ Calculate a pairwise euclidean distance matrix from a DataFrame of points. Distances returned are in units of x and y columns. Parameters ---------- df : pandas DataFrame a DataFrame of points, uniquely indexed by place identifier (e.g., tract ID or parcel ID), represented by x and y coordinate columns Returns ------- pandas Series Multi-indexed distance vector in units of df's values, with top-level index representing "from" and second-level index representing "to". """ # calculate pairwise euclidean distances between all rows distance_vector = pdist(X=df, metric='euclidean') # convert the distance vector to a square distance matrix dist_matrix = squareform(distance_vector) # convert the distance matrix to a multi-indexed vector and return it labels = df.index.values df_dist_matrix = pd.DataFrame(data=dist_matrix, columns=labels, index=labels) return df_dist_matrix.stack()
def network_distance_matrix(df, x, y): """ Calculate a pairwise network distance matrix from a DataFrame of points. Parameters ---------- df : pandas DataFrame a DataFrame of points, uniquely indexed by place identifier (e.g., tract ID or parcel ID), represented by x and y coordinate columns x : str label of the x coordinate column in the DataFrame y : str label of the y coordinate column in the DataFrame Returns ------- None """ # placeholder until implementation of network distance matrix calculation raise NotImplementedError('network_distance_matrix is not currently implemented')
[docs]def distance_matrix(df, method='euclidean', x='lng', y='lat', earth_radius=6371009, return_int=True): """ Calculate a pairwise distance matrix from a DataFrame of two-dimensional points. Parameters ---------- df : pandas DataFrame a DataFrame of points, uniquely indexed by place identifier (e.g., tract ID or parcel ID), represented by x and y coordinate columns method : str {'euclidean', 'greatcircle', 'network'} which algorithm to use for calculating pairwise distances x : str if method='greatcircle' or 'network', label of the x coordinate column in the DataFrame y : str if method='greatcircle' or 'network', label of the y coordinate column in the DataFrame earth_radius : numeric if method='greatcircle', radius of earth in units in which distance will be returned (default is meters) return_int : bool if method='greatcircle', if True, convert all distances to integers Returns ------- pandas Series Multi-indexed distance vector in units of df's values, with top-level index representing "from" and second-level index representing "to". """ if not df.index.is_unique: raise ValueError('The passed-in DataFrame must have a unique index') if method == 'euclidean': return euclidean_distance_matrix(df=df) elif method == 'greatcircle': return great_circle_distance_matrix(df=df, x=x, y=y, earth_radius=earth_radius, return_int=return_int) elif method == 'network': return network_distance_matrix(df=df, x=x, y=y) else: raise ValueError('argument `method` must be one of "euclidean", "greatcircle", or "network"')
def pairwise(iterable): """ Iterate through a list, pairwise. Parameters ---------- iterable : list-like the list-like object to iterate through pairwise Returns ------- zip a zipped iterable of pairwise tuples """ a, b = tee(iterable) next(b, None) return zip(a, b)
[docs]def distance_bands(dist_vector, distances): """ Identify all geographies located within each distance band of each geography. The list of distances is treated pairwise to create distance bands, with the first element of each pair forming the band's inclusive lower limit and the second element of each pair forming the band's exclusive upper limit. For example, if distances=[0, 10, 30], band 0 will contain all geographies with a distance >= 0 and < 10 units (e.g., meters) from the reference geography, and band 1 will contain all geographies with a distance >= 10 and < 30 units from the reference geography. To make the final distance band include all geographies beyond a certain distance, make the final value in the distances list np.inf. Parameters ---------- dist_vector : pandas Series Multi-indexed distance vector in units of df's values, with top-level index representing "from" and second-level index representing "to". distances : list a list of distance band increments Returns ------- pandas Series a series multi-indexed by geography ID and distance band number, with values of arrays of geography IDs with the corresponding distances from that ID """ # loop through each row in distance matrix, identifying all geographies # within each distance band of the row's geography bands = {} dist_matrix = dist_vector.unstack() for _, row in dist_matrix.iterrows(): bands[row.name] = {} # for each distance band for band_number, (dist1, dist2) in enumerate(pairwise(distances)): # find all the geographies within this distance band mask = (row >= dist1) & (row < dist2) place_ids = row[mask].index.values # store value as array of geography IDs keyed by reference geography # ID and distance band number bands[row.name][band_number] = place_ids # convert geography bands to a series multi-indexed by geography ID and # distance band number return pd.DataFrame(bands).T.stack()