Application Programming Interface ¶

`aggregate_zones(mgra_gdf, method='kmeans', n_zones=2000, random_state=0, cluster_factors=None, cluster_factors_onehot=None, use_xy=True, explicit_agg=(), explicit_col='mgra', agg_instruction=None, start_cluster_ids=13)` ¶

Aggregate zones.

Parameters ¶

mgra_gdf : mgra_gdf (GeoDataFrame) Geometry and attibutes of MGRAs method : method (array) default {‘kmeans’, ‘agglom’, ‘agglom_adj’} n_zones : n_zones (int) random_state : random_state (RandomState or int) cluster_factors : cluster_factors (dict) cluster_factors_onehot : cluster_factors_onehot (dict) use_xy : use_xy (bool or float) Use X and Y coordinates as a cluster factor, use a float to scale the x-y coordinates from the CRS if needed. explicit_agg : explicit_agg (list[int or list]) A list containing integers (individual MGRAs that should not be aggregated) or lists of integers (groups of MGRAs that should be aggregated exactly as given, with no less and no more) explicit_col : explicit_col (str) The name of the column containing the ID’s from explicit_agg, usually ‘taz’ agg_instruction : agg_instruction (dict) Dictionary passed to pandas agg that says how to aggregate data columns. start_cluster_ids : start_cluster_ids (int, default 13) Cluster id’s start at this value. Can be 1, but typically SANDAG has the smallest id’s reserved for external zones, so starting at a greater value is typical.

Returns ¶

GeoDataFrame

Source code in rsm/zone_agg.py

def aggregate_zones(
    mgra_gdf,
    method="kmeans",
    n_zones=2000,
    random_state=0,
    cluster_factors=None,
    cluster_factors_onehot=None,
    use_xy=True,
    explicit_agg=(),
    explicit_col="mgra",
    agg_instruction=None,
    start_cluster_ids=13,
):
    """
    Aggregate zones.

    Parameters
    ----------
    mgra_gdf : mgra_gdf (GeoDataFrame)
        Geometry and attibutes of MGRAs
    method : method (array)
        default {'kmeans', 'agglom', 'agglom_adj'}
    n_zones : n_zones (int)
    random_state : random_state (RandomState or int)
    cluster_factors : cluster_factors (dict)
    cluster_factors_onehot : cluster_factors_onehot (dict)
    use_xy : use_xy (bool or float)
        Use X and Y coordinates as a cluster factor, use a float to scale the
        x-y coordinates from the CRS if needed.
    explicit_agg : explicit_agg (list[int or list])
        A list containing integers (individual MGRAs that should not be aggregated)
        or lists of integers (groups of MGRAs that should be aggregated exactly as
        given, with no less and no more)
    explicit_col : explicit_col (str)
        The name of the column containing the ID's from `explicit_agg`, usually 'taz'
    agg_instruction : agg_instruction (dict)
        Dictionary passed to pandas `agg` that says how to aggregate data columns.
    start_cluster_ids : start_cluster_ids (int, default 13)
        Cluster id's start at this value.  Can be 1, but typically SANDAG has the
        smallest id's reserved for external zones, so starting at a greater value
        is typical.

    Returns
    -------
    GeoDataFrame
    """

    if cluster_factors is None:
        cluster_factors = {}

    n = start_cluster_ids
    if explicit_agg:
        explicit_agg_ids = {}
        for i in explicit_agg:
            if isinstance(i, Number):
                explicit_agg_ids[i] = n
            else:
                for j in i:
                    explicit_agg_ids[j] = n
            n += 1
        if explicit_col == mgra_gdf.index.name:
            mgra_gdf = mgra_gdf.reset_index()
            mgra_gdf.index = mgra_gdf[explicit_col]
        in_explicit = mgra_gdf[explicit_col].isin(explicit_agg_ids)
        mgra_gdf_algo = mgra_gdf.loc[~in_explicit].copy()
        mgra_gdf_explicit = mgra_gdf.loc[in_explicit].copy()
        mgra_gdf_explicit["cluster_id"] = mgra_gdf_explicit[explicit_col].map(
            explicit_agg_ids
        )
        n_zones_algorithm = n_zones - len(
            mgra_gdf_explicit["cluster_id"].value_counts()
        )
    else:
        mgra_gdf_algo = mgra_gdf.copy()
        mgra_gdf_explicit = None
        n_zones_algorithm = n_zones

    if use_xy:
        geometry = mgra_gdf_algo.centroid
        X = list(geometry.apply(lambda p: p.x))
        Y = list(geometry.apply(lambda p: p.y))
        factors = [np.asarray(X) * use_xy, np.asarray(Y) * use_xy]
    else:
        factors = []
    for cf, cf_wgt in cluster_factors.items():
        factors.append(cf_wgt * mgra_gdf_algo[cf].values.astype(np.float32))
    if cluster_factors_onehot:
        for cf, cf_wgt in cluster_factors_onehot.items():
            factors.append(cf_wgt * OneHotEncoder().fit_transform(mgra_gdf_algo[[cf]]))
        from scipy.sparse import hstack

        factors2d = []
        for j in factors:
            if j.ndim < 2:
                factors2d.append(np.expand_dims(j, -1))
            else:
                factors2d.append(j)
        data = hstack(factors2d).toarray()
    else:
        data = np.array(factors).T

    if method == "kmeans":
        kmeans = KMeans(n_clusters=n_zones_algorithm, random_state=random_state)
        kmeans.fit(data)
        cluster_id = kmeans.labels_
    elif method == "agglom":
        agglom = AgglomerativeClustering(
            n_clusters=n_zones_algorithm, affinity="euclidean", linkage="ward"
        )
        agglom.fit_predict(data)
        cluster_id = agglom.labels_
    elif method == "agglom_adj":
        from libpysal.weights import Rook

        w_rook = Rook.from_dataframe(mgra_gdf_algo)
        adj_mat = nx.adjacency_matrix(w_rook.to_networkx())
        agglom = AgglomerativeClustering(
            n_clusters=n_zones_algorithm,
            affinity="euclidean",
            linkage="ward",
            connectivity=adj_mat,
        )
        agglom.fit_predict(data)
        cluster_id = agglom.labels_
    else:
        raise NotImplementedError(method)
    mgra_gdf_algo["cluster_id"] = cluster_id

    if mgra_gdf_explicit is None or len(mgra_gdf_explicit) == 0:
        combined = merge_zone_data(
            mgra_gdf_algo,
            agg_instruction,
            cluster_id="cluster_id",
        )
        combined["cluster_id"] = list(range(n, n + n_zones_algorithm))
    else:
        pending = []
        for df in [mgra_gdf_algo, mgra_gdf_explicit]:
            logger.info(f"... merging {len(df)}")
            pending.append(
                merge_zone_data(
                    df,
                    agg_instruction,
                    cluster_id="cluster_id",
                ).reset_index()
            )

        pending[0]["cluster_id"] = list(range(n, n + n_zones_algorithm))

        pending[0] = pending[0][
            [c for c in pending[1].columns if c in pending[0].columns]
        ]
        pending[1] = pending[1][
            [c for c in pending[0].columns if c in pending[1].columns]
        ]
        combined = pd.concat(pending, ignore_index=False)
    combined = combined.reset_index(drop=True)

    return combined

agg_input_files(model_dir='.', rsm_dir='.', taz_cwk_file='taz_crosswalk.csv', mgra_cwk_file='mgra_crosswalk.csv', agg_zones=2000, ext_zones=12, input_files=['microMgraEquivMinutes.csv', 'microMgraTapEquivMinutes.csv', 'walkMgraTapEquivMinutes.csv', 'walkMgraEquivMinutes.csv', 'bikeTazLogsum.csv', 'bikeMgraLogsum.csv', 'zone.term', 'zones.park', 'tap.ptype', 'accessam.csv', 'ParkLocationAlts.csv', 'CrossBorderDestinationChoiceSoaAlternatives.csv', 'TourDcSoaDistanceAlts.csv', 'DestinationChoiceAlternatives.csv', 'SoaTazDistAlts.csv', 'TripMatrices.csv', 'transponderModelAccessibilities.csv', 'crossBorderTours.csv', 'internalExternalTrips.csv', 'visitorTours.csv', 'visitorTrips.csv', 'householdAVTrips.csv', 'crossBorderTrips.csv', 'TNCTrips.csv', 'airport_out.SAN.csv', 'airport_out.CBX.csv', 'TNCtrips.csv']) ¶

Parameters ¶

model_dir : model_dir (path_like) path to full model run, default “.” rsm_dir : rsm_dir (path_like) path to RSM, default “.” taz_cwk_file : taz_cwk_file (csv file) default taz_crosswalk.csv taz to aggregated zones file. Should be located in RSM input folder mgra_cwk_file : mgra_cwk_file (csv file) default mgra_crosswalk.csv mgra to aggregated zones file. Should be located in RSM input folder input_files : input_files (csv + other files) list of input files to be aggregated. Should include the following files “microMgraEquivMinutes.csv”, “microMgraTapEquivMinutes.csv”, “walkMgraTapEquivMinutes.csv”, “walkMgraEquivMinutes.csv”, “bikeTazLogsum.csv”, “bikeMgraLogsum.csv”, “zone.term”, “zones.park”, “tap.ptype”, “accessam.csv”, “ParkLocationAlts.csv”, “CrossBorderDestinationChoiceSoaAlternatives.csv”, “TourDcSoaDistanceAlts.csv”, “DestinationChoiceAlternatives.csv”, “SoaTazDistAlts.csv”, “TripMatrices.csv”, “transponderModelAccessibilities.csv”, “crossBorderTours.csv”, “internalExternalTrips.csv”, “visitorTours.csv”, “visitorTrips.csv”, “householdAVTrips.csv”, “crossBorderTrips.csv”, “TNCTrips.csv”, “airport_out.SAN.csv”, “airport_out.CBX.csv”, “TNCtrips.csv”

Returns ¶

Aggregated files in the RSM input/output/uec directory

Source code in rsm/input_agg.py

def agg_input_files(
    model_dir = ".", 
    rsm_dir = ".",
    taz_cwk_file = "taz_crosswalk.csv",
    mgra_cwk_file = "mgra_crosswalk.csv",
    agg_zones=2000,
    ext_zones=12,
    input_files = ["microMgraEquivMinutes.csv", "microMgraTapEquivMinutes.csv", 
    "walkMgraTapEquivMinutes.csv", "walkMgraEquivMinutes.csv", "bikeTazLogsum.csv",
    "bikeMgraLogsum.csv", "zone.term", "zones.park", "tap.ptype", "accessam.csv",
    "ParkLocationAlts.csv", "CrossBorderDestinationChoiceSoaAlternatives.csv", 
    "TourDcSoaDistanceAlts.csv", "DestinationChoiceAlternatives.csv", "SoaTazDistAlts.csv",
    "TripMatrices.csv", "transponderModelAccessibilities.csv", "crossBorderTours.csv", 
    "internalExternalTrips.csv", "visitorTours.csv", "visitorTrips.csv", "householdAVTrips.csv", 
    "crossBorderTrips.csv", "TNCTrips.csv", "airport_out.SAN.csv", "airport_out.CBX.csv", 
    "TNCtrips.csv"]
    ):

    """
        Parameters
        ----------
        model_dir : model_dir (path_like)
            path to full model run, default "."
        rsm_dir : rsm_dir (path_like)
            path to RSM, default "."
        taz_cwk_file : taz_cwk_file (csv file)
            default taz_crosswalk.csv
            taz to aggregated zones file. Should be located in RSM input folder
        mgra_cwk_file : mgra_cwk_file (csv file)
            default mgra_crosswalk.csv
            mgra to aggregated zones file. Should be located in RSM input folder
        input_files : input_files (csv + other files)
            list of input files to be aggregated. 
            Should include the following files
                "microMgraEquivMinutes.csv", "microMgraTapEquivMinutes.csv", 
                "walkMgraTapEquivMinutes.csv", "walkMgraEquivMinutes.csv", "bikeTazLogsum.csv",
                "bikeMgraLogsum.csv", "zone.term", "zones.park", "tap.ptype", "accessam.csv",
                "ParkLocationAlts.csv", "CrossBorderDestinationChoiceSoaAlternatives.csv",
                "TourDcSoaDistanceAlts.csv", "DestinationChoiceAlternatives.csv", "SoaTazDistAlts.csv",
                "TripMatrices.csv", "transponderModelAccessibilities.csv", "crossBorderTours.csv",
                "internalExternalTrips.csv", "visitorTours.csv", "visitorTrips.csv", "householdAVTrips.csv",
                "crossBorderTrips.csv", "TNCTrips.csv", "airport_out.SAN.csv", "airport_out.CBX.csv",
                "TNCtrips.csv"

        Returns
        -------
        Aggregated files in the RSM input/output/uec directory
    """

    df_clusters = pd.read_csv(os.path.join(rsm_dir, "input", taz_cwk_file))
    df_clusters.columns= df_clusters.columns.str.strip().str.lower()
    dict_clusters = dict(zip(df_clusters['taz'], df_clusters['cluster_id']))

    mgra_cwk = pd.read_csv(os.path.join(rsm_dir, "input", mgra_cwk_file))
    mgra_cwk.columns= mgra_cwk.columns.str.strip().str.lower()
    mgra_cwk = dict(zip(mgra_cwk['mgra'], mgra_cwk['cluster_id']))

    taz_zones = int(agg_zones) + int(ext_zones)
    mgra_zones = int(agg_zones)

    # aggregating microMgraEquivMinutes.csv
    if "microMgraEquivMinutes.csv" in input_files:
        logging.info("Aggregating - microMgraEquivMinutes.csv")
        df_mm_eqmin = pd.read_csv(os.path.join(model_dir, "output", "microMgraEquivMinutes.csv"))
        df_mm_eqmin['i_new'] = df_mm_eqmin['i'].map(mgra_cwk)
        df_mm_eqmin['j_new'] = df_mm_eqmin['j'].map(mgra_cwk)

        df_mm_eqmin_agg = df_mm_eqmin.groupby(['i_new', 'j_new'])['walkTime', 'dist', 'mmTime', 'mmCost', 'mtTime', 'mtCost',
       'mmGenTime', 'mtGenTime', 'minTime'].mean().reset_index()

        df_mm_eqmin_agg = df_mm_eqmin_agg.rename(columns = {'i_new' : 'i', 'j_new' : 'j'})
        df_mm_eqmin_agg.to_csv(os.path.join(rsm_dir, "input", "microMgraEquivMinutes.csv"), index = False)

    else:
        raise FileNotFoundError("microMgraEquivMinutes.csv")


    # aggregating microMgraTapEquivMinutes.csv"   
    if "microMgraTapEquivMinutes.csv" in input_files:
        logging.info("Aggregating - microMgraTapEquivMinutes.csv")
        df_mm_tap = pd.read_csv(os.path.join(model_dir, "output", "microMgraTapEquivMinutes.csv"))
        df_mm_tap['mgra'] = df_mm_tap['mgra'].map(mgra_cwk)

        df_mm_tap_agg = df_mm_tap.groupby(['mgra', 'tap'])['walkTime', 'dist', 'mmTime', 'mmCost', 'mtTime',
       'mtCost', 'mmGenTime', 'mtGenTime', 'minTime'].mean().reset_index()

        df_mm_tap_agg.to_csv(os.path.join(rsm_dir, "input", "microMgraTapEquivMinutes.csv"), index = False)

    else:
        raise FileNotFoundError("microMgraTapEquivMinutes.csv")

    # aggregating walkMgraTapEquivMinutes.csv
    if "walkMgraTapEquivMinutes.csv" in input_files:
        logging.info("Aggregating - walkMgraTapEquivMinutes.csv")
        df_wlk_mgra_tap = pd.read_csv(os.path.join(model_dir, "output", "walkMgraTapEquivMinutes.csv"))
        df_wlk_mgra_tap["mgra"] = df_wlk_mgra_tap["mgra"].map(mgra_cwk)

        df_wlk_mgra_agg = df_wlk_mgra_tap.groupby(["mgra", "tap"])["boardingPerceived", "boardingActual","alightingPerceived","alightingActual","boardingGain","alightingGain"].mean().reset_index()
        df_wlk_mgra_agg.to_csv(os.path.join(rsm_dir, "input", "walkMgraTapEquivMinutes.csv"), index = False)

    else:
        FileNotFoundError("walkMgraTapEquivMinutes.csv")

    # aggregating walkMgraEquivMinutes.csv
    if "walkMgraEquivMinutes.csv" in input_files:
        logging.info("Aggregating - walkMgraEquivMinutes.csv")
        df_wlk_min = pd.read_csv(os.path.join(model_dir, "output", "walkMgraEquivMinutes.csv"))
        df_wlk_min["i"] = df_wlk_min["i"].map(mgra_cwk)
        df_wlk_min["j"] = df_wlk_min["j"].map(mgra_cwk)

        df_wlk_min_agg = df_wlk_min.groupby(["i", "j"])["percieved","actual", "gain"].mean().reset_index()

        df_wlk_min_agg.to_csv(os.path.join(rsm_dir, "input", "walkMgraEquivMinutes.csv"), index = False)

    else:
        FileNotFoundError("walkMgraEquivMinutes.csv")

    # aggregating biketazlogsum
    if "bikeTazLogsum.csv" in input_files:
        logging.info("Aggregating - bikeTazLogsum.csv")
        bike_taz = pd.read_csv(os.path.join(model_dir, "output", "bikeTazLogsum.csv"))

        bike_taz["i"] = bike_taz["i"].map(dict_clusters)
        bike_taz["j"] = bike_taz["j"].map(dict_clusters)

        bike_taz_agg = bike_taz.groupby(["i", "j"])["logsum", "time"].mean().reset_index()
        bike_taz_agg.to_csv(os.path.join(rsm_dir, "input", "bikeTazLogsum.csv"), index = False)

    else:
        raise FileNotFoundError("bikeTazLogsum.csv")

    # aggregating bikeMgraLogsum.csv
    if "bikeMgraLogsum.csv" in input_files:
        logging.info("Aggregating - bikeMgraLogsum.csv")
        bike_mgra = pd.read_csv(os.path.join(model_dir, "output", "bikeMgraLogsum.csv"))
        bike_mgra["i"] = bike_mgra["i"].map(mgra_cwk)
        bike_mgra["j"] = bike_mgra["j"].map(mgra_cwk)

        bike_mgra_agg = bike_mgra.groupby(["i", "j"])["logsum", "time"].mean().reset_index()
        bike_mgra_agg.to_csv(os.path.join(rsm_dir, "input", "bikeMgraLogsum.csv"), index = False)
    else:
        raise FileNotFoundError("bikeMgraLogsum.csv")

    # aggregating zone.term
    if "zone.term" in input_files:
        logging.info("Aggregating - zone.term")
        df_zone_term = pd.read_fwf(os.path.join(model_dir, "input", "zone.term"), header = None)
        df_zone_term.columns = ["taz", "terminal_time"]

        df_agg = pd.merge(df_zone_term, df_clusters, on = "taz", how = 'left')
        df_zones_agg = df_agg.groupby(["cluster_id"])['terminal_time'].max().reset_index()

        df_zones_agg.columns = ["taz", "terminal_time"]
        df_zones_agg.to_fwf(os.path.join(rsm_dir, "input", "zone.term"))

    else:
        raise FileNotFoundError("zone.term")

    # aggregating zones.park
    if "zones.park" in input_files:
        logging.info("Aggregating - zone.park")
        df_zones_park = pd.read_fwf(os.path.join(model_dir, "input", "zone.park"), header = None)
        df_zones_park.columns = ["taz", "park_zones"]

        df_zones_park_agg = pd.merge(df_zones_park, df_clusters, on = "taz", how = 'left')
        df_zones_park_agg = df_zones_park_agg.groupby(["cluster_id"])['park_zones'].max().reset_index()
        df_zones_park_agg.columns = ["taz", "park_zones"]
        df_zones_park_agg.to_fwf(os.path.join(rsm_dir, "input", "zone.park"))

    else:
        raise FileNotFoundError("zone.park")


    # aggregating tap.ptype 
    if "tap.ptype" in input_files:
        logging.info("Aggregating - tap.ptype")
        df_tap_ptype = pd.read_fwf(os.path.join(model_dir, "input", "tap.ptype"), header = None)
        df_tap_ptype.columns = ["tap", "lot id", "parking type", "taz", "capacity", "distance", "transit mode"]

        df_tap_ptype = pd.merge(df_tap_ptype, df_clusters, on = "taz", how = 'left')

        df_tap_ptype = df_tap_ptype[["tap", "lot id", "parking type", "cluster_id", "capacity", "distance", "transit mode"]]
        df_tap_ptype = df_tap_ptype.rename(columns = {"cluster_id": "taz"})
        #df_tap_ptype.to_fwf(os.path.join(rsm_dir, "input", "tap.ptype"))

        widths = [5, 6, 6, 5, 5, 5, 3]

        with open(os.path.join(rsm_dir, "input", "tap.ptype"), 'w') as f:
            for index, row in df_tap_ptype.iterrows():
                field1 = str(row[0]).rjust(widths[0])
                field2 = str(row[1]).rjust(widths[1])
                field3 = str(row[2]).rjust(widths[2])
                field4 = str(row[3]).rjust(widths[3])
                field5 = str(row[4]).rjust(widths[4])
                field6 = str(row[5]).rjust(widths[5])
                field7 = str(row[6]).rjust(widths[6])
                f.write(f'{field1}{field2}{field3}{field4}{field5}{field6}{field7}\n')

    else:
        raise FileNotFoundError("tap.ptype")

    #aggregating accessam.csv
    if "accessam.csv" in input_files:
        logging.info("Aggregating - accessam.csv")
        df_acc = pd.read_csv(os.path.join(model_dir, "input", "accessam.csv"), header = None)
        df_acc.columns = ['TAZ', 'TAP', 'TIME', 'DISTANCE', 'MODE']

        df_acc['TAZ'] = df_acc['TAZ'].map(dict_clusters)
        df_acc_agg = df_acc.groupby(['TAZ', 'TAP', 'MODE'])['TIME', 'DISTANCE'].mean().reset_index()
        df_acc_agg = df_acc_agg[["TAZ", "TAP", "TIME", "DISTANCE", "MODE"]]

        df_acc_agg.to_csv(os.path.join(rsm_dir, "input", "accessam.csv"), index = False, header =False)
    else:
        raise FileNotFoundError("accessam.csv")

    # aggregating ParkLocationAlts.csv
    if "ParkLocationAlts.csv" in input_files:
        logging.info("Aggregating - ParkLocationAlts.csv")
        df_park = pd.read_csv(os.path.join(model_dir, "uec", "ParkLocationAlts.csv"))
        df_park['mgra_new'] = df_park["mgra"].map(mgra_cwk)
        df_park_agg = df_park.groupby(["mgra_new"])["parkarea"].min().reset_index() # assuming 1 is "parking" and 2 is "no parking"
        df_park_agg['a'] = [i+1 for i in range(len(df_park_agg))]

        df_park_agg.columns = ["a", "mgra", "parkarea"]
        df_park_agg.to_csv(os.path.join(rsm_dir, "uec", "ParkLocationAlts.csv"), index = False)

    else:
        FileNotFoundError("ParkLocationAlts.csv")

    # aggregating CrossBorderDestinationChoiceSoaAlternatives.csv
    if "CrossBorderDestinationChoiceSoaAlternatives.csv" in input_files:
        logging.info("Aggregating - CrossBorderDestinationChoiceSoaAlternatives.csv")
        df_cb = pd.read_csv(os.path.join(model_dir, "uec","CrossBorderDestinationChoiceSoaAlternatives.csv"))

        df_cb["mgra_entry"] = df_cb["mgra_entry"].map(mgra_cwk)
        df_cb["mgra_return"] = df_cb["mgra_return"].map(mgra_cwk)
        df_cb["a"] = df_cb["a"].map(mgra_cwk)

        df_cb = pd.merge(df_cb, df_clusters, left_on = "dest", right_on = "taz", how = 'left')
        df_cb = df_cb.drop(columns = ["dest", "taz"])
        df_cb = df_cb.rename(columns = {'cluster_id' : 'dest'})

        df_cb_final  = df_cb.drop_duplicates()

        df_cb_final = df_cb_final[["a", "dest", "poe", "mgra_entry", "mgra_return", "poe_taz"]]
        df_cb_final.to_csv(os.path.join(rsm_dir, "uec", "CrossBorderDestinationChoiceSoaAlternatives.csv"), index = False)

    else:
        FileNotFoundError("CrossBorderDestinationChoiceSoaAlternatives.csv")

    # aggregating households.csv
    if "households.csv" in input_files:
        logging.info("Aggregating - households.csv")
        df_hh = pd.read_csv(os.path.join(model_dir, "input", "households.csv"))
        df_hh["mgra"] = df_hh["mgra"].map(mgra_cwk)
        df_hh["taz"] = df_hh["taz"].map(dict_clusters)

        df_hh.to_csv(os.path.join(rsm_dir, "input", "households.csv"), index = False)

    else:
        FileNotFoundError("households.csv")

    # aggregating ShadowPricingOutput_school_9.csv
    if "ShadowPricingOutput_school_9.csv" in input_files:
        logging.info("Aggregating - ShadowPricingOutput_school_9.csv")
        df_sp_sch = pd.read_csv(os.path.join(model_dir, "input", "ShadowPricingOutput_school_9.csv"))

        agg_instructions = {}
        for col in df_sp_sch.columns:
            if "size" in col:
                agg_instructions.update({col: "sum"})

            if "shadowPrices" in col:
                agg_instructions.update({col: "max"})

            if "_origins" in col:
                agg_instructions.update({col: "sum"})

            if "_modeledDests" in col:
                agg_instructions.update({col: "sum"})

        df_sp_sch['mgra'] = df_sp_sch['mgra'].map(mgra_cwk)
        df_sp_sch_agg = df_sp_sch.groupby(['mgra']).agg(agg_instructions).reset_index()

        alt = list(df_sp_sch_agg['mgra'])
        df_sp_sch_agg.insert(loc=0, column="alt", value=alt)
        df_sp_sch_agg.loc[len(df_sp_agg.index)] = 0

        df_sp_sch_agg.to_csv(os.path.join(rsm_dir, "input", "ShadowPricingOutput_school_9.csv"), index=False)

    else:
        FileNotFoundError("ShadowPricingOutput_school_9.csv")

    # aggregating ShadowPricingOutput_work_9.csv
    if "ShadowPricingOutput_work_9.csv" in input_files:
        logging.info("Aggregating - ShadowPricingOutput_work_9.csv")
        df_sp_wrk = pd.read_csv(os.path.join(model_dir, "input", "ShadowPricingOutput_work_9.csv"))

        agg_instructions = {}
        for col in df_sp_wrk.columns:
            if "size" in col:
                agg_instructions.update({col: "sum"})

            if "shadowPrices" in col:
                agg_instructions.update({col: "max"})

            if "_origins" in col:
                agg_instructions.update({col: "sum"})

            if "_modeledDests" in col:
                agg_instructions.update({col: "sum"})

        df_sp_wrk['mgra'] = df_sp_wrk['mgra'].map(mgra_cwk)

        df_sp_wrk_agg = df_sp_wrk.groupby(['mgra']).agg(agg_instructions).reset_index()

        alt = list(df_sp_wrk_agg['mgra'])
        df_sp_wrk_agg.insert(loc=0, column="alt", value=alt)

        df_sp_wrk_agg.loc[len(df_sp_wrk_agg.index)] = 0

        df_sp_wrk_agg.to_csv(os.path.join(rsm_dir, "input", "ShadowPricingOutput_work_9.csv"), index=False)

    else:
        FileNotFoundError("ShadowPricingOutput_work_9.csv")

    if "TourDcSoaDistanceAlts.csv" in input_files:
        logging.info("Aggregating - TourDcSoaDistanceAlts.csv")
        df_TourDcSoaDistanceAlts = pd.DataFrame({"a" : range(1,taz_zones+1), "dest" : range(1, taz_zones+1)})
        df_TourDcSoaDistanceAlts.to_csv(os.path.join(rsm_dir, "uec", "TourDcSoaDistanceAlts.csv"), index=False)

    if "DestinationChoiceAlternatives.csv" in input_files:
        logging.info("Aggregating - DestinationChoiceAlternatives.csv")
        df_DestinationChoiceAlternatives = pd.DataFrame({"a" : range(1,mgra_zones+1), "mgra" : range(1, mgra_zones+1)})
        df_DestinationChoiceAlternatives.to_csv(os.path.join(rsm_dir, "uec", "DestinationChoiceAlternatives.csv"), index=False)

    if "SoaTazDistAlts.csv" in input_files:
        logging.info("Aggregating - SoaTazDistAlts.csv")
        df_SoaTazDistAlts = pd.DataFrame({"a" : range(1,taz_zones+1), "dest" : range(1, taz_zones+1)})
        df_SoaTazDistAlts.to_csv(os.path.join(rsm_dir, "uec", "SoaTazDistAlts.csv"), index=False)

    if "TripMatrices.csv" in input_files:
        logging.info("Aggregating - TripMatrices.csv")
        trips = pd.read_csv(os.path.join(model_dir,"output", "TripMatrices.csv"))
        trips['i'] = trips['i'].map(dict_clusters)
        trips['j'] = trips['j'].map(dict_clusters)

        cols = list(trips.columns)
        cols.remove("i")
        cols.remove("j")

        trips_df = trips.groupby(['i', 'j'])[cols].sum().reset_index()
        trips_df.to_csv(os.path.join(rsm_dir, "output", "TripMatrices.csv"), index = False)

    else:
        FileNotFoundError("TripMatrices.csv")

    if "transponderModelAccessibilities.csv" in input_files:
        logging.info("Aggregating - transponderModelAccessibilities.csv")
        tran_access = pd.read_csv(os.path.join(model_dir, "output", "transponderModelAccessibilities.csv"))
        tran_access['TAZ'] = tran_access['TAZ'].map(dict_clusters)

        tran_access_agg = tran_access.groupby(['TAZ'])['DIST','AVGTTS','PCTDETOUR'].mean().reset_index()
        tran_access_agg.to_csv(os.path.join(rsm_dir, "output","transponderModelAccessibilities.csv"), index = False)

    else:
        raise FileNotFoundError("transponderModelAccessibilities.csv")

    if "crossBorderTours.csv" in input_files:
        logging.info("Aggregating - crossBorderTours.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "crossBorderTours.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "crossBorderTours.csv"), index = False)

    else:
        raise FileNotFoundError("crossBorderTours.csv")

    if "crossBorderTrips.csv" in input_files:
        logging.info("Aggregating - crossBorderTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "crossBorderTrips.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "crossBorderTrips.csv"), index = False)

    else:
        raise FileNotFoundError("crossBorderTrips.csv")

    if "internalExternalTrips.csv" in input_files:
        logging.info("Aggregating - internalExternalTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "internalExternalTrips.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "internalExternalTrips.csv"), index = False)

    else:
        raise FileNotFoundError("internalExternalTrips.csv")

    if "visitorTours.csv" in input_files:
        logging.info("Aggregating - visitorTours.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "visitorTours.csv"))

        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df.to_csv(os.path.join(rsm_dir, "output", "visitorTours.csv"), index = False)

    else:
        raise FileNotFoundError("visitorTours.csv")

    if "visitorTrips.csv" in input_files:
        logging.info("Aggregating - visitorTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "visitorTrips.csv"))

        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df.to_csv(os.path.join(rsm_dir, "output", "visitorTrips.csv"), index = False)

    else:
        raise FileNotFoundError("visitorTrips.csv")

    if "householdAVTrips.csv" in input_files:
        logging.info("Aggregating - householdAVTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "householdAVTrips.csv"))
        #print(os.path.join(model_dir, "output", "householdAVTrips.csv"))
        df['orig_mgra'] = df['orig_mgra'].map(mgra_cwk)
        df['dest_gra'] = df['dest_gra'].map(mgra_cwk)

        df['trip_orig_mgra'] = df['trip_orig_mgra'].map(mgra_cwk)
        df['trip_dest_mgra'] = df['trip_dest_mgra'].map(mgra_cwk)
        df.to_csv(os.path.join(rsm_dir, "output", "householdAVTrips.csv"), index = False)

    else:
        raise FileNotFoundError("householdAVTrips.csv")

    if "airport_out.CBX.csv" in input_files:
        logging.info("Aggregating - airport_out.CBX.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "airport_out.CBX.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "airport_out.CBX.csv"), index = False)

    else:
        raise FileNotFoundError("airport_out.CBX.csv")

    if "airport_out.SAN.csv" in input_files:
        logging.info("Aggregating - airport_out.SAN.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "airport_out.SAN.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "airport_out.SAN.csv"), index = False)

    else:
        raise FileNotFoundError("airport_out.SAN.csv")

    if "TNCtrips.csv" in input_files:
        logging.info("Aggregating - TNCtrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "TNCtrips.csv"))
        df['originMgra'] = df['originMgra'].map(mgra_cwk)
        df['destinationMgra'] = df['destinationMgra'].map(mgra_cwk)

        df['originTaz'] = df['originTaz'].map(dict_clusters)
        df['destinationTaz'] = df['destinationTaz'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "TNCtrips.csv"), index = False)

    else:
        raise FileNotFoundError("TNCtrips.csv")

    files = ["Trip" + "_" + i + "_" + j + ".csv" for i, j in
                itertools.product(["FA", "GO", "IN", "RE", "SV", "TH", "WH"],
                                   ["OE", "AM", "MD", "PM", "OL"])]

    for file in files:
        logging.info(f"Aggregating - {file}")
        df = pd.read_csv(os.path.join(model_dir, "output", file))
        df['I'] = df['I'].map(dict_clusters)
        df['J'] = df['J'].map(dict_clusters)
        df['HomeZone'] = df['HomeZone'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output",file), index = False)

`copy_transit_demand(matrix_names, input_dir='.', output_dir='.')` ¶

copies the omx transit demand matrix to rsm directory

Parameters ¶

matrix_names : matrix_names (list) omx matrix filenames to aggregate input_dir : input_dir (Path-like) default “.” output_dir : output_dir (Path-like) default “.”

Returns ¶

Source code in rsm/translate.py

def copy_transit_demand(
    matrix_names,
    input_dir=".",
    output_dir="."
):
    """
    copies the omx transit demand matrix to rsm directory

    Parameters
    ----------
    matrix_names : matrix_names (list)
        omx matrix filenames to aggregate
    input_dir : input_dir (Path-like) 
        default "."
    output_dir : output_dir (Path-like)
        default "."

    Returns
    -------

    """


    for mat_name in matrix_names:
        if '.omx' not in mat_name:
            mat_name = mat_name + ".omx"

        input_file_dir = os.path.join(input_dir, mat_name)
        output_file_dir = os.path.join(output_dir, mat_name)

        shutil.copy(input_file_dir, output_file_dir)

`translate_emmebank_demand(input_databank, output_databank, cores_to_aggregate, agg_zone_mapping)` ¶

aggregates the demand matrix cores from one emme databank and loads them into another databank

Parameters ¶

input_databank : input_databank (Emme databank) Emme databank output_databank : output_databank (Emme databank) Emme databank cores_to_aggregate : cores_to_aggregate (list) matrix corenames to aggregate agg_zone_mapping: agg_zone_mapping (Path-like or pandas.DataFrame) zone number mapping between original and aggregated zones. columns: original zones as ‘taz’ and aggregated zones as ‘cluster_id’

Returns ¶

None. Loads the trip matrices into emmebank.

Source code in rsm/translate.py

def translate_emmebank_demand(
    input_databank,
    output_databank,
    cores_to_aggregate,
    agg_zone_mapping,
): 
    """
    aggregates the demand matrix cores from one emme databank and loads them into another databank

    Parameters
    ----------
    input_databank : input_databank (Emme databank)
        Emme databank
    output_databank : output_databank (Emme databank)
        Emme databank
    cores_to_aggregate : cores_to_aggregate (list)
        matrix corenames to aggregate
    agg_zone_mapping: agg_zone_mapping (Path-like or pandas.DataFrame)
        zone number mapping between original and aggregated zones. 
        columns: original zones as 'taz' and aggregated zones as 'cluster_id'

    Returns
    -------
    None. Loads the trip matrices into emmebank.

    """

    agg_zone_mapping_df = pd.read_csv(os.path.join(agg_zone_mapping))
    agg_zone_mapping_df = agg_zone_mapping_df.sort_values('taz')

    agg_zone_mapping_df.columns= agg_zone_mapping_df.columns.str.strip().str.lower()
    zone_mapping = dict(zip(agg_zone_mapping_df['taz'], agg_zone_mapping_df['cluster_id']))

    for core in cores_to_aggregate: 
        matrix = input_databank.matrix(core).get_data()
        matrix_array = matrix.to_numpy()

        matrix_agg = _aggregate_matrix(matrix_array, zone_mapping)

        output_matrix = output_databank.matrix(core)
        output_matrix.set_numpy_data(matrix_agg)

`translate_omx_demand(matrix_names, agg_zone_mapping, input_dir='.', output_dir='.')` ¶

aggregates the omx demand matrix to aggregated zone system

Parameters ¶

matrix_names : matrix_names (list) omx matrix filenames to aggregate agg_zone_mapping: agg_zone_mapping (path_like or pandas.DataFrame) zone number mapping between original and aggregated zones. columns: original zones as ‘taz’ and aggregated zones as ‘cluster_id’ input_dir : input_dir (path_like) default “.” output_dir : output_dir (path_like) default “.”

Returns ¶

Source code in rsm/translate.py

def translate_omx_demand(
    matrix_names,
    agg_zone_mapping,
    input_dir=".",
    output_dir="."
): 
    """
    aggregates the omx demand matrix to aggregated zone system

    Parameters
    ----------
    matrix_names : matrix_names (list)
        omx matrix filenames to aggregate
    agg_zone_mapping: agg_zone_mapping (path_like or pandas.DataFrame)
        zone number mapping between original and aggregated zones. 
        columns: original zones as 'taz' and aggregated zones as 'cluster_id'
    input_dir : input_dir (path_like)
        default "."
    output_dir : output_dir (path_like) 
        default "."

    Returns
    -------

    """

    agg_zone_mapping_df = pd.read_csv(os.path.join(agg_zone_mapping))
    agg_zone_mapping_df = agg_zone_mapping_df.sort_values('taz')

    agg_zone_mapping_df.columns= agg_zone_mapping_df.columns.str.strip().str.lower()
    zone_mapping = dict(zip(agg_zone_mapping_df['taz'], agg_zone_mapping_df['cluster_id']))
    agg_zones = sorted(agg_zone_mapping_df['cluster_id'].unique())

    for mat_name in matrix_names:
        if '.omx' not in mat_name:
            mat_name = mat_name + ".omx"

        #logger.info("Aggregating Matrix: " + mat_name + " ...")

        input_skim_file = os.path.join(input_dir, mat_name)
        print(input_skim_file)
        output_skim_file = os.path.join(output_dir, mat_name)

        assert os.path.isfile(input_skim_file)

        input_matrix = omx.open_file(input_skim_file, mode="r") 
        input_mapping_name = input_matrix.list_mappings()[0]
        input_cores = input_matrix.list_matrices()

        output_matrix = omx.open_file(output_skim_file, mode="w")

        for core in input_cores:
            matrix = input_matrix[core]
            matrix_array = matrix.read()
            matrix_agg = _aggregate_matrix(matrix_array, zone_mapping)
            output_matrix[core] = matrix_agg

        output_matrix.create_mapping(title=input_mapping_name, entries=agg_zones)

        input_matrix.close()
        output_matrix.close()

rsm_household_sampler(input_dir='.', output_dir='.', prev_iter_access=None, curr_iter_access=None, study_area=None, input_household='households.csv', input_person='persons.csv', taz_crosswalk='taz_crosswalk.csv', mgra_crosswalk='mgra_crosswalk.csv', compare_access_columns=('NONMAN_AUTO', 'NONMAN_TRANSIT', 'NONMAN_NONMOTOR', 'NONMAN_SOV_0'), default_sampling_rate=0.25, lower_bound_sampling_rate=0.15, upper_bound_sampling_rate=1.0, random_seed=42, output_household='sampled_households.csv', output_person='sampled_person.csv') ¶

Take an intelligent sampling of households.

Parameters ¶

input_dir : input_dir (path_like) default “.” output_dir : output_dir (path_like) default “.” prev_iter_access : prev_iter_access (Path-like or pandas.DataFrame) Accessibility in an old (default, no treatment, etc) run is given (preloaded) or read in from here. Give as a relative path (from input_dir) or an absolute path. curr_iter_access : curr_iter_access (Path-like or pandas.DataFrame) Accessibility in the latest run is given (preloaded) or read in from here. Give as a relative path (from input_dir) or an absolute path. study_area : study_area (array-like) Array of RSM zone (these are numbered 1 to N in the RSM) in the study area. These zones are sampled at 100% if differential sampling is also turned on. input_household : input_household (Path-like or pandas.DataFrame) Complete synthetic household file. This data will be filtered to match the sampling of households and written out to a new CSV file. input_person : input_person (Path-like or pandas.DataFrame) Complete synthetic persons file. This data will be filtered to match the sampling of households and written out to a new CSV file. compare_access_columns : compare_access_columns (Collection[str]) Column names in the accessibility file to use for comparing accessibility. Only changes in the values in these columns will be evaluated. default_sampling_rate : default_sampling_rate (float) The default sampling rate, in the range (0,1] lower_bound_sampling_rate : lower_bound_sampling_rate (float) Sampling rates by zone will be truncated so they are never lower than this. upper_bound_sampling_rate : upper_bound_sampling_rate (float) Sampling rates by zone will be truncated so they are never higher than this.

Returns ¶

sample_households_df, sample_persons_df : sample_households_df, sample_persons_df (pandas.DataFrame) These are the sampled population to resimulate. They are also written to the output_dir

Source code in rsm/sampler.py

def rsm_household_sampler(
    input_dir=".",
    output_dir=".",
    prev_iter_access=None,
    curr_iter_access=None,
    study_area=None,
    input_household="households.csv",
    input_person="persons.csv",
    taz_crosswalk="taz_crosswalk.csv",
    mgra_crosswalk="mgra_crosswalk.csv",
    compare_access_columns=(
        "NONMAN_AUTO",
        "NONMAN_TRANSIT",
        "NONMAN_NONMOTOR",
        "NONMAN_SOV_0",
    ),
    default_sampling_rate=0.25,  # fix the values of this after some testing
    lower_bound_sampling_rate=0.15,  # fix the values of this after some testing
    upper_bound_sampling_rate=1.0,  # fix the values of this after some testing
    random_seed=42,
    output_household="sampled_households.csv",
    output_person="sampled_person.csv",
):
    """
    Take an intelligent sampling of households.

    Parameters
    ----------
    input_dir : input_dir (path_like)
        default "."
    output_dir : output_dir (path_like)
        default "."
    prev_iter_access : prev_iter_access (Path-like or pandas.DataFrame)
        Accessibility in an old (default, no treatment, etc) run is given (preloaded)
        or read in from here. Give as a relative path (from `input_dir`) or an
        absolute path.
    curr_iter_access : curr_iter_access (Path-like or pandas.DataFrame)
        Accessibility in the latest run is given (preloaded) or read in from here.
        Give as a relative path (from `input_dir`) or an absolute path.
    study_area : study_area (array-like)
        Array of RSM zone (these are numbered 1 to N in the RSM) in the study area.
        These zones are sampled at 100% if differential sampling is also turned on.
    input_household : input_household (Path-like or pandas.DataFrame)
        Complete synthetic household file.  This data will be filtered to match the
        sampling of households and written out to a new CSV file.
    input_person : input_person (Path-like or pandas.DataFrame)
        Complete synthetic persons file.  This data will be filtered to match the
        sampling of households and written out to a new CSV file.
    compare_access_columns : compare_access_columns (Collection[str])
        Column names in the accessibility file to use for comparing accessibility.
        Only changes in the values in these columns will be evaluated.
    default_sampling_rate : default_sampling_rate (float)
        The default sampling rate, in the range (0,1]
    lower_bound_sampling_rate : lower_bound_sampling_rate (float)
        Sampling rates by zone will be truncated so they are never lower than this.
    upper_bound_sampling_rate : upper_bound_sampling_rate (float)
        Sampling rates by zone will be truncated so they are never higher than this.

    Returns
    -------
    sample_households_df, sample_persons_df : sample_households_df, sample_persons_df (pandas.DataFrame)
        These are the sampled population to resimulate.  They are also written to
        the output_dir
    """

    input_dir = Path(input_dir or ".")
    output_dir = Path(output_dir or ".")

    logger.debug("CALL rsm_household_sampler")
    logger.debug(f"  {input_dir=}")
    logger.debug(f"  {output_dir=}")

    def _resolve_df(x, directory, make_index=None):
        if isinstance(x, (str, Path)):
            # read in the file to a pandas DataFrame
            x = Path(x).expanduser()
            if not x.is_absolute():
                x = Path(directory or ".").expanduser().joinpath(x)
            try:
                result = pd.read_csv(x)
            except FileNotFoundError:
                raise
        elif isinstance(x, pd.DataFrame):
            result = x
        elif x is None:
            result = None
        else:
            raise TypeError("must be path-like or DataFrame")
        if (
            result is not None
            and make_index is not None
            and make_index in result.columns
        ):
            result = result.set_index(make_index)
        return result

    def _resolve_out_filename(x):
        x = Path(x).expanduser()
        if not x.is_absolute():
            x = Path(output_dir).expanduser().joinpath(x)
        x.parent.mkdir(parents=True, exist_ok=True)
        return x

    prev_iter_access_df = _resolve_df(
        prev_iter_access, input_dir, make_index="MGRA"
    )
    curr_iter_access_df = _resolve_df(
        curr_iter_access, input_dir, make_index="MGRA"
    )
    rsm_zones = _resolve_df(taz_crosswalk, input_dir)
    dict_clusters = dict(zip(rsm_zones["taz"], rsm_zones["cluster_id"]))

    rsm_mgra_zones = _resolve_df(mgra_crosswalk, input_dir)
    rsm_mgra_zones.columns = rsm_mgra_zones.columns.str.strip().str.lower()
    dict_clusters_mgra = dict(zip(rsm_mgra_zones["mgra"], rsm_mgra_zones["cluster_id"]))

    # changing the taz and mgra to new cluster ids
    input_household_df = _resolve_df(input_household, input_dir)
    input_household_df["taz"] = input_household_df["taz"].map(dict_clusters)
    input_household_df["mgra"] = input_household_df["mgra"].map(dict_clusters_mgra)
    input_household_df["count"] = 1

    mgra_hh = input_household_df.groupby(["mgra"]).size().rename("n_hh").to_frame()

    if curr_iter_access_df is None or prev_iter_access_df is None:

        if curr_iter_access_df is None:
            logger.warning(f"missing curr_iter_access_df from {curr_iter_access}")
        if prev_iter_access_df is None:
            logger.warning(f"missing prev_iter_access_df from {prev_iter_access}")
        # true when sampler is turned off. default_sampling_rate should be set to 1

        mgra_hh["sampling_rate"] = default_sampling_rate
        if study_area is not None:
            mgra_hh.loc[mgra_hh.index.isin(study_area), "sampling_rate"] = 1

        sample_households = []

        for mgra_id, row in mgra_hh.iterrows():
            df = input_household_df.loc[input_household_df["mgra"] == mgra_id]
            sampling_rate = row["sampling_rate"]
            logger.info(f"Sampling rate of RSM zone {mgra_id}: {sampling_rate}")
            df = df.sample(frac=sampling_rate, random_state=mgra_id + random_seed)
            sample_households.append(df)

        # combine study are and non-study area households into single dataframe
        sample_households_df = pd.concat(sample_households)

    else:
        # restrict to rows only where TAZs have households
        prev_iter_access_df = prev_iter_access_df[
            prev_iter_access_df.index.isin(mgra_hh.index)
        ].copy()
        curr_iter_access_df = curr_iter_access_df[
            curr_iter_access_df.index.isin(mgra_hh.index)
        ].copy()

        # compare accessibility columns
        compare_results = pd.DataFrame()

        for column in compare_access_columns:
            compare_results[column] = (
                curr_iter_access_df[column] - prev_iter_access_df[column]
            ).abs()  # take absolute difference
        compare_results["MGRA"] = prev_iter_access_df.index

        compare_results = compare_results.set_index("MGRA")

        # Take row sums of all difference
        compare_results["Total"] = compare_results[list(compare_access_columns)].sum(
            axis=1
        )

        # TODO: potentially adjust this later after we figure out a better approach
        wgts = compare_results["Total"] + 0.01
        wgts /= wgts.mean() / default_sampling_rate
        compare_results["sampling_rate"] = np.clip(
            wgts, lower_bound_sampling_rate, upper_bound_sampling_rate
        )

        sample_households = []
        sample_rate_df = compare_results[["sampling_rate"]].copy()
        if study_area is not None:
            sample_rate_df.loc[
                sample_rate_df.index.isin(study_area), "sampling_rate"
            ] = 1

        for mgra_id, row in sample_rate_df.iterrows():
            df = input_household_df.loc[input_household_df["mgra"] == mgra_id]
            sampling_rate = row["sampling_rate"]
            logger.info(f"Sampling rate of RSM zone {mgra_id}: {sampling_rate}")
            df = df.sample(frac=sampling_rate, random_state=mgra_id + random_seed)
            sample_households.append(df)

        # combine study are and non-study area households into single dataframe
        sample_households_df = pd.concat(sample_households)

    sample_households_df = sample_households_df.sort_values(by=["hhid"])
    sample_households_df.to_csv(_resolve_out_filename(output_household), index=False)

    # select persons belonging to sampled households
    sample_hhids = sample_households_df["hhid"].to_numpy()

    persons_df = _resolve_df(input_person, input_dir)
    sample_persons_df = persons_df.loc[persons_df["hhid"].isin(sample_hhids)]
    sample_persons_df.to_csv(_resolve_out_filename(output_person), index=False)

    global_sample_rate = round(len(sample_households_df) / len(input_household_df),2)
    logger.info(f"Total Sampling Rate : {global_sample_rate}")

    return sample_households_df, sample_persons_df

`rsm_assemble(orig_indiv, orig_joint, rsm_indiv, rsm_joint, households, mgra_crosswalk=None, taz_crosswalk=None, sample_rate=0.25, study_area_taz=None, run_assembler=1)` ¶

Assemble and evaluate RSM trip making.

Parameters ¶

orig_indiv : orig_indiv (path_like) Trips table from “original” model run, should be comprehensive simulation of all individual trips for all synthetic households. orig_joint : orig_joint (path_like) Joint trips table from “original” model run, should be comprehensive simulation of all joint trips for all synthetic households. rsm_indiv : rsm_indiv (path_like) Trips table from RSM model run, should be a simulation of all individual trips for potentially only a subset of all synthetic households. rsm_joint : rsm_joint (path_like) Trips table from RSM model run, should be a simulation of all joint trips for potentially only a subset of all synthetic households (the same sampled households as in rsm_indiv). households : households (path_like) Synthetic household file, used to get home zones for households. mgra_crosswalk : mgra_crosswalk (path_like, optional) Crosswalk from original MGRA to clustered zone ids. Provide this crosswalk if the orig_indiv and orig_joint files reference the original MGRA system and those id’s need to be converted to aggregated values before merging. sample_rate : sample_rate (float) Default/fixed sample rate if sampler was turned off this is used to scale the trips if run_assembler is 0 run_assembler : run_assembler (boolean) Flag to indicate whether to run RSM assembler or not. 1 is to run assembler, 0 is to turn if off setting this to 0 is only an option if sampler is turned off
sample_rate : float default/fixed sample rate if sampler was turned off this is used to scale the trips if run_assembler is 0 study_area_rsm_zones : list it is list of study area RSM zones

Returns ¶

final_trips_rsm : final_ind_trips (pd.DataFrame) Assembled trip table for RSM run, filling in archived trip values for non-resimulated households. combined_trips_by_zone : final_jnt_trips (pd.DataFrame) Summary table of changes in trips by mode, by household home zone. Used to check whether undersampled zones have stable travel behavior.

Separate tables for individual and joint trips, as required by java.

Source code in rsm/assembler.py

def rsm_assemble(
    orig_indiv,
    orig_joint,
    rsm_indiv,
    rsm_joint,
    households,
    mgra_crosswalk=None,
    taz_crosswalk=None,
    sample_rate=0.25,
    study_area_taz=None,
    run_assembler=1,
):
    """
    Assemble and evaluate RSM trip making.

    Parameters
    ----------
    orig_indiv : orig_indiv (path_like)
        Trips table from "original" model run, should be comprehensive simulation
        of all individual trips for all synthetic households.
    orig_joint : orig_joint (path_like)
        Joint trips table from "original" model run, should be comprehensive simulation
        of all joint trips for all synthetic households.
    rsm_indiv : rsm_indiv (path_like)
        Trips table from RSM model run, should be a simulation of all individual
        trips for potentially only a subset of all synthetic households.
    rsm_joint : rsm_joint (path_like)
        Trips table from RSM model run, should be a simulation of all joint
        trips for potentially only a subset of all synthetic households (the
        same sampled households as in `rsm_indiv`).
    households : households (path_like)
        Synthetic household file, used to get home zones for households.
    mgra_crosswalk : mgra_crosswalk (path_like, optional)
        Crosswalk from original MGRA to clustered zone ids.  Provide this crosswalk
        if the `orig_indiv` and `orig_joint` files reference the original MGRA system
        and those id's need to be converted to aggregated values before merging.
    sample_rate : sample_rate (float)
        Default/fixed sample rate if sampler was turned off
        this is used to scale the trips if run_assembler is 0
    run_assembler : run_assembler (boolean)
        Flag to indicate whether to run RSM assembler or not. 
        1 is to run assembler, 0 is to turn if off
        setting this to 0 is only an option if sampler is turned off       
    sample_rate : float
        default/fixed sample rate if sampler was turned off
        this is used to scale the trips if run_assembler is 0
    study_area_rsm_zones :  list
        it is list of study area RSM zones

    Returns
    -------
    final_trips_rsm : final_ind_trips (pd.DataFrame)
        Assembled trip table for RSM run, filling in archived trip values for
        non-resimulated households.
    combined_trips_by_zone : final_jnt_trips (pd.DataFrame)
        Summary table of changes in trips by mode, by household home zone.
        Used to check whether undersampled zones have stable travel behavior.

    Separate tables for individual and joint trips, as required by java.


    """
    orig_indiv = Path(orig_indiv).expanduser()
    orig_joint = Path(orig_joint).expanduser()
    rsm_indiv = Path(rsm_indiv).expanduser()
    rsm_joint = Path(rsm_joint).expanduser()
    households = Path(households).expanduser()

    assert os.path.isfile(orig_indiv)
    assert os.path.isfile(orig_joint)
    assert os.path.isfile(rsm_indiv)
    assert os.path.isfile(rsm_joint)
    assert os.path.isfile(households)

    if mgra_crosswalk is not None:
        mgra_crosswalk = Path(mgra_crosswalk).expanduser()
        assert os.path.isfile(mgra_crosswalk)

    if taz_crosswalk is not None:
        taz_crosswalk = Path(taz_crosswalk).expanduser()
        assert os.path.isfile(taz_crosswalk)

    # load trip data - partial simulation of RSM model
    logger.info("reading ind_trips_rsm")
    ind_trips_rsm = pd.read_csv(rsm_indiv)
    logger.info("reading jnt_trips_rsm")
    jnt_trips_rsm = pd.read_csv(rsm_joint)

    scale_factor = int(1.0/sample_rate)

    if run_assembler == 1:
        # load trip data - full simulation of residual/source model
        logger.info("reading ind_trips_full")
        ind_trips_full = pd.read_csv(orig_indiv)
        logger.info("reading jnt_trips_full")
        jnt_trips_full = pd.read_csv(orig_joint)

        if mgra_crosswalk is not None:
            logger.info("applying mgra_crosswalk to original data")
            mgra_crosswalk = pd.read_csv(mgra_crosswalk).set_index("MGRA")["cluster_id"]
            mgra_crosswalk[-1] = -1
            mgra_crosswalk[0] = 0
            for col in [c for c in ind_trips_full.columns if c.lower().endswith("_mgra")]:
                ind_trips_full[col] = ind_trips_full[col].map(mgra_crosswalk)
            for col in [c for c in jnt_trips_full.columns if c.lower().endswith("_mgra")]:
                jnt_trips_full[col] = jnt_trips_full[col].map(mgra_crosswalk)

        # convert to rsm trips
        logger.info("convert to common table platform")
        rsm_trips = _merge_joint_and_indiv_trips(ind_trips_rsm, jnt_trips_rsm)
        original_trips = _merge_joint_and_indiv_trips(ind_trips_full, jnt_trips_full)

        logger.info("get all hhids in trips produced by RSM")
        hh_ids_rsm = rsm_trips["hh_id"].unique()

        logger.info("remove orginal model trips made by households chosen in RSM trips")
        original_trips_not_resimulated = original_trips.loc[
            ~original_trips["hh_id"].isin(hh_ids_rsm)
        ]
        original_ind_trips_not_resimulated = ind_trips_full[
            ~ind_trips_full["hh_id"].isin(hh_ids_rsm)
        ]
        original_jnt_trips_not_resimulated = jnt_trips_full[
            ~jnt_trips_full["hh_id"].isin(hh_ids_rsm)
        ]

        logger.info("concatenate trips from rsm and original model")
        final_trips_rsm = pd.concat(
            [rsm_trips, original_trips_not_resimulated], ignore_index=True
        ).reset_index(drop=True)
        final_ind_trips = pd.concat(
            [ind_trips_rsm, original_ind_trips_not_resimulated], ignore_index=True
        ).reset_index(drop=True)
        final_jnt_trips = pd.concat(
            [jnt_trips_rsm, original_jnt_trips_not_resimulated], ignore_index=True
        ).reset_index(drop=True)

        # Get percentage change in total trips by mode for each home zone

        # extract trips made by households in RSM and Original model
        original_trips_that_were_resimulated = original_trips.loc[
            original_trips["hh_id"].isin(hh_ids_rsm)
        ]

        def _agg_by_hhid_and_tripmode(df, name):
            return df.groupby(["hh_id", "trip_mode"]).size().rename(name).reset_index()

        # combining trips by hhid and trip mode
        combined_trips = pd.merge(
            _agg_by_hhid_and_tripmode(original_trips_that_were_resimulated, "n_trips_orig"),
            _agg_by_hhid_and_tripmode(rsm_trips, "n_trips_rsm"),
            on=["hh_id", "trip_mode"],
            how="outer",
            sort=True,
        ).fillna(0)

        # aggregating by Home zone
        hh_rsm = pd.read_csv(households)
        hh_id_col_names = ["hhid", "hh_id", "household_id"]
        for hhid in hh_id_col_names:
            if hhid in hh_rsm.columns:
                break
        else:
            raise KeyError(f"none of {hh_id_col_names!r} in household file")
        homezone_col_names = ["mgra", "home_mgra"]
        for zoneid in homezone_col_names:
            if zoneid in hh_rsm.columns:
                break
        else:
            raise KeyError(f"none of {homezone_col_names!r} in household file")
        hh_rsm = hh_rsm[[hhid, zoneid]]

        # attach home zone id
        combined_trips = pd.merge(
            combined_trips, hh_rsm, left_on="hh_id", right_on=hhid, how="left"
        )

        combined_trips_by_zone = (
            combined_trips.groupby([zoneid, "trip_mode"])[["n_trips_orig", "n_trips_rsm"]]
            .sum()
            .reset_index()
        )

        combined_trips_by_zone = combined_trips_by_zone.eval(
            "net_change = (n_trips_rsm - n_trips_orig)"
        )

        combined_trips_by_zone["max_trips"] = np.fmax(
            combined_trips_by_zone.n_trips_rsm, combined_trips_by_zone.n_trips_orig
        )
        combined_trips_by_zone = combined_trips_by_zone.eval(
            "pct_change = net_change / max_trips * 100"
        )
        combined_trips_by_zone = combined_trips_by_zone.drop(columns="max_trips")
    else:
        # if assembler is set to be turned off
        # then scale the trips in the trip list using the fixed sample rate 
        # trips in the final trip lists will be 100%
        scale_factor = int(1.0/sample_rate)

        if study_area_taz:
            sa_rsm = study_area_taz
        else:
            sa_rsm = None

        # concat is slow
        # https://stackoverflow.com/questions/50788508/how-can-i-replicate-rows-of-a-pandas-dataframe
        #final_ind_trips = pd.concat([ind_trips_rsm]*scale_factor, ignore_index=True)
        #final_jnt_trips = pd.concat([jnt_trips_rsm]*scale_factor, ignore_index=True)


        final_ind_trips = scaleup_to_rsm_samplingrate(ind_trips_rsm, 
                                                      households, 
                                                      taz_crosswalk, 
                                                      scale_factor, 
                                                      study_area_tazs=sa_rsm)

        final_jnt_trips = scaleup_to_rsm_samplingrate(jnt_trips_rsm, 
                                                      households, 
                                                      taz_crosswalk, 
                                                      scale_factor,
                                                      study_area_tazs=sa_rsm) 

    return final_ind_trips, final_jnt_trips

Application Programming Interface ¶

aggregate_zones(mgra_gdf, method='kmeans', n_zones=2000, random_state=0, cluster_factors=None, cluster_factors_onehot=None, use_xy=True, explicit_agg=(), explicit_col='mgra', agg_instruction=None, start_cluster_ids=13) ¶

Parameters ¶

Returns ¶

Parameters ¶

Returns ¶

copy_transit_demand(matrix_names, input_dir='.', output_dir='.') ¶

Parameters ¶

Returns ¶

translate_emmebank_demand(input_databank, output_databank, cores_to_aggregate, agg_zone_mapping) ¶

Parameters ¶

Returns ¶

translate_omx_demand(matrix_names, agg_zone_mapping, input_dir='.', output_dir='.') ¶

Parameters ¶

Returns ¶

Parameters ¶

Returns ¶

rsm_assemble(orig_indiv, orig_joint, rsm_indiv, rsm_joint, households, mgra_crosswalk=None, taz_crosswalk=None, sample_rate=0.25, study_area_taz=None, run_assembler=1) ¶

Parameters ¶

Returns ¶

`aggregate_zones(mgra_gdf, method='kmeans', n_zones=2000, random_state=0, cluster_factors=None, cluster_factors_onehot=None, use_xy=True, explicit_agg=(), explicit_col='mgra', agg_instruction=None, start_cluster_ids=13)` ¶

`copy_transit_demand(matrix_names, input_dir='.', output_dir='.')` ¶

`translate_emmebank_demand(input_databank, output_databank, cores_to_aggregate, agg_zone_mapping)` ¶

`translate_omx_demand(matrix_names, agg_zone_mapping, input_dir='.', output_dir='.')` ¶

`rsm_assemble(orig_indiv, orig_joint, rsm_indiv, rsm_joint, households, mgra_crosswalk=None, taz_crosswalk=None, sample_rate=0.25, study_area_taz=None, run_assembler=1)` ¶