Skip to content

Application Programming Interface

aggregate_zones(mgra_gdf, method='kmeans', n_zones=2000, random_state=0, cluster_factors=None, cluster_factors_onehot=None, use_xy=True, explicit_agg=(), explicit_col='mgra', agg_instruction=None, start_cluster_ids=13)

Aggregate zones.

Parameters

mgra_gdf : mgra_gdf (GeoDataFrame) Geometry and attibutes of MGRAs method : method (array) default {‘kmeans’, ‘agglom’, ‘agglom_adj’} n_zones : n_zones (int) random_state : random_state (RandomState or int) cluster_factors : cluster_factors (dict) cluster_factors_onehot : cluster_factors_onehot (dict) use_xy : use_xy (bool or float) Use X and Y coordinates as a cluster factor, use a float to scale the x-y coordinates from the CRS if needed. explicit_agg : explicit_agg (list[int or list]) A list containing integers (individual MGRAs that should not be aggregated) or lists of integers (groups of MGRAs that should be aggregated exactly as given, with no less and no more) explicit_col : explicit_col (str) The name of the column containing the ID’s from explicit_agg, usually ‘taz’ agg_instruction : agg_instruction (dict) Dictionary passed to pandas agg that says how to aggregate data columns. start_cluster_ids : start_cluster_ids (int, default 13) Cluster id’s start at this value. Can be 1, but typically SANDAG has the smallest id’s reserved for external zones, so starting at a greater value is typical.

Returns

GeoDataFrame

Source code in rsm/zone_agg.py
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
def aggregate_zones(
    mgra_gdf,
    method="kmeans",
    n_zones=2000,
    random_state=0,
    cluster_factors=None,
    cluster_factors_onehot=None,
    use_xy=True,
    explicit_agg=(),
    explicit_col="mgra",
    agg_instruction=None,
    start_cluster_ids=13,
):
    """
    Aggregate zones.

    Parameters
    ----------
    mgra_gdf : mgra_gdf (GeoDataFrame)
        Geometry and attibutes of MGRAs
    method : method (array)
        default {'kmeans', 'agglom', 'agglom_adj'}
    n_zones : n_zones (int)
    random_state : random_state (RandomState or int)
    cluster_factors : cluster_factors (dict)
    cluster_factors_onehot : cluster_factors_onehot (dict)
    use_xy : use_xy (bool or float)
        Use X and Y coordinates as a cluster factor, use a float to scale the
        x-y coordinates from the CRS if needed.
    explicit_agg : explicit_agg (list[int or list])
        A list containing integers (individual MGRAs that should not be aggregated)
        or lists of integers (groups of MGRAs that should be aggregated exactly as
        given, with no less and no more)
    explicit_col : explicit_col (str)
        The name of the column containing the ID's from `explicit_agg`, usually 'taz'
    agg_instruction : agg_instruction (dict)
        Dictionary passed to pandas `agg` that says how to aggregate data columns.
    start_cluster_ids : start_cluster_ids (int, default 13)
        Cluster id's start at this value.  Can be 1, but typically SANDAG has the
        smallest id's reserved for external zones, so starting at a greater value
        is typical.

    Returns
    -------
    GeoDataFrame
    """

    if cluster_factors is None:
        cluster_factors = {}

    n = start_cluster_ids
    if explicit_agg:
        explicit_agg_ids = {}
        for i in explicit_agg:
            if isinstance(i, Number):
                explicit_agg_ids[i] = n
            else:
                for j in i:
                    explicit_agg_ids[j] = n
            n += 1
        if explicit_col == mgra_gdf.index.name:
            mgra_gdf = mgra_gdf.reset_index()
            mgra_gdf.index = mgra_gdf[explicit_col]
        in_explicit = mgra_gdf[explicit_col].isin(explicit_agg_ids)
        mgra_gdf_algo = mgra_gdf.loc[~in_explicit].copy()
        mgra_gdf_explicit = mgra_gdf.loc[in_explicit].copy()
        mgra_gdf_explicit["cluster_id"] = mgra_gdf_explicit[explicit_col].map(
            explicit_agg_ids
        )
        n_zones_algorithm = n_zones - len(
            mgra_gdf_explicit["cluster_id"].value_counts()
        )
    else:
        mgra_gdf_algo = mgra_gdf.copy()
        mgra_gdf_explicit = None
        n_zones_algorithm = n_zones

    if use_xy:
        geometry = mgra_gdf_algo.centroid
        X = list(geometry.apply(lambda p: p.x))
        Y = list(geometry.apply(lambda p: p.y))
        factors = [np.asarray(X) * use_xy, np.asarray(Y) * use_xy]
    else:
        factors = []
    for cf, cf_wgt in cluster_factors.items():
        factors.append(cf_wgt * mgra_gdf_algo[cf].values.astype(np.float32))
    if cluster_factors_onehot:
        for cf, cf_wgt in cluster_factors_onehot.items():
            factors.append(cf_wgt * OneHotEncoder().fit_transform(mgra_gdf_algo[[cf]]))
        from scipy.sparse import hstack

        factors2d = []
        for j in factors:
            if j.ndim < 2:
                factors2d.append(np.expand_dims(j, -1))
            else:
                factors2d.append(j)
        data = hstack(factors2d).toarray()
    else:
        data = np.array(factors).T

    if method == "kmeans":
        kmeans = KMeans(n_clusters=n_zones_algorithm, random_state=random_state)
        kmeans.fit(data)
        cluster_id = kmeans.labels_
    elif method == "agglom":
        agglom = AgglomerativeClustering(
            n_clusters=n_zones_algorithm, affinity="euclidean", linkage="ward"
        )
        agglom.fit_predict(data)
        cluster_id = agglom.labels_
    elif method == "agglom_adj":
        from libpysal.weights import Rook

        w_rook = Rook.from_dataframe(mgra_gdf_algo)
        adj_mat = nx.adjacency_matrix(w_rook.to_networkx())
        agglom = AgglomerativeClustering(
            n_clusters=n_zones_algorithm,
            affinity="euclidean",
            linkage="ward",
            connectivity=adj_mat,
        )
        agglom.fit_predict(data)
        cluster_id = agglom.labels_
    else:
        raise NotImplementedError(method)
    mgra_gdf_algo["cluster_id"] = cluster_id

    if mgra_gdf_explicit is None or len(mgra_gdf_explicit) == 0:
        combined = merge_zone_data(
            mgra_gdf_algo,
            agg_instruction,
            cluster_id="cluster_id",
        )
        combined["cluster_id"] = list(range(n, n + n_zones_algorithm))
    else:
        pending = []
        for df in [mgra_gdf_algo, mgra_gdf_explicit]:
            logger.info(f"... merging {len(df)}")
            pending.append(
                merge_zone_data(
                    df,
                    agg_instruction,
                    cluster_id="cluster_id",
                ).reset_index()
            )

        pending[0]["cluster_id"] = list(range(n, n + n_zones_algorithm))

        pending[0] = pending[0][
            [c for c in pending[1].columns if c in pending[0].columns]
        ]
        pending[1] = pending[1][
            [c for c in pending[0].columns if c in pending[1].columns]
        ]
        combined = pd.concat(pending, ignore_index=False)
    combined = combined.reset_index(drop=True)

    return combined

agg_input_files(model_dir='.', rsm_dir='.', taz_cwk_file='taz_crosswalk.csv', mgra_cwk_file='mgra_crosswalk.csv', agg_zones=2000, ext_zones=12, input_files=['microMgraEquivMinutes.csv', 'microMgraTapEquivMinutes.csv', 'walkMgraTapEquivMinutes.csv', 'walkMgraEquivMinutes.csv', 'bikeTazLogsum.csv', 'bikeMgraLogsum.csv', 'zone.term', 'zones.park', 'tap.ptype', 'accessam.csv', 'ParkLocationAlts.csv', 'CrossBorderDestinationChoiceSoaAlternatives.csv', 'TourDcSoaDistanceAlts.csv', 'DestinationChoiceAlternatives.csv', 'SoaTazDistAlts.csv', 'TripMatrices.csv', 'transponderModelAccessibilities.csv', 'crossBorderTours.csv', 'internalExternalTrips.csv', 'visitorTours.csv', 'visitorTrips.csv', 'householdAVTrips.csv', 'crossBorderTrips.csv', 'TNCTrips.csv', 'airport_out.SAN.csv', 'airport_out.CBX.csv', 'TNCtrips.csv'])

Parameters

model_dir : model_dir (path_like) path to full model run, default “.” rsm_dir : rsm_dir (path_like) path to RSM, default “.” taz_cwk_file : taz_cwk_file (csv file) default taz_crosswalk.csv taz to aggregated zones file. Should be located in RSM input folder mgra_cwk_file : mgra_cwk_file (csv file) default mgra_crosswalk.csv mgra to aggregated zones file. Should be located in RSM input folder input_files : input_files (csv + other files) list of input files to be aggregated. Should include the following files “microMgraEquivMinutes.csv”, “microMgraTapEquivMinutes.csv”, “walkMgraTapEquivMinutes.csv”, “walkMgraEquivMinutes.csv”, “bikeTazLogsum.csv”, “bikeMgraLogsum.csv”, “zone.term”, “zones.park”, “tap.ptype”, “accessam.csv”, “ParkLocationAlts.csv”, “CrossBorderDestinationChoiceSoaAlternatives.csv”, “TourDcSoaDistanceAlts.csv”, “DestinationChoiceAlternatives.csv”, “SoaTazDistAlts.csv”, “TripMatrices.csv”, “transponderModelAccessibilities.csv”, “crossBorderTours.csv”, “internalExternalTrips.csv”, “visitorTours.csv”, “visitorTrips.csv”, “householdAVTrips.csv”, “crossBorderTrips.csv”, “TNCTrips.csv”, “airport_out.SAN.csv”, “airport_out.CBX.csv”, “TNCtrips.csv”

Returns

Aggregated files in the RSM input/output/uec directory

Source code in rsm/input_agg.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
def agg_input_files(
    model_dir = ".", 
    rsm_dir = ".",
    taz_cwk_file = "taz_crosswalk.csv",
    mgra_cwk_file = "mgra_crosswalk.csv",
    agg_zones=2000,
    ext_zones=12,
    input_files = ["microMgraEquivMinutes.csv", "microMgraTapEquivMinutes.csv", 
    "walkMgraTapEquivMinutes.csv", "walkMgraEquivMinutes.csv", "bikeTazLogsum.csv",
    "bikeMgraLogsum.csv", "zone.term", "zones.park", "tap.ptype", "accessam.csv",
    "ParkLocationAlts.csv", "CrossBorderDestinationChoiceSoaAlternatives.csv", 
    "TourDcSoaDistanceAlts.csv", "DestinationChoiceAlternatives.csv", "SoaTazDistAlts.csv",
    "TripMatrices.csv", "transponderModelAccessibilities.csv", "crossBorderTours.csv", 
    "internalExternalTrips.csv", "visitorTours.csv", "visitorTrips.csv", "householdAVTrips.csv", 
    "crossBorderTrips.csv", "TNCTrips.csv", "airport_out.SAN.csv", "airport_out.CBX.csv", 
    "TNCtrips.csv"]
    ):

    """
        Parameters
        ----------
        model_dir : model_dir (path_like)
            path to full model run, default "."
        rsm_dir : rsm_dir (path_like)
            path to RSM, default "."
        taz_cwk_file : taz_cwk_file (csv file)
            default taz_crosswalk.csv
            taz to aggregated zones file. Should be located in RSM input folder
        mgra_cwk_file : mgra_cwk_file (csv file)
            default mgra_crosswalk.csv
            mgra to aggregated zones file. Should be located in RSM input folder
        input_files : input_files (csv + other files)
            list of input files to be aggregated. 
            Should include the following files
                "microMgraEquivMinutes.csv", "microMgraTapEquivMinutes.csv", 
                "walkMgraTapEquivMinutes.csv", "walkMgraEquivMinutes.csv", "bikeTazLogsum.csv",
                "bikeMgraLogsum.csv", "zone.term", "zones.park", "tap.ptype", "accessam.csv",
                "ParkLocationAlts.csv", "CrossBorderDestinationChoiceSoaAlternatives.csv",
                "TourDcSoaDistanceAlts.csv", "DestinationChoiceAlternatives.csv", "SoaTazDistAlts.csv",
                "TripMatrices.csv", "transponderModelAccessibilities.csv", "crossBorderTours.csv",
                "internalExternalTrips.csv", "visitorTours.csv", "visitorTrips.csv", "householdAVTrips.csv",
                "crossBorderTrips.csv", "TNCTrips.csv", "airport_out.SAN.csv", "airport_out.CBX.csv",
                "TNCtrips.csv"

        Returns
        -------
        Aggregated files in the RSM input/output/uec directory
    """

    df_clusters = pd.read_csv(os.path.join(rsm_dir, "input", taz_cwk_file))
    df_clusters.columns= df_clusters.columns.str.strip().str.lower()
    dict_clusters = dict(zip(df_clusters['taz'], df_clusters['cluster_id']))

    mgra_cwk = pd.read_csv(os.path.join(rsm_dir, "input", mgra_cwk_file))
    mgra_cwk.columns= mgra_cwk.columns.str.strip().str.lower()
    mgra_cwk = dict(zip(mgra_cwk['mgra'], mgra_cwk['cluster_id']))

    taz_zones = int(agg_zones) + int(ext_zones)
    mgra_zones = int(agg_zones)

    # aggregating microMgraEquivMinutes.csv
    if "microMgraEquivMinutes.csv" in input_files:
        logging.info("Aggregating - microMgraEquivMinutes.csv")
        df_mm_eqmin = pd.read_csv(os.path.join(model_dir, "output", "microMgraEquivMinutes.csv"))
        df_mm_eqmin['i_new'] = df_mm_eqmin['i'].map(mgra_cwk)
        df_mm_eqmin['j_new'] = df_mm_eqmin['j'].map(mgra_cwk)

        df_mm_eqmin_agg = df_mm_eqmin.groupby(['i_new', 'j_new'])['walkTime', 'dist', 'mmTime', 'mmCost', 'mtTime', 'mtCost',
       'mmGenTime', 'mtGenTime', 'minTime'].mean().reset_index()

        df_mm_eqmin_agg = df_mm_eqmin_agg.rename(columns = {'i_new' : 'i', 'j_new' : 'j'})
        df_mm_eqmin_agg.to_csv(os.path.join(rsm_dir, "input", "microMgraEquivMinutes.csv"), index = False)

    else:
        raise FileNotFoundError("microMgraEquivMinutes.csv")


    # aggregating microMgraTapEquivMinutes.csv"   
    if "microMgraTapEquivMinutes.csv" in input_files:
        logging.info("Aggregating - microMgraTapEquivMinutes.csv")
        df_mm_tap = pd.read_csv(os.path.join(model_dir, "output", "microMgraTapEquivMinutes.csv"))
        df_mm_tap['mgra'] = df_mm_tap['mgra'].map(mgra_cwk)

        df_mm_tap_agg = df_mm_tap.groupby(['mgra', 'tap'])['walkTime', 'dist', 'mmTime', 'mmCost', 'mtTime',
       'mtCost', 'mmGenTime', 'mtGenTime', 'minTime'].mean().reset_index()

        df_mm_tap_agg.to_csv(os.path.join(rsm_dir, "input", "microMgraTapEquivMinutes.csv"), index = False)

    else:
        raise FileNotFoundError("microMgraTapEquivMinutes.csv")

    # aggregating walkMgraTapEquivMinutes.csv
    if "walkMgraTapEquivMinutes.csv" in input_files:
        logging.info("Aggregating - walkMgraTapEquivMinutes.csv")
        df_wlk_mgra_tap = pd.read_csv(os.path.join(model_dir, "output", "walkMgraTapEquivMinutes.csv"))
        df_wlk_mgra_tap["mgra"] = df_wlk_mgra_tap["mgra"].map(mgra_cwk)

        df_wlk_mgra_agg = df_wlk_mgra_tap.groupby(["mgra", "tap"])["boardingPerceived", "boardingActual","alightingPerceived","alightingActual","boardingGain","alightingGain"].mean().reset_index()
        df_wlk_mgra_agg.to_csv(os.path.join(rsm_dir, "input", "walkMgraTapEquivMinutes.csv"), index = False)

    else:
        FileNotFoundError("walkMgraTapEquivMinutes.csv")

    # aggregating walkMgraEquivMinutes.csv
    if "walkMgraEquivMinutes.csv" in input_files:
        logging.info("Aggregating - walkMgraEquivMinutes.csv")
        df_wlk_min = pd.read_csv(os.path.join(model_dir, "output", "walkMgraEquivMinutes.csv"))
        df_wlk_min["i"] = df_wlk_min["i"].map(mgra_cwk)
        df_wlk_min["j"] = df_wlk_min["j"].map(mgra_cwk)

        df_wlk_min_agg = df_wlk_min.groupby(["i", "j"])["percieved","actual", "gain"].mean().reset_index()

        df_wlk_min_agg.to_csv(os.path.join(rsm_dir, "input", "walkMgraEquivMinutes.csv"), index = False)

    else:
        FileNotFoundError("walkMgraEquivMinutes.csv")

    # aggregating biketazlogsum
    if "bikeTazLogsum.csv" in input_files:
        logging.info("Aggregating - bikeTazLogsum.csv")
        bike_taz = pd.read_csv(os.path.join(model_dir, "output", "bikeTazLogsum.csv"))

        bike_taz["i"] = bike_taz["i"].map(dict_clusters)
        bike_taz["j"] = bike_taz["j"].map(dict_clusters)

        bike_taz_agg = bike_taz.groupby(["i", "j"])["logsum", "time"].mean().reset_index()
        bike_taz_agg.to_csv(os.path.join(rsm_dir, "input", "bikeTazLogsum.csv"), index = False)

    else:
        raise FileNotFoundError("bikeTazLogsum.csv")

    # aggregating bikeMgraLogsum.csv
    if "bikeMgraLogsum.csv" in input_files:
        logging.info("Aggregating - bikeMgraLogsum.csv")
        bike_mgra = pd.read_csv(os.path.join(model_dir, "output", "bikeMgraLogsum.csv"))
        bike_mgra["i"] = bike_mgra["i"].map(mgra_cwk)
        bike_mgra["j"] = bike_mgra["j"].map(mgra_cwk)

        bike_mgra_agg = bike_mgra.groupby(["i", "j"])["logsum", "time"].mean().reset_index()
        bike_mgra_agg.to_csv(os.path.join(rsm_dir, "input", "bikeMgraLogsum.csv"), index = False)
    else:
        raise FileNotFoundError("bikeMgraLogsum.csv")

    # aggregating zone.term
    if "zone.term" in input_files:
        logging.info("Aggregating - zone.term")
        df_zone_term = pd.read_fwf(os.path.join(model_dir, "input", "zone.term"), header = None)
        df_zone_term.columns = ["taz", "terminal_time"]

        df_agg = pd.merge(df_zone_term, df_clusters, on = "taz", how = 'left')
        df_zones_agg = df_agg.groupby(["cluster_id"])['terminal_time'].max().reset_index()

        df_zones_agg.columns = ["taz", "terminal_time"]
        df_zones_agg.to_fwf(os.path.join(rsm_dir, "input", "zone.term"))

    else:
        raise FileNotFoundError("zone.term")

    # aggregating zones.park
    if "zones.park" in input_files:
        logging.info("Aggregating - zone.park")
        df_zones_park = pd.read_fwf(os.path.join(model_dir, "input", "zone.park"), header = None)
        df_zones_park.columns = ["taz", "park_zones"]

        df_zones_park_agg = pd.merge(df_zones_park, df_clusters, on = "taz", how = 'left')
        df_zones_park_agg = df_zones_park_agg.groupby(["cluster_id"])['park_zones'].max().reset_index()
        df_zones_park_agg.columns = ["taz", "park_zones"]
        df_zones_park_agg.to_fwf(os.path.join(rsm_dir, "input", "zone.park"))

    else:
        raise FileNotFoundError("zone.park")


    # aggregating tap.ptype 
    if "tap.ptype" in input_files:
        logging.info("Aggregating - tap.ptype")
        df_tap_ptype = pd.read_fwf(os.path.join(model_dir, "input", "tap.ptype"), header = None)
        df_tap_ptype.columns = ["tap", "lot id", "parking type", "taz", "capacity", "distance", "transit mode"]

        df_tap_ptype = pd.merge(df_tap_ptype, df_clusters, on = "taz", how = 'left')

        df_tap_ptype = df_tap_ptype[["tap", "lot id", "parking type", "cluster_id", "capacity", "distance", "transit mode"]]
        df_tap_ptype = df_tap_ptype.rename(columns = {"cluster_id": "taz"})
        #df_tap_ptype.to_fwf(os.path.join(rsm_dir, "input", "tap.ptype"))

        widths = [5, 6, 6, 5, 5, 5, 3]

        with open(os.path.join(rsm_dir, "input", "tap.ptype"), 'w') as f:
            for index, row in df_tap_ptype.iterrows():
                field1 = str(row[0]).rjust(widths[0])
                field2 = str(row[1]).rjust(widths[1])
                field3 = str(row[2]).rjust(widths[2])
                field4 = str(row[3]).rjust(widths[3])
                field5 = str(row[4]).rjust(widths[4])
                field6 = str(row[5]).rjust(widths[5])
                field7 = str(row[6]).rjust(widths[6])
                f.write(f'{field1}{field2}{field3}{field4}{field5}{field6}{field7}\n')

    else:
        raise FileNotFoundError("tap.ptype")

    #aggregating accessam.csv
    if "accessam.csv" in input_files:
        logging.info("Aggregating - accessam.csv")
        df_acc = pd.read_csv(os.path.join(model_dir, "input", "accessam.csv"), header = None)
        df_acc.columns = ['TAZ', 'TAP', 'TIME', 'DISTANCE', 'MODE']

        df_acc['TAZ'] = df_acc['TAZ'].map(dict_clusters)
        df_acc_agg = df_acc.groupby(['TAZ', 'TAP', 'MODE'])['TIME', 'DISTANCE'].mean().reset_index()
        df_acc_agg = df_acc_agg[["TAZ", "TAP", "TIME", "DISTANCE", "MODE"]]

        df_acc_agg.to_csv(os.path.join(rsm_dir, "input", "accessam.csv"), index = False, header =False)
    else:
        raise FileNotFoundError("accessam.csv")

    # aggregating ParkLocationAlts.csv
    if "ParkLocationAlts.csv" in input_files:
        logging.info("Aggregating - ParkLocationAlts.csv")
        df_park = pd.read_csv(os.path.join(model_dir, "uec", "ParkLocationAlts.csv"))
        df_park['mgra_new'] = df_park["mgra"].map(mgra_cwk)
        df_park_agg = df_park.groupby(["mgra_new"])["parkarea"].min().reset_index() # assuming 1 is "parking" and 2 is "no parking"
        df_park_agg['a'] = [i+1 for i in range(len(df_park_agg))]

        df_park_agg.columns = ["a", "mgra", "parkarea"]
        df_park_agg.to_csv(os.path.join(rsm_dir, "uec", "ParkLocationAlts.csv"), index = False)

    else:
        FileNotFoundError("ParkLocationAlts.csv")

    # aggregating CrossBorderDestinationChoiceSoaAlternatives.csv
    if "CrossBorderDestinationChoiceSoaAlternatives.csv" in input_files:
        logging.info("Aggregating - CrossBorderDestinationChoiceSoaAlternatives.csv")
        df_cb = pd.read_csv(os.path.join(model_dir, "uec","CrossBorderDestinationChoiceSoaAlternatives.csv"))

        df_cb["mgra_entry"] = df_cb["mgra_entry"].map(mgra_cwk)
        df_cb["mgra_return"] = df_cb["mgra_return"].map(mgra_cwk)
        df_cb["a"] = df_cb["a"].map(mgra_cwk)

        df_cb = pd.merge(df_cb, df_clusters, left_on = "dest", right_on = "taz", how = 'left')
        df_cb = df_cb.drop(columns = ["dest", "taz"])
        df_cb = df_cb.rename(columns = {'cluster_id' : 'dest'})

        df_cb_final  = df_cb.drop_duplicates()

        df_cb_final = df_cb_final[["a", "dest", "poe", "mgra_entry", "mgra_return", "poe_taz"]]
        df_cb_final.to_csv(os.path.join(rsm_dir, "uec", "CrossBorderDestinationChoiceSoaAlternatives.csv"), index = False)

    else:
        FileNotFoundError("CrossBorderDestinationChoiceSoaAlternatives.csv")

    # aggregating households.csv
    if "households.csv" in input_files:
        logging.info("Aggregating - households.csv")
        df_hh = pd.read_csv(os.path.join(model_dir, "input", "households.csv"))
        df_hh["mgra"] = df_hh["mgra"].map(mgra_cwk)
        df_hh["taz"] = df_hh["taz"].map(dict_clusters)

        df_hh.to_csv(os.path.join(rsm_dir, "input", "households.csv"), index = False)

    else:
        FileNotFoundError("households.csv")

    # aggregating ShadowPricingOutput_school_9.csv
    if "ShadowPricingOutput_school_9.csv" in input_files:
        logging.info("Aggregating - ShadowPricingOutput_school_9.csv")
        df_sp_sch = pd.read_csv(os.path.join(model_dir, "input", "ShadowPricingOutput_school_9.csv"))

        agg_instructions = {}
        for col in df_sp_sch.columns:
            if "size" in col:
                agg_instructions.update({col: "sum"})

            if "shadowPrices" in col:
                agg_instructions.update({col: "max"})

            if "_origins" in col:
                agg_instructions.update({col: "sum"})

            if "_modeledDests" in col:
                agg_instructions.update({col: "sum"})

        df_sp_sch['mgra'] = df_sp_sch['mgra'].map(mgra_cwk)
        df_sp_sch_agg = df_sp_sch.groupby(['mgra']).agg(agg_instructions).reset_index()

        alt = list(df_sp_sch_agg['mgra'])
        df_sp_sch_agg.insert(loc=0, column="alt", value=alt)
        df_sp_sch_agg.loc[len(df_sp_agg.index)] = 0

        df_sp_sch_agg.to_csv(os.path.join(rsm_dir, "input", "ShadowPricingOutput_school_9.csv"), index=False)

    else:
        FileNotFoundError("ShadowPricingOutput_school_9.csv")

    # aggregating ShadowPricingOutput_work_9.csv
    if "ShadowPricingOutput_work_9.csv" in input_files:
        logging.info("Aggregating - ShadowPricingOutput_work_9.csv")
        df_sp_wrk = pd.read_csv(os.path.join(model_dir, "input", "ShadowPricingOutput_work_9.csv"))

        agg_instructions = {}
        for col in df_sp_wrk.columns:
            if "size" in col:
                agg_instructions.update({col: "sum"})

            if "shadowPrices" in col:
                agg_instructions.update({col: "max"})

            if "_origins" in col:
                agg_instructions.update({col: "sum"})

            if "_modeledDests" in col:
                agg_instructions.update({col: "sum"})

        df_sp_wrk['mgra'] = df_sp_wrk['mgra'].map(mgra_cwk)

        df_sp_wrk_agg = df_sp_wrk.groupby(['mgra']).agg(agg_instructions).reset_index()

        alt = list(df_sp_wrk_agg['mgra'])
        df_sp_wrk_agg.insert(loc=0, column="alt", value=alt)

        df_sp_wrk_agg.loc[len(df_sp_wrk_agg.index)] = 0

        df_sp_wrk_agg.to_csv(os.path.join(rsm_dir, "input", "ShadowPricingOutput_work_9.csv"), index=False)

    else:
        FileNotFoundError("ShadowPricingOutput_work_9.csv")

    if "TourDcSoaDistanceAlts.csv" in input_files:
        logging.info("Aggregating - TourDcSoaDistanceAlts.csv")
        df_TourDcSoaDistanceAlts = pd.DataFrame({"a" : range(1,taz_zones+1), "dest" : range(1, taz_zones+1)})
        df_TourDcSoaDistanceAlts.to_csv(os.path.join(rsm_dir, "uec", "TourDcSoaDistanceAlts.csv"), index=False)

    if "DestinationChoiceAlternatives.csv" in input_files:
        logging.info("Aggregating - DestinationChoiceAlternatives.csv")
        df_DestinationChoiceAlternatives = pd.DataFrame({"a" : range(1,mgra_zones+1), "mgra" : range(1, mgra_zones+1)})
        df_DestinationChoiceAlternatives.to_csv(os.path.join(rsm_dir, "uec", "DestinationChoiceAlternatives.csv"), index=False)

    if "SoaTazDistAlts.csv" in input_files:
        logging.info("Aggregating - SoaTazDistAlts.csv")
        df_SoaTazDistAlts = pd.DataFrame({"a" : range(1,taz_zones+1), "dest" : range(1, taz_zones+1)})
        df_SoaTazDistAlts.to_csv(os.path.join(rsm_dir, "uec", "SoaTazDistAlts.csv"), index=False)

    if "TripMatrices.csv" in input_files:
        logging.info("Aggregating - TripMatrices.csv")
        trips = pd.read_csv(os.path.join(model_dir,"output", "TripMatrices.csv"))
        trips['i'] = trips['i'].map(dict_clusters)
        trips['j'] = trips['j'].map(dict_clusters)

        cols = list(trips.columns)
        cols.remove("i")
        cols.remove("j")

        trips_df = trips.groupby(['i', 'j'])[cols].sum().reset_index()
        trips_df.to_csv(os.path.join(rsm_dir, "output", "TripMatrices.csv"), index = False)

    else:
        FileNotFoundError("TripMatrices.csv")

    if "transponderModelAccessibilities.csv" in input_files:
        logging.info("Aggregating - transponderModelAccessibilities.csv")
        tran_access = pd.read_csv(os.path.join(model_dir, "output", "transponderModelAccessibilities.csv"))
        tran_access['TAZ'] = tran_access['TAZ'].map(dict_clusters)

        tran_access_agg = tran_access.groupby(['TAZ'])['DIST','AVGTTS','PCTDETOUR'].mean().reset_index()
        tran_access_agg.to_csv(os.path.join(rsm_dir, "output","transponderModelAccessibilities.csv"), index = False)

    else:
        raise FileNotFoundError("transponderModelAccessibilities.csv")

    if "crossBorderTours.csv" in input_files:
        logging.info("Aggregating - crossBorderTours.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "crossBorderTours.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "crossBorderTours.csv"), index = False)

    else:
        raise FileNotFoundError("crossBorderTours.csv")

    if "crossBorderTrips.csv" in input_files:
        logging.info("Aggregating - crossBorderTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "crossBorderTrips.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "crossBorderTrips.csv"), index = False)

    else:
        raise FileNotFoundError("crossBorderTrips.csv")

    if "internalExternalTrips.csv" in input_files:
        logging.info("Aggregating - internalExternalTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "internalExternalTrips.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "internalExternalTrips.csv"), index = False)

    else:
        raise FileNotFoundError("internalExternalTrips.csv")

    if "visitorTours.csv" in input_files:
        logging.info("Aggregating - visitorTours.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "visitorTours.csv"))

        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df.to_csv(os.path.join(rsm_dir, "output", "visitorTours.csv"), index = False)

    else:
        raise FileNotFoundError("visitorTours.csv")

    if "visitorTrips.csv" in input_files:
        logging.info("Aggregating - visitorTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "visitorTrips.csv"))

        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df.to_csv(os.path.join(rsm_dir, "output", "visitorTrips.csv"), index = False)

    else:
        raise FileNotFoundError("visitorTrips.csv")

    if "householdAVTrips.csv" in input_files:
        logging.info("Aggregating - householdAVTrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "householdAVTrips.csv"))
        #print(os.path.join(model_dir, "output", "householdAVTrips.csv"))
        df['orig_mgra'] = df['orig_mgra'].map(mgra_cwk)
        df['dest_gra'] = df['dest_gra'].map(mgra_cwk)

        df['trip_orig_mgra'] = df['trip_orig_mgra'].map(mgra_cwk)
        df['trip_dest_mgra'] = df['trip_dest_mgra'].map(mgra_cwk)
        df.to_csv(os.path.join(rsm_dir, "output", "householdAVTrips.csv"), index = False)

    else:
        raise FileNotFoundError("householdAVTrips.csv")

    if "airport_out.CBX.csv" in input_files:
        logging.info("Aggregating - airport_out.CBX.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "airport_out.CBX.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "airport_out.CBX.csv"), index = False)

    else:
        raise FileNotFoundError("airport_out.CBX.csv")

    if "airport_out.SAN.csv" in input_files:
        logging.info("Aggregating - airport_out.SAN.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "airport_out.SAN.csv"))
        df['originMGRA'] = df['originMGRA'].map(mgra_cwk)
        df['destinationMGRA'] = df['destinationMGRA'].map(mgra_cwk)

        df['originTAZ'] = df['originTAZ'].map(dict_clusters)
        df['destinationTAZ'] = df['destinationTAZ'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "airport_out.SAN.csv"), index = False)

    else:
        raise FileNotFoundError("airport_out.SAN.csv")

    if "TNCtrips.csv" in input_files:
        logging.info("Aggregating - TNCtrips.csv")
        df = pd.read_csv(os.path.join(model_dir, "output", "TNCtrips.csv"))
        df['originMgra'] = df['originMgra'].map(mgra_cwk)
        df['destinationMgra'] = df['destinationMgra'].map(mgra_cwk)

        df['originTaz'] = df['originTaz'].map(dict_clusters)
        df['destinationTaz'] = df['destinationTaz'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output", "TNCtrips.csv"), index = False)

    else:
        raise FileNotFoundError("TNCtrips.csv")

    files = ["Trip" + "_" + i + "_" + j + ".csv" for i, j in
                itertools.product(["FA", "GO", "IN", "RE", "SV", "TH", "WH"],
                                   ["OE", "AM", "MD", "PM", "OL"])]

    for file in files:
        logging.info(f"Aggregating - {file}")
        df = pd.read_csv(os.path.join(model_dir, "output", file))
        df['I'] = df['I'].map(dict_clusters)
        df['J'] = df['J'].map(dict_clusters)
        df['HomeZone'] = df['HomeZone'].map(dict_clusters)
        df.to_csv(os.path.join(rsm_dir, "output",file), index = False)

copy_transit_demand(matrix_names, input_dir='.', output_dir='.')

copies the omx transit demand matrix to rsm directory

Parameters

matrix_names : matrix_names (list) omx matrix filenames to aggregate input_dir : input_dir (Path-like) default “.” output_dir : output_dir (Path-like) default “.”

Returns

Source code in rsm/translate.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
def copy_transit_demand(
    matrix_names,
    input_dir=".",
    output_dir="."
):
    """
    copies the omx transit demand matrix to rsm directory

    Parameters
    ----------
    matrix_names : matrix_names (list)
        omx matrix filenames to aggregate
    input_dir : input_dir (Path-like) 
        default "."
    output_dir : output_dir (Path-like)
        default "."

    Returns
    -------

    """


    for mat_name in matrix_names:
        if '.omx' not in mat_name:
            mat_name = mat_name + ".omx"

        input_file_dir = os.path.join(input_dir, mat_name)
        output_file_dir = os.path.join(output_dir, mat_name)

        shutil.copy(input_file_dir, output_file_dir)

translate_emmebank_demand(input_databank, output_databank, cores_to_aggregate, agg_zone_mapping)

aggregates the demand matrix cores from one emme databank and loads them into another databank

Parameters

input_databank : input_databank (Emme databank) Emme databank output_databank : output_databank (Emme databank) Emme databank cores_to_aggregate : cores_to_aggregate (list) matrix corenames to aggregate agg_zone_mapping: agg_zone_mapping (Path-like or pandas.DataFrame) zone number mapping between original and aggregated zones. columns: original zones as ‘taz’ and aggregated zones as ‘cluster_id’

Returns

None. Loads the trip matrices into emmebank.

Source code in rsm/translate.py
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
def translate_emmebank_demand(
    input_databank,
    output_databank,
    cores_to_aggregate,
    agg_zone_mapping,
): 
    """
    aggregates the demand matrix cores from one emme databank and loads them into another databank

    Parameters
    ----------
    input_databank : input_databank (Emme databank)
        Emme databank
    output_databank : output_databank (Emme databank)
        Emme databank
    cores_to_aggregate : cores_to_aggregate (list)
        matrix corenames to aggregate
    agg_zone_mapping: agg_zone_mapping (Path-like or pandas.DataFrame)
        zone number mapping between original and aggregated zones. 
        columns: original zones as 'taz' and aggregated zones as 'cluster_id'

    Returns
    -------
    None. Loads the trip matrices into emmebank.

    """

    agg_zone_mapping_df = pd.read_csv(os.path.join(agg_zone_mapping))
    agg_zone_mapping_df = agg_zone_mapping_df.sort_values('taz')

    agg_zone_mapping_df.columns= agg_zone_mapping_df.columns.str.strip().str.lower()
    zone_mapping = dict(zip(agg_zone_mapping_df['taz'], agg_zone_mapping_df['cluster_id']))

    for core in cores_to_aggregate: 
        matrix = input_databank.matrix(core).get_data()
        matrix_array = matrix.to_numpy()

        matrix_agg = _aggregate_matrix(matrix_array, zone_mapping)

        output_matrix = output_databank.matrix(core)
        output_matrix.set_numpy_data(matrix_agg)

translate_omx_demand(matrix_names, agg_zone_mapping, input_dir='.', output_dir='.')

aggregates the omx demand matrix to aggregated zone system

Parameters

matrix_names : matrix_names (list) omx matrix filenames to aggregate agg_zone_mapping: agg_zone_mapping (path_like or pandas.DataFrame) zone number mapping between original and aggregated zones. columns: original zones as ‘taz’ and aggregated zones as ‘cluster_id’ input_dir : input_dir (path_like) default “.” output_dir : output_dir (path_like) default “.”

Returns

Source code in rsm/translate.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
def translate_omx_demand(
    matrix_names,
    agg_zone_mapping,
    input_dir=".",
    output_dir="."
): 
    """
    aggregates the omx demand matrix to aggregated zone system

    Parameters
    ----------
    matrix_names : matrix_names (list)
        omx matrix filenames to aggregate
    agg_zone_mapping: agg_zone_mapping (path_like or pandas.DataFrame)
        zone number mapping between original and aggregated zones. 
        columns: original zones as 'taz' and aggregated zones as 'cluster_id'
    input_dir : input_dir (path_like)
        default "."
    output_dir : output_dir (path_like) 
        default "."

    Returns
    -------

    """

    agg_zone_mapping_df = pd.read_csv(os.path.join(agg_zone_mapping))
    agg_zone_mapping_df = agg_zone_mapping_df.sort_values('taz')

    agg_zone_mapping_df.columns= agg_zone_mapping_df.columns.str.strip().str.lower()
    zone_mapping = dict(zip(agg_zone_mapping_df['taz'], agg_zone_mapping_df['cluster_id']))
    agg_zones = sorted(agg_zone_mapping_df['cluster_id'].unique())

    for mat_name in matrix_names:
        if '.omx' not in mat_name:
            mat_name = mat_name + ".omx"

        #logger.info("Aggregating Matrix: " + mat_name + " ...")

        input_skim_file = os.path.join(input_dir, mat_name)
        print(input_skim_file)
        output_skim_file = os.path.join(output_dir, mat_name)

        assert os.path.isfile(input_skim_file)

        input_matrix = omx.open_file(input_skim_file, mode="r") 
        input_mapping_name = input_matrix.list_mappings()[0]
        input_cores = input_matrix.list_matrices()

        output_matrix = omx.open_file(output_skim_file, mode="w")

        for core in input_cores:
            matrix = input_matrix[core]
            matrix_array = matrix.read()
            matrix_agg = _aggregate_matrix(matrix_array, zone_mapping)
            output_matrix[core] = matrix_agg

        output_matrix.create_mapping(title=input_mapping_name, entries=agg_zones)

        input_matrix.close()
        output_matrix.close()

rsm_household_sampler(input_dir='.', output_dir='.', prev_iter_access=None, curr_iter_access=None, study_area=None, input_household='households.csv', input_person='persons.csv', taz_crosswalk='taz_crosswalk.csv', mgra_crosswalk='mgra_crosswalk.csv', compare_access_columns=('NONMAN_AUTO', 'NONMAN_TRANSIT', 'NONMAN_NONMOTOR', 'NONMAN_SOV_0'), default_sampling_rate=0.25, lower_bound_sampling_rate=0.15, upper_bound_sampling_rate=1.0, random_seed=42, output_household='sampled_households.csv', output_person='sampled_person.csv')

Take an intelligent sampling of households.

Parameters

input_dir : input_dir (path_like) default “.” output_dir : output_dir (path_like) default “.” prev_iter_access : prev_iter_access (Path-like or pandas.DataFrame) Accessibility in an old (default, no treatment, etc) run is given (preloaded) or read in from here. Give as a relative path (from input_dir) or an absolute path. curr_iter_access : curr_iter_access (Path-like or pandas.DataFrame) Accessibility in the latest run is given (preloaded) or read in from here. Give as a relative path (from input_dir) or an absolute path. study_area : study_area (array-like) Array of RSM zone (these are numbered 1 to N in the RSM) in the study area. These zones are sampled at 100% if differential sampling is also turned on. input_household : input_household (Path-like or pandas.DataFrame) Complete synthetic household file. This data will be filtered to match the sampling of households and written out to a new CSV file. input_person : input_person (Path-like or pandas.DataFrame) Complete synthetic persons file. This data will be filtered to match the sampling of households and written out to a new CSV file. compare_access_columns : compare_access_columns (Collection[str]) Column names in the accessibility file to use for comparing accessibility. Only changes in the values in these columns will be evaluated. default_sampling_rate : default_sampling_rate (float) The default sampling rate, in the range (0,1] lower_bound_sampling_rate : lower_bound_sampling_rate (float) Sampling rates by zone will be truncated so they are never lower than this. upper_bound_sampling_rate : upper_bound_sampling_rate (float) Sampling rates by zone will be truncated so they are never higher than this.

Returns

sample_households_df, sample_persons_df : sample_households_df, sample_persons_df (pandas.DataFrame) These are the sampled population to resimulate. They are also written to the output_dir

Source code in rsm/sampler.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def rsm_household_sampler(
    input_dir=".",
    output_dir=".",
    prev_iter_access=None,
    curr_iter_access=None,
    study_area=None,
    input_household="households.csv",
    input_person="persons.csv",
    taz_crosswalk="taz_crosswalk.csv",
    mgra_crosswalk="mgra_crosswalk.csv",
    compare_access_columns=(
        "NONMAN_AUTO",
        "NONMAN_TRANSIT",
        "NONMAN_NONMOTOR",
        "NONMAN_SOV_0",
    ),
    default_sampling_rate=0.25,  # fix the values of this after some testing
    lower_bound_sampling_rate=0.15,  # fix the values of this after some testing
    upper_bound_sampling_rate=1.0,  # fix the values of this after some testing
    random_seed=42,
    output_household="sampled_households.csv",
    output_person="sampled_person.csv",
):
    """
    Take an intelligent sampling of households.

    Parameters
    ----------
    input_dir : input_dir (path_like)
        default "."
    output_dir : output_dir (path_like)
        default "."
    prev_iter_access : prev_iter_access (Path-like or pandas.DataFrame)
        Accessibility in an old (default, no treatment, etc) run is given (preloaded)
        or read in from here. Give as a relative path (from `input_dir`) or an
        absolute path.
    curr_iter_access : curr_iter_access (Path-like or pandas.DataFrame)
        Accessibility in the latest run is given (preloaded) or read in from here.
        Give as a relative path (from `input_dir`) or an absolute path.
    study_area : study_area (array-like)
        Array of RSM zone (these are numbered 1 to N in the RSM) in the study area.
        These zones are sampled at 100% if differential sampling is also turned on.
    input_household : input_household (Path-like or pandas.DataFrame)
        Complete synthetic household file.  This data will be filtered to match the
        sampling of households and written out to a new CSV file.
    input_person : input_person (Path-like or pandas.DataFrame)
        Complete synthetic persons file.  This data will be filtered to match the
        sampling of households and written out to a new CSV file.
    compare_access_columns : compare_access_columns (Collection[str])
        Column names in the accessibility file to use for comparing accessibility.
        Only changes in the values in these columns will be evaluated.
    default_sampling_rate : default_sampling_rate (float)
        The default sampling rate, in the range (0,1]
    lower_bound_sampling_rate : lower_bound_sampling_rate (float)
        Sampling rates by zone will be truncated so they are never lower than this.
    upper_bound_sampling_rate : upper_bound_sampling_rate (float)
        Sampling rates by zone will be truncated so they are never higher than this.

    Returns
    -------
    sample_households_df, sample_persons_df : sample_households_df, sample_persons_df (pandas.DataFrame)
        These are the sampled population to resimulate.  They are also written to
        the output_dir
    """

    input_dir = Path(input_dir or ".")
    output_dir = Path(output_dir or ".")

    logger.debug("CALL rsm_household_sampler")
    logger.debug(f"  {input_dir=}")
    logger.debug(f"  {output_dir=}")

    def _resolve_df(x, directory, make_index=None):
        if isinstance(x, (str, Path)):
            # read in the file to a pandas DataFrame
            x = Path(x).expanduser()
            if not x.is_absolute():
                x = Path(directory or ".").expanduser().joinpath(x)
            try:
                result = pd.read_csv(x)
            except FileNotFoundError:
                raise
        elif isinstance(x, pd.DataFrame):
            result = x
        elif x is None:
            result = None
        else:
            raise TypeError("must be path-like or DataFrame")
        if (
            result is not None
            and make_index is not None
            and make_index in result.columns
        ):
            result = result.set_index(make_index)
        return result

    def _resolve_out_filename(x):
        x = Path(x).expanduser()
        if not x.is_absolute():
            x = Path(output_dir).expanduser().joinpath(x)
        x.parent.mkdir(parents=True, exist_ok=True)
        return x

    prev_iter_access_df = _resolve_df(
        prev_iter_access, input_dir, make_index="MGRA"
    )
    curr_iter_access_df = _resolve_df(
        curr_iter_access, input_dir, make_index="MGRA"
    )
    rsm_zones = _resolve_df(taz_crosswalk, input_dir)
    dict_clusters = dict(zip(rsm_zones["taz"], rsm_zones["cluster_id"]))

    rsm_mgra_zones = _resolve_df(mgra_crosswalk, input_dir)
    rsm_mgra_zones.columns = rsm_mgra_zones.columns.str.strip().str.lower()
    dict_clusters_mgra = dict(zip(rsm_mgra_zones["mgra"], rsm_mgra_zones["cluster_id"]))

    # changing the taz and mgra to new cluster ids
    input_household_df = _resolve_df(input_household, input_dir)
    input_household_df["taz"] = input_household_df["taz"].map(dict_clusters)
    input_household_df["mgra"] = input_household_df["mgra"].map(dict_clusters_mgra)
    input_household_df["count"] = 1

    mgra_hh = input_household_df.groupby(["mgra"]).size().rename("n_hh").to_frame()

    if curr_iter_access_df is None or prev_iter_access_df is None:

        if curr_iter_access_df is None:
            logger.warning(f"missing curr_iter_access_df from {curr_iter_access}")
        if prev_iter_access_df is None:
            logger.warning(f"missing prev_iter_access_df from {prev_iter_access}")
        # true when sampler is turned off. default_sampling_rate should be set to 1

        mgra_hh["sampling_rate"] = default_sampling_rate
        if study_area is not None:
            mgra_hh.loc[mgra_hh.index.isin(study_area), "sampling_rate"] = 1

        sample_households = []

        for mgra_id, row in mgra_hh.iterrows():
            df = input_household_df.loc[input_household_df["mgra"] == mgra_id]
            sampling_rate = row["sampling_rate"]
            logger.info(f"Sampling rate of RSM zone {mgra_id}: {sampling_rate}")
            df = df.sample(frac=sampling_rate, random_state=mgra_id + random_seed)
            sample_households.append(df)

        # combine study are and non-study area households into single dataframe
        sample_households_df = pd.concat(sample_households)

    else:
        # restrict to rows only where TAZs have households
        prev_iter_access_df = prev_iter_access_df[
            prev_iter_access_df.index.isin(mgra_hh.index)
        ].copy()
        curr_iter_access_df = curr_iter_access_df[
            curr_iter_access_df.index.isin(mgra_hh.index)
        ].copy()

        # compare accessibility columns
        compare_results = pd.DataFrame()

        for column in compare_access_columns:
            compare_results[column] = (
                curr_iter_access_df[column] - prev_iter_access_df[column]
            ).abs()  # take absolute difference
        compare_results["MGRA"] = prev_iter_access_df.index

        compare_results = compare_results.set_index("MGRA")

        # Take row sums of all difference
        compare_results["Total"] = compare_results[list(compare_access_columns)].sum(
            axis=1
        )

        # TODO: potentially adjust this later after we figure out a better approach
        wgts = compare_results["Total"] + 0.01
        wgts /= wgts.mean() / default_sampling_rate
        compare_results["sampling_rate"] = np.clip(
            wgts, lower_bound_sampling_rate, upper_bound_sampling_rate
        )

        sample_households = []
        sample_rate_df = compare_results[["sampling_rate"]].copy()
        if study_area is not None:
            sample_rate_df.loc[
                sample_rate_df.index.isin(study_area), "sampling_rate"
            ] = 1

        for mgra_id, row in sample_rate_df.iterrows():
            df = input_household_df.loc[input_household_df["mgra"] == mgra_id]
            sampling_rate = row["sampling_rate"]
            logger.info(f"Sampling rate of RSM zone {mgra_id}: {sampling_rate}")
            df = df.sample(frac=sampling_rate, random_state=mgra_id + random_seed)
            sample_households.append(df)

        # combine study are and non-study area households into single dataframe
        sample_households_df = pd.concat(sample_households)

    sample_households_df = sample_households_df.sort_values(by=["hhid"])
    sample_households_df.to_csv(_resolve_out_filename(output_household), index=False)

    # select persons belonging to sampled households
    sample_hhids = sample_households_df["hhid"].to_numpy()

    persons_df = _resolve_df(input_person, input_dir)
    sample_persons_df = persons_df.loc[persons_df["hhid"].isin(sample_hhids)]
    sample_persons_df.to_csv(_resolve_out_filename(output_person), index=False)

    global_sample_rate = round(len(sample_households_df) / len(input_household_df),2)
    logger.info(f"Total Sampling Rate : {global_sample_rate}")

    return sample_households_df, sample_persons_df

rsm_assemble(orig_indiv, orig_joint, rsm_indiv, rsm_joint, households, mgra_crosswalk=None, taz_crosswalk=None, sample_rate=0.25, study_area_taz=None, run_assembler=1)

Assemble and evaluate RSM trip making.

Parameters

orig_indiv : orig_indiv (path_like) Trips table from “original” model run, should be comprehensive simulation of all individual trips for all synthetic households. orig_joint : orig_joint (path_like) Joint trips table from “original” model run, should be comprehensive simulation of all joint trips for all synthetic households. rsm_indiv : rsm_indiv (path_like) Trips table from RSM model run, should be a simulation of all individual trips for potentially only a subset of all synthetic households. rsm_joint : rsm_joint (path_like) Trips table from RSM model run, should be a simulation of all joint trips for potentially only a subset of all synthetic households (the same sampled households as in rsm_indiv). households : households (path_like) Synthetic household file, used to get home zones for households. mgra_crosswalk : mgra_crosswalk (path_like, optional) Crosswalk from original MGRA to clustered zone ids. Provide this crosswalk if the orig_indiv and orig_joint files reference the original MGRA system and those id’s need to be converted to aggregated values before merging. sample_rate : sample_rate (float) Default/fixed sample rate if sampler was turned off this is used to scale the trips if run_assembler is 0 run_assembler : run_assembler (boolean) Flag to indicate whether to run RSM assembler or not. 1 is to run assembler, 0 is to turn if off setting this to 0 is only an option if sampler is turned off
sample_rate : float default/fixed sample rate if sampler was turned off this is used to scale the trips if run_assembler is 0 study_area_rsm_zones : list it is list of study area RSM zones

Returns

final_trips_rsm : final_ind_trips (pd.DataFrame) Assembled trip table for RSM run, filling in archived trip values for non-resimulated households. combined_trips_by_zone : final_jnt_trips (pd.DataFrame) Summary table of changes in trips by mode, by household home zone. Used to check whether undersampled zones have stable travel behavior.

Separate tables for individual and joint trips, as required by java.

Source code in rsm/assembler.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def rsm_assemble(
    orig_indiv,
    orig_joint,
    rsm_indiv,
    rsm_joint,
    households,
    mgra_crosswalk=None,
    taz_crosswalk=None,
    sample_rate=0.25,
    study_area_taz=None,
    run_assembler=1,
):
    """
    Assemble and evaluate RSM trip making.

    Parameters
    ----------
    orig_indiv : orig_indiv (path_like)
        Trips table from "original" model run, should be comprehensive simulation
        of all individual trips for all synthetic households.
    orig_joint : orig_joint (path_like)
        Joint trips table from "original" model run, should be comprehensive simulation
        of all joint trips for all synthetic households.
    rsm_indiv : rsm_indiv (path_like)
        Trips table from RSM model run, should be a simulation of all individual
        trips for potentially only a subset of all synthetic households.
    rsm_joint : rsm_joint (path_like)
        Trips table from RSM model run, should be a simulation of all joint
        trips for potentially only a subset of all synthetic households (the
        same sampled households as in `rsm_indiv`).
    households : households (path_like)
        Synthetic household file, used to get home zones for households.
    mgra_crosswalk : mgra_crosswalk (path_like, optional)
        Crosswalk from original MGRA to clustered zone ids.  Provide this crosswalk
        if the `orig_indiv` and `orig_joint` files reference the original MGRA system
        and those id's need to be converted to aggregated values before merging.
    sample_rate : sample_rate (float)
        Default/fixed sample rate if sampler was turned off
        this is used to scale the trips if run_assembler is 0
    run_assembler : run_assembler (boolean)
        Flag to indicate whether to run RSM assembler or not. 
        1 is to run assembler, 0 is to turn if off
        setting this to 0 is only an option if sampler is turned off       
    sample_rate : float
        default/fixed sample rate if sampler was turned off
        this is used to scale the trips if run_assembler is 0
    study_area_rsm_zones :  list
        it is list of study area RSM zones

    Returns
    -------
    final_trips_rsm : final_ind_trips (pd.DataFrame)
        Assembled trip table for RSM run, filling in archived trip values for
        non-resimulated households.
    combined_trips_by_zone : final_jnt_trips (pd.DataFrame)
        Summary table of changes in trips by mode, by household home zone.
        Used to check whether undersampled zones have stable travel behavior.

    Separate tables for individual and joint trips, as required by java.


    """
    orig_indiv = Path(orig_indiv).expanduser()
    orig_joint = Path(orig_joint).expanduser()
    rsm_indiv = Path(rsm_indiv).expanduser()
    rsm_joint = Path(rsm_joint).expanduser()
    households = Path(households).expanduser()

    assert os.path.isfile(orig_indiv)
    assert os.path.isfile(orig_joint)
    assert os.path.isfile(rsm_indiv)
    assert os.path.isfile(rsm_joint)
    assert os.path.isfile(households)

    if mgra_crosswalk is not None:
        mgra_crosswalk = Path(mgra_crosswalk).expanduser()
        assert os.path.isfile(mgra_crosswalk)

    if taz_crosswalk is not None:
        taz_crosswalk = Path(taz_crosswalk).expanduser()
        assert os.path.isfile(taz_crosswalk)

    # load trip data - partial simulation of RSM model
    logger.info("reading ind_trips_rsm")
    ind_trips_rsm = pd.read_csv(rsm_indiv)
    logger.info("reading jnt_trips_rsm")
    jnt_trips_rsm = pd.read_csv(rsm_joint)

    scale_factor = int(1.0/sample_rate)

    if run_assembler == 1:
        # load trip data - full simulation of residual/source model
        logger.info("reading ind_trips_full")
        ind_trips_full = pd.read_csv(orig_indiv)
        logger.info("reading jnt_trips_full")
        jnt_trips_full = pd.read_csv(orig_joint)

        if mgra_crosswalk is not None:
            logger.info("applying mgra_crosswalk to original data")
            mgra_crosswalk = pd.read_csv(mgra_crosswalk).set_index("MGRA")["cluster_id"]
            mgra_crosswalk[-1] = -1
            mgra_crosswalk[0] = 0
            for col in [c for c in ind_trips_full.columns if c.lower().endswith("_mgra")]:
                ind_trips_full[col] = ind_trips_full[col].map(mgra_crosswalk)
            for col in [c for c in jnt_trips_full.columns if c.lower().endswith("_mgra")]:
                jnt_trips_full[col] = jnt_trips_full[col].map(mgra_crosswalk)

        # convert to rsm trips
        logger.info("convert to common table platform")
        rsm_trips = _merge_joint_and_indiv_trips(ind_trips_rsm, jnt_trips_rsm)
        original_trips = _merge_joint_and_indiv_trips(ind_trips_full, jnt_trips_full)

        logger.info("get all hhids in trips produced by RSM")
        hh_ids_rsm = rsm_trips["hh_id"].unique()

        logger.info("remove orginal model trips made by households chosen in RSM trips")
        original_trips_not_resimulated = original_trips.loc[
            ~original_trips["hh_id"].isin(hh_ids_rsm)
        ]
        original_ind_trips_not_resimulated = ind_trips_full[
            ~ind_trips_full["hh_id"].isin(hh_ids_rsm)
        ]
        original_jnt_trips_not_resimulated = jnt_trips_full[
            ~jnt_trips_full["hh_id"].isin(hh_ids_rsm)
        ]

        logger.info("concatenate trips from rsm and original model")
        final_trips_rsm = pd.concat(
            [rsm_trips, original_trips_not_resimulated], ignore_index=True
        ).reset_index(drop=True)
        final_ind_trips = pd.concat(
            [ind_trips_rsm, original_ind_trips_not_resimulated], ignore_index=True
        ).reset_index(drop=True)
        final_jnt_trips = pd.concat(
            [jnt_trips_rsm, original_jnt_trips_not_resimulated], ignore_index=True
        ).reset_index(drop=True)

        # Get percentage change in total trips by mode for each home zone

        # extract trips made by households in RSM and Original model
        original_trips_that_were_resimulated = original_trips.loc[
            original_trips["hh_id"].isin(hh_ids_rsm)
        ]

        def _agg_by_hhid_and_tripmode(df, name):
            return df.groupby(["hh_id", "trip_mode"]).size().rename(name).reset_index()

        # combining trips by hhid and trip mode
        combined_trips = pd.merge(
            _agg_by_hhid_and_tripmode(original_trips_that_were_resimulated, "n_trips_orig"),
            _agg_by_hhid_and_tripmode(rsm_trips, "n_trips_rsm"),
            on=["hh_id", "trip_mode"],
            how="outer",
            sort=True,
        ).fillna(0)

        # aggregating by Home zone
        hh_rsm = pd.read_csv(households)
        hh_id_col_names = ["hhid", "hh_id", "household_id"]
        for hhid in hh_id_col_names:
            if hhid in hh_rsm.columns:
                break
        else:
            raise KeyError(f"none of {hh_id_col_names!r} in household file")
        homezone_col_names = ["mgra", "home_mgra"]
        for zoneid in homezone_col_names:
            if zoneid in hh_rsm.columns:
                break
        else:
            raise KeyError(f"none of {homezone_col_names!r} in household file")
        hh_rsm = hh_rsm[[hhid, zoneid]]

        # attach home zone id
        combined_trips = pd.merge(
            combined_trips, hh_rsm, left_on="hh_id", right_on=hhid, how="left"
        )

        combined_trips_by_zone = (
            combined_trips.groupby([zoneid, "trip_mode"])[["n_trips_orig", "n_trips_rsm"]]
            .sum()
            .reset_index()
        )

        combined_trips_by_zone = combined_trips_by_zone.eval(
            "net_change = (n_trips_rsm - n_trips_orig)"
        )

        combined_trips_by_zone["max_trips"] = np.fmax(
            combined_trips_by_zone.n_trips_rsm, combined_trips_by_zone.n_trips_orig
        )
        combined_trips_by_zone = combined_trips_by_zone.eval(
            "pct_change = net_change / max_trips * 100"
        )
        combined_trips_by_zone = combined_trips_by_zone.drop(columns="max_trips")
    else:
        # if assembler is set to be turned off
        # then scale the trips in the trip list using the fixed sample rate 
        # trips in the final trip lists will be 100%
        scale_factor = int(1.0/sample_rate)

        if study_area_taz:
            sa_rsm = study_area_taz
        else:
            sa_rsm = None

        # concat is slow
        # https://stackoverflow.com/questions/50788508/how-can-i-replicate-rows-of-a-pandas-dataframe
        #final_ind_trips = pd.concat([ind_trips_rsm]*scale_factor, ignore_index=True)
        #final_jnt_trips = pd.concat([jnt_trips_rsm]*scale_factor, ignore_index=True)


        final_ind_trips = scaleup_to_rsm_samplingrate(ind_trips_rsm, 
                                                      households, 
                                                      taz_crosswalk, 
                                                      scale_factor, 
                                                      study_area_tazs=sa_rsm)

        final_jnt_trips = scaleup_to_rsm_samplingrate(jnt_trips_rsm, 
                                                      households, 
                                                      taz_crosswalk, 
                                                      scale_factor,
                                                      study_area_tazs=sa_rsm) 

    return final_ind_trips, final_jnt_trips