Exploratory analysis (v0.12)¶
This notebook presents an exploratory analysis of the embeddings generated for the city of Leicester by our model v0.12 trained on a randomly sampled 1% of the nodes from 138 cities in the UK. See gnnuf_train_model_v0_12.py
for further detail on the model training.
Table of contents:
Setup ¶
# Base libraries
import math
import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
# NetworkX
import networkx as nx
import osmnx as ox
# OS environment setup
from local_directories import *
# Reset random seeds
random_seed = 2674
# Other
neighbourhood_min_nodes = 8
max_distance = 500
Load data ¶
Leicester OSMnx graph data ¶
We used the data made available by Boeing, which include simplified street networks of 138 cities in the UK derived from OpenStreetMap.
# Load Leciester's graph
leicester_osmnx_graph = ox.io.load_graphml(bulk_storage_directory + "/osmnx/raw/leicester-1864.graphml")
leicester_osmnx_graph_prj = ox.project_graph(leicester_osmnx_graph)
len(list(leicester_osmnx_graph.nodes))
13293
ox.plot_graph(
leicester_osmnx_graph_prj,
node_size=5, node_color="#000000",
edge_color="#000000", edge_linewidth=0.1,
bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
The code below extracts the tabular data from the OSMnx format into a dataframe.
# Convert graph to dataframe version
leicester_osmnx_graph_prj_df = None
for node in leicester_osmnx_graph_prj:
node_dict = leicester_osmnx_graph_prj.nodes[node]
node_dict["osmnx_node_id"] = int(node)
# node_dict["osmnx_node_id"] = str(node)
if leicester_osmnx_graph_prj_df is None:
leicester_osmnx_graph_prj_df = pd.DataFrame.from_dict([node_dict])
else:
leicester_osmnx_graph_prj_df = pd.concat([leicester_osmnx_graph_prj_df, pd.DataFrame.from_dict([node_dict])])
leicester_osmnx_graph_prj_df.head()
y | x | street_count | elevation | elevation_aster | elevation_srtm | lon | lat | osmnx_node_id | ref | highway | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5.829804e+06 | 622151.977595 | 3 | 72.0 | 35 | 72 | -1.196195 | 52.604506 | 194739 | NaN | NaN |
0 | 5.829991e+06 | 622098.041002 | 3 | 72.0 | 45 | 72 | -1.196922 | 52.606196 | 1551014281 | NaN | NaN |
0 | 5.828827e+06 | 622259.813792 | 2 | 79.0 | 57 | 79 | -1.194965 | 52.595696 | 326312 | 21 | motorway_junction |
0 | 5.830107e+06 | 622077.742140 | 3 | 79.0 | 43 | 79 | -1.197179 | 52.607245 | 326320 | 21 | motorway_junction |
0 | 5.829673e+06 | 622220.645785 | 3 | 74.0 | 35 | 74 | -1.195230 | 52.603314 | 2627867454 | NaN | NaN |
Leicester embeddings ¶
Load the pre-computed embeddings for Leicecster. See gnnuf_embedding_model_v0_12_Leicester.py
for further details.
# Load Leciester's embeddings
leicester_emb_df = pd.read_csv(this_repo_directory + "/data/leicester-1864_emb_gnnuf_model_v0-12.csv")
leicester_emb_df.head()
osmnx_node_id | EMB000 | EMB001 | |
---|---|---|---|
0 | 337976 | 0.700673 | -0.058294 |
1 | 337979 | 1.052401 | -0.071909 |
2 | 337983 | 1.176129 | -0.014825 |
3 | 337985 | 1.200868 | 0.031910 |
4 | 337986 | 0.967397 | 0.003360 |
# Load Leciester's pooled embeddings
leicester_emb_pool_df = pd.read_csv(this_repo_directory + "/data/leicester-1864_emb-pool_gnnuf_model_v0-12.csv")
leicester_emb_pool_df.head()
osmnx_node_id | EMB000 | EMB001 | |
---|---|---|---|
0 | 337976 | 0.929014 | -0.045372 |
1 | 337979 | 0.911989 | -0.045298 |
2 | 337983 | 0.929369 | -0.041823 |
3 | 337985 | 0.930489 | -0.040748 |
4 | 337986 | 0.929369 | -0.041823 |
Preliminary embeddings plots ¶
Node embeddings ¶
Let's start with a simple scatterplot showing the embeddings obtained for street junctions in Leicester, encoding the first embedding on the x-axis and the second embedding on the y-axis.
def bounded_min_max(x, min_val, max_val):
if x < min_val:
return 0
elif x > max_val:
return 1
else:
return (x - min_val) / (max_val - min_val)
leicester_emb_df["EMB_dist"] = leicester_emb_df.apply( lambda x:
bounded_min_max(math.sqrt(x["EMB000"]**2 + x["EMB001"]**2), 0.75, 1.5),
axis=1)
leicester_emb_df["EMB_angl"] = leicester_emb_df.apply( lambda x:
math.sin(math.atan2(x["EMB001"], x["EMB000"])),
axis=1)
def embeddings_colour(emb000, emb001):
dist = math.sqrt(emb000**2 + emb001**2)
angl = math.sin(math.atan2(emb001, emb000))
if dist < 0.7:
return "#000000"
elif dist < 1.35:
if angl < -0.8660:
return "#fde725"
elif angl < -0.5:
return "#addc30"
elif angl < 0.0:
return "#5ec962"
elif angl < 0.5:
return "#28ae80"
elif angl < 0.8660:
return "#21918c"
else:
return "#2c728e"
else:
if angl < -0.8660:
return "#f9cb35"
elif angl < -0.5:
return "#f98e09"
elif angl < 0.0:
return "#e45a31"
elif angl < 0.5:
return "#bc3754"
elif angl < 0.8660:
return "#8a226a"
else:
return "#57106e"
leicester_emb_df["EMB_colr"] = leicester_emb_df.apply( lambda x:
embeddings_colour(x["EMB000"], x["EMB001"]),
axis=1)
for node in leicester_osmnx_graph_prj.nodes:
if len(leicester_emb_df[leicester_emb_df["osmnx_node_id"] == node]["EMB000"].values) == 0:
leicester_osmnx_graph_prj.nodes[node]["EMB000"] = None
leicester_osmnx_graph_prj.nodes[node]["EMB001"] = None
leicester_osmnx_graph_prj.nodes[node]["EMB_dist"] = None
leicester_osmnx_graph_prj.nodes[node]["EMB_angl"] = None
leicester_osmnx_graph_prj.nodes[node]["EMB_colr"] = "#cccccc"
else:
leicester_osmnx_graph_prj.nodes[node]["EMB000"] = float(leicester_emb_df[leicester_emb_df["osmnx_node_id"] == node]["EMB000"].values[0])
leicester_osmnx_graph_prj.nodes[node]["EMB001"] = float(leicester_emb_df[leicester_emb_df["osmnx_node_id"] == node]["EMB001"].values[0])
leicester_osmnx_graph_prj.nodes[node]["EMB_dist"] = float(leicester_emb_df[leicester_emb_df["osmnx_node_id"] == node]["EMB_dist"].values[0])
leicester_osmnx_graph_prj.nodes[node]["EMB_angl"] = float(leicester_emb_df[leicester_emb_df["osmnx_node_id"] == node]["EMB_angl"].values[0])
leicester_osmnx_graph_prj.nodes[node]["EMB_colr"] = leicester_emb_df[leicester_emb_df["osmnx_node_id"] == node]["EMB_colr"].values[0]
ox.plot_graph(leicester_osmnx_graph_prj, node_color=[
leicester_osmnx_graph_prj.nodes[node]["EMB000"] for node in leicester_osmnx_graph_prj.nodes],
node_size=10, bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
ox.plot_graph(leicester_osmnx_graph_prj, node_color=[
leicester_osmnx_graph_prj.nodes[node]["EMB001"] for node in leicester_osmnx_graph_prj.nodes],
node_size=10, bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
We can then explore the values in more detail looking at the nodes position compared to the origin as their distance and angle.
plt.figure(figsize=(7,7))
ax = plt.axes()
ax.set_facecolor("white")
plt.scatter(
x=leicester_emb_df.EMB000,
y=leicester_emb_df.EMB001,
c=leicester_emb_df.EMB_dist,
s=10, edgecolors='black', linewidth=0.1)
plt.xlabel("Pooled embeddings first dimension")
plt.ylabel("Pooled embeddings second dimension")
plt.show()
fig = px.scatter(
leicester_emb_df,
x="EMB000",
y="EMB001",
color="EMB_dist",
hover_data=['osmnx_node_id'],
width=800, height=800,
color_continuous_scale='viridis'
)
fig.update_layout(
{"plot_bgcolor": "#ffffff"},
xaxis=dict(scaleanchor="y", scaleratio=1)
)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.show()
ox.plot_graph(leicester_osmnx_graph_prj, node_color=[
leicester_osmnx_graph_prj.nodes[node]["EMB_dist"] for node in leicester_osmnx_graph_prj.nodes],
node_size=10, bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
plt.figure(figsize=(7,7))
ax = plt.axes()
ax.set_facecolor("white")
plt.scatter(
x=leicester_emb_df.EMB000,
y=leicester_emb_df.EMB001,
c=leicester_emb_df.EMB_angl,
s=10, edgecolors='black', linewidth=0.1)
plt.xlabel("Pooled embeddings first dimension")
plt.ylabel("Pooled embeddings second dimension")
plt.show()
fig = px.scatter(
leicester_emb_df,
x="EMB000",
y="EMB001",
color="EMB_angl",
hover_data=['osmnx_node_id'],
width=800, height=800,
color_continuous_scale='viridis'
)
fig.update_layout(
{"plot_bgcolor": "#ffffff"},
xaxis=dict(scaleanchor="y", scaleratio=1)
)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.show()
ox.plot_graph(leicester_osmnx_graph_prj, node_color=[
leicester_osmnx_graph_prj.nodes[node]["EMB_angl"] for node in leicester_osmnx_graph_prj.nodes],
node_size=10, bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
Combined angle and distance plot ¶
plt.figure(figsize=(7,7))
ax = plt.axes()
ax.set_facecolor("white")
plt.scatter(
x=leicester_emb_df.EMB000,
y=leicester_emb_df.EMB001,
c=leicester_emb_df.EMB_colr,
s=10, edgecolors='black', linewidth=0.1)
plt.xlabel("Pooled embeddings first dimension")
plt.ylabel("Pooled embeddings second dimension")
plt.show()
fig = go.Figure()
fig.add_trace(go.Scatter(
x=leicester_emb_df.EMB000,
y=leicester_emb_df.EMB001,
mode='markers',
marker=dict(color=leicester_emb_df.EMB_colr)
))
fig.update_layout({"plot_bgcolor": "#ffffff"}, width=800, height=800)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.show()
ox.plot_graph(leicester_osmnx_graph_prj, node_color=[
leicester_osmnx_graph_prj.nodes[node]["EMB_colr"] for node in leicester_osmnx_graph_prj.nodes],
node_size=10, bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
Pooled embeddings ¶
fig = px.scatter(
leicester_emb_pool_df,
x="EMB000",
y="EMB001",
hover_data=['osmnx_node_id'],
width=800, height=800
)
fig.update_layout({"plot_bgcolor": "#ffffff"})
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.show()
for node in leicester_osmnx_graph_prj.nodes:
if len(leicester_emb_pool_df[leicester_emb_pool_df["osmnx_node_id"] == node]["EMB000"].values) == 0:
leicester_osmnx_graph_prj.nodes[node]["EMB000pool"] = None
leicester_osmnx_graph_prj.nodes[node]["EMB001pool"] = None
else:
leicester_osmnx_graph_prj.nodes[node]["EMB000pool"] = float(leicester_emb_pool_df[leicester_emb_pool_df["osmnx_node_id"] == node]["EMB000"].values[0])
leicester_osmnx_graph_prj.nodes[node]["EMB001pool"] = float(leicester_emb_pool_df[leicester_emb_pool_df["osmnx_node_id"] == node]["EMB001"].values[0])
ox.plot_graph(leicester_osmnx_graph_prj, node_color=[
leicester_osmnx_graph_prj.nodes[node]["EMB000pool"] for node in leicester_osmnx_graph_prj.nodes],
node_size=10, bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
ox.plot_graph(leicester_osmnx_graph_prj, node_color=[
leicester_osmnx_graph_prj.nodes[node]["EMB001pool"] for node in leicester_osmnx_graph_prj.nodes],
node_size=10, bgcolor="#ffffff",
figsize=(16, 16))
(<Figure size 1600x1600 with 1 Axes>, <Axes: >)
Exploring embedding patterns ¶
In this section, we further explore the patterns in the embedding values and thier spatial distribution.
Clusters (node embeddings) ¶
We then illustrate eight clusters of embeddings obtained using DBSCAN and how the related nodes are spatially distributed.
leicester_emb_patters_df = leicester_emb_df.merge(
# Ego-graph pooled embeddings
leicester_emb_pool_df.rename(columns={"EMB000":"EMB000pooled", "EMB001":"EMB001pooled"}),
on="osmnx_node_id"
)
leicester_osmnx_patters = leicester_osmnx_graph_prj.copy()
# from sklearn.cluster import DBSCAN
# clust = DBSCAN(eps=0.07, min_samples=100)
import hdbscan
clust = hdbscan.HDBSCAN(min_cluster_size=200, min_samples=10)
leicester_emb_df_clust = leicester_emb_patters_df[["EMB000", "EMB001"]].dropna()
leicester_emb_patters_df["clust"] = clust.fit_predict(leicester_emb_df_clust)
leicester_emb_patters_df["clust"].nunique()
8
clust_sizes = leicester_emb_patters_df[leicester_emb_patters_df['clust']>-1]['clust'].value_counts()
clust_mapping = {old: new for new, old in enumerate(clust_sizes.index, start=0)}
clust_mapping.update({-1: -1})
leicester_emb_patters_df['clust'] = leicester_emb_patters_df['clust'].map(clust_mapping)
print(leicester_emb_patters_df.groupby('clust').size().reset_index(name='counts'))
clust counts 0 -1 1652 1 0 6723 2 1 1410 3 2 897 4 3 782 5 4 656 6 5 480 7 6 384
colorbrewer_set1 = ["#377eb8", "#e41a1c", "#4daf4a", "#984ea3", "#ff7f00", "#ffff33", "#a65628", "#f781bf", "#999999"]
colorbrewer_paired12 = ["#a6cee3", "#1f78b4", "#b2df8a", "#33a02c", "#fb9a99", "#e31a1c", "#fdbf6f", "#ff7f00", "#cab2d6", "#6a3d9a", "#ffff99", "#b15928", "#cccccc"]
leicester_emb_patters_df["clust_colour"] = leicester_emb_patters_df["clust"].apply(lambda x: colorbrewer_set1[x])
leicester_emb_patters_df.head()
osmnx_node_id | EMB000 | EMB001 | EMB_dist | EMB_angl | EMB_colr | EMB000pooled | EMB001pooled | clust | clust_colour | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 337976 | 0.700673 | -0.058294 | 0.000000 | -0.082911 | #5ec962 | 0.929014 | -0.045372 | -1 | #999999 |
1 | 337979 | 1.052401 | -0.071909 | 0.406472 | -0.068169 | #5ec962 | 0.911989 | -0.045298 | 2 | #4daf4a |
2 | 337983 | 1.176129 | -0.014825 | 0.568296 | -0.012604 | #5ec962 | 0.929369 | -0.041823 | 2 | #4daf4a |
3 | 337985 | 1.200868 | 0.031910 | 0.601723 | 0.026563 | #28ae80 | 0.930489 | -0.040748 | 2 | #4daf4a |
4 | 337986 | 0.967397 | 0.003360 | 0.289871 | 0.003474 | #28ae80 | 0.929369 | -0.041823 | 5 | #ffff33 |
plt.figure(figsize=(7,7))
ax = plt.axes()
ax.set_facecolor("white")
plt.scatter(
x=leicester_emb_patters_df.EMB000,
y=leicester_emb_patters_df.EMB001,
c=leicester_emb_patters_df.clust_colour,
s=5, edgecolors='black', linewidth=0.1)
plt.xlabel("Embeddings first dimension")
plt.ylabel("Embeddings second dimension")
plt.show()
fig = go.Figure()
fig.add_trace(go.Scatter(
x=leicester_emb_patters_df.EMB000,
y=leicester_emb_patters_df.EMB001,
mode='markers',
marker=dict(color=leicester_emb_patters_df.clust_colour)
))
fig.update_layout({"plot_bgcolor": "#ffffff"}, width=800, height=800)
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#cccccc', zeroline=True, zerolinewidth=1, zerolinecolor='#cccccc')
fig.show()
for node in leicester_osmnx_patters.nodes:
node_bivariate_colour = leicester_emb_patters_df.loc[leicester_emb_patters_df["osmnx_node_id"] == node]
if node_bivariate_colour.empty:
leicester_osmnx_patters.nodes[node]["clust_colour"] = "#000000"
leicester_osmnx_patters.nodes[node]["node_size"] = 1
else:
leicester_osmnx_patters.nodes[node]["clust_colour"] = node_bivariate_colour["clust_colour"].values[0]
leicester_osmnx_patters.nodes[node]["node_size"] = 7
ox.plot_graph(
leicester_osmnx_patters,
node_color=[leicester_osmnx_patters.nodes[node]["clust_colour"] for node in leicester_osmnx_patters.nodes],
node_size=[leicester_osmnx_patters.nodes[node]["node_size"] if leicester_osmnx_patters.nodes[node]["clust_colour"]!=colorbrewer_set1[-1] else 1 for node in leicester_osmnx_patters.nodes],
bgcolor="#ffffff", edge_color="#000000", edge_linewidth=0.1,
figsize=(12, 12))
(<Figure size 1200x1200 with 1 Axes>, <Axes: >)
Prepare geopandas dataframe for interactive maps.
leicester_gdf = gpd.GeoDataFrame(
leicester_osmnx_graph_prj_df,
geometry=gpd.points_from_xy(
leicester_osmnx_graph_prj_df.lon,
leicester_osmnx_graph_prj_df.lat
),
crs="EPSG:4326"
).merge(leicester_emb_patters_df, on='osmnx_node_id', how='left')
leicester_gdf.head()
y | x | street_count | elevation | elevation_aster | elevation_srtm | lon | lat | osmnx_node_id | ref | ... | geometry | EMB000 | EMB001 | EMB_dist | EMB_angl | EMB_colr | EMB000pooled | EMB001pooled | clust | clust_colour | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 5.829804e+06 | 622151.977595 | 3 | 72.0 | 35 | 72 | -1.196195 | 52.604506 | 194739 | NaN | ... | POINT (-1.1962 52.60451) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 5.829991e+06 | 622098.041002 | 3 | 72.0 | 45 | 72 | -1.196922 | 52.606196 | 1551014281 | NaN | ... | POINT (-1.19692 52.6062) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 5.828827e+06 | 622259.813792 | 2 | 79.0 | 57 | 79 | -1.194965 | 52.595696 | 326312 | 21 | ... | POINT (-1.19496 52.5957) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | 5.830107e+06 | 622077.742140 | 3 | 79.0 | 43 | 79 | -1.197179 | 52.607245 | 326320 | 21 | ... | POINT (-1.19718 52.60724) | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | 5.829673e+06 | 622220.645785 | 3 | 74.0 | 35 | 74 | -1.195230 | 52.603314 | 2627867454 | NaN | ... | POINT (-1.19523 52.60331) | 1.00017 | -0.058451 | 0.335835 | -0.058341 | #5ec962 | 0.946935 | -0.034518 | 5.0 | #ffff33 |
5 rows × 21 columns
Interactive clusters map ¶
leicester_gdf[leicester_gdf["clust_colour"]!=colorbrewer_set1[-1]].dropna(subset=["EMB000"]).explore(
color="clust_colour",
marker_kwds={"radius": 7}, style_kwds={"stroke": False},
tiles="CartoDB positron"
)