Source code for bgd.real.pennsylvania_road_dataset

import os
import networkx as nx
from networkx.algorithms.planarity import is_planar
import pandas as pd
import torch
from tqdm import tqdm
import torch_geometric as pyg
from torch_geometric.data import InMemoryDataset, Data
import pickle
import wget
import numpy as np
import gzip
import shutil
from ..utils import ESWR, describe_one_dataset
from littleballoffur.exploration_sampling import *

print(os.getcwd())

def four_cycles(g):
    """
    Returns the number of 4-cycles in a graph, normalised by the number of nodes
    """
    cycles = nx.simple_cycles(g, 4)
    return len(list(cycles)) / g.order()


def download_roads(visualise = False):
    zip_url = "https://snap.stanford.edu/data/roadNet-PA.txt.gz"

    start_dir = os.getcwd()
    os.chdir("bgd_files")


    if "roads" not in os.listdir():
        os.mkdir("roads")

    os.chdir("roads")
    # print(os.listdir())
    # quit()
    if "road_graph.npz" in os.listdir():
        with open("road_graph.npz", "rb") as f:
            graph = pickle.load(f)
        os.chdir(start_dir)
        return graph
    else:


        print("Downloading roads graph")
        # os.chdir("roads")

        _ = wget.download(zip_url)

        with gzip.open('roadNet-PA.txt.gz', 'rb') as f_in:
            with open('roadNet-PA.txt', 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

        os.remove("roadNet-PA.txt.gz")
    # else:
    #     os.chdir("roads")

    #     if "road-graph.npz" in os.listdir():
    #         with open("road-graph.npz", "rb") as f:
    #             graph = pickle.load(f)
    #         os.chdir('../')
    #         return graph


    edgelist = pd.read_csv("roadNet-PA.txt", delimiter = "\t", header=3)

    graph = nx.Graph()

    sources = edgelist["# FromNodeId"].to_numpy().astype("int")
    targets = edgelist["ToNodeId"].to_numpy().astype("int")

    edges_to_add = [(sources[i], targets[i]) for i in tqdm(range(sources.shape[0]))]

    graph.add_edges_from(edges_to_add)

    del edges_to_add


    graph = nx.convert_node_labels_to_integers(graph)

    CGs = [graph.subgraph(c) for c in nx.connected_components(graph)]
    CGs = sorted(CGs, key=lambda x: x.number_of_nodes(), reverse=True)
    graph = CGs[0]
    graph = nx.convert_node_labels_to_integers(graph)
    graph.remove_edges_from(nx.selfloop_edges(graph))

    print("Sub-sampling road graph")
    sampler = MetropolisHastingsRandomWalkSampler(number_of_nodes=50000, seed=42)
    graph = sampler.sample(graph)

    with open("road_graph.npz", "wb") as f:
        pickle.dump(graph, f)

    os.chdir(start_dir)
    return graph


def individual_sample(graph):
    sampler = DiffusionSampler(number_of_nodes=np.random.randint(12, 48))
    return nx.convert_node_labels_to_integers(sampler.sample(graph))

def add_attrs_to_graph(g):
    dummy_label = torch.Tensor([1])
    nx.set_edge_attributes(g, dummy_label, "attrs")
    nx.set_node_attributes(g, dummy_label, "attrs")


    return g


def get_road_dataset(num = 2000, targets = False):
    fb_graph = download_roads()
    nx_graph_list = ESWR(fb_graph, num, 96)


    data_objects = [pyg.utils.from_networkx(g) for g in nx_graph_list]

    for i, data in enumerate(tqdm(data_objects, desc="Calculating diameter values for roads", leave=False)):
        data.y = torch.tensor(int(is_planar(nx_graph_list[i])))

        nx_graph_list[i] = data

    return data_objects# loader

[docs] class PennsylvaniaRoadDataset(InMemoryDataset): r""" Contributor: Alex O. Davies Contributor email: `alexander.davies@bristol.ac.uk` NOTE: This is a big graph (1M nodes) so subsampling many small graphs from it can be very slow. To alleviate this we pre-sample a graph of 100k nodes from the original. Road graphs from Pennsylvania, sampled from a large original graph using ESWR. The original graph is sourced from: `J. Leskovec, K. Lang, A. Dasgupta, M. Mahoney. Community Structure in Large Networks: Natural Cluster Sizes and the Absence of Large Well-Defined Clusters. Internet Mathematics 6(1) 29--123, 2009.` The task is predicting whether a given graph is planar (can be laid out with no crossing edges). - Task: Graph classification - Num node features: None - Num edge features: None - Num target values: 1 - Target shape: 1 - Num graphs: Parameterised by `num` Args: root (str): Root directory where the dataset should be saved. stage (str): The stage of the dataset to load. One of "train", "val", "test". (default: :obj:`"train"`) transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before every access. (default: :obj:`None`) pre_transform (callable, optional): A function/transform that takes in an :obj:`torch_geometric.data.Data` object and returns a transformed version. The data object will be transformed before being saved to disk. (default: :obj:`None`) pre_filter (callable, optional): A function that takes in an :obj:`torch_geometric.data.Data` object and returns a boolean value, indicating whether the data object should be included in the final dataset. (default: :obj:`None`) num (int): The number of samples to take from the original dataset. (default: :obj:`2000`). """
[docs] def __init__(self, root, stage="train", transform=None, pre_transform=None, pre_filter=None, num = 2000): self.num = num self.stage = stage self.stage_to_index = {"train":0, "val":1, "test":2} _ = download_roads() del _ super().__init__(root, transform, pre_transform, pre_filter) self.data, self.slices = torch.load(self.processed_paths[self.stage_to_index[self.stage]])
@property def raw_file_names(self): return [] @property def processed_file_names(self): return ['train.pt', 'val.pt', 'test.pt']
[docs] def process(self): # Read data into huge `Data` list. if os.path.isfile(self.processed_paths[self.stage_to_index[self.stage]]): print("Road files exist") return data_list = get_road_dataset(num=self.num, targets=self.stage != "train") # if self.stage == "train": # print("Found stage train, dropping targets") # new_data_list = [] # for i, item in enumerate(data_list): # n_nodes, n_edges = item.x.shape[0], item.edge_index.shape[1] # data = Data(x = torch.ones(n_nodes).to(torch.int).reshape((-1, 1)), # edge_index=item.edge_index, # edge_attr=torch.ones(n_edges).to(torch.int).reshape((-1,1)), # y = None) # new_data_list.append(data) # data_list = new_data_list # else: # new_data_list = [] # for i, item in enumerate(data_list): # n_nodes, n_edges = item.x.shape[0], item.edge_index.shape[1] # data = Data(x = item.x, # edge_index=item.edge_index, # edge_attr=torch.ones(n_edges).to(torch.int).reshape((-1,1)), # y = item.y) # new_data_list.append(data) # data_list = new_data_list if self.pre_filter is not None: data_list = [data for data in data_list if self.pre_filter(data)] if self.pre_transform is not None: data_list = [self.pre_transform(data) for data in data_list] data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[self.stage_to_index[self.stage]]) del data_list
if __name__ == "__main__": dataset = PennsylvaniaRoadDataset(os.getcwd()+'/bgd_files/'+'roads', stage = "train") describe_one_dataset(dataset)