# 第五次作业

在本次作业中，大家需要完成GraphSAGE模型的训练，重点体会Neighbor Sampling的方式及用法，并在cora数据集上进行测试。

本作业需要安装[CogDL](https://github.com/THUDM/cogdl)：pip install cogdl

如需使用gpu版，请先安装gpu版本的[PyTorch](https://pytorch.org/get-started/locally/)，再安装cogdl。

本作业由智谱GNN中心及课程团队筹备，由CogDL团队提供技术支持。


### 1. GraphSAGE模型
仔细阅读GraphSAGE模型的实现，特别是forward/inference函数。

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

from cogdl.layers import SAGELayer


class GraphSAGE(nn.Module):

    def __init__(self, num_features, num_classes, hidden_size, num_layers, dropout, aggr="mean"):
        super(GraphSAGE, self).__init__()
        self.adjlist = {}
        self.num_features = num_features
        self.num_classes = num_classes
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = dropout
        shapes = [num_features] + [hidden_size] * (num_layers - 1) + [num_classes]
        self.convs = nn.ModuleList(
            [SAGELayer(shapes[layer], shapes[layer + 1], aggr=aggr) for layer in range(num_layers)]
        )

    def forward(self, x, adjs):
        device = next(self.parameters()).device
        for i, (src_id, graph, size) in enumerate(adjs):
            graph = graph.to(device)
            output = self.convs[i](graph, x)
            x = output[: size[1]]
            if i != self.num_layers - 1:
                x = F.relu(x)
                x = F.dropout(x, p=self.dropout, training=self.training)
        return x

    def inference(self, x_all, data_loader):
        device = next(self.parameters()).device
        for i in range(len(self.convs)):
            output = []
            for src_id, graph, size in data_loader:
                x = x_all[src_id].to(device)
                graph = graph.to(device)
                x = self.convs[i](graph, x)
                x = x[: size[1]]
                if i != self.num_layers - 1:
                    x = F.relu(x)
                output.append(x.cpu())
            x_all = torch.cat(output, dim=0)
        return x_all


### 2. 从cogdl中加载cora数据集（x表示特征，y表示标签，mask表示训练/验证/测试集的划分）

In [2]:
from cogdl.datasets import build_dataset_from_name

dataset = build_dataset_from_name("cora")
data = dataset[0]
print(data)

Graph(x=[2708, 1433], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_index=[2, 10556])


### 3. 使用GraphSAGE模型进行训练

#### 3.1 测试CogDL提供的NeighborSampler

In [3]:
from cogdl.data.sampler import NeighborSampler

data_loader = NeighborSampler(
    dataset=dataset,
    mask=dataset.data.train_mask,
    sizes=[2, 2],
    shuffle=True,
    batch_size=8,
)

###################
##### 作业填空 #####
###################
# 阅读sampler源码 https://github.com/THUDM/cogdl/blob/master/cogdl/data/sampler.py
# 使用自己的语言来描述NeighborSampler返回的每个batch中各个元素的形式及含义
# target_id: 
# n_id: 
# adjs: 

for batch in data_loader:
    target_id, n_id, adjs = batch
    print(target_id)
    print(n_id)
    print(adjs)
    break

tensor([0, 1, 2, 3, 4, 5, 6, 7])
tensor([   0,    1,    2,    3,    4,    5,    6,    7, 1862,  633,  652, 1454,
        2544, 2175, 1016, 1659, 1629, 1042,  373,  208, 1986,  332, 1761, 1416,
         926, 1701,  470, 2176,  982,  561, 1711, 1517, 1047, 1025])
[(tensor([   0,    1,    2,    3,    4,    5,    6,    7, 1862,  633,  652, 1454,
        2544, 2175, 1016, 1659, 1629, 1042,  373,  208, 1986,  332, 1761, 1416,
         926, 1701,  470, 2176,  982,  561, 1711, 1517, 1047, 1025]), Graph(col=[35]), (34, 20)), (tensor([   0,    1,    2,    3,    4,    5,    6,    7, 1862,  633,  652, 1454,
        2544, 2175, 1016, 1659, 1629, 1042,  373,  208]), Graph(col=[14]), (20, 8))]


#### 3.2 使用NeighborSampler来训练GraphSAGE模型

In [4]:
import math
import copy
import numpy as np
from tqdm import tqdm
from cogdl.data.sampler import NeighborSampler


def accuracy(y_pred, y_true):
    y_true = y_true.squeeze().long()
    preds = y_pred.max(1)[1].type_as(y_true)
    correct = preds.eq(y_true).double()
    correct = correct.sum().item()
    return correct / len(y_true)

def train(hidden_size=64, batch_size=32, sizes=[10, 10]):
    hidden_size = hidden_size
    model = GraphSAGE(data.x.shape[1], data.y.max() + 1, hidden_size, num_layers=2, dropout=0.5)

    train_loader = NeighborSampler(
        dataset=dataset,
        mask=dataset.data.train_mask,
        sizes=[10, 10],
        shuffle=True,
        batch_size=batch_size,
    )
    val_loader = NeighborSampler(
        dataset=dataset,
        mask=dataset.data.val_mask,
        sizes=[10, 10],
        shuffle=False,
        batch_size=batch_size,
    )
    test_loader = NeighborSampler(
        dataset=dataset,
        mask=None,
        sizes=[-1],
        shuffle=False,
        batch_size=batch_size,
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    epoch_iter = tqdm(range(100), position=0, leave=True)
    best_model = None
    best_loss = 1e8
    for epoch in epoch_iter:
        model.train()
        optimizer.zero_grad()
        train_loss = []
        for batch in train_loader:
            target_id, n_id, adjs = batch
            ###################
            ##### 作业填空 #####
            ###################
            # 将每个batch的数据整理成模型forward需要的形式（可能需要使用data.x）
            logits = model(...)

            y = data.y[target_id]
            loss = F.cross_entropy(logits, y)
            loss.backward()
            optimizer.step()
            train_loss.append(loss.item())
        train_loss = np.mean(train_loss)

        model.eval()
        val_loss = []
        val_acc = []
        with torch.no_grad():
            val_count = 0
            for batch in val_loader:
                target_id, n_id, adjs = batch
                ###################
                ##### 作业填空 #####
                ###################
                # 将每个batch的数据整理成模型forward需要的形式（与前面相同）
                logits = model(...)

                y = data.y[target_id]
                val_loss.append(F.cross_entropy(logits, y).item() * y.shape[0])
                val_acc.append(accuracy(logits, y) * y.shape[0])
                val_count += y.shape[0]

            val_loss = np.sum(val_loss) / val_count
            val_acc = np.sum(val_acc) / val_count
            if val_loss < best_loss:
                best_loss = val_loss
                best_val_acc = val_acc
                best_model = copy.deepcopy(model)

        epoch_iter.set_description(f"Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    with torch.no_grad():
        logits = best_model.inference(data.x, test_loader)
        val_acc = accuracy(logits[data.val_mask], data.y[data.val_mask])
        test_acc = accuracy(logits[data.test_mask], data.y[data.test_mask])
    print("Val Acc", val_acc)
    print("Test Acc", test_acc)

train()

Epoch: 099, Train Loss: 0.0003, Val Loss: 0.8542, Val Acc: 0.7740: 100%|██████████| 100/100 [00:23<00:00,  4.31it/s]


Val Acc 0.79
Test Acc 0.817


### 4. 实验总结
基于Neighbor Sampling的GraphSAGE模型训练与之前的GCN模型有何不同？GraphSAGE模型中的inference与forward有什么不同？
