# 第六次作业

在本次作业中，大家需要完成GRAND模型的训练，重点体会一致性正则化（Consistency Regularization）的用法，并在cora数据集上进行测试。

本作业需要安装[CogDL](https://github.com/THUDM/cogdl)：pip install cogdl

如需使用gpu版，请先安装gpu版本的[PyTorch](https://pytorch.org/get-started/locally/)，再安装cogdl。

本作业由智谱GNN中心及课程团队筹备，由CogDL团队提供技术支持。


### 1. GRAND模型
阅读GRAND模型的实现。

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from cogdl.utils import spmm


class Grand(nn.Module):
    """
    Implementation of GRAND in paper `"Graph Random Neural Networks for Semi-Supervised Learning on Graphs"`
    <https://arxiv.org/abs/2005.11079>
    Parameters
    ----------
    nfeat : int
        Size of each input features.
    nhid : int
        Size of hidden features.
    nclass : int
        Number of output classes.
    input_droprate : float
        Dropout rate of input features.
    hidden_droprate : float
        Dropout rate of hidden features.
    use_bn : bool
        Using batch normalization.
    dropnode_rate : float
        Rate of dropping elements of input features
    order : int
        Order of adjacency matrix
    alpha : float
    """

    def __init__(
        self, nfeat, nhid, nclass, input_droprate, hidden_droprate, use_bn, dropnode_rate, order, alpha,
    ):
        super(Grand, self).__init__()
        self.layer1 = nn.Linear(nfeat, nhid)
        self.layer2 = nn.Linear(nhid, nclass)
        self.input_droprate = input_droprate
        self.hidden_droprate = hidden_droprate
        self.bn1 = nn.BatchNorm1d(nfeat)
        self.bn2 = nn.BatchNorm1d(nhid)
        self.use_bn = use_bn
        self.order = order
        self.dropnode_rate = dropnode_rate
        self.alpha = alpha

    def drop_node(self, x):
        n = x.shape[0]
        drop_rates = torch.ones(n) * self.dropnode_rate
        if self.training:
            masks = torch.bernoulli(1.0 - drop_rates).unsqueeze(1)
            x = masks.to(x.device) * x
        else:
            x = x * (1.0 - self.dropnode_rate)
        return x

    def rand_prop(self, graph, x):
        x = self.drop_node(x)
        y = x
        for i in range(self.order):
            x = spmm(graph, x).detach_()
            y.add_(x)
        return y.div_(self.order + 1.0).detach_()

    def normalize_x(self, x):
        row_sum = x.sum(1)
        row_inv = row_sum.pow_(-1)
        row_inv.masked_fill_(row_inv == float("inf"), 0)
        x = x * row_inv[:, None]
        return x

    def forward(self, graph):
        graph.sym_norm()
        x = graph.x
        x = self.normalize_x(x)
        x = self.rand_prop(graph, x)
        if self.use_bn:
            x = self.bn1(x)
        x = F.dropout(x, self.input_droprate, training=self.training)
        x = F.relu(self.layer1(x))
        if self.use_bn:
            x = self.bn2(x)
        x = F.dropout(x, self.hidden_droprate, training=self.training)
        x = self.layer2(x)
        return x

### 2. 从cogdl中加载cora数据集（x表示特征，y表示标签，mask表示训练/验证/测试集的划分）

In [2]:
from cogdl.datasets import build_dataset_from_name

dataset = build_dataset_from_name("cora")
data = dataset[0]
data.add_remaining_self_loops()
print(data)

Graph(x=[2708, 1433], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_index=[2, 13264], edge_weight=[13264])


### 3. 使用GRAND模型进行训练

#### 3.1 GRAND的训练逻辑

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class GrandModelWrapper(object):
    """
    sample : int
        Number of augmentations for consistency loss
    temperature : float
        Temperature to sharpen predictions.
    lmbda : float
         Proportion of consistency loss of unlabelled data
    """

    def __init__(self, model, sample=2, temperature=0.5, lmbda=0.5):
        super(GrandModelWrapper, self).__init__()
        self.model = model
        self.sample = sample
        self.temperature = temperature
        self.lmbda = lmbda

    def train_step(self, graph):
        output_list = []
        for i in range(self.sample):
            output_list.append(self.model(graph))
        loss_train = 0.0
        for output in output_list:
            loss_train += F.cross_entropy(output[graph.train_mask], graph.y[graph.train_mask])
        loss_train = loss_train / self.sample

        output_list = [F.log_softmax(x, dim=-1) for x in output_list]
        loss_consis = self.consistency_loss(output_list, graph.train_mask)

        return loss_train + loss_consis

    def consistency_loss(self, logps, train_mask):
        temp = self.temperature
        ps = [torch.exp(p)[~train_mask] for p in logps]

        ###################
        ##### 作业填空 #####
        ###################
        # 根据模型的多组输出计算GRAND论文中的一致性正则化损失函数（参考GRAND论文的公式2/3）
        # 整体流程包括：1）Average；2）Sharpening；3）L2 distance
        # 注：ps中保存了模型对于非训练集中的结果的多组预测结果

        loss = ...

        return self.lmbda * loss

#### 3.2 训练GRAND模型

In [4]:
import copy
from tqdm import tqdm


def accuracy(y_pred, y_true):
    y_true = y_true.squeeze().long()
    preds = y_pred.max(1)[1].type_as(y_true)
    correct = preds.eq(y_true).double()
    correct = correct.sum().item()
    return correct / len(y_true)

def train(hidden_size=32):
    hidden_size = hidden_size
    model = Grand(
        data.x.shape[1], hidden_size, data.y.max() + 1, 
        input_droprate=0.5, hidden_droprate=0.5, use_bn=False,
        dropnode_rate=0.5, order=5, alpha=0.2,
    )
    print("Model Parameters:", sum(p.numel() for p in model.parameters()))

    if torch.cuda.is_available():
        device = torch.device("cuda")
        model = model.to(device)
        data.apply(lambda x: x.to(device))

    mw = GrandModelWrapper(model, sample=4, temperature=0.5, lmbda=1.0)

    optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)
    epoch_iter = tqdm(range(100), position=0, leave=True)
    best_model = None
    best_acc = 0
    for epoch in epoch_iter:
        model.train()
        optimizer.zero_grad()

        loss = mw.train_step(data)
        loss.backward()

        optimizer.step()
        train_loss = loss.item()

        model.eval()
        with torch.no_grad():
            logits = model(data)
            val_loss = F.cross_entropy(logits[data.val_mask], data.y[data.val_mask]).item()
            val_acc = accuracy(logits[data.val_mask], data.y[data.val_mask])
            if val_acc > best_acc:
                best_acc = val_acc
                best_model = copy.deepcopy(model)

        epoch_iter.set_description(f"Epoch: {epoch:03d}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    with torch.no_grad():
        logits = best_model(data)
        val_acc = accuracy(logits[data.val_mask], data.y[data.val_mask])
        test_acc = accuracy(logits[data.test_mask], data.y[data.test_mask])
    print("Val Acc", val_acc)
    print("Test Acc", test_acc)

train()

  0%|          | 0/100 [00:00<?, ?it/s]

Model Parameters: 49049


Epoch: 099, Train Loss: 0.8854, Val Loss: 0.8261, Val Acc: 0.8100: 100%|██████████| 100/100 [07:21<00:00,  4.42s/it]


Val Acc 0.81
Test Acc 0.825


### 4. 开放讨论
谈谈你对一致性正则化(Consistency Regularization)的理解。你觉得在什么情况下一致性正则化能够带来效果提升？（可以将你的思考发到讨论区中：https://discuss.cogdl.ai/t/topic/83 ）
