vai_q_pytorch QAT - 3.5 简体中文

Vitis AI 用户指南 (UG1414)

Document ID
UG1414
Release Date
2023-09-28
Version
3.5 简体中文

假设有预定义的模型架构,请按照以下步骤,以来自 torchvision 的 ResNet18 为例,执行量化感知训练 (QAT)。完整模型定义可在此处找到。

  1. 检查是否存在要量化的非模块运算。ResNet18 使用 '+' 来添加 2 个张量。将其替换为 pytorch_nndct.nn.modules.functional.Add
  2. 检查是否存在要多次调用的模块。通常,此类模块并无权重,最常见的是 torch.nn.ReLu 模块。定义多个此类模块,然后在单次前传中逐一调用这些模块。修改后满足要求的定义如下:
    class BasicBlock(nn.Module):
      expansion = 1
    
      def __init__(self,
                   inplanes,
                   planes,
                   stride=1,
                   downsample=None,
                   groups=1,
                   base_width=64,
                   dilation=1,
                   norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
          norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
          raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
          raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride
    
        # Use a functional module to replace '+'
        self.skip_add = functional.Add()
    
        # Additional defined module
        self.relu2 = nn.ReLU(inplace=True)
    
      def forward(self, x):
        identity = x
    
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu1(out)
    
        out = self.conv2(out)
        out = self.bn2(out)
    
        if self.downsample is not None:
          identity = self.downsample(x)
        
        # Use function module instead of '+'
        # out += identity
        out = self.skip_add(out, identity)
        out = self.relu2(out)
    
        return out
    
  3. 插入 QuantStubDeQuantStub

    使用 QuantStub 来量化网络的输入,使用 DeQuantStub 来反量化网络的输出。这样就会对单次前传中从 QuantStubDeQuantStub 的任意子网络进行量化。允许存在多对 QuantStub-DeQuantStub:

    class ResNet(nn.Module):
    
      def __init__(self,
                   block,
                   layers,
                   num_classes=1000,
                   zero_init_residual=False,
                   groups=1,
                   width_per_group=64,
                   replace_stride_with_dilation=None,
                   norm_layer=None):
        super(ResNet, self).__init__()
        if norm_layer is None:
          norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer
    
        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation iEachne:
          # each element in the tuple indicates if we should replace
          # the 2x2 stride with a dilated convolution instead
          replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
          raise ValueError(
              "replace_stride_with_dilation should be None "
              "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(
            3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(
            block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(
            block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(
            block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)
    
        self.quant_stub = nndct_nn.QuantStub()
        self.dequant_stub = nndct_nn.DeQuantStub()
    
        for m in self.modules():
          if isinstance(m, nn.Conv2d):
            nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
          elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
            nn.init.constant_(m.weight, 1)
            nn.init.constant_(m.bias, 0)
    
        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
          for m in self.modules():
            if isinstance(m, Bottleneck):
              nn.init.constant_(m.bn3.weight, 0)
            elif isinstance(m, BasicBlock):
              nn.init.constant_(m.bn2.weight, 0)
    
      def forward(self, x):
        x = self.quant_stub(x)
    
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
    
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
    
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = self.dequant_stub(x)
        return x
    
  4. 使用 QAT API 创建量化器并训练模型:
    def _resnet(arch, block, layers, pretrained, progress, **kwargs):
      model = ResNet(block, layers, **kwargs)
      if pretrained:
        #state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
        state_dict = torch.load(model_urls[arch])
        model.load_state_dict(state_dict)
      return model
    
    def resnet18(pretrained=False, progress=True, **kwargs):
      r"""ResNet-18 model from
        `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>'_
    
        Args:
            pretrained (bool): If True, returns a model pre-trained on ImageNet
            progress (bool): If True, displays a progress bar of the download to stderr
        """
      return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress,
                     **kwargs)
    
    model = resnet18(pretrained=True)
    
    # Generate dummy inputs.
    input = torch.randn([batch_size, 3, 224, 224], dtype=torch.float32)
    
    # Create a quantizer
    from pytorch_nndct import QatProcessor
    qat_processor = QatProcessor(model, inputs, bitwidth=8)
    quantized_model = qat_processor.trainable_model()optimizer = torch.optim.Adam(
            quantized_model.parameters(), 
            lr, 
            weight_decay=weight_decay)
    
    # Use the optimizer to train del, just like a normal float model.
    …
    
  5. 获取可部署模型并进行测试。

    完成 QAT 后,将量化模型转换为可部署模型。可部署模型的精度可能与量化模型的精度略有不同:

    output_dir = 'qat_result'
    deployable_model = qat_processor.to_deployable(quantized_model, output_dir)
    validate(val_loader, deployable_model, criterion, gpu)
  6. 从可部署模型导出 XMODEL。

    batch size=1 是 XMODEL 编译的必备条件:

    # Use CPU mode to export xmodel.
    deployable_model.cpu()
    val_subset = torch.utils.data.Subset(val_dataset, list(range(1)))
    subset_loader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=1,
        shuffle=False,
        num_workers=8,
        pin_memory=True)
    # Must forward deployable model at least 1 iteration with batch_size=1
    for images, _ in subset_loader:
      deployable_model(images)
    qat_processor.export_xmodel(output_dir)