VGG: Deeper Networks, Regular Design

  • 核心思想:更深,卷积层数更多;标准化模块,卷积有标准化

  • VGG设计规则:

    1. All conv are 3x3 stride 1 pad 1
    2. All max pool are 2x2 stride 2
    3. After pool, double #channels,在分辨率压缩的同时保持信息量
  • VGG的5个stages:

    1. Stage 1: conv-conv-pool
    2. Stage 2: conv-conv-pool
    3. Stage 3: conv-conv-pool
    4. Stage 4: conv-conv-conv-[conv]-pool
    5. Stage 5: conv-conv-conv-[conv]-pool
  • VGG19比VGG16在stage4和5中多一个卷积层

image.png

  • Why 3x3 layers?

    1. 标准化,容易加速
    2. 多个3x3的卷积核可以形成大卷积核:two 3x3 layers – 5x5 receptive field;three 3x3 layers – 7x7 receptive field
    3. 多个3x3的卷积核提供了更多的非线性,拟合能力强
    4. 感受野相同的前提下参数量更少:Conv(5x5, C → C): 25𝐶² parameters;Conv(3x3, C → C): 2 × 9𝐶² = 18𝐶² parameters
  • VGG16:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import torch
import torch.nn as nn

class VGG16(nn.Module):
def __init__(self, num_classes=1000):
super(VGG16, self).__init__()

# 卷积层部分(13个卷积层)
self.features = nn.Sequential(
# 第一组卷积层
nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),

# 第二组卷积层
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),

# 第三组卷积层
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),

# 第四组卷积层
nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),

# 第五组卷积层
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2),
)

# 全连接层部分(3个全连接层)
self.classifier = nn.Sequential(
nn.Linear(512 * 7 * 7, 4096), # 输入是512*7*7,输出4096
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096), # 第二个全连接层,输出4096
nn.ReLU(inplace=True),
nn.Dropout(p=0.5),
nn.Linear(4096, num_classes), # 输出类别数
)

self.init_parameters()

def init_parameters(self):
# 初始化卷积层的权重
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
if m.bias is not None:
nn.init.constant_(m.bias, 0)

# 初始化全连接层的权重
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, mean=0, std=0.01)
nn.init.constant_(m.bias, 0)

def forward(self, x):
x = self.features(x) # 卷积层部分
x = torch.flatten(x, 1) # 展平
x = self.classifier(x) # 全连接层部分
return x

GoogLeNet

  • 核心思想:多分支模式;全局平均池化;中间层分类器

image.png

GoogLeNet #1: Inception Module

  • Inception 模块:具有并行分支的本地单元。由于无法确定卷积层的最佳大小,于是采用不同分支用不同大小的卷积,自动选择最好的分支

  • 使用1x1”Bottleneck”:提高了非线性性;增大了中间通道数

image.png

GoogLeNet #2: Global Average Pooling

  • 最后没有大的 FC 层,减少了参数量

  • 使用全局平均池化来减少空间维度,并使用一个线性层来生成类分数

GoogLeNet #3: Auxiliary Classifiers

  • 在中间层加入全局平均池化和分类器,使得梯度可以从深的位置传到浅层位置

  • 在网络末端使用 loss 进行训练效果不佳: 网络太深,梯度不能很好地传播

  • 深度监督:在网络中的几个中间点附加“辅助分类器”,这些中间点试图对图像进行分类并接收损失

  • 有了 BatchNorm后,不再需要使用这个技巧

  • GoogLeNet:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import torch
import torch.nn as nn
import torch.nn.functional as F

class BasicConv2d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
super(BasicConv2d, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)
self.relu = nn.ReLU(inplace=True)

def forward(self, x):
x = self.conv(x)
x = self.relu(x)
return x

class Inception(nn.Module):
def __init__(self, in_channels, c1x1, c3x3_reduce, c3x3, c5x5_reduce, c5x5, pool_proj):
super(Inception, self).__init__()
# 1x1 convolution branch
self.branch1 = BasicConv2d(in_channels, c1x1, kernel_size=1)
# 1x1 convolution followed by 3x3 convolution branch
self.branch2 = nn.Sequential(
BasicConv2d(in_channels, c3x3_reduce, kernel_size=1),
BasicConv2d(c3x3_reduce, c3x3, kernel_size=3, padding=1)
)
# 1x1 convolution followed by 5x5 convolution branch
self.branch3 = nn.Sequential(
BasicConv2d(in_channels, c5x5_reduce, kernel_size=1),
BasicConv2d(c5x5_reduce, c5x5, kernel_size=5, padding=2)
)
# 3x3 max pooling followed by 1x1 convolution branch
self.branch4 = nn.Sequential(
nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
BasicConv2d(in_channels, pool_proj, kernel_size=1)
)

def forward(self, x):
branch1 = self.branch1(x)
branch2 = self.branch2(x)
branch3 = self.branch3(x)
branch4 = self.branch4(x)
return torch.cat([branch1, branch2, branch3, branch4], 1)

class GoogLeNet(nn.Module):
def __init__(self, num_classes=1000):
super(GoogLeNet, self).__init__()
self.conv1 = BasicConv2d(3, 64, kernel_size=7, stride=2, padding=3)
self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.conv2 = nn.Sequential(
BasicConv2d(64, 64, kernel_size=1),
BasicConv2d(64, 192, kernel_size=3, padding=1)
)
self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.inception3a = Inception(192, 64, 96, 128, 16, 32, 32)
self.inception3b = Inception(256, 128, 128, 192, 32, 96, 64)
self.maxpool3 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.inception4a = Inception(480, 192, 96, 208, 16, 48, 64)
self.inception4b = Inception(512, 160, 112, 224, 24, 64, 64)
self.inception4c = Inception(512, 128, 128, 256, 24, 64, 64)
self.inception4d = Inception(512, 112, 144, 288, 32, 64, 64)
self.inception4e = Inception(528, 256, 160, 320, 32, 128, 128)
self.maxpool4 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.inception5a = Inception(832, 256, 160, 320, 32, 128, 128)
self.inception5b = Inception(832, 384, 192, 384, 48, 128, 128)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(1024, num_classes)

def forward(self, x):
x = self.conv1(x)
x = self.maxpool1(x)
x = self.conv2(x)
x = self.maxpool2(x)
x = self.inception3a(x)
x = self.inception3b(x)
x = self.maxpool3(x)
x = self.inception4a(x)
x = self.inception4b(x)
x = self.inception4c(x)
x = self.inception4d(x)
x = self.inception4e(x)
x = self.maxpool4(x)
x = self.inception5a(x)
x = self.inception5b(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x

Deep Networks are Hard to Train

  • 神经网络仍然不能过深,会出现梯度消失、梯度爆炸的问题

Weight Initialization

Gradient Vanishing

  • 乘方差较小的噪音时会导致值趋于0,深层网络都为0,从而使得梯度为0,出现梯度小时的问题

image.png

image.png

Gradient Exploding

  • 方差过大导致大的数累乘,使得梯度进入饱和区(如下图)或导致梯度爆炸

image.png

image.png

Xavier Initialization

  • “Xavier” initialization: Var(w) = 1 / N, N is the dimension of 𝑥,以全连接层为例推导:

image.png

  • Xavier initialization没有考虑激活函数,其中tanh函数效果最好,其他激活函数也可以使用

  • 而对于卷积层,则可以把卷积层当成多个小的全连接层,对于卷积层,N 为 kernel_size² * (input_channels + output_channels) / 2

image.png

image.png

Kaiming Initialization

  • 层数过多使用ReLU激活函数时,Xavier初始化仍会导致梯度问题

  • Kaiming initialization: Var(w) = 2 / N, N is the dimension of 𝑥 (For ReLU),以全连接层为例推导:

image.png

  • Kaiming初始化最适配于ReLU激活函数

Batch Normalization

  • 核心思想:“动态”标准化每一层的输出,使其均值和方差为零,归一化内部特征以稳定训练,减少梯度消失和爆炸

  • 使用整个数据集的均值和方差进行归一化的代价很高;可以像这样规范化一批激活:

image.png

  • 这个公式可微,可以当成一层

  • 输入:x ∈ RN×D,动态计算均值和方差

image.png

  • 在 RD 中添加可学习的缩放和移位参数从而使其可以拟合任何分布

  • 记录训练期间看到的 μ 和 σ 的运行平均值,因为无法在测试的时候使用

Batch Normalization for CNNs

image.png

  • 对于卷积层,希望归一化遵循卷积属性——以便同一特征图的不同元素在不同位置以相同的方式归一化

  • BN是在每个通道上进行计算

  • 通常插入到全连接层或卷积层之后,非线性之前

  • 使深度网络更易于训练,允许更高的学习率,更快的收敛

  • 网络对初始化变得更加鲁棒

  • 在训练期间充当正则化

  • 缺点:缺少理论解释;训练和测试时候模式不同容易导致bug

image.png

Group Normalization

  • 当batch变小时,BN 的误差会迅速增加

  • GN 的计算与批量大小无关,其准确性在很宽的范围内保持稳定,即使批量大小为2

image.png

  • GN在通道中分组,在每个组中计算均值和方差

Deep Residual Learning

  • With Batch Normalization and proper initialization, we can train networks with 10+ layers

  • 问题:The deeper model does worse than a shallow model

  • 随着神经网络深度的增加,模型的训练误差和测试误差往往会出现先下降后上升的现象,即所谓的“退化问题”。 这并非由于过拟合引起的,因为即使在训练集上,误差也会随着深度的增加而上升。 ResNet(残差网络)通过引入残差连接(shortcut connections)有效地解决了这一问题

image.png

Residual Networks

  • 较深的模型可以模拟较浅的模型:从较浅的模型复制层,即f(x)=x映射,设置额外的层作为标识,因此,较深的模型至少应该与较浅的模型一样好

  • 假设:这是一个优化问题。更深的模型更难优化,甚至无法学习恒等函数来模拟浅层模型

  • 解决方案:更改网络,以便学习具有其他额外层的函数

  • 在传统的深度神经网络中,随着层数的增加,网络需要学习一个复杂的映射函数。 然而,直接学习复杂的映射函数可能导致训练困难,甚至性能下降。 ResNet提出了一种新的思路:将目标映射分解为输入与输出之间的“残差”,即:H(x)=F(x)+x

  • 其中,H(x) 是期望的目标映射,F(x) 是需要学习的残差函数,x 是输入。 通过这种方式,网络只需学习输入与输出之间的差异,而非直接学习复杂的映射。 这使得深层网络的训练变得更加容易,避免了退化问题

  • 残差连接通过将输入直接添加到输出,允许梯度在反向传播时直接传递到前面的层,缓解了梯度消失或爆炸的问题

image.png

  • 残差网络是许多残差块的堆栈

  • 常规设计如VGG:每个残差块有两个 3x3 卷积

  • 网络分为几个阶段: 每个阶段的第一个块将分辨率减半(使用 stride-2 conv)并将通道数增加一倍

image.png

  • 当需要更改第一个块中的通道时,skip 连接可以实现为恒等函数或 1x1 卷积

  • bottleneck残差块包含的参数较少且较深

  • Batch Normalization在conv和relu之间应用

image.png

  • ResNet:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import torch
import torch.nn as nn
import torch.nn.functional as F

def conv3x3(in_channels, out_channels, stride=1):
"""3x3卷积层,带有填充"""
return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)

def conv1x1(in_channels, out_channels, stride=1):
"""1x1卷积层"""
return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)

class BasicBlock(nn.Module):
expansion = 1 # 输出通道数的倍数

def __init__(self, in_channels, out_channels, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(in_channels, out_channels, stride)
self.bn1 = nn.BatchNorm2d(out_channels) # 批量归一化
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(out_channels, out_channels)
self.bn2 = nn.BatchNorm2d(out_channels) # 批量归一化
self.downsample = downsample

def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
out += identity
out = self.relu(out)
return out

class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
super(ResNet, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64) # 批量归一化
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)

def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.in_channels, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion), # 批量归一化
)
layers = []
layers.append(block(self.in_channels, planes, stride, downsample))
self.in_channels = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, planes))
return nn.Sequential(*layers)

def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x

Improving ResNet: ResNeXt

  • FLOPs:浮点数运算次数

  • 分组进行,更加高效

image.png

  • 效率可以提高G倍,G是group数

  • ResNeXt:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import torch
import torch.nn as nn
import torch.nn.functional as F

def conv3x3(in_channels, out_channels, stride=1, groups=1):
"""3x3卷积层,带有填充和分组卷积"""
return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, groups=groups, bias=False)

def conv1x1(in_channels, out_channels, stride=1):
"""1x1卷积层"""
return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)

class Bottleneck(nn.Module):
expansion = 4 # 输出通道数的倍数

def __init__(self, in_channels, out_channels, stride=1, downsample=None, groups=32, width_per_group=4):
super(Bottleneck, self).__init__()
width = int(out_channels * (width_per_group / 64.)) * groups # 计算每个组的通道数

self.conv1 = conv1x1(in_channels, width)
self.bn1 = nn.BatchNorm2d(width)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(width, width, stride, groups)
self.bn2 = nn.BatchNorm2d(width)
self.conv3 = conv1x1(width, out_channels * self.expansion)
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
self.downsample = downsample

def forward(self, x):
identity = x
if self.downsample is not None:
identity = self.downsample(x)

out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)

out += identity
out = self.relu(out)
return out

class ResNeXt(nn.Module):
def __init__(self, block, layers, num_classes=1000):
super(ResNeXt, self).__init__()
self.in_channels = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
self.fc = nn.Linear(512 * block.expansion, num_classes)

def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.in_channels != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.in_channels, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.in_channels, planes, stride, downsample))
self.in_channels = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.in_channels, planes))
return nn.Sequential(*layers)

def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x

DenseNet

  • DenseNet 每一层都和后面的所有层相连

  • 对于每个层,所有前面层的特征图被连接起来(ResNet 对特征图求和)并用作输入,它自己的特征图被用作所有后续层的输入,将梯度拼在后面

image.png

image.png

Squeeze-and-Excitation Networks (SENet)

  • 调制通道,使得通道对全局的感受能力更强,将全局观念添加到每个 ResNet 块中

image.png