argparse模塊
argparse 是 Python 內(nèi)置的一個用于命令項選項與參數(shù)解析的模塊,通過在程序中定義好我們需要的參數(shù)
1.解析器的創(chuàng)建argparse.ArgumentParser
parser=argparse.ArgumentParser(description='YOLO Detection')
2.添加參數(shù).add_argument
parser.add_argument('-v', '--version', default='yolo',help='yolo.')
這句話意思就是parser的version是yolo
參數(shù)列表ArgumentParser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest])
name or flags - 選項字符串的名字或者列表,例如 foo 或者 -f, --foo。
action - 命令行遇到參數(shù)時的動作,默認值是 store。
store_const,表示賦值為const;
append,將遇到的值存儲成列表,也就是如果參數(shù)重復(fù)則會保存多個值;
append_const,將參數(shù)規(guī)范中定義的一個值保存到一個列表;
count,存儲遇到的次數(shù);此外,也可以繼承 argparse.Action 自定義參數(shù)解析;
nargs - 應(yīng)該讀取的命令行參數(shù)個數(shù),可以是具體的數(shù)字,或者是?號,當(dāng)不指定值時對于 Positional argument 使用 default,對于 Optional argument 使用 const;或者是 * 號,表示 0 或多個參數(shù);或者是 + 號表示 1 或多個參數(shù)。
const - action 和 nargs 所需要的常量值。
default - 不指定參數(shù)時的默認值。
type - 命令行參數(shù)應(yīng)該被轉(zhuǎn)換成的類型。
choices - 參數(shù)可允許的值的一個容器。
required - 可選參數(shù)是否可以省略 (僅針對可選參數(shù))。
help - 參數(shù)的幫助信息,當(dāng)指定為 argparse.SUPPRESS 時表示不顯示該參數(shù)的幫助信息.
metavar - 在 usage 說明中的參數(shù)名稱,對于必選參數(shù)默認就是參數(shù)名稱,對于可選參數(shù)默認是全大寫的參數(shù)名稱.
dest - 解析后的參數(shù)名稱,默認情況下,對于可選參數(shù)選取最長的名稱,中劃線轉(zhuǎn)換為下劃線.
3.實例化parser.parse_args()
def parse_args():
return parser.parse_args()
def train():
args = parse_args()
隨機數(shù)種子
設(shè)置隨機數(shù)種子,在gpu或cpu上固定每一次的訓(xùn)練結(jié)果,隨機數(shù)種子seed確定時,模型的訓(xùn)練結(jié)果將始終保持一致。
這面的五個語句都是規(guī)定隨機數(shù)種子,最后一個是cuda的隨機數(shù)種子,將這個數(shù)值置為True的話,每次返回的卷積算法將是確定的
def setup_seed(seed):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
setup_seed(20)
調(diào)用cuda
if args.cuda:
print('use cuda')
cudnn.benchmark = True
device = torch.device("cuda")
else:
print('use cpu')
device = torch.device("cpu")
顯存不夠,跑不了gpu,是用cpu跑的
數(shù)據(jù)讀取
dataset = VOCDetection(root=VOC_ROOT, img_size=input_size[0],
transform=SSDAugmentation(input_size),
mosaic=args.mosaic)
VOCDetection是從調(diào)用的是voc0712的
網(wǎng)絡(luò)模型定義
yolo_net = myYOLO(device, input_size=input_size, num_classes=args.num_classes, trainable=True, hr=hr)
class myYOLO(nn.Module):
self.backbone = resnet18(pretrained=True)
# neck
self.SPP = nn.Sequential(
Conv(512, 256, k=1),
SPP(),
BottleneckCSP(256*4, 512, n=1, shortcut=False)
)
self.SAM = SAM(512)
self.conv_set = BottleneckCSP(512, 512, n=3, shortcut=False)
self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1)
骨干網(wǎng)絡(luò)是resnet18,從https://download.pytorch.org/models/resnet18-5c106cde.pth'下載的

neck部分是spp( Spatial Pyramid Pooling 空金字塔),Bottleneck瓶頸層
SPPNet的能夠接受任意尺寸圖片的輸入

Bottleneck使用的是1*1的卷積神經(jīng)網(wǎng)絡(luò),可以大幅減少計算量

然后用二維卷積預(yù)測.
用sgd做優(yōu)化器,epoch = 10,最大學(xué)習(xí)率0.001
開始訓(xùn)練
動態(tài)學(xué)習(xí)率,在訓(xùn)練伊始學(xué)習(xí)率很低,然后慢慢提到最大學(xué)習(xí)率0.001
if args.cos and epoch > 20 and epoch <= max_epoch - 20:
# use cos lr
tmp_lr = 0.00001 + 0.5*(base_lr-0.00001)*(1+math.cos(math.pi*(epoch-20)*1./ (max_epoch-20)))
set_lr(optimizer, tmp_lr)
elif args.cos and epoch > max_epoch - 20:
tmp_lr = 0.00001
set_lr(optimizer, tmp_lr)
else:
if epoch in cfg['lr_epoch']:
tmp_lr = tmp_lr * 0.1
set_lr(optimizer, tmp_lr)
for iter_i, (images, targets) in enumerate(data_loader):
# WarmUp strategy for learning rate
if not args.no_warm_up:
if epoch < args.wp_epoch:
tmp_lr = base_lr * pow((iter_i+epoch*epoch_size)*1. / (args.wp_epoch*epoch_size), 4)
# tmp_lr = 1e-6 + (base_lr-1e-6) * (iter_i+epoch*epoch_size) / (epoch_size * (args.wp_epoch))
set_lr(optimizer, tmp_lr)
elif epoch == args.wp_epoch and iter_i == 0:
tmp_lr = base_lr
set_lr(optimizer, tmp_lr)
Multi-scale Training多尺度訓(xùn)練,預(yù)先定義幾個固定的尺度,每個epoch隨機選擇一個尺度進行訓(xùn)練。
if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale:
# randomly choose a new size
size = random.randint(10, 19) * 32
input_size = [size, size]
model.set_grid(input_size)
if args.multi_scale:
# interpolate
images = torch.nn.functional.interpolate(images, size=input_size, mode='bilinear', align_corners=False)

在降低batch和epoch后終于可以成功運行了。跑了兩天最后得到了模型,在測試是時候ap一直是-1,可能測試集設(shè)置的不對,測試的代碼還沒看懂,周末繼續(xù)。
resent18代碼閱讀
殘差塊BasicBlock和Bottleneck
兩個差不多,一個是兩層卷積一個是三層

class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
identity = x #弄了個temp來承接
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None: #當(dāng)連接的維度不同時,使用1*1的卷積核將低維轉(zhuǎn)成高維,然后才能進行相加
identity = self.downsample(x)
out += identity #實現(xiàn)H(x)=F(x)+x或H(x)=F(x)+Wx
out = self.relu(out)
return out
感覺原理很簡單,就是在組后一層的relu之前,把輸入數(shù)據(jù)再加入原始數(shù)據(jù)(可能有權(quán)重)
當(dāng)有1x1卷積核的時候,我們叫bottleneck,當(dāng)沒有1x1卷積核時,我們稱其為BasicBlock
第一階段
class ResNet(nn.Module):
def __init__(self, block, layers, zero_init_residual=False):
super(ResNet, self).__init__()
self.inplanes = 64
self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
bias=False)
self.bn1 = nn.BatchNorm2d(64)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
余下4個階段,這里的make layer是resent內(nèi)部編寫的函數(shù),下面會說
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
何凱明提出了針對于relu的初始化方法,這個部分就是對每一層進行初始化,詳解看注釋
for m in self.modules():#這就話就是把模塊的每一層進行判別
if isinstance(m, nn.Conv2d):#如果這一層是二維卷積層,就用何凱明的正態(tài)分布初始化
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):#BatchNorm2d歸一化也是針對與relu的,數(shù)據(jù)在進行Relu之前不會因為數(shù)據(jù)過大而導(dǎo)致網(wǎng)絡(luò)性能的不穩(wěn)定
nn.init.constant_(m.weight, 1)#對歸一化的權(quán)重和偏執(zhí)做初始化
nn.init.constant_(m.bias, 0)


在每個殘差分支中初始化最后一個BN,Bottleneck和BasicBlock分別初始化
if zero_init_residual:#判斷是否是殘差塊
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0)
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0)
實現(xiàn)一層卷積,block參數(shù)指定是兩層殘差塊或三層殘差塊,planes參數(shù)為輸入的channel數(shù),blocks說明該卷積有幾個殘差塊
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
#即如果該層的輸入的通道數(shù)inplanes和其輸出的通道數(shù)的數(shù)planes * block.expansion不同,那要使用1*1的卷積核將輸入x低維轉(zhuǎn)成高維,然后才能進行相加
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride),
nn.BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
從forward里最能直觀的觀察網(wǎng)絡(luò)的結(jié)構(gòu)了
def forward(self, x):
C_1 = self.conv1(x)
C_1 = self.bn1(C_1)
C_1 = self.relu(C_1)
C_1 = self.maxpool(C_1)
C_2 = self.layer1(C_1)
C_3 = self.layer2(C_2)
C_4 = self.layer3(C_3)
C_5 = self.layer4(C_4)
return C_3, C_4, C_5
resnet共有五個階段,其中第一階段為一個7x7的卷積處理,stride為2,然后經(jīng)過池化處理,此時特征圖的尺寸已成為輸入的1/4,接下來是四個階段,也就是代碼中的layer1234

只有l(wèi)ayer2,layer3,layer4只在第一個BasicBlock進行downsample下采樣,layer1沒有下采樣。每個layer包含2個BasicBlock,1個BasicBlock中有2次卷積。
def resnet18(pretrained=False, **kwargs):
"""Constructs a ResNet-18 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
if pretrained:
model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
return model

ResNet的參數(shù)很直接 def init(self, block, layers, num_classes=1000, zero_init_residual=False):參數(shù)block指明殘差塊是兩層或三層,參數(shù)layers指明每個卷積層需要的殘差塊數(shù)量,num_classes指明分類數(shù),zero_init_residual是否初始化為0,其中resnet18中[2, 2, 2, 2]對應(yīng)的就是layer1234的參數(shù),resent18的block是basicBlock,resent50以上的網(wǎng)絡(luò)使用了Bottleneck。
yolo檢測器
先要提前定義幾個模塊
1.空間注意力
class SAM(nn.Module):
""" Parallel CBAM """
def __init__(self, in_ch):
super(SAM, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_ch, in_ch, 1),
nn.Sigmoid()
)
def forward(self, x):
""" Spatial Attention Module """
x_attention = self.conv(x)
return x * x_attention
從x * x_attention很直觀的就能看出,就是來算feature map權(quán)重x_attention
2.空間金子塔
class SPP(nn.Module):
"""
Spatial Pyramid Pooling
"""
def __init__(self):
super(SPP, self).__init__()
def forward(self, x):
x_1 = torch.nn.functional.max_pool2d(x, 5, stride=1, padding=2)
x_2 = torch.nn.functional.max_pool2d(x, 9, stride=1, padding=4)
x_3 = torch.nn.functional.max_pool2d(x, 13, stride=1, padding=6)
x = torch.cat([x, x_1, x_2, x_3], dim=1)
return x
先算三個規(guī)模下的池化,然后torch.cat(x,1,2,3),1)就表示按維數(shù)1拼接起來
3.BottleneckCSP CrossStagePartial跨階段瓶頸
class BottleneckCSP(nn.Module):
# CSP Bottleneck https://github.com/WongKinYiu/CrossStagePartialNetworks
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
super(BottleneckCSP, self).__init__()
c_ = int(c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, k=1)
self.cv2 = nn.Conv2d(c1, c_, kernel_size=1, bias=False)
self.cv3 = nn.Conv2d(c_, c_, kernel_size=1, bias=False)
self.cv4 = Conv(2 * c_, c2, k=1)
self.bn = nn.BatchNorm2d(2 * c_) # applied to cat(cv2, cv3)
self.act = nn.LeakyReLU(0.1, inplace=True)
self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)])
這個模塊的作者是yolov4和cspnet的作者,
本文中YOLO結(jié)構(gòu)
class myYOLO(nn.Module):
def __init__(self, device, input_size=None, num_classes=20, trainable=False, conf_thresh=0.001, nms_thresh=0.5, hr=False):
super(myYOLO, self).__init__()
self.device = device #輸入層
#對各種參數(shù)的定義
self.num_classes = num_classes
self.trainable = trainable
self.conf_thresh = conf_thresh
self.nms_thresh = nms_thresh
self.stride = 32
#這里終于提到了yolo中畫bbox的函數(shù)
self.grid_cell = self.create_grid(input_size)
self.input_size = input_size
self.scale = np.array([[[input_size[1], input_size[0], input_size[1], input_size[0]]]])
self.scale_torch = torch.tensor(self.scale.copy(), device=device).float()
# we use resnet18 as backbone
self.backbone = resnet18(pretrained=True)
# neck
self.SPP = nn.Sequential(
Conv(512, 256, k=1),
SPP(),
BottleneckCSP(256*4, 512, n=1, shortcut=False)
)
self.SAM = SAM(512)
self.conv_set = BottleneckCSP(512, 512, n=3, shortcut=False)
self.pred = nn.Conv2d(512, 1 + self.num_classes + 4, 1)
def forward(self, x, target=None):
# backbone
_, _, C_5 = self.backbone(x)
# head
C_5 = self.SPP(C_5)
C_5 = self.SAM(C_5)
C_5 = self.conv_set(C_5)
# pred
prediction = self.pred(C_5)
prediction = prediction.view(C_5.size(0), 1 + self.num_classes + 4, -1).permute(0, 2, 1)
B, HW, C = prediction.size()
# Divide prediction to obj_pred, txtytwth_pred and cls_pred
# [B, H*W, 1]
conf_pred = prediction[:, :, :1]
# [B, H*W, num_cls]
cls_pred = prediction[:, :, 1 : 1 + self.num_classes]
# [B, H*W, 4]
txtytwth_pred = prediction[:, :, 1 + self.num_classes:]
# test
if not self.trainable:
with torch.no_grad():
# batch size = 1
all_conf = torch.sigmoid(conf_pred)[0] # 0 is because that these is only 1 batch.
all_bbox = torch.clamp((self.decode_boxes(txtytwth_pred) / self.scale_torch)[0], 0., 1.)
all_class = (torch.softmax(cls_pred[0, :, :], 1) * all_conf)
# separate box pred and class conf
all_conf = all_conf.to('cpu').numpy()
all_class = all_class.to('cpu').numpy()
all_bbox = all_bbox.to('cpu').numpy()
bboxes, scores, cls_inds = self.postprocess(all_bbox, all_class)
return bboxes, scores, cls_inds
else:
conf_loss, cls_loss, txtytwth_loss, total_loss = tools.loss(pred_conf=conf_pred, pred_cls=cls_pred,
pred_txtytwth=txtytwth_pred,
label=target)
return conf_loss, cls_loss, txtytwth_loss, total_loss