deeplab
Encoder-Decoder with Atrous convolution(空洞卷積)
空洞卷積能夠通過深度卷積神經(jīng)網(wǎng)絡(luò)控制特征的像素,調(diào)節(jié)卷積核的視覺域以獲取多尺度的信息。對于兩維的信號,有
其中空洞率r是采樣輸入信號的步長.
深度可分離卷積是depthwise conv(每個輸入通道的單獨的空間卷積)加上pointwise(結(jié)合depthwise的輸出)。
Deeplabv3 as encoder用Logit前面一個特征作為encoder的輸出,這個特征包括256個通道和豐富的語義特征。
decoder:encoder的輸出首先經(jīng)過因子為4的雙線性差值上采樣,然后與相應(yīng)的低級特征連接,之后經(jīng)過幾個的卷積層。
Atrous Spatial Pyramid Pooling
有四個平行的不同的空洞卷積相連,不同的空洞率獲取不同尺度的信息。
ASPP包括兩個部分:多尺度空洞卷積和圖像級特征。多尺度空洞卷積包括,的普通卷積,
空洞率為6的空洞卷積,
空洞率為12的空洞卷積,
空洞率為18的空洞卷積;圖像級特征,對輸入在[1,2]維上求均值,經(jīng)過
的普通卷積,再使用線性差值resize到輸入圖像的大小,最后將4個卷積和image level feature 連接起來,最后再經(jīng)過一個
的卷積得到網(wǎng)絡(luò)的輸出。
def ASPP(input):
inputs_size= tf.shape(inputs)[1:3]
# (a) one 1x1 convolution and three 3x3 convolutions with rates = (6, 12, 18) when output stride = 16.
# the rates are doubled when output stride = 8.
conv_1x1 = layers_lib.conv2d(inputs, depth, [1, 1], stride=1, scope="conv_1x1")
conv_3x3_1 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[0], scope='conv_3x3_1')
conv_3x3_2 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[1], scope='conv_3x3_2')
conv_3x3_3 = layers_lib.conv2d(inputs, depth, [3, 3], stride=1, rate=atrous_rates[2], scope='conv_3x3_3')
# (b) the image-level features
image_level_features = tf.reduce_mean(inputs, [1, 2], name='global_average_pooling', keepdims=True) #mean with 1 and 2 dim, 0 dim keep
# 1x1 convolution with 256 filters( and batch normalization)
image_level_features = layers_lib.conv2d(image_level_features, depth, [1, 1], stride=1, scope='conv_1x1')
# bilinearly upsample features
image_level_features = tf.image.resize_bilinear(image_level_features, inputs_size, name='upsample')
net = tf.concat([conv_1x1, conv_3x3_1, conv_3x3_2, conv_3x3_3, image_level_features], axis=3, name='concat')
net = layers_lib.conv2d(net, depth, [1, 1], stride=1, scope='conv_1x1_concat')
return net
損失函數(shù)
網(wǎng)絡(luò)輸出的是pixel-wise的sotfmax,即為
其中,x為二維平面上的像素位置,表示網(wǎng)絡(luò)最后輸出層中pixel x對應(yīng)的第k個通道的值。
表示像素x屬于k類的概率。
損失函數(shù)使用負類交叉熵,即為
其中表示x在真實label所在通道上的輸出概率。
步驟
1、將數(shù)據(jù)集存為tfrecord文件
writer = tf.python_io.TFRecordWriter(output_filename)
###讀image
fid = tf.gfile.GFile(image_path, 'rb')
encoded_jpg = fid.read()
encoded_jpg_io = io.BytestIO(encode_jpg)
image = PIL.Image.open(encoded_jpg_io)
###讀mask
fid = tf.gfile.GFile(label_path, 'rb')
encoded_label = fid.read()
encoded_label_io = io.BytestIO(encode_label)
label = PIL.Image.open(encoded_label_io)
example = tf.train.Example(features=tf.train.Features(feature={
'image/height': dataset_util.int64_feature(height),
'image/width': dataset_util.int64_feature(width),
'image/encoded': dataset_util.bytes_feature(encoded_jpg),
'image/format': dataset_util.bytes_feature('jpeg'.encode('utf8')),
'label/encoded': dataset_util.bytes_feature(encoded_label),
'label/format': dataset_util.bytes_feature('png'.encode('utf8')),}))
writer.write(tf_example.SerializeToString())
writer.close()
2、定義模型
def deeplabv3_plus_model_fn(features, labels, mode, params):
images = tf.cast(tf.map_fn(preprocessing.mean_image_addition, features), tf.uint8)#減均值
inputs = tf.transpose(images, [0, 3, 1, 2])
logits, end_points = base_model(inputs)
inout_size = tf.shape(images)[1:3]
net = end_points['/block4']
encoder_output = ASPP(net)
###decoder
#提取低級特征
low_level_features = end_points['/block1/unit_s/bottleneck_v2/conv1']
#一般低級圖像特征的通道數(shù)很多(256或512)使用1*1的卷積核減輸出通道數(shù).
low_level_features = conv2d(low_level_features, 48, [1, 1],s=1)
low_level_feature_size = tf.shape(low_level_features)[1:3]
#連接encoder輸出和低級特征
net = tf.image.resize_bilinear(encoder_output, low_level_features_size, name='upsample_1')
net = tf.concat([net, low_level_features], axis=3, name='concat')
#經(jīng)過幾個3*3的卷積層以增強特征方便上采樣
net = layers_lib.conv2d(net, 256, [3, 3], stride=1, scope='conv_3x3_1')
net = layers_lib.conv2d(net, 256, [3, 3], stride=1, scope='conv_3x3_2')
net = layers_lib.conv2d(net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='conv_1x1')
#輸出的邏輯值
logits = tf.image.resize_bilinear(net, inputs_size, name='upsample_2')
pred_classes = tf.expand_dims(tf.argmax(logits, axis=3, output_type=tf.int32), axis=3)
return pred_classes
3、開始訓(xùn)練
#訓(xùn)練
def train():
###定義loss
labels = tf.squeeze(labels, axis=3)
logits_by_num_classes = tf.reshape(logits, [-1, params['num_classes']])
labels_flat = tf.reshape(labels, [-1, ])
#取label中標簽有用的索引
valid_indices = tf.to_int32(labels_flat <= params['num_classes'] - 1)#elements cluding 0 and 1
valid_logits = tf.dynamic_partition(logits_by_num_classes, valid_indices, num_partitions=2)[1]
valid_labels = tf.dynamic_partition(labels_flat, valid_indices, num_partitions=2)[1]
pred_flat = tf.reshape(pred_classes, [-1, ])
valid_preds = tf.dynamic_partition(preds_flat, valid_indices, num_partitions=2)[1]
confusion_matrix = tf.confusion_matrix(valid_labels, valid_preds, num_classes=params['num_classes'])
cross_entropy = tf.losses.sparse_softmax_cross_entropy(logits=valid_logits, labels=valid_labels)
train_var_list = [v for v in tf.trainable_variables()]
#交叉熵loss + l2正則化
loss = cross_entropy + params.get('weight_decay', _WEIGHT_DECAY) * tf.add_n([tf.nn.l2_loss(v) for v in train_var_list])
###定義全局step
global_step = tf.train.get_or_create_global_step()
learning_rate = tf.train.polynomial_decay(
params['initial_learning_rate',
tf.cast(global_step, tf.int32)-params['initial_global_step'],
params['max_iter'],
params['end_learning_rate'],
power=params['power'])
tf.identity(learning_rate, name='learning_rate')
optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum'])
update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
train_op = optimizer.minimize(loss, global_step, var_list=train_var_list)
return tf.estimator.EstimatorSpec(
mode=mode,
predictions=predictions,
loss=loss,
train_op=train_op,
eval_metric_ops=metrics)
4、測試
def predict(img):
#img = plt.imread(image_files[0])
image = tf.convert_to_tensor(img)
image = tf.to_float(tf.image.convert_image_dtype(image, dtype=tf.uint8))
image.set_shape([None, None, 3])
images = preprocessing.mean_image_subtraction(image)
images = tf.reshape(images, [1, tf.shape(image)[0],tf.shape(image)[1],3])
labels = tf.zeros([1, tf.shape(image)[0],tf.shape(image)[1],1])
labels = tf.to_int32(tf.image.convert_image_dtype(labels, dtype=tf.uint8))
predictions = deeplab_model.deeplabv3_plus_model_fn(
images,
labels,
tf.estimator.ModeKeys.EVAL,
params={
'output_stride': FLAGS.output_stride,
'batch_size': 1, # Batch size must be 1 because the images' size may differ
'base_architecture': FLAGS.base_architecture,
'pre_trained_model': None,
'batch_norm_decay': None,
'num_classes': _NUM_CLASSES,
'freeze_batch_norm': True
}).predictions
saver = tf.train.Saver()
with tf.Session(config=config) as sess:
ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
saver.restore(sess, 'model/model.ckpt-73536')
preds = sess.run(predictions)
pred = preds['classes']
pred = pred.astype(np.float32)
pred[pred == 1] = -1
pred += 1
return pred[0, :, :, 0].astype(np.uint8)
評價標準
IOU,模型產(chǎn)生的目標窗口和原來標記窗口的交疊率,即檢測結(jié)果與ground truth的交集比上他們的并集。
訓(xùn)練過程效果



問題及注意事項
- 訓(xùn)練圖很大,大部分超過了
,如果crop_size設(shè)置為
時會使大部分訓(xùn)練圖全部包括天空或者前景。因此使用
較好。另外,不用padding效果較好。
參考文獻
附錄
resnet101結(jié)構(gòu)
conv1 (7, 7, 64, s2) out:112*112
conv2_x (1, 1, 64)
(3, 3, 64) *3 out:56*56
(1, 1, 256)
conv3_x (1, 1, 128)
(3, 3, 128) *4 out:28*28
(1, 1, 512)
conv4_x (1, 1, 256)
(3, 3, 256) *23 out:14*14
(1, 1, 1024)
conv5_x (1, 1, 512)
(3, 3, 512) *3 out:7*7
(1, 1, 2048)
average pool, 1000-d fc, softmax