人臉檢測(cè)——滑動(dòng)窗口篇(訓(xùn)練和實(shí)現(xiàn))

人臉檢測(cè):cascade cnn,mtcnn,都可以通過下面代碼復(fù)現(xiàn)。但是下面的實(shí)現(xiàn)是比較low的,后面更新FCN的方法。
注意mtcnn的標(biāo)簽加了回歸框,訓(xùn)練時(shí)候的輸出層要作修改:(回歸框的作用還是很大的)
compute bbox reg label,其中x1,x2,y1,y2為真實(shí)的人臉坐標(biāo),x_left,x_right,y_top,y_bottom,width,height為預(yù)測(cè)的人臉坐標(biāo),
如果是在準(zhǔn)備人臉和非人臉樣本的時(shí)候,x_left,x_right,y_top,y_bottom,width,height就是你的滑動(dòng)窗與真實(shí)人臉的IOU>0.6(根據(jù)你的定義)的滑動(dòng)窗坐標(biāo)。
offset_x1 = (x1 - x_left) / float(width)
offset_y1 = (y1 - y_top) / float(height)
offset_x2 = (x2 - x_right) / float(width)
offset_y2 = (y2 - y_bottom ) / float(height)

tensorflow:12-net訓(xùn)練

2016年9月份的代碼,比較亂
可憐
>.<,僅供參考,需要的話自己閱讀整理吧。

train_net_12.py : face_AELW文件夾下包含有人臉和非人臉兩個(gè)文件夾。

import tensorflow as tf
import cv2
import os
import csv
from pandas import read_csv
import random
import numpy as np
import utils


filename = '/Users/liupeng/Desktop/anaconda/Dlib/face_AFLW'
# 該本分可以保存到txt文件中,可以節(jié)省加載時(shí)間,另外可以通過判斷文件名,給人臉和非人臉加標(biāo)簽。
text_data = []
label = 0
for filename1 in os.listdir(filename):
    #print (filename1)
    label = label + 1
    if (filename1[0] != '.'):
        filename1 = filename + '/' + filename1
        for filename2 in os.listdir(filename1):
            #print (filename2)
            if (filename2[0] != '.' ):
                #print (filename2)
                filename2 = filename1 + '/' + filename2
                image = cv2.imread(filename2)
                if image is None:
                    continue
                text_data.append(filename2 + ' ' + str(label-2))
text_data = [x.split(' ') for x in text_data]        

random.shuffle(text_data)
train_image = []
train_label = []
for i in range(len(text_data)):
    train_image.append(text_data[i][0])
    train_label.append(text_data[i][1])
#print (train_image)
print (train_label)

batch_size = 128
IMAGE_SIZE = 12

def get_next_batch(pointer):
    batch_x = np.zeros([batch_size, IMAGE_SIZE, IMAGE_SIZE, 3])  
    batch_y = np.zeros([batch_size, 2]) 
    # images = train_image[pointer*batch_size : (pointer+1)*batch_size]
    # label = train_label[pointer*batch_size : (pointer+1)*batch_size]
    for i in range(batch_size):  
        image = cv2.imread(train_image[i+pointer*batch_size])
        image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE))  
        
        image = (image - 127.5)*0.0078125
        '''m = image.mean()
        s = image.std()
        min_s = 1.0/(np.sqrt(image.shape[0]*image.shape[1]*image.shape[2]))
        std = max(min_s, s)
        image = (image-m)/std'''
        
        batch_x[i,:] = image.astype('float32') #/ 255.0
        # print (batch_x[i])
        if train_label[i+pointer*batch_size] == '0':
            batch_y[i,0] = 1
        else:
            batch_y[i,1] = 1
        # print (train_image[i+pointer*batch_size],batch_y[i])
    return batch_x, batch_y

# 網(wǎng)絡(luò)可以加深一點(diǎn)。改成 3 -> 16 3*3(SAME) pooling -> 32 3*3(SAME) pooling -> 32 3*3(VALID) -> 2
def fcn_12_detect(threshold, dropout=False, activation=tf.nn.relu):
    
    imgs = tf.placeholder(tf.float32, [None, IMAGE_SIZE, IMAGE_SIZE, 3])
    labels = tf.placeholder(tf.float32, [None, 2])
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    with tf.variable_scope('net_12'):
        conv1,_ = utils.conv2d(x=imgs, n_output=16, k_w=3, k_h=3, d_w=1, d_h=1, name="conv1")
        conv1 = activation(conv1)
        pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding="SAME", name="pool1")
        ip1,W1 = utils.conv2d(x=pool1, n_output=16, k_w=6, k_h=6, d_w=1, d_h=1, padding="VALID", name="ip1")
        ip1 = activation(ip1)
        if dropout:
            ip1 = tf.nn.dropout(ip1, keep_prob)
        ip2,W2 = utils.conv2d(x=ip1, n_output=2, k_w=1, k_h=1, d_w=1, d_h=1, name="ip2")

        pred = tf.nn.sigmoid(utils.flatten(ip2))
        target = utils.flatten(labels)

        regularizer = 8e-3 * (tf.nn.l2_loss(W1)+100*tf.nn.l2_loss(W2))

        loss = tf.reduce_mean(tf.div(tf.add(-tf.reduce_sum(target * tf.log(pred + 1e-9),1), -tf.reduce_sum((1-target) * tf.log(1-pred + 1e-9),1)),2)) + regularizer
        cost = tf.reduce_mean(loss)
        
        predict = pred
        max_idx_p = tf.argmax(predict, 1)  
        max_idx_l = tf.argmax(target, 1)  
        correct_pred = tf.equal(max_idx_p, max_idx_l)  
        acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))  
        
        thresholding_12 = tf.cast(tf.greater(pred, threshold), "float")
        recall_12 = tf.reduce_sum(tf.cast(tf.logical_and(tf.equal(thresholding_12, tf.constant([1.0])), tf.equal(target, tf.constant([1.0]))), "float")) / tf.reduce_sum(target)

        '''
        correct_prediction = tf.equal(tf.cast(tf.greater(pred, threshold), tf.int32), tf.cast(target, tf.int32))
        acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))'''

        return {'imgs': imgs, 'labels': labels, 'keep_prob': keep_prob,
            'cost': cost, 'pred': pred, 'accuracy': acc, 'features': ip1,
            'recall': recall_12, 'thresholding': thresholding_12}

def train():


    net_output = fcn_12_detect(0.0)
    global_step = tf.Variable(0, tf.int32)
    starter_learning_rate = 0.00001
    learning_rate = tf.train.exponential_decay(
        learning_rate=starter_learning_rate,
        global_step=global_step,
        decay_steps=1000,
        decay_rate=1.0,
        staircase=True,
        name=None)
    train_step = tf.train.AdamOptimizer(learning_rate).minimize(net_output['cost'], global_step=global_step)


    sess = tf.Session()
    saver = tf.train.Saver(tf.trainable_variables())
    # import pdb; pdb.set_trace()
    sess.run(tf.initialize_all_variables())

    saver.restore(sess, 'model/model_net_12-123100')
    for j in range(2000):
        for i in range(700):

            imgs, labels = get_next_batch(i)
            # labels = labels.reshape((labels.shape[0]))

            if i%300==0 and i!=0:

                saver.save(sess, 'model/model_net_12', global_step=global_step, write_meta_graph=False)
                
            if i%1==0:

                img, label = get_next_batch(700+i%50)
                
                cost, accuracy, recall, lr, pre = sess.run(
                    [net_output['cost'], net_output['accuracy'], net_output['recall'], learning_rate, net_output['pred']],
                    feed_dict={net_output['imgs']: img, net_output['labels']: label})

                print("Step %d, cost: %f, acc: %f, recall: %f, lr: %f"%(i, cost, accuracy, recall, lr))
                print (pre[0], label[0])
                print (pre[1], label[1])
                print (pre[2], label[2])
                print (pre[3], label[3])
                print (pre[4], label[4])
                # print("target: ", target)
                # print("pred: ", pred)

            # train
            sess.run(train_step, feed_dict={net_output['imgs']: imgs, net_output['labels']: labels})

    sess.close()

def test():  
    
    image = cv2.imread('images/8.jpg')
    image = cv2.resize(image, (IMAGE_SIZE, IMAGE_SIZE))  
    
    m = image.mean()
    s = image.std()
    min_s = 1.0/(np.sqrt(image.shape[0]*image.shape[1]*image.shape[2]))
    std = max(min_s, s)
    image = (image-m)/std
    image = image.astype('float32') #/ 255
    
    net_12 = fcn_12_detect(0.0)   
   
    saver = tf.train.Saver()  
    sess = tf.Session()  
    # saver.restore(sess, tf.train.latest_checkpoint('/Users/liupeng/Desktop/anaconda/i_code', 'checkpoint')) 
    sess.run(tf.initialize_all_variables())
    
    print ('start restore model')
    saver.restore(sess, 'model/model_net_12-71400')
    print ('ok')
    # saver.restore(sess, tf.train.latest_checkpoint('.'))  
   
    # predict = tf.argmax(tf.reshape(output, [-1, MAX_CAPTCHA, CHAR_SET_LEN]), 2)  
    predict = sess.run(net_12['pred'], feed_dict={net_12['imgs']: [image]})
    print ("predict:", predict)
    return predict  

if __name__ == '__main__':

    train()
    # test()

utils.py

import tensorflow as tf
def conv2d(x, n_output,
           k_h=5, k_w=5, d_h=2, d_w=2,
           padding='SAME', name='conv2d', reuse=None):
    """Helper for creating a 2d convolution operation.

    Parameters
    ----------
    x : tf.Tensor
        Input tensor to convolve.
    n_output : int
        Number of filters.
    k_h : int, optional
        Kernel height
    k_w : int, optional
        Kernel width
    d_h : int, optional
        Height stride
    d_w : int, optional
        Width stride
    padding : str, optional
        Padding type: "SAME" or "VALID"
    name : str, optional
        Variable scope

    Returns
    -------
    op : tf.Tensor
        Output of convolution
    """
    with tf.variable_scope(name or 'conv2d', reuse=reuse):
        W = tf.get_variable(
            name='W',
            shape=[k_h, k_w, x.get_shape()[-1], n_output],
            initializer=tf.contrib.layers.xavier_initializer_conv2d())

        conv = tf.nn.conv2d(
            name='conv',
            input=x,
            filter=W,
            strides=[1, d_h, d_w, 1],
            padding=padding)

        b = tf.get_variable(
            name='b',
            shape=[n_output],
            initializer=tf.constant_initializer(0.0))

        h = tf.nn.bias_add(
            name='h',
            value=conv,
            bias=b)

    return h, W

def linear(x, n_output, name=None, activation=None, reuse=None):
    """Fully connected layer.

    Parameters
    ----------
    x : tf.Tensor
        Input tensor to connect
    n_output : int
        Number of output neurons
    name : None, optional
        Scope to apply

    Returns
    -------
    h, W : tf.Tensor, tf.Tensor
        Output of fully connected layer and the weight matrix
    """
    if len(x.get_shape()) != 2:
        x = flatten(x, reuse=reuse)

    n_input = x.get_shape().as_list()[1]

    with tf.variable_scope(name or "fc", reuse=reuse):
        W = tf.get_variable(
            name='W',
            shape=[n_input, n_output],
            dtype=tf.float32,
            initializer=tf.tf.contrib.layers.xavier_initializer())

        b = tf.get_variable(
            name='b',
            shape=[n_output],
            dtype=tf.float32,
            initializer=tf.constant_initializer(0.0))

        h = tf.nn.bias_add(
            name='h',
            value=tf.matmul(x, W),
            bias=b)

        if activation:
            h = activation(h)

        return h, W

def flatten(x, name=None, reuse=None):
    """Flatten Tensor to 2-dimensions.

    Parameters
    ----------
    x : tf.Tensor
        Input tensor to flatten.
    name : None, optional
        Variable scope for flatten operations

    Returns
    -------
    flattened : tf.Tensor
        Flattened tensor.
    """
    with tf.variable_scope('flatten'):
        dims = x.get_shape().as_list()
        if len(dims) == 4:
            flattened = tf.reshape(
                x,
                shape=[-1, dims[1] * dims[2] * dims[3]])
        elif len(dims) == 2 or len(dims) == 1:
            flattened = x
        else:
            raise ValueError('Expected n dimensions of 1, 2 or 4.  Found:',
                             len(dims))

        return flattened

def lrelu(features, leak=0.2):
    """Leaky rectifier.

    Parameters
    ----------
    features : tf.Tensor
        Input to apply leaky rectifier to.
    leak : float, optional
        Percentage of leak.

    Returns
    -------
    op : tf.Tensor
        Resulting output of applying leaky rectifier activation.
    """
    f1 = 0.5 * (1 + leak)
    f2 = 0.5 * (1 - leak)
    return f1 * features + f2 * abs(features)

train_net_24.py 參考train_net_12.py,加深一下網(wǎng)絡(luò),自己寫吧。。。。
下面是滑動(dòng)窗人臉檢測(cè)的流程:
(1)確定最小檢測(cè)人臉,對(duì)原圖img縮放,縮放比例為(滑動(dòng)窗大?。钚∪四槾笮。?。
(2)縮放后的圖片,構(gòu)建金字塔。
(3)對(duì)金字塔的每一層,通過滑動(dòng)窗獲取patch,對(duì)patch歸一化處理,之后給訓(xùn)練好的人臉檢測(cè)器識(shí)別,將識(shí)別為人臉的窗口位置和概率保存。
(4)將人臉窗口映射到原圖img中的人臉位置,概率不變。
(5)NMS處理重疊窗口。
(6)級(jí)聯(lián)的方式提高準(zhǔn)確率。
(7)在原圖畫出人臉位置。
*****調(diào)節(jié)的參數(shù)有:
# 步長(zhǎng)
stride = 2
# 最小人臉大小
F = 40
# 構(gòu)建金字塔的比例
ff = 0.8
# 概率多大時(shí)判定為人臉?
p = 0.8
# nms
overlapThresh_12 = 0.7
overlapThresh_24 = 0.7
下面不是完成代碼,需要自己添加訓(xùn)練好的model,稍作修改就可以。

import numpy as np
import tensorflow as tf
from model import fcn_12_detect


def py_nms(dets, thresh, mode="Union"):
    """
    greedily select boxes with high confidence
    keep boxes overlap <= thresh
    rule out overlap > thresh
    :param dets: [[x1, y1, x2, y2 score]]
    :param thresh: retain overlap <= thresh
    :return: indexes to keep
    """
    if len(dets) == 0:
        return []
    x1 = dets[:, 0]
    y1 = dets[:, 1]
    x2 = dets[:, 2]
    y2 = dets[:, 3]
    scores = dets[:, 4]

    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    keep = []
    while order.size > 0:
        i = order[0]
        keep.append(i)
        xx1 = np.maximum(x1[i], x1[order[1:]])
        yy1 = np.maximum(y1[i], y1[order[1:]])
        xx2 = np.minimum(x2[i], x2[order[1:]])
        yy2 = np.minimum(y2[i], y2[order[1:]])

        w = np.maximum(0.0, xx2 - xx1 + 1)
        h = np.maximum(0.0, yy2 - yy1 + 1)
        inter = w * h
        if mode == "Union":
            ovr = inter / (areas[i] + areas[order[1:]] - inter)
        elif mode == "Minimum":
            ovr = inter / np.minimum(areas[i], areas[order[1:]])

        inds = np.where(ovr <= thresh)[0]
        order = order[inds + 1]

    return dets[keep]

def image_preprocess(img):

    img = (img - 127.5)*0.0078125
    '''m = img.mean()
    s = img.std()
    min_s = 1.0/(np.sqrt(img.shape[0]*img.shape[1]*img.shape[2]))
    std = max(min_s, s)  
    img = (img-m)/std'''

    return img



def slide_window(img, window_size, stride):
    # 對(duì)構(gòu)建的金字塔圖片,滑動(dòng)窗口。
    # img:圖片, window_size:滑動(dòng)窗的大小,stride:步長(zhǎng)。
    
    window_list = []
    
    w = img.shape[1]
    h = img.shape[0]

    if w<=window_size+stride or h<=window_size+stride:
        return None
    if len(img.shape)!=3:
        return None
    
    for i in range(int((w-window_size)/stride)):
        for j in range(int((h-window_size)/stride)):
            box = [j*stride, i*stride, j*stride+window_size, i*stride+window_size]
            
            window_list.append(box)

return img, np.asarray(window_list)


def pyramid(image, f, window_size):
    # 構(gòu)建圖像的金字塔,以便進(jìn)行多尺度滑動(dòng)窗口
    # image:輸入圖像,f:縮放的尺度, window_size:滑動(dòng)窗大小。
    w = image.shape[1]
    h = image.shape[0]
    img_ls = []
    while( w > window_size and h > window_size):
        img_ls.append(image)
        w = int(w * f)
        h = int(h * f)
        image = cv2.resize(image, (w, h))
    return img_ls

def min_face(img, F, window_size, stride):
    # img:輸入圖像,F(xiàn):最小人臉大小, window_size:滑動(dòng)窗,stride:滑動(dòng)窗的步長(zhǎng)。
    h, w, _ = img.shape
    w_re = int(float(w)*window_size/F)
    h_re = int(float(h)*window_size/F)
    if w_re<=window_size+stride or h_re<=window_size+stride:
        print (None)
    # 調(diào)整圖片大小的時(shí)候注意參數(shù),千萬不要寫反了
    # 根據(jù)最小人臉縮放圖片
    img = cv2.resize(img, (w_re, h_re))
    return img



if __name__ = "__main__":
    
    image = cv2.imread('images/1.jpg')
    h,w,_ = image.shape
    
    ......
    # 調(diào)參的參數(shù)
    IMAGE_SIZE = 12
    # 步長(zhǎng)
    stride = 2
    # 最小人臉大小
    F = 40
    # 構(gòu)建金字塔的比例
    ff = 0.8
    # 概率多大時(shí)判定為人臉?
    p_12 = 0.8
    p_24 = 0.8
    # nms
    overlapThresh_12 = 0.7
    overlapThresh_24 = 0.3
    ......
    # 加載 model
    net_12 = fcn_12_detect()
    net_12_vars = [v for v in tf.trainable_variables() if v.name.startswith('net_12')]
    saver_net_12 = tf.train.Saver(net_12_vars)
    sess = tf.Session()
    sess.run(tf.initialize_all_variables())
    saver_net_12.restore(sess, 'model/12-net/model_net_12-123200')
    # net_24...
    ......
    # 需要檢測(cè)的最小人臉
    image_ = min_face(image, F, IMAGE_SIZE, stride)
    ......
    # 金字塔
    pyd = pyramid(np.array(image_), ff, IMAGE_SIZE)
    ......
    # net-12
    window_after_12 = []
    for i, img in enumerate(pyd):
        # 滑動(dòng)窗口
        slide_return = slide_window(img, IMAGE_SIZE, stride)
        if slide_return is None:
            break
        img_12 = slide_return[0]
        window_net_12 = slide_return[1]
        w_12 = img_12.shape[1]
        h_12 = img_12.shape[0]
        
        patch_net_12 = []
        for box in window_net_12:
            patch = img_12[box[0]:box[2], box[1]:box[3], :]
            # 做歸一化處理
            patch = image_preprocess(patch)
            patch_net_12.append(patch)
        patch_net_12 = np.array(patch_net_12)
    
        # 預(yù)測(cè)人臉
        pred_cal_12 = sess.run(net_12['pred'], feed_dict={net_12['imgs']: patch_net_12})
        window_net = window_net_12
        # print (pred_cal_12)
        windows = []
        for i, pred in enumerate(pred_cal_12):
            # 概率大于0.8的判定為人臉。
            s = np.where(pred[1]>p_12)[0]
            if len(s)==0:
                continue
            #保存窗口位置和概率。
            windows.append([window_net[i][0],window_net[i][1],window_net[i][2],window_net[i][3],pred[1]])
        
        # 按照概率值 由大到小排序
        windows = np.asarray(windows)
        windows = py_nms(windows, overlapThresh_12, 'Union')
        window_net = windows
        for box in window_net:
            lt_x = int(float(box[0])*w/w_12)
            lt_y = int(float(box[1])*h/h_12)
            rb_x = int(float(box[2])*w/w_12)
            rb_y = int(float(box[3])*h/h_12)
            p_box = box[4]
            window_after_12.append([lt_x, lt_y, rb_x, rb_y, p_box])
    # 按照概率值 由大到小排序
    # window_after_12 = np.asarray(window_after_12)
    # window_net = py_nms(window_after_12, overlapThresh_12, 'Union')
    window_net = window_after_12
    print (window_net)
    
    # net-24
    windows_24 = []
    if window_net == []:
        print "windows is None!"
    if window_net != []:
        patch_net_24 = []
        img_24 = image
        for box in window_net:
            patch = img_24[box[0]:box[2], box[1]:box[3], :]
            patch = cv2.resize(patch, (24, 24))
            # 做歸一化處理
            patch = image_preprocess(patch)
            patch_net_24.append(patch)
        # 預(yù)測(cè)人臉
        pred_net_24 = sess.run(net_24['pred'], feed_dict={net_24['imgs']: patch_net_24})
        print (pred_net_24)
        window_net = window_net
        # print (pred_net_24)
        for i, pred in enumerate(pred_net_24):
            s = np.where(pred[1]>p_24)[0]
            if len(s)==0:
                continue
            windows_24.append([window_net[i][0],window_net[i][1],window_net[i][2],window_net[i][3],pred[1]])
        # 按照概率值 由大到小排序
        windows_24 = np.asarray(windows_24)
        #window_net = nms_max(windows_24, overlapThresh=0.7)
        window_net = py_nms(windows_24, overlapThresh_24, 'Union')


    if window_net == []:
        print "windows is None!"
    if window_net != []:
        print(window_net.shape)
        for box in window_net:
            #ImageDraw.Draw(image).rectangle((box[1], box[0], box[3], box[2]), outline = "red")
            cv2.rectangle(image, (int(box[1]),int(box[0])), (int(box[3]),int(box[2])), (0, 255, 0), 2)
    cv2.imwrite("images/face_img.jpg", image)
    cv2.imshow("face detection", image)
    cv2.waitKey(10000)
    cv2.destroyAllWindows()
    

    coord.request_stop()
    coord.join(threads)

    sess.close()

檢測(cè)結(jié)果:(下面的重疊窗口可以通過設(shè)置overlapThresh去除)

20170903113941123.jpg
20170903130732564.jpg
最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請(qǐng)聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請(qǐng)結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡(jiǎn)書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

相關(guān)閱讀更多精彩內(nèi)容

友情鏈接更多精彩內(nèi)容