由于業(yè)務(wù)需要,所以搜索了一些相關(guān)的隨機算法
代碼是參考維基百科進行編寫的:https://en.wikipedia.org/wiki/Reservoir_sampling
注意點:
Chao算法會有缺陷,因為它一開始就把所需要的數(shù)據(jù)全部扔池子里了。 如果權(quán)重存在0的的數(shù)據(jù),且數(shù)據(jù)量較少,可能會出現(xiàn)在最終結(jié)果里。(實驗中,Chao算法結(jié)果不正確)
Res算法,我使用了遍歷求最小值,所以在處理大量數(shù)據(jù)時,可能會存在性能瓶頸。
代碼:
-- 蓄水池采樣算法
function reserviorSampling(tbSequence, dNeed, dSequenceSize, funcRandom)
dSequenceSize = dSequenceSize or #tbSequence
funcRandom = funcRandom or math.random
if dNeed > dSequenceSize then
return tbSequence
end
local tbSample = {}
for i=1, dNeed do
table.insert(tbSample, tbSequence[i])
end
for i=dNeed+1, dSequenceSize do
local j = funcRandom(1,i)
if dNeed >= j then
tbSample[j] = tbSequence[i]
end
end
return tbSample
end
-- 加權(quán)蓄水池采樣算法: Algorithm A-Chao
function weightedReserviorSampling_Chao(tbSequence, dNeed, dSequenceSize, funcRandom)
dSequenceSize = dSequenceSize or #tbSequence
funcRandom = funcRandom or math.random
if dNeed > dSequenceSize then
return tbSequence
end
local dWeightSum = 0
local tbSample = {}
for i=1, dNeed do
table.insert(tbSample, tbSequence[i])
dWeightSum = dWeightSum + tbSequence[i].weight
end
for i=dNeed+1, dSequenceSize do
dWeightSum = dWeightSum + tbSequence[i].weight
local p = tbSequence[i].weight / dWeightSum
local j = funcRandom()
if j <= p then
tbSample[funcRandom(1, dNeed)] = tbSequence[i]
end
end
return tbSample
end
-- 加權(quán)蓄水池采樣算法: Algorithm A-Res
function weightedReserviorSampling_Res(tbSequence, dNeed, dSequenceSize, funcRandom)
dSequenceSize = dSequenceSize or #tbSequence
funcRandom = funcRandom or math.random
if dNeed > dSequenceSize then
return tbSequence
end
local dWeightSum = 0
local tbSample = {}
local dMinIndex = nil
for i=1, dSequenceSize do
local r = funcRandom()^(1/tbSequence[i].weight)
tbSequence[i].__reservior_r = r
if i <= dNeed then
table.insert(tbSample, tbSequence[i])
else
if not dMinIndex then
local dMin = 9999
for i,v in ipairs(tbSample) do
if v.__reservior_r < dMin then
dMin = v.__reservior_r
dMinIndex = i
end
end
end
assert(dMinIndex)
if r > tbSample[dMinIndex].__reservior_r then
table.remove(tbSample,dMinIndex)
dMinIndex = nil
table.insert(tbSample, tbSequence[i])
end
end
end
return tbSample
end