要使用余弦相似度或Levenshtein距離來(lái)計(jì)算字符串之間的相似度,我們需要對(duì)字符串進(jìn)行一些預(yù)處理。余弦相似度通常用于比較文檔或句子,而Levenshtein距離用于計(jì)算兩個(gè)字符串之間的編輯距離。下面我將分別展示如何使用這兩種方法來(lái)對(duì)字符串?dāng)?shù)組進(jìn)行分組。
使用Levenshtein距離
Levenshtein距離是指將一個(gè)字符串轉(zhuǎn)換為另一個(gè)字符串所需的最少編輯操作次數(shù)(包括插入、刪除和替換字符)。
function levenshteinDistance(a, b) {
if (a.length === 0) return b.length;
if (b.length === 0) return a.length;
const matrix = [];
// 初始化矩陣
for (let i = 0; i <= b.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= a.length; j++) {
matrix[0][j] = j;
}
// 計(jì)算Levenshtein距離
for (let i = 1; i <= b.length; i++) {
for (let j = 1; j <= a.length; j++) {
if (b.charAt(i - 1) === a.charAt(j - 1)) {
matrix[i][j] = matrix[i - 1][j - 1];
} else {
matrix[i][j] = Math.min(
matrix[i - 1][j - 1] + 1, // 替換
Math.min(
matrix[i][j - 1] + 1, // 插入
matrix[i - 1][j] + 1 // 刪除
)
);
}
}
}
return matrix[b.length][a.length];
}
function groupByLevenshtein(strings, distanceThreshold) {
const groups = [];
strings.forEach((str, index) => {
let grouped = false;
for (let i = 0; i < groups.length; i++) {
const group = groups[i];
if (levenshteinDistance(str, group[0]) <= distanceThreshold) {
group.push(str);
grouped = true;
break;
}
}
if (!grouped) {
groups.push([str]);
}
});
return groups;
}
// 測(cè)試數(shù)據(jù)
const strings = ["apple", "apply", "appale", "orange", "orango", "banana", "bananb"];
const distanceThreshold = 2;
// 調(diào)用函數(shù)并打印結(jié)果
const groups = groupByLevenshtein(strings, distanceThreshold);
console.log(groups);
使用余弦相似度
余弦相似度通常用于比較文檔或句子,它基于向量空間模型。為了使用余弦相似度,我們需要將字符串轉(zhuǎn)換為向量。這通常涉及到將字符串分割成單詞或字符,并創(chuàng)建一個(gè)表示每個(gè)單詞或字符出現(xiàn)次數(shù)的向量。
function termFrequency(str) {
const terms = str.split('');
const frequency = {};
terms.forEach(term => {
frequency[term] = (frequency[term] || 0) + 1;
});
return frequency;
}
function cosineSimilarity(vecA, vecB) {
let dotProduct = 0;
let magnitudeA = 0;
let magnitudeB = 0;
for (let key in vecA) {
dotProduct += vecA[key] * (vecB[key] || 0);
magnitudeA += vecA[key] * vecA[key];
}
for (let key in vecB) {
magnitudeB += vecB[key] * vecB[key];
}
return dotProduct / (Math.sqrt(magnitudeA) * Math.sqrt(magnitudeB));
}
function groupByCosineSimilarity(strings, similarityThreshold) {
const groups = [];
const vectors = strings.map(termFrequency);
vectors.forEach((vector, index) => {
let grouped = false;
for (let i = 0; i < groups.length; i++) {
const group = groups[i];
if (cosineSimilarity(vector, vectors[group[0]]) >= similarityThreshold) {
group.push(index);
grouped = true;
break;
}
}
if (!grouped) {
groups.push([index]);
}
});
return groups.map(group => group.map(index => strings[index]));
}
// 測(cè)試數(shù)據(jù)
const strings = ["apple", "apply", "appale", "orange", "orango", "banana", "bananb"];
const similarityThreshold = 0.8; // 余弦相似度的范圍是-1到1,接近1表示更相似
// 調(diào)用函數(shù)并打印結(jié)果
const groups = groupByCosineSimilarity(strings, similarityThreshold)
console.log(groups);