存儲(chǔ)
以構(gòu)建文章內(nèi)容涉及人物間關(guān)系為例(此處人物即實(shí)體)
如下圖,先使用AI來提取實(shí)體(人物)-關(guān)系,將實(shí)體-關(guān)系存入圖數(shù)據(jù)庫,再將文檔向量化后關(guān)聯(lián)實(shí)體存入向量數(shù)據(jù)庫

查詢
如下圖所示,用戶輸入一段查詢文本,將其向量化后選取關(guān)聯(lián)最緊密的文章相關(guān)聯(lián)的實(shí)體,再利用此實(shí)體來查詢圖數(shù)據(jù)庫中的實(shí)體關(guān)系。
(ps:理論上這種方法要查詢的精確,需要將文章切分為足夠小且完整的段落,否則一個(gè)文章關(guān)聯(lián)實(shí)體過多就沒有意義了。若文章文本巨大,可嘗試先利用AI進(jìn)行文本切分再提取實(shí)體關(guān)系,向量化存儲(chǔ);或者要求AI對(duì)每個(gè)實(shí)體寫一個(gè)簡(jiǎn)介,向量化此簡(jiǎn)介;又或是將實(shí)體向量化后直接存儲(chǔ))

AI
市面上的ChatAI模型,或本地部署也可,此處采用智譜AI為例
向量化
curl --location --request POST 'https://open.bigmodel.cn/api/paas/v4/embeddings' \
--header 'Content-Type: application/json' \
--header 'Authorization: Bearer zhipu_ai_tokens' \
--data-raw '{
"input": "賈寶玉",
"model": "embedding-2"
}'

實(shí)體關(guān)系提取
例如下圖,通過提問讓AI提取實(shí)體-關(guān)系,代碼層面自然是使用AI產(chǎn)品提供的API實(shí)現(xiàn)。
(ps: 市面上有許多專注于 實(shí)體識(shí)別(NER)、關(guān)系抽取(RE)、事件抽?。‥E)的產(chǎn)品,例如 KnowLM/README_ZH.md at main · zjunlp/KnowLM · GitHub)

向量數(shù)據(jù)
以 chromadb為例
Running Chroma - ChromaDB Cookbook | The Unofficial Guide to ChromaDB
docker安裝
docker run -d --name chromadb -p 8100:8000 -v /root/chromadb/db_data:/chroma/chroma -e IS_PERSISTENT=TRUE -e ANONYMIZED_TELEMETRY=TRUE chromadb/chroma:latest
Java SDK
https://github.com/amikos-tech/chromadb-java-client
chroma有多種語言的sdk(Chroma Ecosystem Clients - ChromaDB Cookbook | The Unofficial Guide to ChromaDB),
此處使用java
- 導(dǎo)入chromadb-sdk依賴,pom.xml
<dependency>
<groupId>io.github.amikos-tech</groupId>
<artifactId>chromadb-java-client</artifactId>
<version>0.1.5</version>
</dependency>
- 由于使用智譜AI向量化接口,還需要導(dǎo)入智譜SDK
<dependency>
<groupId>cn.bigmodel.openapi</groupId>
<artifactId>oapi-java-sdk</artifactId>
<version>release-V4-2.1.0</version>
</dependency>
- 先簡(jiǎn)單包裝下向量化接口
package com.jenson;
import com.zhipu.oapi.ClientV4;
import com.zhipu.oapi.service.v4.embedding.EmbeddingApiResponse;
import com.zhipu.oapi.service.v4.embedding.EmbeddingRequest;
import com.zhipu.oapi.service.v4.embedding.EmbeddingResult;
import java.util.concurrent.TimeUnit;
/**
* @author Jenson
* @version 1.0
* @date 2024/8/12 上午10:40
*/
public class ZhiPuUtils {
/**
* AI向量化文本
*
* @param apiKey apiKey
* @param model 模型名稱
* @param content 文本內(nèi)容
* @return 向量化結(jié)果
*/
public static EmbeddingResult embedding(String apiKey, String model, String content) {
ClientV4 client = new ClientV4
.Builder(apiKey)
.networkConfig(
60 * 5,
60 * 5,
60 * 5,
60 * 5,
TimeUnit.SECONDS)
.build();
EmbeddingRequest request = EmbeddingRequest.builder()
.model(model)
.input(content)
.build();
EmbeddingApiResponse embeddingApiResponse = client.invokeEmbeddingsApi(request);
return embeddingApiResponse.getData();
}
}
- 實(shí)現(xiàn)
tech.amikos.chromadb.EmbeddingFunction
chroma-sdk 的現(xiàn)有實(shí)現(xiàn)不包含國(guó)產(chǎn)AI的,得自己寫一下,簡(jiǎn)單弄一下就好
package com.jenson;
import com.zhipu.oapi.service.v4.embedding.Embedding;
import com.zhipu.oapi.service.v4.embedding.EmbeddingResult;
import tech.amikos.chromadb.EmbeddingFunction;
import java.util.ArrayList;
import java.util.List;
/**
* 智譜AI向量化Chroma 函數(shù)實(shí)現(xiàn)
*
* @author Jenson
* @version 1.0
*/
public class ZhiPuAIEmbeddingFunction implements EmbeddingFunction {
private final String apiKey;
private final String defaultModel;
public ZhiPuAIEmbeddingFunction(String apiKey,
String defaultModel) {
this.apiKey = apiKey;
this.defaultModel = defaultModel;
}
@Override
public List<List<Float>> createEmbedding(List<String> documents) {
return this.createEmbedding(documents, this.defaultModel);
}
@Override
public List<List<Float>> createEmbedding(List<String> documents, String model) {
List<List<Float>> list = new ArrayList<>();
for (String document : documents) {
EmbeddingResult embeddingResult = ZhiPuUtils.embedding(this.apiKey, model, document);
if (embeddingResult.getError() != null) {
throw new RuntimeException("質(zhì)譜AI向量化失敗," + embeddingResult.getError().toString());
}
List<Embedding> embeddings = embeddingResult.getData();
List<Float> embeddingList = new ArrayList<>();
for (Double v : embeddings.get(0).getEmbedding()) {
embeddingList.add(Float.valueOf(String.valueOf(v)));
}
list.add(embeddingList);
}
return list;
}
}
- 包裝chromadb的增刪改查,此例只簡(jiǎn)單包裝了下新增和查詢
package com.jenson;
import com.google.gson.internal.LinkedTreeMap;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.codec.digest.DigestUtils;
import tech.amikos.chromadb.Client;
import tech.amikos.chromadb.Collection;
import tech.amikos.chromadb.EmbeddingFunction;
import tech.amikos.chromadb.handler.ApiException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
/**
* Chroma Db Client
*
* @author Jenson
* @version 1.0
*/
@Slf4j
public class ChromaClient {
private Client client;
private EmbeddingFunction embeddingFunction;
public ChromaClient(String basePath, EmbeddingFunction embeddingFunction) {
this.client = new Client(basePath);
this.embeddingFunction = embeddingFunction;
}
/**
* 保存數(shù)據(jù)
*
* @param collectionName 集合
* @param document 文檔
* @param metadata 元數(shù)據(jù)
*/
public void save(String collectionName, String document, Map<String, String> metadata) {
Collection collection = this.getCollection(collectionName);
List<Map<String, String>> metadataList = new ArrayList<>();
if (metadata != null && !metadata.isEmpty()) {
metadataList.add(metadata);
}
try {
String id = DigestUtils.md5Hex((document).getBytes(StandardCharsets.UTF_8));
collection.add(
null,
metadataList,
Collections.singletonList(document),
Collections.singletonList(id));
} catch (ApiException e) {
log.error("wrong to add chroma document, collection : " + collectionName + ",", e);
throw new RuntimeException("wrong to add chroma document, collection : " + collectionName);
}
}
/**
* 獲取集合
*
* @param collectionName 集合
* @return 集合
*/
public Collection getCollection(String collectionName) {
if (collectionName == null || collectionName.isEmpty()) {
throw new RuntimeException("function getCollection param collectionName is required");
}
Collection collection = null;
try {
Map<String, String> metadata = new LinkedTreeMap<>();
metadata.put("hnsw:space", "cosine");
metadata.put("embedding_function", embeddingFunction.getClass().getName());
collection = client.createCollection(
collectionName,
// 自定義向量空間的距離計(jì)算方法
metadata,
true,
this.embeddingFunction);
} catch (ApiException e) {
log.error("wrong to get chroma collection : " + collectionName + ",", e);
throw new RuntimeException("wrong to get chroma collection : " + collectionName);
}
return collection;
}
}
存儲(chǔ)
package com.jenson;
import tech.amikos.chromadb.handler.ApiException;
import java.util.HashMap;
/**
* @author Jenson
* @version 1.0
* @date 2024/8/12 上午10:38
*/
public class Main {
public static void main(String[] args) throws ApiException {
ChromaClient chromaClient = new ChromaClient(
"http://localhost:8100",
new ZhiPuAIEmbeddingFunction("zhipu_apikey", "embedding-2")
);
chromaClient.save("test",
// 文章內(nèi)容
"《紅樓夢(mèng)》書敘西方靈河岸上三生石畔的絳珠仙子,為了酬報(bào)神瑛侍者的灌溉之恩,要將畢生的淚水償還,就隨其下凡歷劫。寶玉為神瑛侍者轉(zhuǎn)世,林黛玉為絳珠仙子轉(zhuǎn)世,這段姻緣稱為“木石前盟”。遠(yuǎn)古女媧煉石補(bǔ)天遺下的頑石,通靈性,為賈寶玉出世時(shí)所銜的“通靈寶玉”,“通靈寶玉”歷盡世間辛酸悲歡的故事,就是《石頭記》,亦即《紅樓夢(mèng)》。\n《紅樓夢(mèng)》故事紛紜復(fù)雜,其較大的事件有:黛玉喪母,進(jìn)京依附外祖母;寶玉母姨及其子薛蟠、女薛寶釵進(jìn)駐賈府;寶玉在秦可卿臥房午覺,夢(mèng)游太虛幻境,看了”金陵十二釵正冊(cè)”;王熙鳳毒設(shè)相思局,致賈瑞命歸黃泉;秦可卿病亡,公公哭得如淚人一般;賈元春加封賢德妃,獲準(zhǔn)省親,元春題名別院為“大觀園”;寶、黛二人于沁芳閘共賞 《會(huì)真記》,寶玉以張生、鶯鶯喻己喻人,黛玉感極生嗔;王夫人怒逐金釧,金釧不堪受辱,投井身亡;寶玉事發(fā),賈政痛笞寶玉;探春發(fā)起組織海棠詩社,此時(shí)邢岫煙、李紋、薛寶琴等同時(shí)入駐賈府,彼等均具詩才,大觀園比前更加熱鬧;劉姥姥攜外孫板兒進(jìn)榮府,深得賈母歡心;紫鵑戲說黛玉將回蘇州,寶玉呆癥大發(fā);賈璉垂涎尤氏姐妹,偷娶尤二姐;尤二姐為鳳姐所害,誤服虎狼藥,吞金自盡;尤三姐殉情飲劍身亡;賈赦欲討鴛鴦為妾,鴛鴦哭訴賈母,賈母申斥賈赦夫婦;王夫人、鳳姐夜抄大觀園,司棋、晴雯被攆;晴雯病亡,寶玉心痛如絞,作《芙蓉女兒誄》以祭;迎春嫁了“中山狼”孫紹祖,受盡凌辱而死;薛蟠吃酒打死酒店當(dāng)槽被擒拿;夏金桂誤飲毒藥湯,自取滅亡;元妃薨逝,通靈寶玉丟失,寶玉喪魂失魄;鳳姐奇設(shè)調(diào)包計(jì),黛玉聞知寶玉娶了寶釵,魂歸離恨天,寶玉于瀟湘館痛祭黛玉,紫鵑細(xì)訴黛玉臨終情景;薛寶琴史湘云相繼出嫁;錦衣軍奉旨查抄賈府;賈母逝世,鴛鴦上吊身亡;鳳姐病重,臨終托劉姥姥照看巧姐;寶玉魂魄隨和尚重游太虛幻境,見到眾多已離人世的姐妹;寶玉、賈蘭叔侄赴考,出考場(chǎng),寶玉旋即迷失;賈政途遇寶玉與一僧一道飄然而去,圣上賜寶玉“文妙真人”道號(hào);襲人嫁與蔣玉菡;賈雨村和甄士隱執(zhí)手?jǐn)⑴f,言榮寧二府,將會(huì)蘭桂齊芳,家道復(fù)初;僧道攜寶玉到青埂峰下,仍將玉放在女媧補(bǔ)天之處,各自云游。",
// 人物(實(shí)體)列表
new HashMap<>() {{
put("names", "賈璉,夏金桂,秦可卿,蔣玉菡,賈雨村,和尚,王夫人,迎春,寶玉,劉姥姥,王熙鳳,公公,尤二姐,甄士隱,金釧,賢德妃,探春,鴛鴦,賈府,外祖母,賈政,寶釵,薛寶琴,通靈寶玉,自取滅亡,賈寶玉,司棋,絳珠仙子,賈元春,邢岫煙,黛玉,史湘云,賈蘭,孫紹祖,李紋,鳳姐,板兒,賈瑞,酒店當(dāng)槽,薛寶釵,薛蟠,紫鵑,晴雯,賈赦,林黛玉,襲人,錦衣軍,神瑛侍者");
}});
}
}
查詢
package com.jenson;
import tech.amikos.chromadb.Collection;
import tech.amikos.chromadb.handler.ApiException;
import tech.amikos.chromadb.model.QueryEmbedding;
import java.util.ArrayList;
import java.util.Collections;
/**
* @author Jenson
* @version 1.0
* @date 2024/8/12 上午10:38
*/
public class Main {
public static void main(String[] args) throws ApiException {
ChromaClient chromaClient = new ChromaClient(
"http://localhost:8100",
new ZhiPuAIEmbeddingFunction("zhipu_apikey", "embedding-2")
);
Collection.QueryResponse queryResponse = chromaClient.getCollection("test").query(
// 用戶查詢的文本
Collections.singletonList("夢(mèng)游太虛幻境"),
10,
null,
null,
new ArrayList<>() {{
// include EMBEDDINGS 會(huì)報(bào)錯(cuò),Expected a double but was BEGIN_ARRAY at line 1 column 300 path $.embeddings[0][0]
// add(QueryEmbedding.IncludeEnum.EMBEDDINGS);
add(QueryEmbedding.IncludeEnum.METADATAS);
add(QueryEmbedding.IncludeEnum.DOCUMENTS);
// 數(shù)字越小距離越近,關(guān)系越緊密
add(QueryEmbedding.IncludeEnum.DISTANCES);
}}
);
System.out.println(queryResponse);
}
}
圖數(shù)據(jù)庫
圖數(shù)據(jù)庫使用neo4j為例
安裝
安裝社區(qū)版,Neo4j Deployment Center - Graph Database & Analytics

我安裝的是window版的,解壓后執(zhí)行 bin 下的啟動(dòng)文件
> neo4j console
日志中會(huì)包含以下兩條
2024-08-12 08:14:30.383+0000 INFO Bolt enabled on localhost:7687.
2024-08-12 08:14:30.880+0000 INFO HTTP enabled on localhost:7474.
數(shù)據(jù)庫鏈接端口為 7687
web端口為 7474
默認(rèn)賬號(hào)密碼為 neo4j / neo4j
web端初次登錄會(huì)被要求修改密碼
Java 連接
使用java連接,安裝驅(qū)動(dòng)依賴
<dependency>
<groupId>org.neo4j.driver</groupId>
<artifactId>neo4j-java-driver</artifactId>
<version>5.23.0</version>
</dependency>
創(chuàng)建人物(實(shí)體) - 關(guān)系
若要節(jié)點(diǎn)不重復(fù),可以創(chuàng)建唯一性約束(以某個(gè)屬性為唯一)
CREATE CONSTRAINT 約束名稱 FOR (n:Lable) REQUIRE n.屬性 IS UNIQUE
例:
CREATE CONSTRAINT FOR (n:Person) REQUIRE n.nameIS UNIQUE
使用了拼接CQL語句的方式
使用MERGE來建立關(guān)系,可以避免創(chuàng)建重復(fù)關(guān)系
package com.jenson;
import org.neo4j.driver.*;
import org.neo4j.driver.Record;
import org.neo4j.driver.exceptions.ClientException;
import java.util.*;
/**
* @author Jenson
* @version 1.0
* @date 2024/8/2 上午10:36
*/
public class Main {
public static void main(String[] args) {
// 連接URI,用戶名和密碼
String uri = "bolt://127.0.0.1:7687";
String user = "neo4j";
String password = "Aa123456";
String entityLabel = "Person";
String relateLabel = "RELATION";
// 創(chuàng)建驅(qū)動(dòng)實(shí)例
Driver driver = GraphDatabase.driver(uri, AuthTokens.basic(user, password));
String s = "[絳珠仙子,神瑛侍者,恩人];[寶玉,神瑛侍者,轉(zhuǎn)世];[林黛玉,絳珠仙子,轉(zhuǎn)世];[寶玉,林黛玉,姻緣];[賈寶玉,通靈寶玉,擁有者];[黛玉,外祖母,依附];[薛蟠,薛寶釵,兄妹];[寶玉,秦可卿,夢(mèng)中相遇];[王熙鳳,賈瑞,致死];[秦可卿,公公,親情];[賈元春,賢德妃,身份];[寶玉,黛玉,共賞];[王夫人,金釧,主仆];[寶玉,賈政,父子];[探春,邢岫煙,同住];[探春,李紋,同住];[探春,薛寶琴,同住];[劉姥姥,板兒,祖孫];[紫鵑,黛玉,主仆];[賈璉,尤二姐,婚姻];[鳳姐,尤二姐,致死];[賈赦,鴛鴦,求娶];[王夫人,鳳姐,夜抄];[司棋,晴雯,被攆];[寶玉,晴雯,祭奠];[迎春,孫紹祖,婚姻];[薛蟠,酒店當(dāng)槽,致死];[夏金桂,自取滅亡,結(jié)果];[寶玉,通靈寶玉,丟失];[寶玉,寶釵,婚姻];[黛玉,寶玉,得知娶寶釵];[寶玉,紫鵑,詢問];[薛寶琴,史湘云,出嫁];[錦衣軍,賈府,查抄];[鳳姐,劉姥姥,托付];[寶玉,和尚,重游];[寶玉,賈蘭,叔侄];[賈政,寶玉,途遇];[襲人,蔣玉菡,婚姻];[賈雨村,甄士隱,敘舊]";
Set<String> set = new HashSet<>();
String[] split = s.split(";");
Map<String, Map<String, String>> map = new HashMap<>();
for (String string : split) {
String s0 = string;
s0 = s0.replace("[", "").replace("]", "");
String[] s1 = s0.split(",");
String a = s1[0];
String b = s1[1];
String r = s1[2];
set.add(a);
set.add(b);
Map<String, String> map2 = map.computeIfAbsent(a, k -> new HashMap<>());
map2.put(b, r);
}
// 創(chuàng)建會(huì)話
try (Session session = driver.session()) {
// 創(chuàng)建節(jié)點(diǎn)
for (String v : set) {
// 查詢是否存在
Result result = session.run("MATCH (n:" + entityLabel + "{name:'" + v + "'}) return n");
if (result.hasNext()) {
// 已存在節(jié)點(diǎn),跳過
continue;
}
try {
session.run("CREATE (:" + entityLabel + "{name:'" + v + "'})");
} catch (ClientException e) {
System.out.println(e.getMessage());
throw e;
}
}
// 創(chuàng)建關(guān)系
List<String> createRelList = new ArrayList<>();
map.forEach((a, v) ->
v.forEach((b, r) ->
createRelList.add("Match (a:" + entityLabel + "{name:'" + a + "'}),(b:" + entityLabel + "{name:'" + b + "'}) MERGE (a)-[r:" + relateLabel + "{name:'" + r + "'}]->(b)")));
for (String createRel : createRelList) {
session.run(createRel);
}
}
// 關(guān)閉驅(qū)動(dòng)
driver.close();
}
}
查詢
- 查詢?nèi)?/li>
match(n:Person) return *

- 查詢指定人物的n層關(guān)系
match(a:Person)-[r1:RELATION*1..2]->(x) where a.name in ['林黛玉','宋江'] return a,r1,x
