如何開發(fā)一個(gè)elasticsearch分詞插件

參考IK插件,如果開發(fā)一款簡單的ES分詞插件。github地址:https://github.com/tenlee2012/elasticsearch-analysis-demo

項(xiàng)目配置

1. 創(chuàng)建pom項(xiàng)目

pom文件大概如下:

<?xml version="1.0" encoding="UTF-8"?>

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <name>elasticsearch-analysis-demo</name>
    <modelVersion>4.0.0</modelVersion>
    <groupId>org.elasticsearch</groupId>
    <artifactId>elasticsearch-analysis-demo</artifactId>
    <version>${elasticsearch.version}</version>
    <packaging>jar</packaging>
    <description>Demo Custom Analyzer for Elasticsearch</description>
    <inceptionYear>2020</inceptionYear>

    <properties>
        <elasticsearch.version>7.7.1</elasticsearch.version>
        <jackson.version>2.10.4</jackson.version>
        <maven.compiler.target>1.8</maven.compiler.target>
        <elasticsearch.plugin.name>analysis-demo</elasticsearch.plugin.name>
        <elasticsearch.plugin.classname>org.elasticsearch.plugin.analysis.demo.AnalysisDemoPlugin</elasticsearch.plugin.classname>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.elasticsearch</groupId>
            <artifactId>elasticsearch</artifactId>
            <version>${elasticsearch.version}</version>
           <!-- 此處的scope只用在provided就可以了,不參與打包 -->
            <scope> provided </scope>
        </dependency>
       
        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-api</artifactId>
            <version>2.3</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>4.12</version>
            <scope>test</scope>
        </dependency>
    </dependencies>

    <build>
        <resources>
            <resource>
                <directory>src/main/resources</directory>
                <filtering>false</filtering>
                <excludes>
                    <!-- 這個(gè)文件是es插件描述文件,不用打包到j(luò)ar包里面 -->
                    <exclude>plugin-descriptor.properties</exclude>
                </excludes>
            </resource>
        </resources>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.7.0</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                </configuration>
            </plugin>
            <!-- 把package的jar包在打包成zip文件,es插件是zip文件格式 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.1.0</version>
                <configuration>
                    <finalName>analysis-http</finalName>
                    <appendAssemblyId>false</appendAssemblyId>
                    <outputDirectory>target</outputDirectory>
                    <descriptors>
                        <!-- 該插件的配置文件 -->
                        <descriptor>src/main/assembly/plugin.xml</descriptor>
                    </descriptors>
                </configuration>
                <executions>
                    <execution>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>

            <!-- 配置文件復(fù)制,用于把 config目錄下的文件復(fù)制到target目錄下 -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-resources-plugin</artifactId>
                <version>2.4.3</version>
                <configuration>
                    <encoding>${project.build.sourceEncoding}</encoding>
                </configuration>
                <executions>
                    <execution>
                        <id>copy-spring-boot-resources</id>
                        <!-- here the phase you need -->
                        <phase>validate</phase>
                        <goals>
                            <!-- 資源文件配置 -->
                            <goal>copy-resources</goal>
                        </goals>
                        <configuration>
                            <encoding>utf-8</encoding>
                            <outputDirectory>${basedir}/target/config</outputDirectory>
                            <resources>
                                <resource>
                                    <directory>${basedir}/config</directory>
                                    <includes>
                                        <include>*</include>
                                    </includes>
                                </resource>
                            </resources>
                        </configuration>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

2. assembly 插件配置

文件路徑src/main/assembly/plugin.xml

<?xml version="1.0"?>
<assembly>
    <id>analysis-http-release</id>
    <formats>
        <format>zip</format>
    </formats>
    <includeBaseDirectory>false</includeBaseDirectory>
    <fileSets>
        <fileSet>
            <directory>${project.basedir}/config</directory>
            <outputDirectory>config</outputDirectory>
        </fileSet>
    </fileSets>

    <files>
        <file>
            <source>${project.basedir}/src/main/resources/plugin-descriptor.properties</source>
            <outputDirectory/>
            <filtered>true</filtered>
        </file>
        <file>
            <source>${project.basedir}/src/main/resources/plugin-security.policy</source>
            <outputDirectory/>
            <filtered>true</filtered>
        </file>
    </files>
    <dependencySets>
        <dependencySet>
            <outputDirectory/>
            <useProjectArtifact>true</useProjectArtifact>
            <useTransitiveFiltering>true</useTransitiveFiltering>
            <excludes>
                <exclude>org.elasticsearch:elasticsearch</exclude>
            </excludes>
        </dependencySet>
        <dependencySet>
            <outputDirectory/>
            <useProjectArtifact>true</useProjectArtifact>
            <useTransitiveFiltering>true</useTransitiveFiltering>
            <includes>
                <include>com.fasterxml.jackson.core:jackson-databind</include>
            </includes>
            <excludes>
                <!-- 會(huì)和 es 自帶的沖突 -->
                <exclude>com.fasterxml.jackson.core:jackson-core</exclude>
            </excludes>
        </dependencySet>
    </dependencySets>
</assembly>

3. 插件描述文件

resources目錄下,plugin-descriptor.properties文件。

該描述文件是給es校驗(yàn)和使用,參考https://www.elastic.co/guide/en/elasticsearch/plugins/master/plugin-authors.html#_plugin_descriptor_file

字段 類型 描述
description String simple summary of the plugin
version String plugin’s version
name String the plugin name
classname String the name of the class to load, fully-qualified.
java.version String version of java the code is built against. Use the system property java.specification.version. Version string must be a sequence of nonnegative decimal integers separated by "."'s and may have leading zeros.
elasticsearch.version String 對(duì)應(yīng)的 Elasticsearch 版本。

ik插件plugin-descriptor.properties的配置是從pom.xml讀取的properties配置,這樣維護(hù)更方便,在打包時(shí)會(huì)替換掉占位符
如下:

# Elasticsearch plugin descriptor file
# This file must exist as 'plugin-descriptor.properties' at
# the root directory of all plugins.
#
# A plugin can be 'site', 'jvm', or both.
#
### example site plugin for "foo":
#
# foo.zip <-- zip file for the plugin, with this structure:
#   _site/ <-- the contents that will be served
#   plugin-descriptor.properties <-- example contents below:
#
# site=true
# description=My cool plugin
# version=1.0
#
### example jvm plugin for "foo"
#
# foo.zip <-- zip file for the plugin, with this structure:
#   <arbitrary name1>.jar <-- classes, resources, dependencies
#   <arbitrary nameN>.jar <-- any number of jars
#   plugin-descriptor.properties <-- example contents below:
#
# jvm=true
# classname=foo.bar.BazPlugin
# description=My cool plugin
# version=2.0.0-rc1
# elasticsearch.version=2.0
# java.version=1.7
#
### mandatory elements for all plugins:
#
# 'description': simple summary of the plugin
description=${project.description}
#
# 'version': plugin's version
version=${project.version}
#
# 'name': the plugin name
name=${elasticsearch.plugin.name}
#
# 'classname': the name of the class to load, fully-qualified.
classname=${elasticsearch.plugin.classname}
#
# 'java.version' version of java the code is built against
# use the system property java.specification.version
# version string must be a sequence of nonnegative decimal integers
# separated by "."'s and may have leading zeros
java.version=${maven.compiler.target}
#
# 'elasticsearch.version' version of elasticsearch compiled against
# You will have to release a new version of the plugin for each new
# elasticsearch release. This version is checked when the plugin
# is loaded so Elasticsearch will refuse to start in the presence of
# plugins with the incorrect elasticsearch.version.
elasticsearch.version=${elasticsearch.version}

4. 權(quán)限聲明文件

resources目錄下,文件名為plugin-security.policy。

jdk的安全策略限制,必須聲明項(xiàng)目使用的權(quán)限

grant {
  // needed because of the hot reload functionality
  permission java.net.SocketPermission "*", "accept,connect,resolve"; // 網(wǎng)絡(luò)訪問
  permission java.lang.RuntimePermission "getClassLoader"; // 部分插件需要,比如okhttp,fastjson
  permission java.net.NetPermission "getProxySelector"; // 網(wǎng)絡(luò)訪問
  permission java.lang.RuntimePermission "accessDeclaredMembers"; // 序列化和反序列化,比如jackson,fastjson
  permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; // 序列化和反序列化,比如jackson,fastjson
};

開發(fā)

1. 插件入口類

plugin-descriptor.properties文件的classname屬性配置的插件入口。
ik的入口類是org.elasticsearch.plugin.analysis.ik.AnalysisIkPlugin,比較簡單,繼承Plugin和實(shí)現(xiàn)了AnalysisPlugin接口,主聲明了tokenizeranalyzer的名稱,ik代碼參考。

public class MyAnalysisPlugin extends Plugin implements AnalysisPlugin {

    @Override
    public Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> getTokenizers() {
        Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> extra = new HashMap<>();

        extra.put("demo_tokenizer", new AnalysisModule.AnalysisProvider<TokenizerFactory>() {
            @Override
            public TokenizerFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
                return MyTokenizerFactory.getTokenizerFactory(indexSettings, environment, name, settings);
            }
        });

        return extra;
    }
    @Override
    public Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> getAnalyzers() {
        Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<? extends Analyzer>>> extra = new HashMap<>();

        extra.put("demo_analyzer", new AnalysisModule.AnalysisProvider() {
            @Override
            public Object get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
                return MyAnalyzerProvider.getAnalyzerProvider(indexSettings, environment, name, settings);
            }
        });
        return extra;
    }
}

2. tokenizer怎么寫

MyTokenizerFactory

public class MyTokenizerFactory extends AbstractTokenizerFactory {

    private MyConfiguration configuration;

    /**
     * 構(gòu)造函數(shù)
     * @param indexSettings 索引配置
     * @param name 分析器或者分詞器名稱。如果是自定義分析器,則為自定義分析器名稱
     * @param env es環(huán)境配置
     * @param settings 自定義分析器配置參數(shù)
     */
    public MyTokenizerFactory(IndexSettings indexSettings, String name, Environment env, Settings settings) {
        super(indexSettings,  settings, name);
        configuration = new MyConfiguration(indexSettings, name, env, settings);
    }

    public static TokenizerFactory getTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
        return new MyTokenizerFactory(indexSettings, name, environment, settings).setSmart(false);
    }
    
    @Override
    public Tokenizer create() {
        return new MyTokenizer(configuration);
    }
}

核心來了,他就是MyTokenizer
MyTokenizer繼承org.apache.lucene.analysis.Tokenizer,同時(shí)必須是final類型,不然啟動(dòng)會(huì)報(bào)錯(cuò)。
代碼示例如下

public final class MyTokenizer extends Tokenizer {
    //詞元文本屬性
    private final CharTermAttribute termAtt;
    //詞元位移屬性
    private final OffsetAttribute offsetAtt;
    // 距離
    private final PositionIncrementAttribute positionAttr;

    /**
     * 單文檔當(dāng)前所在的總offset,當(dāng)reset(切換multi-value fields中的value)的時(shí)候不清零,在end(切換field)時(shí)清零
     */
    private int totalOffset = 0;

    private AnalyzeContext analyzeContext;

    public MyTokenizer(Configuration configuration) {
        super();
        offsetAtt = addAttribute(OffsetAttribute.class);
        termAtt = addAttribute(CharTermAttribute.class);
        positionAttr = addAttribute(PositionIncrementAttribute.class);

        analyzeContext = new AnalyzeContext(input, configuration);
    }

    /**
     * @return 返會(huì)true告知還有下個(gè)詞元,返會(huì)false告知詞元輸出完畢
     * @throws IOException
     */
    @Override
    public boolean incrementToken() throws IOException {
        this.clearAttributes();

        int position = 0;
        Term term;
        boolean unIncreased = true;
        do {
            term = analyzeContext.next();
            if (term == null) {
                break;
            }
            if (TextUtility.isBlank(term.getText())) { // 過濾掉空白符,提高索引效率
                continue;
            }

            ++position;
            unIncreased = false;
        } while (unIncreased);

        if (term != null) {
            positionAttr.setPositionIncrement(position);
            termAtt.setEmpty().append(term.getText());
            offsetAtt.setOffset(correctOffset(totalOffset + term.getOffset()),
                    correctOffset(totalOffset + term.getOffset() + term.getText().length()));
            return true;
        } else {
            totalOffset += analyzeContext.offset;
            return false;
        }
    }
    
    @Override
    public void end() throws IOException {
        super.end();
        offsetAtt.setOffset(totalOffset, totalOffset);
        totalOffset = 0;
    }

    /**
     * 必須重載的方法,否則在批量索引文件時(shí)將會(huì)導(dǎo)致文件索引失敗
     */
    @Override
    public void reset() throws IOException {
        super.reset();
        analyzeContext.reset(new BufferedReader(this.input));
    }
}

分詞類AnalyzeContext

分詞類負(fù)責(zé)讀取文本,將文本分詞,

public class AnalyzeContext {

    /**
     * 輸入
     */
    private Reader input;
    /**
     * 配置
     */
    private Configuration configuration;
    /**
     * 分詞結(jié)果
     */
    private Iterator<Term> iterator;
    /**
     * term的偏移量,由于wrapper是按行讀取的,必須對(duì)term.offset做一個(gè)校正
     */
    int offset;
    /**
     * 緩沖區(qū)大小
     */
    private static final int BUFFER_SIZE = 4096;
    /**
     * 緩沖區(qū)
     */
    private char[] buffer = new char[BUFFER_SIZE];
    /**
     * 緩沖區(qū)未處理的下標(biāo)
     */
    private int remainSize = 0;

    /**
     * 句子分隔符
     */
    private static final Set<Character> delimiterCharSet = new HashSet<Character>() {{
        add('\r');
        add('\n');
        add('。');
        add('!');
        add('!');
        add(',');
        add(',');
        add('?');
        add('?');
        add(';');
        add(';');
    }};

    public AnalyzeContext(Reader reader, Configuration configuration) {
        this.input = reader;
        this.configuration = configuration;
    }

    /**
     * 重置分詞器
     *
     * @param reader
     */
    public void reset(Reader reader) {
        input = reader;
        offset = 0;
        iterator = null;
    }

    public Term next() throws IOException {
        // 如果當(dāng)年迭代器中還有詞,繼續(xù)迭代
        if (iterator != null && iterator.hasNext()) {
            return iterator.next();
        }
        // 沒詞,讀取下一行
        String line = readLine();

        if (line == null) {
            return null;
        }

        // todo 
        List<Term> termList = [你的分詞算法].getTextTokenizer(line, configuration);
        // 分詞結(jié)果是空
        if (termList.size() == 0) {
            return null;
        }

        for (Term term : termList) {
            term.setOffset(term.getOffset() + offset);
        }
        offset += line.length();
        iterator = termList.iterator();
        return iterator.next();
    }

    private String readLine() throws IOException {
        int offset = 0;
        int length = BUFFER_SIZE;
        // 上次讀取剩下的部分
        if (remainSize > 0) {
            offset = remainSize;
            length -= remainSize;
        }
        // 讀取的字符數(shù),-1 讀取結(jié)束
        int n = input.read(buffer, offset, length);
        if (n < 0) {
            if (remainSize != 0) {
                String lastLine = new String(buffer, 0, remainSize);
                remainSize = 0;
                return lastLine;
            }
            return null;
        }
        n += offset;

        // 真正的句子結(jié)束位置
        int eos = lastIndexOfEos(buffer, n);
        String line = new String(buffer, 0, eos);
        remainSize = n - eos;
        if (remainSize > 0) {
            // 把剩下的復(fù)制到緩沖區(qū)開始位置
            System.arraycopy(buffer, eos, buffer, 0, remainSize);
        }
        return line;
    }

    /**
     * 根據(jù)句子分隔符,找到這一段文本中的最后一句話所在位置。
     *
     * @param buffer
     * @param length
     * @return
     */
    private int lastIndexOfEos(char[] buffer, int length) {
        if (length < BUFFER_SIZE) {
            return length;
        }
        for (int i = length - 1; i > 0; i--) {
            if (delimiterCharSet.contains(buffer[i])) {
                return i + 1;
            }
        }
        return length;
    }
}

Term

public class Term Serializable {
    //詞元的起始位移
    private int offset;
    //詞元的相對(duì)起始位置
    private int end;
    //詞元文本
    private String text;
    //詞元類型
    private String lexemeType;
}

打包&安裝

打包

執(zhí)行命令mvn clean package進(jìn)行打包,target目錄下會(huì)生成zip包。

安裝

執(zhí)行命令 bin/elasticsearch-plugin install [plugin_name],同意權(quán)限授權(quán),即可。

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
【社區(qū)內(nèi)容提示】社區(qū)部分內(nèi)容疑似由AI輔助生成,瀏覽時(shí)請結(jié)合常識(shí)與多方信息審慎甄別。
平臺(tái)聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點(diǎn),簡書系信息發(fā)布平臺(tái),僅提供信息存儲(chǔ)服務(wù)。

友情鏈接更多精彩內(nèi)容