diff --git a/README.md b/README.md index 0268123..00b429d 100644 --- a/README.md +++ b/README.md @@ -8,26 +8,9 @@ Currently, only baseforms for german and english are implemented. Example: the german base form of `zurückgezogen` is `zurückziehen`. -## Versions - -| Plugin | Elasticsearch | Release date | -| --------- | --------------- | -------------| -| 2.2.1.1 | 2.2.1 | Jun 22, 2016 | -| 2.2.1.0 | 2.2.1 | Apr 23, 2016 | -| 1.4.0.0 | 1.4.0 | Feb 19, 2015 | -| 1.3.0.0 | 1.3.1 | Jul 30, 2014 | - ## Installation -### Elasticsearch 2.x - - ./bin/plugin install http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-analysis-baseform/2.2.1.1/elasticsearch-analysis-baseform-2.2.1.1-plugin.zip - -### Elasticsearch 1.x - - ./bin/plugin -install analysis-baseform -url http://xbib.org/repository/org/xbib/elasticsearch/plugin/elasticsearch-analysis-baseform/1.4.0.0/elasticsearch-analysis-baseform-1.4.0.0-plugin.zip - -Do not forget to restart the node after installing. +Use Gradle to build the plugin and install it using the elasticsearch-plugin command. Check the "gradle.properties" for the supported version. ## Project docs @@ -61,11 +44,14 @@ In the settings, set up a token filter of type "baseform" and language "de":: } By using such a tokenizer, the sentence -"Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet" + + "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet" + will be tokenized into -"Die", "Die", "Jahresfeier", "Jahresfeier", "der", "der", "Rechtsanwaltskanzleien", "Rechtsanwaltskanzlei", -"auf", "auf", "dem", "der", "Donaudampfschiff", "Donaudampfschiff", "hat", "haben", "viel", "viel", -"Ökosteuer", "Ökosteuer", "gekostet", "kosten" + + "Die", "Die", "Jahresfeier", "Jahresfeier", "der", "der", "Rechtsanwaltskanzleien", "Rechtsanwaltskanzlei", + "auf", "auf", "dem", "der", "Donaudampfschiff", "Donaudampfschiff", "hat", "haben", "viel", "viel", + "Ökosteuer", "Ökosteuer", "gekostet", "kosten" It is recommended to add the [Unique token filter](http://www.elasticsearch.org/guide/reference/index-modules/analysis/unique-tokenfilter.html) to skip tokens that occur more than once. @@ -115,6 +101,18 @@ this token stream will be produced:: As an alternative, separate dictionaries for `en-verbs` and `en-nouns` are available. +## Caching + +The time consumed by the baseform computation may increase your overall indexing time drastically if applied in the billions. You can configure the cache size (in number of entries) for mapping a token to an array of baseform tokens. +Reaching the cache size limit results in clearing of the cache and starting anew. This setting and the cache respectively is applied to a node, so configure it in the elasticsearch.yml file: + +``` +# default: 8388608 entries +# minimum: 131072 entries +# baseform_max_cache_size: 8388608 +``` + + # License Elasticsearch Baseform Analysis Plugin @@ -148,3 +146,5 @@ and is distributed under CC-BY-SA http://creativecommons.org/licenses/by-sa/3.0/ The english baseforms are a modified version of the english.dict file of http://languagetool.org/download/snapshots/LanguageTool-20131115-snapshot.zip which is licensed under LGPL http://www.fsf.org/licensing/licenses/lgpl.html#SEC1 + +GBI-Genios Deutsche Wirtschaftsdatenbank GmbH for adding the caching-functionality. diff --git a/build.gradle b/build.gradle index 0091f1e..02573d6 100644 --- a/build.gradle +++ b/build.gradle @@ -1,60 +1,40 @@ -group = 'org.xbib.elasticsearch.plugin' -version = '2.2.0.0' - -ext { - pluginName = 'baseform' - pluginClassname = 'org.xbib.elasticsearch.plugin.baseform.AnalysisBaseformPlugin' - pluginDescription = 'Baseform plugin for Elasticsearch' - user = 'jprante' - name = 'elasticsearch-analysis-baseform' - scmUrl = 'https://github.com/' + user + '/' + name - scmConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git' - scmDeveloperConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git' - versions = [ - 'elasticsearch' : '2.2.0', - 'log4j': '2.5', - 'junit' : '4.12' - ] -} -println "Host: " + java.net.InetAddress.getLocalHost() -println "Gradle: " + gradle.gradleVersion + " JVM: " + org.gradle.internal.jvm.Jvm.current() + " Groovy: " + GroovySystem.getVersion() -println "Build: group: '${project.group}', name: '${project.name}', version: '${project.version}'" -println "Timestamp: " + java.time.Instant.now().atZone(java.time.ZoneId.systemDefault()).toString() - -buildscript { - repositories { - mavenLocal() - mavenCentral() - jcenter() - maven { - url "http://xbib.org/repository" - } - } - dependencies { - classpath 'org.ajoberstar:gradle-git:1.4.2' - classpath 'co.riiid:gradle-github-plugin:0.4.2' - classpath 'io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.5.3' - } +plugins { + id "org.sonarqube" version "2.5" + id "org.xbib.gradle.plugin.asciidoctor" version "1.5.4.1.0" + id "io.codearte.nexus-staging" version "0.7.0" } + +printf "Host: %s\nOS: %s %s %s\nJVM: %s %s %s %s\nGroovy: %s\nGradle: %s\n" + + "Build: group: ${project.group} name: ${project.name} version: ${project.version}\n", + InetAddress.getLocalHost(), + System.getProperty("os.name"), + System.getProperty("os.arch"), + System.getProperty("os.version"), + System.getProperty("java.version"), + System.getProperty("java.vm.version"), + System.getProperty("java.vm.vendor"), + System.getProperty("java.vm.name"), + GroovySystem.getVersion(), + gradle.gradleVersion + apply plugin: 'java' apply plugin: 'maven' apply plugin: 'signing' -apply plugin: 'co.riiid.gradle' +apply plugin: 'findbugs' +apply plugin: 'pmd' +apply plugin: 'checkstyle' +apply plugin: "jacoco" +apply plugin: 'org.xbib.gradle.plugin.asciidoctor' repositories { mavenCentral() - mavenLocal() - jcenter() - maven { - url "http://xbib.org/repository" - } } configurations { wagon - releaseJars { + distJars { extendsFrom runtime exclude group: 'org.elasticsearch' exclude module: 'lucene-core' @@ -63,27 +43,41 @@ configurations { exclude module: 'jackson-core' exclude module: 'jackson-dataformat-smile' exclude module: 'jackson-dataformat-yaml' + exclude module: 'log4j-api' } } +apply from: 'gradle/ext.gradle' +apply from: 'gradle/publish.gradle' +apply from: 'gradle/sonarqube.gradle' + dependencies { - compile "org.elasticsearch:elasticsearch:${versions.elasticsearch}" - testCompile "junit:junit:${versions.junit}" - testCompile "org.apache.logging.log4j:log4j-slf4j-impl:${versions.log4j}" - testCompile "org.apache.logging.log4j:log4j-core:${versions.log4j}" - releaseJars "${project.group}:${project.name}:${project.version}" - wagon 'org.apache.maven.wagon:wagon-ssh-external:2.10' + def without_hamcrest = { + exclude group: 'org.hamcrest', module: 'hamcrest-core' + } + compile "org.elasticsearch:elasticsearch:${project.property('elasticsearch.version')}" + compile "org.apache.logging.log4j:log4j-api:${project.property('log4j.version')}" + testCompile "junit:junit:${project.property('junit.version')}", without_hamcrest + testCompile "org.apache.logging.log4j:log4j-core:${project.property('log4j.version')}" + testCompile "org.elasticsearch.plugin:transport-netty4-client:${project.property('elasticsearch.version')}" + testCompile "org.elasticsearch.test:framework:${project.property('elasticsearch.version')}" + testCompile "org.codelibs.elasticsearch.module:analysis-common:${project.property('elasticsearch.version')}" + distJars "${project.group}:${project.name}:${project.version}" + wagon "org.apache.maven.wagon:wagon-ssh:${project.property('wagon.version')}" } -sourceCompatibility = 1.7 -targetCompatibility = 1.7 +sourceCompatibility = JavaVersion.VERSION_1_8 +targetCompatibility = JavaVersion.VERSION_1_8 +[compileJava, compileTestJava]*.options*.encoding = 'UTF-8' tasks.withType(JavaCompile) { - options.compilerArgs << "-Xlint:unchecked,deprecation" + options.compilerArgs << "-Xlint:all" << "-profile" << "compact2" } test { - systemProperty 'path.home', projectDir.absolutePath + systemProperties['path.home'] = System.getProperty("user.dir") + systemProperties['tests.security.manager'] = false + testLogging { showStandardStreams = false exceptionFormat = 'full' @@ -95,28 +89,26 @@ task makePluginDescriptor(type: Copy) { into 'build/tmp/plugin' expand([ 'descriptor': [ - 'name': pluginName, - 'classname': pluginClassname, - 'description': pluginDescription, - 'jvm': true, - 'site': false, - 'isolated': true, - 'version': project.property('version'), - 'javaVersion': project.property('targetCompatibility'), - 'elasticsearchVersion' : versions.elasticsearch + 'name': project.property('pluginName'), + 'classname': project.property('pluginClassname'), + 'description': project.property('pluginDescription'), + 'version': project.property('version'), + 'javaVersion': project.property('targetCompatibility'), + 'elasticsearchVersion' : project.property('elasticsearch.version') ] ]) } task buildPluginZip(type: Zip, dependsOn: [':jar', ':makePluginDescriptor']) { - from configurations.releaseJars + from configurations.distJars from 'build/tmp/plugin' + //into 'elasticsearch' classifier = 'plugin' } task unpackPlugin(type: Copy, dependsOn: [':buildPluginZip']) { delete "plugins" - from configurations.releaseJars + from configurations.distJars from 'build/tmp/plugin' into "plugins/${pluginName}" } @@ -140,7 +132,7 @@ task sourcesJar(type: Jar, dependsOn: classes) { } artifacts { - archives javadocJar, sourcesJar, buildPluginZip + archives sourcesJar, javadocJar, buildPluginZip } if (project.hasProperty('signing.keyId')) { @@ -148,8 +140,3 @@ if (project.hasProperty('signing.keyId')) { sign configurations.archives } } - -ext.grgit = org.ajoberstar.grgit.Grgit.open() - -apply from: 'gradle/git.gradle' -apply from: 'gradle/publish.gradle' diff --git a/config/checkstyle/checkstyle.xml b/config/checkstyle/checkstyle.xml new file mode 100644 index 0000000..52fe33c --- /dev/null +++ b/config/checkstyle/checkstyle.xml @@ -0,0 +1,323 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 0000000..b099adb --- /dev/null +++ b/gradle.properties @@ -0,0 +1,12 @@ +pluginName = baseform +pluginClassname = org.xbib.elasticsearch.plugin.analysis.baseform.AnalysisBaseformPlugin +pluginDescription = Baseform plugin for Elasticsearch + +group = org.xbib.elasticsearch.plugin +name = elasticsearch-analysis-baseform +version = 6.6.1-02 + +elasticsearch.version = 6.6.1 +log4j.version = 2.9.1 +junit.version = 4.12 +wagon.version = 2.12 diff --git a/gradle/ext.gradle b/gradle/ext.gradle new file mode 100644 index 0000000..0c623b9 --- /dev/null +++ b/gradle/ext.gradle @@ -0,0 +1,7 @@ + +ext { + user = 'jprante' + scmUrl = 'https://github.com/' + user + '/' + name + scmConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git' + scmDeveloperConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git' +} diff --git a/gradle/git.gradle b/gradle/git.gradle deleted file mode 100644 index 6043d07..0000000 --- a/gradle/git.gradle +++ /dev/null @@ -1,9 +0,0 @@ - -task gitRelease(dependsOn: build) << { - grgit.add(patterns: ['.'], update: true) - grgit.commit(message: "release of ${project.version}") - grgit.tag.remove(names: [project.version]) - grgit.tag.add(name: project.version) - grgit.push() - grgit.push(tags: true) -} \ No newline at end of file diff --git a/gradle/publish.gradle b/gradle/publish.gradle index 28d91c2..0e648cb 100644 --- a/gradle/publish.gradle +++ b/gradle/publish.gradle @@ -1,19 +1,3 @@ -apply plugin: 'io.codearte.nexus-staging' - -/* -nexus { - attachJavadoc = true - attachSources = true - attachTests = true - sign = true - repositoryUrl = 'https://oss.sonatype.org/service/local/staging/deploy/maven2' - snapshotRepositoryUrl = 'https://oss.sonatype.org/content/repositories/snapshots' -} -*/ - -nexusStaging { - packageGroup = "org.xbib" -} task xbibUpload(type: Upload) { configuration = configurations.archives @@ -22,27 +6,25 @@ task xbibUpload(type: Upload) { if (project.hasProperty("xbibUsername")) { mavenDeployer { configuration = configurations.wagon - repository(url: 'scpexe://xbib.org/repository') { - authentication(userName: xbibUsername, privateKey: xbibPrivateKey) + repository(url: uri('sftp://xbib.org/repository')) { + authentication(userName: xbibUsername, privateKey: xbibPrivateKey) } } } } } -task mavenCentralUpload(type: Upload) { +task sonaTypeUpload(type: Upload) { configuration = configurations.archives uploadDescriptor = true repositories { if (project.hasProperty('ossrhUsername')) { mavenDeployer { - beforeDeployment { - MavenDeployment deployment -> signing.signPom(deployment) - } - repository(url: 'https://oss.sonatype.org/service/local/staging/deploy/maven2') { + beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) } + repository(url: uri(ossrhReleaseUrl)) { authentication(userName: ossrhUsername, password: ossrhPassword) } - snapshotRepository(url: 'https://oss.sonatype.org/content/repositories/snapshots') { + snapshotRepository(url: uri(ossrhSnapshotUrl)) { authentication(userName: ossrhUsername, password: ossrhPassword) } pom.project { @@ -79,20 +61,3 @@ task mavenCentralUpload(type: Upload) { } } } - -if (project.hasProperty('githubToken')) { - github { - owner = user - token = githubToken - repo = project.name - name = project.version - tagName = project.version - targetCommitish = 'master' - assets = [ - "build/distributions/${project.name}-${project.version}-plugin.zip" - ] - } - githubRelease { - dependsOn gitRelease, buildPluginZip - } -} \ No newline at end of file diff --git a/gradle/sonarqube.gradle b/gradle/sonarqube.gradle new file mode 100644 index 0000000..3985a4f --- /dev/null +++ b/gradle/sonarqube.gradle @@ -0,0 +1,39 @@ +tasks.withType(FindBugs) { + ignoreFailures = true + reports { + xml.enabled = false + html.enabled = true + } +} +tasks.withType(Pmd) { + ignoreFailures = true + reports { + xml.enabled = true + html.enabled = true + } +} +tasks.withType(Checkstyle) { + ignoreFailures = true + reports { + xml.enabled = true + html.enabled = true + } +} + +jacocoTestReport { + reports { + xml.enabled = true + csv.enabled = false + } +} + +sonarqube { + properties { + property "sonar.projectName", "${project.group} ${project.name}" + property "sonar.sourceEncoding", "UTF-8" + property "sonar.tests", "src/test/java" + property "sonar.scm.provider", "git" + property "sonar.java.coveragePlugin", "jacoco" + property "sonar.junit.reportsPath", "build/test-results/test/" + } +} diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar index d3b8398..7a3265e 100644 Binary files a/gradle/wrapper/gradle-wrapper.jar and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 120a40f..c30d4c0 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,5 @@ -#Wed Jun 22 12:35:29 CEST 2016 distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-2.14-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-4.7-bin.zip diff --git a/gradlew b/gradlew index 27309d9..cccdd3d 100755 --- a/gradlew +++ b/gradlew @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env sh ############################################################################## ## @@ -33,11 +33,11 @@ DEFAULT_JVM_OPTS="" # Use the maximum available, or set MAX_FD != -1 to use that value. MAX_FD="maximum" -warn ( ) { +warn () { echo "$*" } -die ( ) { +die () { echo echo "$*" echo @@ -154,11 +154,19 @@ if $cygwin ; then esac fi -# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules -function splitJvmOpts() { - JVM_OPTS=("$@") +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " } -eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS -JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" +APP_ARGS=$(save "$@") -exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong +if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then + cd "$(dirname "$0")" +fi + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat index f6d5974..e95643d 100644 --- a/gradlew.bat +++ b/gradlew.bat @@ -49,7 +49,6 @@ goto fail @rem Get command-line arguments, handling Windows variants if not "%OS%" == "Windows_NT" goto win9xME_args -if "%@eval[2+2]" == "4" goto 4NT_args :win9xME_args @rem Slurp the command line arguments. @@ -60,11 +59,6 @@ set _SKIP=2 if "x%~1" == "x" goto execute set CMD_LINE_ARGS=%* -goto execute - -:4NT_args -@rem Get arguments from the 4NT Shell from JP Software -set CMD_LINE_ARGS=%$ :execute @rem Setup the command line diff --git a/settings.gradle b/settings.gradle deleted file mode 100644 index a11be14..0000000 --- a/settings.gradle +++ /dev/null @@ -1 +0,0 @@ -rootProject.name = 'elasticsearch-analysis-baseform' diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformAnalysisBinderProcessor.java b/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformAnalysisBinderProcessor.java deleted file mode 100644 index ab8e100..0000000 --- a/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformAnalysisBinderProcessor.java +++ /dev/null @@ -1,11 +0,0 @@ -package org.xbib.elasticsearch.index.analysis.baseform; - -import org.elasticsearch.index.analysis.AnalysisModule; - -public class BaseformAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { - - @Override - public void processTokenFilters(TokenFiltersBindings tokenFiltersBindings) { - tokenFiltersBindings.processTokenFilter("baseform", BaseformTokenFilterFactory.class); - } -} diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilter.java b/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilter.java index 06dc5a8..2fb209c 100644 --- a/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilter.java +++ b/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilter.java @@ -1,33 +1,60 @@ package org.xbib.elasticsearch.index.analysis.baseform; +import java.io.IOException; +import java.nio.charset.CharacterCodingException; +import java.util.LinkedList; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl; +import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.BytesRef; import org.xbib.elasticsearch.common.fsa.Dictionary; -import java.io.IOException; -import java.nio.charset.CharacterCodingException; -import java.util.LinkedList; - public class BaseformTokenFilter extends TokenFilter { + private static final Logger LOG = LogManager.getLogger(BaseformTokenFilter.class); + + private static final byte ORIGINAL_TYPE = 1; + private static final byte BASEFORM_TYPE = 4; + + private static ConcurrentHashMap TERM_CACHE; + + private static final AtomicLong termCacheCount = new AtomicLong(0); + + private static final AtomicBoolean needsClearCache = new AtomicBoolean(false); + + private static long MAX_CACHE_SIZE; + + private static final CharSequence NO_TERMS = ""; + private final LinkedList tokens; private final Dictionary dictionary; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + + private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private AttributeSource.State current; - protected BaseformTokenFilter(TokenStream input, Dictionary dictionary) { + protected BaseformTokenFilter(TokenStream input, Dictionary dictionary, long maxCacheSize) { super(input); this.tokens = new LinkedList<>(); - this.dictionary = dictionary; + this.dictionary = dictionary; if (TERM_CACHE == null) { + TERM_CACHE = new ConcurrentHashMap(); + MAX_CACHE_SIZE = maxCacheSize; + } } @Override @@ -38,6 +65,7 @@ public final boolean incrementToken() throws IOException { restoreState(current); termAtt.setEmpty().append(token); posIncAtt.setPositionIncrement(0); + setPayload(BASEFORM_TYPE); return true; } if (input.incrementToken()) { @@ -45,6 +73,7 @@ public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { current = captureState(); } + setPayload(ORIGINAL_TYPE); return true; } else { return false; @@ -53,7 +82,24 @@ public final boolean incrementToken() throws IOException { protected void baseform() throws CharacterCodingException { CharSequence term = new String(termAtt.buffer(), 0, termAtt.length()); - CharSequence s = dictionary.lookup(term); + if (needsClearCache.get()) { + checkCacheSize(); + } + CharSequence s = TERM_CACHE.computeIfAbsent(term, t -> { + if (termCacheCount.incrementAndGet() > MAX_CACHE_SIZE) { + needsClearCache.set(true); + } + + try { + CharSequence baseform = dictionary.lookup(t); + if (baseform == null || term.equals(baseform)) { + return NO_TERMS; + } + return baseform; + } catch (CharacterCodingException e) { + return NO_TERMS; + } + }); if (s != null && s.length() > 0) { PackedTokenAttributeImpl impl = new PackedTokenAttributeImpl(); impl.append(s); @@ -61,6 +107,32 @@ protected void baseform() throws CharacterCodingException { } } + private void setPayload(byte tokenType) { + BytesRef payload = payloadAtt.getPayload(); + if (tokenType == ORIGINAL_TYPE) { + if (payload == null) { + payload = new BytesRef(); + } + } else { + if (payload != null && payload.length > 0) { + payload = BytesRef.deepCopyOf(payload); + } else { + payload = new BytesRef(new byte[1]); + } + payload.bytes[payload.offset] |= tokenType; + } + payloadAtt.setPayload(payload); + } + + private void checkCacheSize() { + needsClearCache.set(false); + final Runtime runtime = Runtime.getRuntime(); + long memoryUsage = runtime.totalMemory() - runtime.freeMemory(); + TERM_CACHE = new ConcurrentHashMap(); + termCacheCount.set(0); + LOG.warn("Clearing term cache for baseform, memory usage: " + memoryUsage); + } + @Override public void reset() throws IOException { super.reset(); diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilterAnalysisProvider.java b/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilterAnalysisProvider.java new file mode 100644 index 0000000..be25dc0 --- /dev/null +++ b/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilterAnalysisProvider.java @@ -0,0 +1,25 @@ +package org.xbib.elasticsearch.index.analysis.baseform; + +import java.io.IOException; + +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.IndexSettings; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; + +public class BaseformTokenFilterAnalysisProvider implements AnalysisProvider{ + + private final long maxBaseformEntries; + + public BaseformTokenFilterAnalysisProvider(long maxBaseformEntries) { + this.maxBaseformEntries = maxBaseformEntries; + } + + @Override + public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) + throws IOException { + return new BaseformTokenFilterFactory(indexSettings, name, settings, maxBaseformEntries); + } + +} diff --git a/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilterFactory.java b/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilterFactory.java index 6589730..3bb76bd 100644 --- a/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilterFactory.java +++ b/src/main/java/org/xbib/elasticsearch/index/analysis/baseform/BaseformTokenFilterFactory.java @@ -1,34 +1,33 @@ package org.xbib.elasticsearch.index.analysis.baseform; +import java.io.IOException; +import java.io.InputStreamReader; + import org.apache.lucene.analysis.TokenStream; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.Index; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; -import org.elasticsearch.index.settings.IndexSettingsService; import org.xbib.elasticsearch.common.fsa.Dictionary; -import java.io.IOException; -import java.io.InputStreamReader; - public class BaseformTokenFilterFactory extends AbstractTokenFilterFactory { private final Dictionary dictionary; + private final long maxCacheSize; + @Inject - public BaseformTokenFilterFactory(Index index, - IndexSettingsService indexSettingsService, - @Assisted String name, - @Assisted Settings settings) { - super(index, indexSettingsService.indexSettings(), name, settings); + public BaseformTokenFilterFactory(IndexSettings indexSettings, @Assisted String name, @Assisted Settings settings, long maxCacheSize) { + super(indexSettings, name, settings); this.dictionary = createDictionary(settings); + this.maxCacheSize = maxCacheSize; } @Override public TokenStream create(TokenStream tokenStream) { - return new BaseformTokenFilter(tokenStream, dictionary); + return new BaseformTokenFilter(tokenStream, dictionary, maxCacheSize); } private Dictionary createDictionary(Settings settings) { diff --git a/src/main/java/org/xbib/elasticsearch/plugin/analysis/baseform/AnalysisBaseformPlugin.java b/src/main/java/org/xbib/elasticsearch/plugin/analysis/baseform/AnalysisBaseformPlugin.java index 7a52e4a..841b086 100644 --- a/src/main/java/org/xbib/elasticsearch/plugin/analysis/baseform/AnalysisBaseformPlugin.java +++ b/src/main/java/org/xbib/elasticsearch/plugin/analysis/baseform/AnalysisBaseformPlugin.java @@ -1,33 +1,45 @@ package org.xbib.elasticsearch.plugin.analysis.baseform; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.elasticsearch.common.inject.Inject; +import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.analysis.AnalysisModule; +import org.elasticsearch.index.analysis.TokenFilterFactory; +import org.elasticsearch.indices.analysis.AnalysisModule.AnalysisProvider; +import org.elasticsearch.plugins.AnalysisPlugin; import org.elasticsearch.plugins.Plugin; -import org.xbib.elasticsearch.index.analysis.baseform.BaseformAnalysisBinderProcessor; +import org.xbib.elasticsearch.index.analysis.baseform.BaseformTokenFilterAnalysisProvider; + +public class AnalysisBaseformPlugin extends Plugin implements AnalysisPlugin { -public class AnalysisBaseformPlugin extends Plugin { + private static final Logger LOG = LogManager.getLogger(AnalysisBaseformPlugin.class); - private final Settings settings; + public static final Setting SETTING_MAX_CACHE_SIZE = + Setting.longSetting("baseform_max_cache_size", 8388608, 131072, Setting.Property.NodeScope); + private final long maxCacheSize; + @Inject public AnalysisBaseformPlugin(Settings settings) { - this.settings = settings; + this.maxCacheSize = SETTING_MAX_CACHE_SIZE.get(settings); + LOG.info("Maximum Cache Size AnalysisBaseformPlugin: " + this.maxCacheSize); + } @Override - public String name() { - return "analysis-baseform"; + public Map> getTokenFilters() { + return Collections.singletonMap("baseform", new BaseformTokenFilterAnalysisProvider(this.maxCacheSize)); } @Override - public String description() { - return "A baseform token filter for german and other languages"; - } - - public void onModule(AnalysisModule module) { - if (settings.getAsBoolean("plugins.baseform.enabled", true)) { - module.addProcessor(new BaseformAnalysisBinderProcessor()); - } - } + public List> getSettings() { + return Stream.of(SETTING_MAX_CACHE_SIZE).collect(Collectors.toList()); + } } diff --git a/src/main/templates/plugin-descriptor.properties b/src/main/templates/plugin-descriptor.properties index d599e0c..0b9867c 100644 --- a/src/main/templates/plugin-descriptor.properties +++ b/src/main/templates/plugin-descriptor.properties @@ -1,9 +1,6 @@ classname=${descriptor.classname} name=${descriptor.name} description=${descriptor.description} -jvm=${descriptor.jvm} -site=${descriptor.site} -isolated=${descriptor.isolated} version=${descriptor.version} java.version=${descriptor.javaVersion} elasticsearch.version=${descriptor.elasticsearchVersion} diff --git a/src/test/java/org/xbib/elasticsearch/index/analysis/BaseformTokenFilterTests.java b/src/test/java/org/xbib/elasticsearch/index/analysis/BaseformTokenFilterTests.java index 45d729a..14a1916 100644 --- a/src/test/java/org/xbib/elasticsearch/index/analysis/BaseformTokenFilterTests.java +++ b/src/test/java/org/xbib/elasticsearch/index/analysis/BaseformTokenFilterTests.java @@ -1,20 +1,22 @@ package org.xbib.elasticsearch.index.analysis; +import java.io.IOException; +import java.io.StringReader; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - -import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.Version; +import org.elasticsearch.cluster.metadata.IndexMetaData; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.TokenFilterFactory; - -import org.junit.Assert; +import org.elasticsearch.test.ESTestCase; import org.junit.Test; +import org.xbib.elasticsearch.plugin.analysis.baseform.AnalysisBaseformPlugin; -import java.io.IOException; -import java.io.StringReader; - -public class BaseformTokenFilterTests extends Assert { - +public class BaseformTokenFilterTests extends ESTestCase { + @Test public void testOne() throws IOException { @@ -22,31 +24,24 @@ public void testOne() throws IOException { String[] expected = { "Die", - "Die", - "Jahresfeier", "Jahresfeier", "der", - "der", "Rechtsanwaltskanzleien", "Rechtsanwaltskanzlei", "auf", - "auf", "dem", "der", "Donaudampfschiff", - "Donaudampfschiff", "hat", "haben", "viel", - "viel", - "Ökosteuer", "Ökosteuer", "gekostet", "kosten" }; - AnalysisService analysisService = MapperTestUtils.analysisService(); - TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform"); - Tokenizer tokenizer = analysisService.tokenizer("standard").create(); + TestAnalysis analysis = createTestAnalysis(); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform"); + Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @@ -58,8 +53,6 @@ public void testTwo() throws IOException { String[] expected = { "Das", - "Das", - "sind", "sind", "Autos", "Auto", @@ -67,12 +60,11 @@ public void testTwo() throws IOException { "der", "Nudeln", "Nudel", - "transportieren", "transportieren" }; - AnalysisService analysisService = MapperTestUtils.analysisService(); - TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform"); - Tokenizer tokenizer = analysisService.tokenizer("standard").create(); + TestAnalysis analysis = createTestAnalysis(); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform"); + Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } @@ -87,18 +79,33 @@ public void testThree() throws IOException { "wurde", "werden", "zum", - "zum", - "tollen", "tollen", "gemacht", "machen" }; - AnalysisService analysisService = MapperTestUtils.analysisService(); - TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform"); - Tokenizer tokenizer = analysisService.tokenizer("standard").create(); + TestAnalysis analysis = createTestAnalysis(); + TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform"); + Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); } + + private TestAnalysis createTestAnalysis() throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .build(); + IndexMetaData indexMetaData = IndexMetaData.builder("test") + .settings(settings) + .numberOfShards(1) + .numberOfReplicas(1) + .build(); + Settings nodeSettings = Settings.builder() + .put(AnalysisBaseformPlugin.SETTING_MAX_CACHE_SIZE.getKey(), 131072) + .put("path.home", System.getProperty("path.home", "/tmp")) + .build(); + TestAnalysis analysis = createTestAnalysis(new IndexSettings(indexMetaData, nodeSettings), nodeSettings, new AnalysisBaseformPlugin(nodeSettings)); + return analysis; + } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { stream.reset(); diff --git a/src/test/java/org/xbib/elasticsearch/index/analysis/DictionaryTests.java b/src/test/java/org/xbib/elasticsearch/index/analysis/DictionaryTests.java index b69768e..d71496d 100644 --- a/src/test/java/org/xbib/elasticsearch/index/analysis/DictionaryTests.java +++ b/src/test/java/org/xbib/elasticsearch/index/analysis/DictionaryTests.java @@ -1,14 +1,14 @@ package org.xbib.elasticsearch.index.analysis; -import org.junit.Assert; -import org.junit.Test; -import org.xbib.elasticsearch.common.fsa.Dictionary; - import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.nio.charset.CharacterCodingException; +import org.junit.Assert; +import org.junit.Test; +import org.xbib.elasticsearch.common.fsa.Dictionary; + public class DictionaryTests extends Assert { @Test diff --git a/src/test/java/org/xbib/elasticsearch/index/analysis/EnglishBaseformTokenFilterTests.java b/src/test/java/org/xbib/elasticsearch/index/analysis/EnglishBaseformTokenFilterTests.java index 0f13f1c..f3c4ba2 100644 --- a/src/test/java/org/xbib/elasticsearch/index/analysis/EnglishBaseformTokenFilterTests.java +++ b/src/test/java/org/xbib/elasticsearch/index/analysis/EnglishBaseformTokenFilterTests.java @@ -1,23 +1,20 @@ package org.xbib.elasticsearch.index.analysis; +import java.io.IOException; + import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.elasticsearch.Version; +import org.elasticsearch.analysis.common.CommonAnalysisPlugin; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.NamedAnalyzer; +import org.elasticsearch.test.ESTestCase; import org.junit.Test; +import org.xbib.elasticsearch.plugin.analysis.baseform.AnalysisBaseformPlugin; -import java.io.IOException; -import java.io.InputStreamReader; - -import static org.elasticsearch.common.io.Streams.copyToString; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - -public class EnglishBaseformTokenFilterTests { +public class EnglishBaseformTokenFilterTests extends ESTestCase { @Test public void test1() throws IOException { @@ -122,22 +119,28 @@ public void test1() throws IOException { "character", "today" }; - Settings settings = Settings.settingsBuilder() - .loadFromSource(copyToStringFromClasspath("/org/xbib/elasticsearch/index/analysis/baseform_en.json")) - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("path.home", System.getProperty("path.home")) - .put("client.type", "node") - .build(); - AnalysisService analysisService = MapperTestUtils.analysisService(settings); - - NamedAnalyzer analyzer = analysisService.analyzer("baseform"); - + TestAnalysis analysis = createTestAnalysis("org/xbib/elasticsearch/index/analysis/baseform_en.json"); + NamedAnalyzer analyzer = analysis.indexAnalyzers.get("baseform"); assertSimpleTSOutput(analyzer.tokenStream("content", source), expected); } - private static String copyToStringFromClasspath(String path) throws IOException { - return copyToString(new InputStreamReader(EnglishBaseformTokenFilterTests.class.getResourceAsStream(path), "UTF-8")); + private TestAnalysis createTestAnalysis(String resource) throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .loadFromStream(resource, ClassLoader.getSystemClassLoader().getResourceAsStream(resource), false) + .build(); + IndexMetaData indexMetaData = IndexMetaData.builder("test") + .settings(settings) + .numberOfShards(1) + .numberOfReplicas(1) + .build(); + Settings nodeSettings = Settings.builder() + .put(AnalysisBaseformPlugin.SETTING_MAX_CACHE_SIZE.getKey(), 131072) + .put("path.home", System.getProperty("path.home", "/tmp")) + .build(); + TestAnalysis analysis = createTestAnalysis(new IndexSettings(indexMetaData, nodeSettings), nodeSettings, new AnalysisBaseformPlugin(nodeSettings), new CommonAnalysisPlugin()); + return analysis; } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { diff --git a/src/test/java/org/xbib/elasticsearch/index/analysis/GermanBaseformTokenFilterTests.java b/src/test/java/org/xbib/elasticsearch/index/analysis/GermanBaseformTokenFilterTests.java index 514cfc9..6df15bc 100644 --- a/src/test/java/org/xbib/elasticsearch/index/analysis/GermanBaseformTokenFilterTests.java +++ b/src/test/java/org/xbib/elasticsearch/index/analysis/GermanBaseformTokenFilterTests.java @@ -1,34 +1,28 @@ package org.xbib.elasticsearch.index.analysis; import java.io.IOException; -import java.io.InputStreamReader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - +import org.elasticsearch.Version; +import org.elasticsearch.analysis.common.CommonAnalysisPlugin; +import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.index.analysis.AnalysisService; +import org.elasticsearch.index.IndexSettings; import org.elasticsearch.index.analysis.NamedAnalyzer; - +import org.elasticsearch.test.ESTestCase; import org.junit.BeforeClass; import org.junit.Test; +import org.xbib.elasticsearch.plugin.analysis.baseform.AnalysisBaseformPlugin; -import static org.elasticsearch.common.io.Streams.copyToString; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertTrue; - -public class GermanBaseformTokenFilterTests { +public class GermanBaseformTokenFilterTests extends ESTestCase { static NamedAnalyzer analyzer; @BeforeClass public static void create() throws IOException { - Settings settings = Settings.settingsBuilder() - .loadFromSource(copyToStringFromClasspath(("/org/xbib/elasticsearch/index/analysis/baseform_de.json"))) - .build(); - AnalysisService analysisService = MapperTestUtils.analysisService(settings); - analyzer = analysisService.analyzer("baseform"); + TestAnalysis analysis = createTestAnalysis("org/xbib/elasticsearch/index/analysis/baseform_de.json"); + analyzer = analysis.indexAnalyzers.get("baseform"); } @Test @@ -87,8 +81,22 @@ public void test3() throws IOException { assertSimpleTSOutput(analyzer.tokenStream("content", source), expected); } - private static String copyToStringFromClasspath(String path) throws IOException { - return copyToString(new InputStreamReader(EnglishBaseformTokenFilterTests.class.getResourceAsStream(path), "UTF-8")); + private static TestAnalysis createTestAnalysis(String resource) throws IOException { + Settings settings = Settings.builder() + .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) + .loadFromStream(resource, ClassLoader.getSystemClassLoader().getResourceAsStream(resource), false) + .build(); + IndexMetaData indexMetaData = IndexMetaData.builder("test") + .settings(settings) + .numberOfShards(1) + .numberOfReplicas(1) + .build(); + Settings nodeSettings = Settings.builder() + .put(AnalysisBaseformPlugin.SETTING_MAX_CACHE_SIZE.getKey(), 131072) + .put("path.home", System.getProperty("path.home", "/tmp")) + .build(); + TestAnalysis analysis = createTestAnalysis(new IndexSettings(indexMetaData, nodeSettings), nodeSettings, new AnalysisBaseformPlugin(nodeSettings), new CommonAnalysisPlugin()); + return analysis; } private void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException { diff --git a/src/test/java/org/xbib/elasticsearch/index/analysis/MapperTestUtils.java b/src/test/java/org/xbib/elasticsearch/index/analysis/MapperTestUtils.java deleted file mode 100644 index bc93967..0000000 --- a/src/test/java/org/xbib/elasticsearch/index/analysis/MapperTestUtils.java +++ /dev/null @@ -1,204 +0,0 @@ -package org.xbib.elasticsearch.index.analysis; - -import org.elasticsearch.Version; -import org.elasticsearch.client.Client; -import org.elasticsearch.cluster.metadata.IndexMetaData; -import org.elasticsearch.common.inject.Injector; -import org.elasticsearch.common.inject.ModulesBuilder; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.settings.SettingsModule; -import org.elasticsearch.env.Environment; -import org.elasticsearch.env.EnvironmentModule; -import org.elasticsearch.index.Index; -import org.elasticsearch.index.IndexNameModule; -import org.elasticsearch.index.analysis.AnalysisModule; -import org.elasticsearch.index.analysis.AnalysisService; -import org.elasticsearch.index.mapper.DocumentMapperParser; -import org.elasticsearch.index.mapper.Mapper; -import org.elasticsearch.index.mapper.MapperService; -import org.elasticsearch.index.mapper.MetadataFieldMapper; -import org.elasticsearch.index.mapper.core.BinaryFieldMapper; -import org.elasticsearch.index.mapper.core.BooleanFieldMapper; -import org.elasticsearch.index.mapper.core.ByteFieldMapper; -import org.elasticsearch.index.mapper.core.CompletionFieldMapper; -import org.elasticsearch.index.mapper.core.DateFieldMapper; -import org.elasticsearch.index.mapper.core.DoubleFieldMapper; -import org.elasticsearch.index.mapper.core.FloatFieldMapper; -import org.elasticsearch.index.mapper.core.IntegerFieldMapper; -import org.elasticsearch.index.mapper.core.LongFieldMapper; -import org.elasticsearch.index.mapper.core.ShortFieldMapper; -import org.elasticsearch.index.mapper.core.StringFieldMapper; -import org.elasticsearch.index.mapper.core.TokenCountFieldMapper; -import org.elasticsearch.index.mapper.core.TypeParsers; -import org.elasticsearch.index.mapper.geo.GeoPointFieldMapper; -import org.elasticsearch.index.mapper.internal.AllFieldMapper; -import org.elasticsearch.index.mapper.internal.IdFieldMapper; -import org.elasticsearch.index.mapper.internal.IndexFieldMapper; -import org.elasticsearch.index.mapper.internal.ParentFieldMapper; -import org.elasticsearch.index.mapper.internal.RoutingFieldMapper; -import org.elasticsearch.index.mapper.internal.SourceFieldMapper; -import org.elasticsearch.index.mapper.internal.TTLFieldMapper; -import org.elasticsearch.index.mapper.internal.TimestampFieldMapper; -import org.elasticsearch.index.mapper.internal.TypeFieldMapper; -import org.elasticsearch.index.mapper.internal.UidFieldMapper; -import org.elasticsearch.index.mapper.internal.VersionFieldMapper; -import org.elasticsearch.index.mapper.ip.IpFieldMapper; -import org.elasticsearch.index.mapper.object.ObjectMapper; -import org.elasticsearch.index.settings.IndexSettingsModule; -import org.elasticsearch.index.similarity.SimilarityLookupService; -import org.elasticsearch.indices.analysis.IndicesAnalysisService; -import org.elasticsearch.indices.mapper.MapperRegistry; -import org.xbib.elasticsearch.plugin.analysis.baseform.AnalysisBaseformPlugin; - -import java.util.LinkedHashMap; -import java.util.Map; - -public class MapperTestUtils { - - public static AnalysisService newAnalysisService(Settings indexSettings) { - Injector parentInjector = new ModulesBuilder().add(new SettingsModule(indexSettings), - new EnvironmentModule(new Environment(indexSettings))).createInjector(); - Index index = new Index("test"); - Injector injector = new ModulesBuilder().add( - new IndexSettingsModule(index, indexSettings), - new IndexNameModule(index), - new AnalysisModule(indexSettings, parentInjector.getInstance(IndicesAnalysisService.class))).createChildInjector(parentInjector); - - return injector.getInstance(AnalysisService.class); - } - - public static SimilarityLookupService newSimilarityLookupService(Settings indexSettings) { - return new SimilarityLookupService(new Index("test"), indexSettings); - } - - public static DocumentMapperParser newDocumentMapperParser() { - return newDocumentMapperParser(Settings.builder() - .put("path.home", System.getProperty("path.home")) - .build()); - } - - public static DocumentMapperParser newDocumentMapperParser(Settings settings) { - Settings forcedSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put(settings) - .build(); - SimilarityLookupService similarityLookupService = newSimilarityLookupService(forcedSettings); - Map mappers = registerBuiltInMappers(); - Map metadataMappers = registerBuiltInMetadataMappers(); - MapperRegistry mapperRegistry = new MapperRegistry(mappers, metadataMappers); - MapperService mapperService = new MapperService(new Index("test"), - forcedSettings, - newAnalysisService(forcedSettings), - similarityLookupService, - null, - mapperRegistry); - return new DocumentMapperParser( - forcedSettings, - mapperService, - newAnalysisService(forcedSettings), - similarityLookupService, - null, - mapperRegistry); - } - - public static MapperService newMapperService(Settings settings, Client client) { - Settings indexSettings = Settings.builder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("path.home", System.getProperty("path.home")) - .put("client.type", "node") - .put(settings) - .build(); - Index index = new Index("test"); - Injector parentInjector = new ModulesBuilder() - .add(new SettingsModule(indexSettings), - new EnvironmentModule(new Environment(indexSettings))) - .createInjector(); - AnalysisModule analysisModule = new AnalysisModule(indexSettings, - parentInjector.getInstance(IndicesAnalysisService.class)); - new AnalysisBaseformPlugin(settings).onModule(analysisModule); - Injector injector = new ModulesBuilder().add(new IndexSettingsModule(index, indexSettings), - new IndexNameModule(index), - analysisModule) - .createChildInjector(parentInjector); - AnalysisService analysisService = injector.getInstance(AnalysisService.class); - SimilarityLookupService similarityLookupService = new SimilarityLookupService(index, indexSettings); - Map mappers = registerBuiltInMappers(); - Map metadataMappers = registerBuiltInMetadataMappers(); - MapperRegistry mapperRegistry = new MapperRegistry(mappers, metadataMappers); - return new MapperService(new Index("test"), - indexSettings, - analysisService, - similarityLookupService, - null, - mapperRegistry); - } - - public static AnalysisService analysisService() { - Settings settings = Settings.settingsBuilder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("path.home", System.getProperty("path.home")) - .put("client.type", "node") - .build(); - return newMapperService(settings, null).analysisService(); - } - - public static AnalysisService analysisService(Settings settings) { - Settings newSettings = Settings.settingsBuilder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("path.home", System.getProperty("path.home")) - .put("client.type", "node") - .put(settings) - .build(); - return newMapperService(newSettings, null).analysisService(); - } - - public static AnalysisService analysisService(String resource) { - Settings settings = Settings.settingsBuilder() - .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) - .put("path.home", System.getProperty("path.home")) - .put("client.type", "node") - .loadFromStream(resource, MapperTestUtils.class.getResourceAsStream(resource)) - .build(); - return newMapperService(settings, null).analysisService(); - } - - // copy from org.elasticsearch.indices.IndicesModule - private static Map registerBuiltInMappers() { - Map mapperParsers = new LinkedHashMap<>(); - mapperParsers.put(ByteFieldMapper.CONTENT_TYPE, new ByteFieldMapper.TypeParser()); - mapperParsers.put(ShortFieldMapper.CONTENT_TYPE, new ShortFieldMapper.TypeParser()); - mapperParsers.put(IntegerFieldMapper.CONTENT_TYPE, new IntegerFieldMapper.TypeParser()); - mapperParsers.put(LongFieldMapper.CONTENT_TYPE, new LongFieldMapper.TypeParser()); - mapperParsers.put(FloatFieldMapper.CONTENT_TYPE, new FloatFieldMapper.TypeParser()); - mapperParsers.put(DoubleFieldMapper.CONTENT_TYPE, new DoubleFieldMapper.TypeParser()); - mapperParsers.put(BooleanFieldMapper.CONTENT_TYPE, new BooleanFieldMapper.TypeParser()); - mapperParsers.put(BinaryFieldMapper.CONTENT_TYPE, new BinaryFieldMapper.TypeParser()); - mapperParsers.put(DateFieldMapper.CONTENT_TYPE, new DateFieldMapper.TypeParser()); - mapperParsers.put(IpFieldMapper.CONTENT_TYPE, new IpFieldMapper.TypeParser()); - mapperParsers.put(StringFieldMapper.CONTENT_TYPE, new StringFieldMapper.TypeParser()); - mapperParsers.put(TokenCountFieldMapper.CONTENT_TYPE, new TokenCountFieldMapper.TypeParser()); - mapperParsers.put(ObjectMapper.CONTENT_TYPE, new ObjectMapper.TypeParser()); - mapperParsers.put(ObjectMapper.NESTED_CONTENT_TYPE, new ObjectMapper.TypeParser()); - mapperParsers.put(TypeParsers.MULTI_FIELD_CONTENT_TYPE, TypeParsers.multiFieldConverterTypeParser); - mapperParsers.put(CompletionFieldMapper.CONTENT_TYPE, new CompletionFieldMapper.TypeParser()); - mapperParsers.put(GeoPointFieldMapper.CONTENT_TYPE, new GeoPointFieldMapper.TypeParser()); - return mapperParsers; - } - - // copy from org.elasticsearch.indices.IndicesModule - private static Map registerBuiltInMetadataMappers() { - Map metadataMapperParsers = new LinkedHashMap<>(); - metadataMapperParsers.put(UidFieldMapper.NAME, new UidFieldMapper.TypeParser()); - metadataMapperParsers.put(IdFieldMapper.NAME, new IdFieldMapper.TypeParser()); - metadataMapperParsers.put(RoutingFieldMapper.NAME, new RoutingFieldMapper.TypeParser()); - metadataMapperParsers.put(IndexFieldMapper.NAME, new IndexFieldMapper.TypeParser()); - metadataMapperParsers.put(SourceFieldMapper.NAME, new SourceFieldMapper.TypeParser()); - metadataMapperParsers.put(TypeFieldMapper.NAME, new TypeFieldMapper.TypeParser()); - metadataMapperParsers.put(AllFieldMapper.NAME, new AllFieldMapper.TypeParser()); - metadataMapperParsers.put(TimestampFieldMapper.NAME, new TimestampFieldMapper.TypeParser()); - metadataMapperParsers.put(TTLFieldMapper.NAME, new TTLFieldMapper.TypeParser()); - metadataMapperParsers.put(VersionFieldMapper.NAME, new VersionFieldMapper.TypeParser()); - metadataMapperParsers.put(ParentFieldMapper.NAME, new ParentFieldMapper.TypeParser()); - return metadataMapperParsers; - } -} diff --git a/src/test/java/org/xbib/elasticsearch/plugin/baseform/BaseformPluginTest.java b/src/test/java/org/xbib/elasticsearch/plugin/baseform/BaseformPluginTest.java deleted file mode 100644 index 584ae84..0000000 --- a/src/test/java/org/xbib/elasticsearch/plugin/baseform/BaseformPluginTest.java +++ /dev/null @@ -1,19 +0,0 @@ -package org.xbib.elasticsearch.plugin.baseform; - -import org.elasticsearch.client.Client; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.ESLoggerFactory; -import org.junit.Test; - -public class BaseformPluginTest extends NodeTestUtils { - - private final static ESLogger logger = ESLoggerFactory.getLogger(BaseformPluginTest.class.getName()); - - @Test - public void test() { - Client client = client("1"); - // TODO - client.close(); - } - -} diff --git a/src/test/java/org/xbib/elasticsearch/plugin/baseform/MockNode.java b/src/test/java/org/xbib/elasticsearch/plugin/baseform/MockNode.java deleted file mode 100644 index 631e9c6..0000000 --- a/src/test/java/org/xbib/elasticsearch/plugin/baseform/MockNode.java +++ /dev/null @@ -1,35 +0,0 @@ -package org.xbib.elasticsearch.plugin.baseform; - -import org.elasticsearch.Version; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.node.Node; -import org.elasticsearch.node.internal.InternalSettingsPreparer; -import org.elasticsearch.plugins.Plugin; - -import java.util.ArrayList; -import java.util.Collection; - -public class MockNode extends Node { - - public MockNode(Settings settings, Collection> classpathPlugins) { - super(InternalSettingsPreparer.prepareEnvironment(settings, null), Version.CURRENT, classpathPlugins); - } - - public MockNode(Settings settings, Class classpathPlugin) { - this(settings, list(classpathPlugin)); - } - - public MockNode(Settings settings) { - this(settings, list()); - } - - private static Collection> list() { - return new ArrayList<>(); - } - - private static Collection> list(Class classpathPlugin) { - Collection> list = new ArrayList<>(); - list.add(classpathPlugin); - return list; - } -} diff --git a/src/test/java/org/xbib/elasticsearch/plugin/baseform/NodeTestUtils.java b/src/test/java/org/xbib/elasticsearch/plugin/baseform/NodeTestUtils.java deleted file mode 100644 index da46c44..0000000 --- a/src/test/java/org/xbib/elasticsearch/plugin/baseform/NodeTestUtils.java +++ /dev/null @@ -1,196 +0,0 @@ -package org.xbib.elasticsearch.plugin.baseform; - -import org.elasticsearch.ElasticsearchTimeoutException; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthAction; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; -import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; -import org.elasticsearch.action.admin.cluster.node.info.NodesInfoRequest; -import org.elasticsearch.action.admin.cluster.node.info.NodesInfoResponse; -import org.elasticsearch.client.support.AbstractClient; -import org.elasticsearch.cluster.health.ClusterHealthStatus; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.ESLoggerFactory; -import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.transport.InetSocketTransportAddress; -import org.elasticsearch.common.unit.TimeValue; -import org.elasticsearch.node.Node; -import org.junit.After; -import org.junit.Before; -import org.xbib.elasticsearch.plugin.analysis.baseform.AnalysisBaseformPlugin; - -import java.io.IOException; -import java.nio.file.FileVisitResult; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.SimpleFileVisitor; -import java.nio.file.attribute.BasicFileAttributes; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.elasticsearch.common.settings.Settings.settingsBuilder; - -public class NodeTestUtils { - - protected final static ESLogger logger = ESLoggerFactory.getLogger("test"); - - private Map nodes = new HashMap<>(); - - private Map clients = new HashMap<>(); - - private AtomicInteger counter = new AtomicInteger(); - - private String cluster; - - private String host; - - private int port; - - @Before - public void startNodes() { - try { - logger.info("starting"); - setClusterName(); - startNode("1"); - findNodeAddress(); - try { - ClusterHealthResponse healthResponse = client("1").execute(ClusterHealthAction.INSTANCE, - new ClusterHealthRequest().waitForStatus(ClusterHealthStatus.GREEN).timeout(TimeValue.timeValueSeconds(30))).actionGet(); - if (healthResponse != null && healthResponse.isTimedOut()) { - throw new IOException("cluster state is " + healthResponse.getStatus().name() - + ", from here on, everything will fail!"); - } - } catch (ElasticsearchTimeoutException e) { - throw new IOException("timeout, cluster does not respond to health request, cowardly refusing to continue with operations"); - } - } catch (Throwable t) { - logger.error("startNodes failed", t); - } - } - - @After - public void stopNodes() { - try { - closeNodes(); - } catch (Exception e) { - logger.error("can not close nodes", e); - } finally { - try { - deleteFiles(); - logger.info("data files wiped"); - Thread.sleep(2000L); - } catch (IOException e) { - logger.error(e.getMessage(), e); - } catch (InterruptedException e) { - // ignore - } - } - } - - protected void setClusterName() { - this.cluster = "test-analysis-baseform-" - + "-" + System.getProperty("user.name") - + "-" + counter.incrementAndGet(); - } - - protected String getClusterName() { - return cluster; - } - - protected Settings getSettings() { - return settingsBuilder() - .put("host", host) - .put("port", port) - .put("cluster.name", cluster) - .put("path.home", getHome()) - .build(); - } - - protected Settings getNodeSettings() { - return settingsBuilder() - .put("cluster.name", cluster) - .put("cluster.routing.schedule", "50ms") - .put("cluster.routing.allocation.disk.threshold_enabled", false) - .put("discovery.zen.multicast.enabled", true) - .put("discovery.zen.multicast.ping_timeout", "5s") - .put("http.enabled", true) - .put("threadpool.bulk.size", Runtime.getRuntime().availableProcessors()) - .put("threadpool.bulk.queue_size", 16 * Runtime.getRuntime().availableProcessors()) // default is 50, too low - .put("index.number_of_replicas", 0) - .put("path.home", getHome()) - .build(); - } - - protected String getHome() { - return System.getProperty("path.home"); - } - - public void startNode(String id) throws IOException { - buildNode(id).start(); - } - - public AbstractClient client(String id) { - return clients.get(id); - } - - private void closeNodes() throws IOException { - logger.info("closing all clients"); - for (AbstractClient client : clients.values()) { - client.close(); - } - clients.clear(); - logger.info("closing all nodes"); - for (Node node : nodes.values()) { - if (node != null) { - node.close(); - } - } - nodes.clear(); - logger.info("all nodes closed"); - } - - protected void findNodeAddress() { - NodesInfoRequest nodesInfoRequest = new NodesInfoRequest().transport(true); - NodesInfoResponse response = client("1").admin().cluster().nodesInfo(nodesInfoRequest).actionGet(); - Object obj = response.iterator().next().getTransport().getAddress() - .publishAddress(); - if (obj instanceof InetSocketTransportAddress) { - InetSocketTransportAddress address = (InetSocketTransportAddress) obj; - host = address.address().getHostName(); - port = address.address().getPort(); - } - } - - private Node buildNode(String id) throws IOException { - Settings nodeSettings = settingsBuilder() - .put(getNodeSettings()) - .put("name", id) - .build(); - logger.info("settings={}", nodeSettings.getAsMap()); - // ES 2.1 renders NodeBuilder as useless - Node node = new MockNode(nodeSettings, AnalysisBaseformPlugin.class); - AbstractClient client = (AbstractClient)node.client(); - nodes.put(id, node); - clients.put(id, client); - logger.info("clients={}", clients); - return node; - } - - private static void deleteFiles() throws IOException { - Path directory = Paths.get(System.getProperty("path.home") + "/data"); - Files.walkFileTree(directory, new SimpleFileVisitor() { - @Override - public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { - Files.delete(file); - return FileVisitResult.CONTINUE; - } - - @Override - public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { - Files.delete(dir); - return FileVisitResult.CONTINUE; - } - }); - } -}