Skip to content

Commit

Permalink
OAK-11133: lucene type filter does not get correctly converted into t…
Browse files Browse the repository at this point in the history
…he elastic counterpart (#1725)

* OAK-11133: lucene type filter does not get correctly converted into the elastic counterpart

* OAK-11133: improve log message
  • Loading branch information
fabriziofortino committed Sep 19, 2024
1 parent bc410d8 commit 3e31573
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -191,8 +191,10 @@ private static <FD> LinkedHashMap<String, FD> loadFilters(NodeState state,
.map(Map.Entry::getValue)
.collect(Collectors.toList());
} catch (Exception e) {
LOG.warn("Unable introspect lucene internal factories to perform transformations. " +
"Current configuration will be used", e);
LOG.warn("Unable to introspect Lucene internal factories for transformations. " +
"If using an Elasticsearch-specific factory, consider using a Lucene-compatible one for backward compatibility. " +
"Current configuration will be used. Error: {}", e.getMessage());
LOG.debug("Error details: ", e);
name = normalize(t.getName());
transformers = List.of();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.lucene.analysis.cjk.CJKBigramFilterFactory;
import org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory;
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory;
import org.apache.lucene.analysis.core.TypeTokenFilterFactory;
import org.apache.lucene.analysis.en.AbstractWordsFileFilterFactory;
import org.apache.lucene.analysis.miscellaneous.KeepWordFilterFactory;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilterFactory;
Expand Down Expand Up @@ -129,6 +130,17 @@ protected interface ParameterTransformer {
));
});

LUCENE_ELASTIC_TRANSFORMERS.put(TypeTokenFilterFactory.class, luceneParams -> {
Object useWhitelist = luceneParams.remove("useWhitelist");
if (useWhitelist != null && Boolean.parseBoolean(useWhitelist.toString())) {
luceneParams.put("mode", "include");
} else {
luceneParams.put("mode", "exclude");
}

return luceneParams;
});

LUCENE_ELASTIC_TRANSFORMERS.put(PatternCaptureGroupFilterFactory.class, luceneParams ->
reKey.apply(luceneParams, Map.of("pattern", "patterns"))
);
Expand Down Expand Up @@ -234,6 +246,7 @@ protected interface ParameterTransformer {
Map.entry("pattern_capture_group", "pattern_capture"),
Map.entry("reverse_string", "reverse"),
Map.entry("snowball_porter", "snowball"),
Map.entry("dictionary_compound_word", "dictionary_decompounder")
Map.entry("dictionary_compound_word", "dictionary_decompounder"),
Map.entry("type", "keep_types")
);
}
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,52 @@ public void badIndexDefinitionShouldLetQEWork() throws Exception {
}

@Test
public void testSynonyms() throws Exception {
public void fullTextSearchWithTypeTokenFilter() throws Exception {
setup(List.of("foo"), idx -> {
Tree anl = idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME, "Standard");

Tree type = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "Type");
type.setProperty("types", "stopTypes.txt");
type.addChild("stopTypes.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "<NUM>\n<SYNONYM>");
});

Tree content = root.getTree("/").addChild("content");
content.addChild("bar").setProperty("foo", "foo 123");
root.commit();

assertEventually(() -> {
assertQuery("select * from [nt:base] where CONTAINS(*, 'foo')", List.of("/content/bar"));
assertQuery("select * from [nt:base] where CONTAINS(*, '123')", List.of());
});
}

@Test
public void fullTextSearchWithWhitelistedTypeTokenFilter() throws Exception {
setup(List.of("foo"), idx -> {
Tree anl = idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME, "Standard");

Tree type = addFilter(anl.addChild(FulltextIndexConstants.ANL_FILTERS), "Type");
type.setProperty("types", "stopTypes.txt");
type.setProperty("useWhitelist", "true");
type.addChild("stopTypes.txt").addChild(JcrConstants.JCR_CONTENT)
.setProperty(JcrConstants.JCR_DATA, "<NUM>\n<SYNONYM>");
});

Tree content = root.getTree("/").addChild("content");
content.addChild("bar").setProperty("foo", "foo 123");
root.commit();

assertEventually(() -> {
assertQuery("select * from [nt:base] where CONTAINS(*, 'foo')", List.of());
assertQuery("select * from [nt:base] where CONTAINS(*, '123')", List.of("/content/bar"));
});
}

@Test
public void synonyms() throws Exception {
setup(List.of("foo"), idx -> {
Tree anl = idx.addChild(FulltextIndexConstants.ANALYZERS).addChild(FulltextIndexConstants.ANL_DEFAULT);
anl.addChild(FulltextIndexConstants.ANL_TOKENIZER).setProperty(FulltextIndexConstants.ANL_NAME, "Standard");
Expand Down

0 comments on commit 3e31573

Please sign in to comment.