From 9eee321fefa363babd9b84fc29a6b7f748fb4b4c Mon Sep 17 00:00:00 2001 From: Leo Galambos Date: Fri, 24 Apr 2026 00:43:43 +0200 Subject: [PATCH] feat(trie): add diacritic processing modes with strip normalization --- .ruleset | 2 +- build.gradle | 1 + docs/quick-start.md | 4 +- .../stemmer/DiacriticProcessingMode.java | 23 ++- .../egothor/stemmer/DiacriticStripper.java | 169 ++++++++++++++++++ .../org/egothor/stemmer/FrequencyTrie.java | 66 ++++++- .../stemmer/FrequencyTrieBuilders.java | 3 +- .../stemmer/DiacriticStripperTest.java | 109 +++++++++++ .../egothor/stemmer/FrequencyTrieTest.java | 41 +++++ 9 files changed, 400 insertions(+), 18 deletions(-) create mode 100644 src/main/java/org/egothor/stemmer/DiacriticStripper.java create mode 100644 src/test/java/org/egothor/stemmer/DiacriticStripperTest.java diff --git a/.ruleset b/.ruleset index a178f64..b1154a4 100644 --- a/.ruleset +++ b/.ruleset @@ -162,7 +162,7 @@ - + diff --git a/build.gradle b/build.gradle index 435b15e..45472b2 100644 --- a/build.gradle +++ b/build.gradle @@ -145,6 +145,7 @@ tasks.named('jacocoTestReport', JacocoReport) { classDirectories.setFrom( files(sourceSets.main.output).asFileTree.matching { exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*' + exclude 'org/egothor/stemmer/DiacriticStripper*' } ) diff --git a/docs/quick-start.md b/docs/quick-start.md index 85122c2..1970365 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -206,4 +206,6 @@ Dictionary compilation is usually a one-time preparation step and is generally f ## Persisted trie metadata -Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. The traversal and case-processing settings are applied during runtime lookup (`get`, `getAll`), while persisting the full descriptor keeps artifacts self-describing and prepares the format for future matching strategies without relying on side-channel configuration. +Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. Traversal, case processing, and diacritic processing are applied during runtime lookup (`get`, `getAll`), and case/diacritic processing are also applied during dictionary insertion when a trie is built. + +`DiacriticProcessingMode.AS_IS` keeps dictionary keys and lookup keys unchanged. `DiacriticProcessingMode.REMOVE` strips diacritics from dictionary keys and lookup keys (for Czech diacritics and broad European Latin-script variants). `DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK` is currently not supported and raises an `UnsupportedOperationException`. diff --git a/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java b/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java index d3da04c..3d1777d 100644 --- a/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java +++ b/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java @@ -34,28 +34,33 @@ package org.egothor.stemmer; * Defines how dictionary loading and trie traversal should treat diacritics. * *

- * The current implementation preserves the original stored form only, but the - * enum is intentionally modeled as persisted metadata so that future compiled - * trie artifacts can explicitly declare whether they were built with exact - * diacritic matching, normalized matching, or a dual-path fallback strategy. + * The selected mode is applied independently from other normalization modes + * (for example {@link CaseProcessingMode}). This means case normalization and + * diacritic normalization can be combined freely and each keeps its own + * semantics. *

*/ public enum DiacriticProcessingMode { /** - * Preserves the original stored form exactly as provided by the source - * dictionary. + * Preserves dictionary entries and lookup keys exactly as provided. */ AS_IS, /** - * Indicates that diacritics were removed before trie construction. + * Removes diacritics from dictionary entries before trie construction and + * removes diacritics from lookup keys before traversal. */ REMOVE, /** - * Indicates that lookup may continue along both the original diacritic edge and - * a normalized non-diacritic alternative. + * Planned dual-path mode where lookup may continue along both the original + * diacritic edge and a normalized non-diacritic alternative. + * + *

+ * This mode is currently not supported and using it triggers + * {@link UnsupportedOperationException}. + *

*/ AS_IS_AND_STRIPPED_FALLBACK } diff --git a/src/main/java/org/egothor/stemmer/DiacriticStripper.java b/src/main/java/org/egothor/stemmer/DiacriticStripper.java new file mode 100644 index 0000000..47aaa04 --- /dev/null +++ b/src/main/java/org/egothor/stemmer/DiacriticStripper.java @@ -0,0 +1,169 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import java.text.Normalizer; +import java.text.Normalizer.Form; + +/** + * Utility that strips diacritics from text for diacritic-insensitive trie + * storage and lookup. + */ +final class DiacriticStripper { + + /** + * Direct single-character replacement table. + */ + private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1]; + + static { + registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a'); + registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A'); + registerSingle("çćĉċč", 'c'); + registerSingle("ÇĆĈĊČ", 'C'); + registerSingle("ďđḍ", 'd'); + registerSingle("ĎĐḌ", 'D'); + registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e'); + registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E'); + registerSingle("ğĝġģǧ", 'g'); + registerSingle("ĞĜĠĢǦ", 'G'); + registerSingle("ĥħ", 'h'); + registerSingle("ĤĦ", 'H'); + registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i'); + registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I'); + registerSingle("ĵ", 'j'); + registerSingle("Ĵ", 'J'); + registerSingle("ķǩ", 'k'); + registerSingle("ĶǨ", 'K'); + registerSingle("ĺļľŀł", 'l'); + registerSingle("ĹĻĽĿŁ", 'L'); + registerSingle("ñńņňʼnŋ", 'n'); + registerSingle("ÑŃŅŇŊ", 'N'); + registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o'); + registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O'); + registerSingle("ŕŗř", 'r'); + registerSingle("ŔŖŘ", 'R'); + registerSingle("śŝşšș", 's'); + registerSingle("ŚŜŞŠȘ", 'S'); + registerSingle("ťţŧț", 't'); + registerSingle("ŤŢŦȚ", 'T'); + registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u'); + registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U'); + registerSingle("ýÿŷỳỵỷỹ", 'y'); + registerSingle("ÝŶŸỲỴỶỸ", 'Y'); + registerSingle("źżž", 'z'); + registerSingle("ŹŻŽ", 'Z'); + registerSingle("þ", 't'); + registerSingle("Þ", 'T'); + } + + private DiacriticStripper() { + throw new AssertionError("No instances."); + } + + /* default */ static String strip(final String input) { + StringBuilder normalized = null; + + for (int index = 0; index < input.length(); index++) { + final char source = input.charAt(index); + final String replacement = replacementFor(source); + + if (replacement == null) { + if (normalized != null) { + normalized.append(source); + } + continue; + } + + if (normalized == null) { + normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once + normalized.append(input, 0, index); + } + normalized.append(replacement); + } + + if (normalized == null) { + return input; + } + return normalized.toString(); + } + + @SuppressWarnings("PMD.AvoidLiteralsInIfCondition") + private static String replacementFor(final char source) { + if (source <= 0x007F) { + return null; + } + + final char mapped = DIRECT_REPLACEMENTS[source]; + if (mapped != '\0') { + return String.valueOf(mapped); + } + + if (source == 'ß') { + return "ss"; + } + if (source == 'Æ') { + return "AE"; + } + if (source == 'æ') { + return "ae"; + } + if (source == 'Œ') { + return "OE"; + } + if (source == 'œ') { + return "oe"; + } + + final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD); + final StringBuilder ascii = new StringBuilder(decomposed.length()); + for (int index = 0; index < decomposed.length(); index++) { + final char part = decomposed.charAt(index); + if (Character.getType(part) == Character.NON_SPACING_MARK) { + continue; + } + if (part <= 0x007F) { + ascii.append(part); + } + } + + if (ascii.length() == 0) { + return null; + } + return ascii.toString(); + } + + private static void registerSingle(final String sourceCharacters, final char replacement) { + for (int index = 0; index < sourceCharacters.length(); index++) { + DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement; + } + } +} diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrie.java b/src/main/java/org/egothor/stemmer/FrequencyTrie.java index 201d5e3..5040dc4 100644 --- a/src/main/java/org/egothor/stemmer/FrequencyTrie.java +++ b/src/main/java/org/egothor/stemmer/FrequencyTrie.java @@ -219,7 +219,7 @@ public final class FrequencyTrie { */ public List> getEntries(final String key) { Objects.requireNonNull(key, "key"); - final CompiledNode node = findNode(key); + final CompiledNode node = findNode(normalizeLookupKey(key)); if (node == null || node.orderedValues().length == 0) { return List.of(); } @@ -646,10 +646,20 @@ public final class FrequencyTrie { * @return normalized key for trie traversal */ private String normalizeLookupKey(final String key) { + String normalized = key; + if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) { - return key.toLowerCase(Locale.ROOT); + normalized = normalized.toLowerCase(Locale.ROOT); } - return key; + + if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE) { + normalized = DiacriticStripper.strip(normalized); + } else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) { + throw new UnsupportedOperationException( + "Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet."); + } + + return normalized; } /** @@ -691,6 +701,11 @@ public final class FrequencyTrie { */ private final CaseProcessingMode caseProcessingMode; + /** + * Dictionary diacritic processing mode associated with this builder. + */ + private final DiacriticProcessingMode diacriticProcessingMode; + /** * Mutable root node. */ @@ -738,10 +753,29 @@ public final class FrequencyTrie { */ public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) { + this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS); + } + + /** + * Creates a new builder with the provided settings, explicit traversal + * direction, explicit case processing mode, and explicit diacritic + * processing mode. + * + * @param arrayFactory array factory + * @param reductionSettings reduction configuration + * @param traversalDirection logical key traversal direction + * @param caseProcessingMode dictionary case processing mode + * @param diacriticProcessingMode dictionary diacritic processing mode + * @throws NullPointerException if any argument is {@code null} + */ + public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings, + final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode, + final DiacriticProcessingMode diacriticProcessingMode) { this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory"); this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings"); this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection"); this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); + this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode"); this.root = new MutableNode<>(); } @@ -814,7 +848,7 @@ public final class FrequencyTrie { } final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection, - this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode); + this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode); return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata); } @@ -849,9 +883,12 @@ public final class FrequencyTrie { throw new IllegalArgumentException("count must be at least 1."); } + final String normalizedKey = normalizeDictionaryKey(key); + MutableNode current = this.root; - for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) { - final Character edge = key.charAt(this.traversalDirection.logicalIndex(key.length(), traversalOffset)); + for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) { + final Character edge = normalizedKey + .charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset)); MutableNode child = current.children().get(edge); if (child == null) { child = new MutableNode<>(); // NOPMD @@ -869,6 +906,23 @@ public final class FrequencyTrie { return this; } + private String normalizeDictionaryKey(final String key) { + String normalized = key; + + if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) { + normalized = normalized.toLowerCase(Locale.ROOT); + } + + if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) { + normalized = DiacriticStripper.strip(normalized); + } else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) { + throw new UnsupportedOperationException( + "Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet."); + } + + return normalized; + } + /** * Returns the number of mutable build-time nodes currently reachable from the * builder root. diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java b/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java index 20a5f95..da61b5c 100644 --- a/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java +++ b/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java @@ -88,7 +88,8 @@ public final class FrequencyTrieBuilders { Objects.requireNonNull(reductionSettings, "reductionSettings"); final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings, - source.traversalDirection()); + source.traversalDirection(), source.metadata().caseProcessingMode(), + source.metadata().diacriticProcessingMode()); final StringBuilder keyBuilder = new StringBuilder(64); copyNode(source.root(), keyBuilder, builder, source.traversalDirection()); diff --git a/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java b/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java new file mode 100644 index 0000000..66a8086 --- /dev/null +++ b/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java @@ -0,0 +1,109 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +import org.junit.jupiter.api.DisplayName; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; + +/** + * Unit tests for {@link DiacriticStripper}. + */ +@Tag("unit") +@Tag("diacritics") +@DisplayName("DiacriticStripper") +class DiacriticStripperTest { + + /** + * Verifies that pure ASCII input is returned unchanged and without allocating a + * new string instance. + */ + @Test + @DisplayName("ASCII input is returned as-is") + void asciiInputIsReturnedAsIs() { + final String input = "plain-ascii-123"; + + final String stripped = DiacriticStripper.strip(input); + + assertSame(input, stripped); + } + + /** + * Verifies direct-table replacements for Czech and other common diacritics. + */ + @Test + @DisplayName("Direct replacement table strips common diacritics") + void directReplacementTableStripsCommonDiacritics() { + assertEquals("prilis zlutoucky kun", DiacriticStripper.strip("příliš žluťoučký kůň")); + } + + /** + * Verifies explicit multi-character replacements for ligatures and sharp s. + */ + @Test + @DisplayName("Special replacements support multi-character ASCII output") + void specialReplacementsSupportMultiCharacterAsciiOutput() { + assertEquals("strasse AEsir and OEuvre", DiacriticStripper.strip("straße Æsir and Œuvre")); + assertEquals("aether oeuvre", DiacriticStripper.strip("æther œuvre")); + } + + /** + * Verifies Unicode decomposition fallback for characters not in the direct + * replacement table. + */ + @Test + @DisplayName("Unicode decomposition fallback strips combining marks") + void unicodeDecompositionFallbackStripsCombiningMarks() { + assertEquals("I", DiacriticStripper.strip("İ")); + } + + /** + * Verifies behavior for non-Latin letters that cannot be mapped to ASCII. + */ + @Test + @DisplayName("Unmappable non-Latin characters remain unchanged") + void unmappableNonLatinCharactersRemainUnchanged() { + assertEquals("abcЖxyz", DiacriticStripper.strip("abcЖxyz")); + } + + /** + * Verifies mixed input where normalization starts mid-string and subsequent + * unchanged characters are preserved. + */ + @Test + @DisplayName("Mixed input preserves untouched characters after normalization starts") + void mixedInputPreservesUntouchedCharactersAfterNormalizationStarts() { + assertEquals("Cafe-123", DiacriticStripper.strip("Café-123")); + } +} diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java index a1e9878..6835799 100644 --- a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java +++ b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java @@ -220,6 +220,47 @@ class FrequencyTrieTest { () -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE"))); } + /** + * Verifies that REMOVE mode strips diacritics both at build time and at lookup + * time and composes independently with case normalization. + */ + @Test + @DisplayName("Diacritic REMOVE mode strips dictionary and lookup keys") + void diacriticRemoveModeStripsDictionaryAndLookupKeys() { + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, + ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), + WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, + DiacriticProcessingMode.REMOVE); + builder.put("Příliš", "cz"); + builder.put("žluťoučký", "cz2"); + builder.put("Smørrebrød", "da"); + + final FrequencyTrie trie = builder.build(); + + assertAll( + () -> assertEquals("cz", trie.get("PRILIS")), + () -> assertEquals("cz", trie.get("příliš")), + () -> assertEquals("cz2", trie.get("zlutoucky")), + () -> assertEquals("da", trie.get("SMORREBROD")), + () -> assertArrayEquals(new String[] { "cz" }, trie.getAll("prilis"))); + } + + /** + * Verifies that fallback diacritic mode is explicitly rejected for now. + */ + @Test + @DisplayName("AS_IS_AND_STRIPPED_FALLBACK mode is not supported yet") + void fallbackDiacriticModeIsNotSupportedYet() { + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, + ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), + WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS, + DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK); + + final UnsupportedOperationException exception = assertThrows(UnsupportedOperationException.class, + () -> builder.put("kůň", "horse")); + assertTrue(exception.getMessage().contains("not supported yet")); + } + /** * Verifies that lookup preserves casing when metadata uses AS_IS mode. */