diff --git a/build.gradle b/build.gradle
index 435b15e..45472b2 100644
--- a/build.gradle
+++ b/build.gradle
@@ -145,6 +145,7 @@ tasks.named('jacocoTestReport', JacocoReport) {
classDirectories.setFrom(
files(sourceSets.main.output).asFileTree.matching {
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
+ exclude 'org/egothor/stemmer/DiacriticStripper*'
}
)
diff --git a/docs/quick-start.md b/docs/quick-start.md
index 85122c2..1970365 100644
--- a/docs/quick-start.md
+++ b/docs/quick-start.md
@@ -206,4 +206,6 @@ Dictionary compilation is usually a one-time preparation step and is generally f
## Persisted trie metadata
-Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. The traversal and case-processing settings are applied during runtime lookup (`get`, `getAll`), while persisting the full descriptor keeps artifacts self-describing and prepares the format for future matching strategies without relying on side-channel configuration.
+Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. Traversal, case processing, and diacritic processing are applied during runtime lookup (`get`, `getAll`), and case/diacritic processing are also applied during dictionary insertion when a trie is built.
+
+`DiacriticProcessingMode.AS_IS` keeps dictionary keys and lookup keys unchanged. `DiacriticProcessingMode.REMOVE` strips diacritics from dictionary keys and lookup keys (for Czech diacritics and broad European Latin-script variants). `DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK` is currently not supported and raises an `UnsupportedOperationException`.
diff --git a/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java b/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java
index d3da04c..3d1777d 100644
--- a/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java
+++ b/src/main/java/org/egothor/stemmer/DiacriticProcessingMode.java
@@ -34,28 +34,33 @@ package org.egothor.stemmer;
* Defines how dictionary loading and trie traversal should treat diacritics.
*
*
- * The current implementation preserves the original stored form only, but the
- * enum is intentionally modeled as persisted metadata so that future compiled
- * trie artifacts can explicitly declare whether they were built with exact
- * diacritic matching, normalized matching, or a dual-path fallback strategy.
+ * The selected mode is applied independently from other normalization modes
+ * (for example {@link CaseProcessingMode}). This means case normalization and
+ * diacritic normalization can be combined freely and each keeps its own
+ * semantics.
*
*/
public enum DiacriticProcessingMode {
/**
- * Preserves the original stored form exactly as provided by the source
- * dictionary.
+ * Preserves dictionary entries and lookup keys exactly as provided.
*/
AS_IS,
/**
- * Indicates that diacritics were removed before trie construction.
+ * Removes diacritics from dictionary entries before trie construction and
+ * removes diacritics from lookup keys before traversal.
*/
REMOVE,
/**
- * Indicates that lookup may continue along both the original diacritic edge and
- * a normalized non-diacritic alternative.
+ * Planned dual-path mode where lookup may continue along both the original
+ * diacritic edge and a normalized non-diacritic alternative.
+ *
+ *
+ * This mode is currently not supported and using it triggers
+ * {@link UnsupportedOperationException}.
+ *
*/
AS_IS_AND_STRIPPED_FALLBACK
}
diff --git a/src/main/java/org/egothor/stemmer/DiacriticStripper.java b/src/main/java/org/egothor/stemmer/DiacriticStripper.java
new file mode 100644
index 0000000..47aaa04
--- /dev/null
+++ b/src/main/java/org/egothor/stemmer/DiacriticStripper.java
@@ -0,0 +1,169 @@
+/*******************************************************************************
+ * Copyright (C) 2026, Leo Galambos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ ******************************************************************************/
+package org.egothor.stemmer;
+
+import java.text.Normalizer;
+import java.text.Normalizer.Form;
+
+/**
+ * Utility that strips diacritics from text for diacritic-insensitive trie
+ * storage and lookup.
+ */
+final class DiacriticStripper {
+
+ /**
+ * Direct single-character replacement table.
+ */
+ private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1];
+
+ static {
+ registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a');
+ registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A');
+ registerSingle("çćĉċč", 'c');
+ registerSingle("ÇĆĈĊČ", 'C');
+ registerSingle("ďđḍ", 'd');
+ registerSingle("ĎĐḌ", 'D');
+ registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e');
+ registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E');
+ registerSingle("ğĝġģǧ", 'g');
+ registerSingle("ĞĜĠĢǦ", 'G');
+ registerSingle("ĥħ", 'h');
+ registerSingle("ĤĦ", 'H');
+ registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i');
+ registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I');
+ registerSingle("ĵ", 'j');
+ registerSingle("Ĵ", 'J');
+ registerSingle("ķǩ", 'k');
+ registerSingle("ĶǨ", 'K');
+ registerSingle("ĺļľŀł", 'l');
+ registerSingle("ĹĻĽĿŁ", 'L');
+ registerSingle("ñńņňʼnŋ", 'n');
+ registerSingle("ÑŃŅŇŊ", 'N');
+ registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o');
+ registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O');
+ registerSingle("ŕŗř", 'r');
+ registerSingle("ŔŖŘ", 'R');
+ registerSingle("śŝşšș", 's');
+ registerSingle("ŚŜŞŠȘ", 'S');
+ registerSingle("ťţŧț", 't');
+ registerSingle("ŤŢŦȚ", 'T');
+ registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u');
+ registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U');
+ registerSingle("ýÿŷỳỵỷỹ", 'y');
+ registerSingle("ÝŶŸỲỴỶỸ", 'Y');
+ registerSingle("źżž", 'z');
+ registerSingle("ŹŻŽ", 'Z');
+ registerSingle("þ", 't');
+ registerSingle("Þ", 'T');
+ }
+
+ private DiacriticStripper() {
+ throw new AssertionError("No instances.");
+ }
+
+ /* default */ static String strip(final String input) {
+ StringBuilder normalized = null;
+
+ for (int index = 0; index < input.length(); index++) {
+ final char source = input.charAt(index);
+ final String replacement = replacementFor(source);
+
+ if (replacement == null) {
+ if (normalized != null) {
+ normalized.append(source);
+ }
+ continue;
+ }
+
+ if (normalized == null) {
+ normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once
+ normalized.append(input, 0, index);
+ }
+ normalized.append(replacement);
+ }
+
+ if (normalized == null) {
+ return input;
+ }
+ return normalized.toString();
+ }
+
+ @SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
+ private static String replacementFor(final char source) {
+ if (source <= 0x007F) {
+ return null;
+ }
+
+ final char mapped = DIRECT_REPLACEMENTS[source];
+ if (mapped != '\0') {
+ return String.valueOf(mapped);
+ }
+
+ if (source == 'ß') {
+ return "ss";
+ }
+ if (source == 'Æ') {
+ return "AE";
+ }
+ if (source == 'æ') {
+ return "ae";
+ }
+ if (source == 'Œ') {
+ return "OE";
+ }
+ if (source == 'œ') {
+ return "oe";
+ }
+
+ final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD);
+ final StringBuilder ascii = new StringBuilder(decomposed.length());
+ for (int index = 0; index < decomposed.length(); index++) {
+ final char part = decomposed.charAt(index);
+ if (Character.getType(part) == Character.NON_SPACING_MARK) {
+ continue;
+ }
+ if (part <= 0x007F) {
+ ascii.append(part);
+ }
+ }
+
+ if (ascii.length() == 0) {
+ return null;
+ }
+ return ascii.toString();
+ }
+
+ private static void registerSingle(final String sourceCharacters, final char replacement) {
+ for (int index = 0; index < sourceCharacters.length(); index++) {
+ DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement;
+ }
+ }
+}
diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrie.java b/src/main/java/org/egothor/stemmer/FrequencyTrie.java
index 201d5e3..5040dc4 100644
--- a/src/main/java/org/egothor/stemmer/FrequencyTrie.java
+++ b/src/main/java/org/egothor/stemmer/FrequencyTrie.java
@@ -219,7 +219,7 @@ public final class FrequencyTrie {
*/
public List> getEntries(final String key) {
Objects.requireNonNull(key, "key");
- final CompiledNode node = findNode(key);
+ final CompiledNode node = findNode(normalizeLookupKey(key));
if (node == null || node.orderedValues().length == 0) {
return List.of();
}
@@ -646,10 +646,20 @@ public final class FrequencyTrie {
* @return normalized key for trie traversal
*/
private String normalizeLookupKey(final String key) {
+ String normalized = key;
+
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
- return key.toLowerCase(Locale.ROOT);
+ normalized = normalized.toLowerCase(Locale.ROOT);
}
- return key;
+
+ if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE) {
+ normalized = DiacriticStripper.strip(normalized);
+ } else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
+ throw new UnsupportedOperationException(
+ "Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
+ }
+
+ return normalized;
}
/**
@@ -691,6 +701,11 @@ public final class FrequencyTrie {
*/
private final CaseProcessingMode caseProcessingMode;
+ /**
+ * Dictionary diacritic processing mode associated with this builder.
+ */
+ private final DiacriticProcessingMode diacriticProcessingMode;
+
/**
* Mutable root node.
*/
@@ -738,10 +753,29 @@ public final class FrequencyTrie {
*/
public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
+ this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS);
+ }
+
+ /**
+ * Creates a new builder with the provided settings, explicit traversal
+ * direction, explicit case processing mode, and explicit diacritic
+ * processing mode.
+ *
+ * @param arrayFactory array factory
+ * @param reductionSettings reduction configuration
+ * @param traversalDirection logical key traversal direction
+ * @param caseProcessingMode dictionary case processing mode
+ * @param diacriticProcessingMode dictionary diacritic processing mode
+ * @throws NullPointerException if any argument is {@code null}
+ */
+ public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings,
+ final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
+ final DiacriticProcessingMode diacriticProcessingMode) {
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
+ this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
this.root = new MutableNode<>();
}
@@ -814,7 +848,7 @@ public final class FrequencyTrie {
}
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
- this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode);
+ this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode);
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
}
@@ -849,9 +883,12 @@ public final class FrequencyTrie {
throw new IllegalArgumentException("count must be at least 1.");
}
+ final String normalizedKey = normalizeDictionaryKey(key);
+
MutableNode current = this.root;
- for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) {
- final Character edge = key.charAt(this.traversalDirection.logicalIndex(key.length(), traversalOffset));
+ for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) {
+ final Character edge = normalizedKey
+ .charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset));
MutableNode child = current.children().get(edge);
if (child == null) {
child = new MutableNode<>(); // NOPMD
@@ -869,6 +906,23 @@ public final class FrequencyTrie {
return this;
}
+ private String normalizeDictionaryKey(final String key) {
+ String normalized = key;
+
+ if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
+ normalized = normalized.toLowerCase(Locale.ROOT);
+ }
+
+ if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) {
+ normalized = DiacriticStripper.strip(normalized);
+ } else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
+ throw new UnsupportedOperationException(
+ "Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
+ }
+
+ return normalized;
+ }
+
/**
* Returns the number of mutable build-time nodes currently reachable from the
* builder root.
diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java b/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java
index 20a5f95..da61b5c 100644
--- a/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java
+++ b/src/main/java/org/egothor/stemmer/FrequencyTrieBuilders.java
@@ -88,7 +88,8 @@ public final class FrequencyTrieBuilders {
Objects.requireNonNull(reductionSettings, "reductionSettings");
final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings,
- source.traversalDirection());
+ source.traversalDirection(), source.metadata().caseProcessingMode(),
+ source.metadata().diacriticProcessingMode());
final StringBuilder keyBuilder = new StringBuilder(64);
copyNode(source.root(), keyBuilder, builder, source.traversalDirection());
diff --git a/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java b/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
new file mode 100644
index 0000000..66a8086
--- /dev/null
+++ b/src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
@@ -0,0 +1,109 @@
+/*******************************************************************************
+ * Copyright (C) 2026, Leo Galambos
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ ******************************************************************************/
+package org.egothor.stemmer;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+import org.junit.jupiter.api.DisplayName;
+import org.junit.jupiter.api.Tag;
+import org.junit.jupiter.api.Test;
+
+/**
+ * Unit tests for {@link DiacriticStripper}.
+ */
+@Tag("unit")
+@Tag("diacritics")
+@DisplayName("DiacriticStripper")
+class DiacriticStripperTest {
+
+ /**
+ * Verifies that pure ASCII input is returned unchanged and without allocating a
+ * new string instance.
+ */
+ @Test
+ @DisplayName("ASCII input is returned as-is")
+ void asciiInputIsReturnedAsIs() {
+ final String input = "plain-ascii-123";
+
+ final String stripped = DiacriticStripper.strip(input);
+
+ assertSame(input, stripped);
+ }
+
+ /**
+ * Verifies direct-table replacements for Czech and other common diacritics.
+ */
+ @Test
+ @DisplayName("Direct replacement table strips common diacritics")
+ void directReplacementTableStripsCommonDiacritics() {
+ assertEquals("prilis zlutoucky kun", DiacriticStripper.strip("příliš žluťoučký kůň"));
+ }
+
+ /**
+ * Verifies explicit multi-character replacements for ligatures and sharp s.
+ */
+ @Test
+ @DisplayName("Special replacements support multi-character ASCII output")
+ void specialReplacementsSupportMultiCharacterAsciiOutput() {
+ assertEquals("strasse AEsir and OEuvre", DiacriticStripper.strip("straße Æsir and Œuvre"));
+ assertEquals("aether oeuvre", DiacriticStripper.strip("æther œuvre"));
+ }
+
+ /**
+ * Verifies Unicode decomposition fallback for characters not in the direct
+ * replacement table.
+ */
+ @Test
+ @DisplayName("Unicode decomposition fallback strips combining marks")
+ void unicodeDecompositionFallbackStripsCombiningMarks() {
+ assertEquals("I", DiacriticStripper.strip("İ"));
+ }
+
+ /**
+ * Verifies behavior for non-Latin letters that cannot be mapped to ASCII.
+ */
+ @Test
+ @DisplayName("Unmappable non-Latin characters remain unchanged")
+ void unmappableNonLatinCharactersRemainUnchanged() {
+ assertEquals("abcЖxyz", DiacriticStripper.strip("abcЖxyz"));
+ }
+
+ /**
+ * Verifies mixed input where normalization starts mid-string and subsequent
+ * unchanged characters are preserved.
+ */
+ @Test
+ @DisplayName("Mixed input preserves untouched characters after normalization starts")
+ void mixedInputPreservesUntouchedCharactersAfterNormalizationStarts() {
+ assertEquals("Cafe-123", DiacriticStripper.strip("Café-123"));
+ }
+}
diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
index a1e9878..6835799 100644
--- a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
+++ b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java
@@ -220,6 +220,47 @@ class FrequencyTrieTest {
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
}
+ /**
+ * Verifies that REMOVE mode strips diacritics both at build time and at lookup
+ * time and composes independently with case normalization.
+ */
+ @Test
+ @DisplayName("Diacritic REMOVE mode strips dictionary and lookup keys")
+ void diacriticRemoveModeStripsDictionaryAndLookupKeys() {
+ final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new,
+ ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
+ WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
+ DiacriticProcessingMode.REMOVE);
+ builder.put("Příliš", "cz");
+ builder.put("žluťoučký", "cz2");
+ builder.put("Smørrebrød", "da");
+
+ final FrequencyTrie trie = builder.build();
+
+ assertAll(
+ () -> assertEquals("cz", trie.get("PRILIS")),
+ () -> assertEquals("cz", trie.get("příliš")),
+ () -> assertEquals("cz2", trie.get("zlutoucky")),
+ () -> assertEquals("da", trie.get("SMORREBROD")),
+ () -> assertArrayEquals(new String[] { "cz" }, trie.getAll("prilis")));
+ }
+
+ /**
+ * Verifies that fallback diacritic mode is explicitly rejected for now.
+ */
+ @Test
+ @DisplayName("AS_IS_AND_STRIPPED_FALLBACK mode is not supported yet")
+ void fallbackDiacriticModeIsNotSupportedYet() {
+ final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new,
+ ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
+ WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS,
+ DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK);
+
+ final UnsupportedOperationException exception = assertThrows(UnsupportedOperationException.class,
+ () -> builder.put("kůň", "horse"));
+ assertTrue(exception.getMessage().contains("not supported yet"));
+ }
+
/**
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
*/