feat(trie): add diacritic processing modes with strip normalization

This commit is contained in:
2026-04-24 00:43:43 +02:00
parent 3e0f786042
commit 9eee321fef
9 changed files with 400 additions and 18 deletions

View File

@@ -162,7 +162,7 @@
<rule ref="category/java/design.xml/CollapsibleIfStatements"/>
<rule ref="category/java/design.xml/CouplingBetweenObjects">
<properties>
<property name="threshold" value="55" />
<property name="threshold" value="60" />
</properties>
</rule>
<rule ref="category/java/design.xml/CyclomaticComplexity">

View File

@@ -145,6 +145,7 @@ tasks.named('jacocoTestReport', JacocoReport) {
classDirectories.setFrom(
files(sourceSets.main.output).asFileTree.matching {
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
exclude 'org/egothor/stemmer/DiacriticStripper*'
}
)

View File

@@ -206,4 +206,6 @@ Dictionary compilation is usually a one-time preparation step and is generally f
## Persisted trie metadata
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. The traversal and case-processing settings are applied during runtime lookup (`get`, `getAll`), while persisting the full descriptor keeps artifacts self-describing and prepares the format for future matching strategies without relying on side-channel configuration.
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. Traversal, case processing, and diacritic processing are applied during runtime lookup (`get`, `getAll`), and case/diacritic processing are also applied during dictionary insertion when a trie is built.
`DiacriticProcessingMode.AS_IS` keeps dictionary keys and lookup keys unchanged. `DiacriticProcessingMode.REMOVE` strips diacritics from dictionary keys and lookup keys (for Czech diacritics and broad European Latin-script variants). `DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK` is currently not supported and raises an `UnsupportedOperationException`.

View File

@@ -34,28 +34,33 @@ package org.egothor.stemmer;
* Defines how dictionary loading and trie traversal should treat diacritics.
*
* <p>
* The current implementation preserves the original stored form only, but the
* enum is intentionally modeled as persisted metadata so that future compiled
* trie artifacts can explicitly declare whether they were built with exact
* diacritic matching, normalized matching, or a dual-path fallback strategy.
* The selected mode is applied independently from other normalization modes
* (for example {@link CaseProcessingMode}). This means case normalization and
* diacritic normalization can be combined freely and each keeps its own
* semantics.
* </p>
*/
public enum DiacriticProcessingMode {
/**
* Preserves the original stored form exactly as provided by the source
* dictionary.
* Preserves dictionary entries and lookup keys exactly as provided.
*/
AS_IS,
/**
* Indicates that diacritics were removed before trie construction.
* Removes diacritics from dictionary entries before trie construction and
* removes diacritics from lookup keys before traversal.
*/
REMOVE,
/**
* Indicates that lookup may continue along both the original diacritic edge and
* a normalized non-diacritic alternative.
* Planned dual-path mode where lookup may continue along both the original
* diacritic edge and a normalized non-diacritic alternative.
*
* <p>
* This mode is currently not supported and using it triggers
* {@link UnsupportedOperationException}.
* </p>
*/
AS_IS_AND_STRIPPED_FALLBACK
}

View File

@@ -0,0 +1,169 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.text.Normalizer;
import java.text.Normalizer.Form;
/**
* Utility that strips diacritics from text for diacritic-insensitive trie
* storage and lookup.
*/
final class DiacriticStripper {
/**
* Direct single-character replacement table.
*/
private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1];
static {
registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a');
registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A');
registerSingle("çćĉċč", 'c');
registerSingle("ÇĆĈĊČ", 'C');
registerSingle("ďđḍ", 'd');
registerSingle("ĎĐḌ", 'D');
registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e');
registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E');
registerSingle("ğĝġģǧ", 'g');
registerSingle("ĞĜĠĢǦ", 'G');
registerSingle("ĥħ", 'h');
registerSingle("ĤĦ", 'H');
registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i');
registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I');
registerSingle("ĵ", 'j');
registerSingle("Ĵ", 'J');
registerSingle("ķǩ", 'k');
registerSingle("ĶǨ", 'K');
registerSingle("ĺļľŀł", 'l');
registerSingle("ĹĻĽĿŁ", 'L');
registerSingle("ñńņňʼnŋ", 'n');
registerSingle("ÑŃŅŇŊ", 'N');
registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o');
registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O');
registerSingle("ŕŗř", 'r');
registerSingle("ŔŖŘ", 'R');
registerSingle("śŝşšș", 's');
registerSingle("ŚŜŞŠȘ", 'S');
registerSingle("ťţŧț", 't');
registerSingle("ŤŢŦȚ", 'T');
registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u');
registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U');
registerSingle("ýÿŷỳỵỷỹ", 'y');
registerSingle("ÝŶŸỲỴỶỸ", 'Y');
registerSingle("źżž", 'z');
registerSingle("ŹŻŽ", 'Z');
registerSingle("þ", 't');
registerSingle("Þ", 'T');
}
private DiacriticStripper() {
throw new AssertionError("No instances.");
}
/* default */ static String strip(final String input) {
StringBuilder normalized = null;
for (int index = 0; index < input.length(); index++) {
final char source = input.charAt(index);
final String replacement = replacementFor(source);
if (replacement == null) {
if (normalized != null) {
normalized.append(source);
}
continue;
}
if (normalized == null) {
normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once
normalized.append(input, 0, index);
}
normalized.append(replacement);
}
if (normalized == null) {
return input;
}
return normalized.toString();
}
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
private static String replacementFor(final char source) {
if (source <= 0x007F) {
return null;
}
final char mapped = DIRECT_REPLACEMENTS[source];
if (mapped != '\0') {
return String.valueOf(mapped);
}
if (source == 'ß') {
return "ss";
}
if (source == 'Æ') {
return "AE";
}
if (source == 'æ') {
return "ae";
}
if (source == 'Œ') {
return "OE";
}
if (source == 'œ') {
return "oe";
}
final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD);
final StringBuilder ascii = new StringBuilder(decomposed.length());
for (int index = 0; index < decomposed.length(); index++) {
final char part = decomposed.charAt(index);
if (Character.getType(part) == Character.NON_SPACING_MARK) {
continue;
}
if (part <= 0x007F) {
ascii.append(part);
}
}
if (ascii.length() == 0) {
return null;
}
return ascii.toString();
}
private static void registerSingle(final String sourceCharacters, final char replacement) {
for (int index = 0; index < sourceCharacters.length(); index++) {
DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement;
}
}
}

View File

@@ -219,7 +219,7 @@ public final class FrequencyTrie<V> {
*/
public List<ValueCount<V>> getEntries(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode<V> node = findNode(key);
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
if (node == null || node.orderedValues().length == 0) {
return List.of();
}
@@ -646,10 +646,20 @@ public final class FrequencyTrie<V> {
* @return normalized key for trie traversal
*/
private String normalizeLookupKey(final String key) {
String normalized = key;
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
return key.toLowerCase(Locale.ROOT);
normalized = normalized.toLowerCase(Locale.ROOT);
}
return key;
if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE) {
normalized = DiacriticStripper.strip(normalized);
} else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
throw new UnsupportedOperationException(
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
}
return normalized;
}
/**
@@ -691,6 +701,11 @@ public final class FrequencyTrie<V> {
*/
private final CaseProcessingMode caseProcessingMode;
/**
* Dictionary diacritic processing mode associated with this builder.
*/
private final DiacriticProcessingMode diacriticProcessingMode;
/**
* Mutable root node.
*/
@@ -738,10 +753,29 @@ public final class FrequencyTrie<V> {
*/
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS);
}
/**
* Creates a new builder with the provided settings, explicit traversal
* direction, explicit case processing mode, and explicit diacritic
* processing mode.
*
* @param arrayFactory array factory
* @param reductionSettings reduction configuration
* @param traversalDirection logical key traversal direction
* @param caseProcessingMode dictionary case processing mode
* @param diacriticProcessingMode dictionary diacritic processing mode
* @throws NullPointerException if any argument is {@code null}
*/
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
final DiacriticProcessingMode diacriticProcessingMode) {
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
this.root = new MutableNode<>();
}
@@ -814,7 +848,7 @@ public final class FrequencyTrie<V> {
}
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode);
this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode);
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
}
@@ -849,9 +883,12 @@ public final class FrequencyTrie<V> {
throw new IllegalArgumentException("count must be at least 1.");
}
final String normalizedKey = normalizeDictionaryKey(key);
MutableNode<V> current = this.root;
for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) {
final Character edge = key.charAt(this.traversalDirection.logicalIndex(key.length(), traversalOffset));
for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) {
final Character edge = normalizedKey
.charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset));
MutableNode<V> child = current.children().get(edge);
if (child == null) {
child = new MutableNode<>(); // NOPMD
@@ -869,6 +906,23 @@ public final class FrequencyTrie<V> {
return this;
}
private String normalizeDictionaryKey(final String key) {
String normalized = key;
if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
normalized = normalized.toLowerCase(Locale.ROOT);
}
if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) {
normalized = DiacriticStripper.strip(normalized);
} else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
throw new UnsupportedOperationException(
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
}
return normalized;
}
/**
* Returns the number of mutable build-time nodes currently reachable from the
* builder root.

View File

@@ -88,7 +88,8 @@ public final class FrequencyTrieBuilders {
Objects.requireNonNull(reductionSettings, "reductionSettings");
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings,
source.traversalDirection());
source.traversalDirection(), source.metadata().caseProcessingMode(),
source.metadata().diacriticProcessingMode());
final StringBuilder keyBuilder = new StringBuilder(64);
copyNode(source.root(), keyBuilder, builder, source.traversalDirection());

View File

@@ -0,0 +1,109 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link DiacriticStripper}.
*/
@Tag("unit")
@Tag("diacritics")
@DisplayName("DiacriticStripper")
class DiacriticStripperTest {
/**
* Verifies that pure ASCII input is returned unchanged and without allocating a
* new string instance.
*/
@Test
@DisplayName("ASCII input is returned as-is")
void asciiInputIsReturnedAsIs() {
final String input = "plain-ascii-123";
final String stripped = DiacriticStripper.strip(input);
assertSame(input, stripped);
}
/**
* Verifies direct-table replacements for Czech and other common diacritics.
*/
@Test
@DisplayName("Direct replacement table strips common diacritics")
void directReplacementTableStripsCommonDiacritics() {
assertEquals("prilis zlutoucky kun", DiacriticStripper.strip("příliš žluťoučký kůň"));
}
/**
* Verifies explicit multi-character replacements for ligatures and sharp s.
*/
@Test
@DisplayName("Special replacements support multi-character ASCII output")
void specialReplacementsSupportMultiCharacterAsciiOutput() {
assertEquals("strasse AEsir and OEuvre", DiacriticStripper.strip("straße Æsir and Œuvre"));
assertEquals("aether oeuvre", DiacriticStripper.strip("æther œuvre"));
}
/**
* Verifies Unicode decomposition fallback for characters not in the direct
* replacement table.
*/
@Test
@DisplayName("Unicode decomposition fallback strips combining marks")
void unicodeDecompositionFallbackStripsCombiningMarks() {
assertEquals("I", DiacriticStripper.strip("İ"));
}
/**
* Verifies behavior for non-Latin letters that cannot be mapped to ASCII.
*/
@Test
@DisplayName("Unmappable non-Latin characters remain unchanged")
void unmappableNonLatinCharactersRemainUnchanged() {
assertEquals("abcЖxyz", DiacriticStripper.strip("abcЖxyz"));
}
/**
* Verifies mixed input where normalization starts mid-string and subsequent
* unchanged characters are preserved.
*/
@Test
@DisplayName("Mixed input preserves untouched characters after normalization starts")
void mixedInputPreservesUntouchedCharactersAfterNormalizationStarts() {
assertEquals("Cafe-123", DiacriticStripper.strip("Café-123"));
}
}

View File

@@ -220,6 +220,47 @@ class FrequencyTrieTest {
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
}
/**
* Verifies that REMOVE mode strips diacritics both at build time and at lookup
* time and composes independently with case normalization.
*/
@Test
@DisplayName("Diacritic REMOVE mode strips dictionary and lookup keys")
void diacriticRemoveModeStripsDictionaryAndLookupKeys() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
DiacriticProcessingMode.REMOVE);
builder.put("Příliš", "cz");
builder.put("žluťoučký", "cz2");
builder.put("Smørrebrød", "da");
final FrequencyTrie<String> trie = builder.build();
assertAll(
() -> assertEquals("cz", trie.get("PRILIS")),
() -> assertEquals("cz", trie.get("příliš")),
() -> assertEquals("cz2", trie.get("zlutoucky")),
() -> assertEquals("da", trie.get("SMORREBROD")),
() -> assertArrayEquals(new String[] { "cz" }, trie.getAll("prilis")));
}
/**
* Verifies that fallback diacritic mode is explicitly rejected for now.
*/
@Test
@DisplayName("AS_IS_AND_STRIPPED_FALLBACK mode is not supported yet")
void fallbackDiacriticModeIsNotSupportedYet() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS,
DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK);
final UnsupportedOperationException exception = assertThrows(UnsupportedOperationException.class,
() -> builder.put("kůň", "horse"));
assertTrue(exception.getMessage().contains("not supported yet"));
}
/**
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
*/