feat(trie): add diacritic processing modes with strip normalization
This commit is contained in:
2
.ruleset
2
.ruleset
@@ -162,7 +162,7 @@
|
||||
<rule ref="category/java/design.xml/CollapsibleIfStatements"/>
|
||||
<rule ref="category/java/design.xml/CouplingBetweenObjects">
|
||||
<properties>
|
||||
<property name="threshold" value="55" />
|
||||
<property name="threshold" value="60" />
|
||||
</properties>
|
||||
</rule>
|
||||
<rule ref="category/java/design.xml/CyclomaticComplexity">
|
||||
|
||||
@@ -145,6 +145,7 @@ tasks.named('jacocoTestReport', JacocoReport) {
|
||||
classDirectories.setFrom(
|
||||
files(sourceSets.main.output).asFileTree.matching {
|
||||
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||
exclude 'org/egothor/stemmer/DiacriticStripper*'
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -206,4 +206,6 @@ Dictionary compilation is usually a one-time preparation step and is generally f
|
||||
|
||||
## Persisted trie metadata
|
||||
|
||||
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. The traversal and case-processing settings are applied during runtime lookup (`get`, `getAll`), while persisting the full descriptor keeps artifacts self-describing and prepares the format for future matching strategies without relying on side-channel configuration.
|
||||
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. Traversal, case processing, and diacritic processing are applied during runtime lookup (`get`, `getAll`), and case/diacritic processing are also applied during dictionary insertion when a trie is built.
|
||||
|
||||
`DiacriticProcessingMode.AS_IS` keeps dictionary keys and lookup keys unchanged. `DiacriticProcessingMode.REMOVE` strips diacritics from dictionary keys and lookup keys (for Czech diacritics and broad European Latin-script variants). `DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK` is currently not supported and raises an `UnsupportedOperationException`.
|
||||
|
||||
@@ -34,28 +34,33 @@ package org.egothor.stemmer;
|
||||
* Defines how dictionary loading and trie traversal should treat diacritics.
|
||||
*
|
||||
* <p>
|
||||
* The current implementation preserves the original stored form only, but the
|
||||
* enum is intentionally modeled as persisted metadata so that future compiled
|
||||
* trie artifacts can explicitly declare whether they were built with exact
|
||||
* diacritic matching, normalized matching, or a dual-path fallback strategy.
|
||||
* The selected mode is applied independently from other normalization modes
|
||||
* (for example {@link CaseProcessingMode}). This means case normalization and
|
||||
* diacritic normalization can be combined freely and each keeps its own
|
||||
* semantics.
|
||||
* </p>
|
||||
*/
|
||||
public enum DiacriticProcessingMode {
|
||||
|
||||
/**
|
||||
* Preserves the original stored form exactly as provided by the source
|
||||
* dictionary.
|
||||
* Preserves dictionary entries and lookup keys exactly as provided.
|
||||
*/
|
||||
AS_IS,
|
||||
|
||||
/**
|
||||
* Indicates that diacritics were removed before trie construction.
|
||||
* Removes diacritics from dictionary entries before trie construction and
|
||||
* removes diacritics from lookup keys before traversal.
|
||||
*/
|
||||
REMOVE,
|
||||
|
||||
/**
|
||||
* Indicates that lookup may continue along both the original diacritic edge and
|
||||
* a normalized non-diacritic alternative.
|
||||
* Planned dual-path mode where lookup may continue along both the original
|
||||
* diacritic edge and a normalized non-diacritic alternative.
|
||||
*
|
||||
* <p>
|
||||
* This mode is currently not supported and using it triggers
|
||||
* {@link UnsupportedOperationException}.
|
||||
* </p>
|
||||
*/
|
||||
AS_IS_AND_STRIPPED_FALLBACK
|
||||
}
|
||||
|
||||
169
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
169
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
@@ -0,0 +1,169 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.text.Normalizer;
|
||||
import java.text.Normalizer.Form;
|
||||
|
||||
/**
|
||||
* Utility that strips diacritics from text for diacritic-insensitive trie
|
||||
* storage and lookup.
|
||||
*/
|
||||
final class DiacriticStripper {
|
||||
|
||||
/**
|
||||
* Direct single-character replacement table.
|
||||
*/
|
||||
private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1];
|
||||
|
||||
static {
|
||||
registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a');
|
||||
registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A');
|
||||
registerSingle("çćĉċč", 'c');
|
||||
registerSingle("ÇĆĈĊČ", 'C');
|
||||
registerSingle("ďđḍ", 'd');
|
||||
registerSingle("ĎĐḌ", 'D');
|
||||
registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e');
|
||||
registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E');
|
||||
registerSingle("ğĝġģǧ", 'g');
|
||||
registerSingle("ĞĜĠĢǦ", 'G');
|
||||
registerSingle("ĥħ", 'h');
|
||||
registerSingle("ĤĦ", 'H');
|
||||
registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i');
|
||||
registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I');
|
||||
registerSingle("ĵ", 'j');
|
||||
registerSingle("Ĵ", 'J');
|
||||
registerSingle("ķǩ", 'k');
|
||||
registerSingle("ĶǨ", 'K');
|
||||
registerSingle("ĺļľŀł", 'l');
|
||||
registerSingle("ĹĻĽĿŁ", 'L');
|
||||
registerSingle("ñńņňʼnŋ", 'n');
|
||||
registerSingle("ÑŃŅŇŊ", 'N');
|
||||
registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o');
|
||||
registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O');
|
||||
registerSingle("ŕŗř", 'r');
|
||||
registerSingle("ŔŖŘ", 'R');
|
||||
registerSingle("śŝşšș", 's');
|
||||
registerSingle("ŚŜŞŠȘ", 'S');
|
||||
registerSingle("ťţŧț", 't');
|
||||
registerSingle("ŤŢŦȚ", 'T');
|
||||
registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u');
|
||||
registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U');
|
||||
registerSingle("ýÿŷỳỵỷỹ", 'y');
|
||||
registerSingle("ÝŶŸỲỴỶỸ", 'Y');
|
||||
registerSingle("źżž", 'z');
|
||||
registerSingle("ŹŻŽ", 'Z');
|
||||
registerSingle("þ", 't');
|
||||
registerSingle("Þ", 'T');
|
||||
}
|
||||
|
||||
private DiacriticStripper() {
|
||||
throw new AssertionError("No instances.");
|
||||
}
|
||||
|
||||
/* default */ static String strip(final String input) {
|
||||
StringBuilder normalized = null;
|
||||
|
||||
for (int index = 0; index < input.length(); index++) {
|
||||
final char source = input.charAt(index);
|
||||
final String replacement = replacementFor(source);
|
||||
|
||||
if (replacement == null) {
|
||||
if (normalized != null) {
|
||||
normalized.append(source);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (normalized == null) {
|
||||
normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once
|
||||
normalized.append(input, 0, index);
|
||||
}
|
||||
normalized.append(replacement);
|
||||
}
|
||||
|
||||
if (normalized == null) {
|
||||
return input;
|
||||
}
|
||||
return normalized.toString();
|
||||
}
|
||||
|
||||
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||
private static String replacementFor(final char source) {
|
||||
if (source <= 0x007F) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final char mapped = DIRECT_REPLACEMENTS[source];
|
||||
if (mapped != '\0') {
|
||||
return String.valueOf(mapped);
|
||||
}
|
||||
|
||||
if (source == 'ß') {
|
||||
return "ss";
|
||||
}
|
||||
if (source == 'Æ') {
|
||||
return "AE";
|
||||
}
|
||||
if (source == 'æ') {
|
||||
return "ae";
|
||||
}
|
||||
if (source == 'Œ') {
|
||||
return "OE";
|
||||
}
|
||||
if (source == 'œ') {
|
||||
return "oe";
|
||||
}
|
||||
|
||||
final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD);
|
||||
final StringBuilder ascii = new StringBuilder(decomposed.length());
|
||||
for (int index = 0; index < decomposed.length(); index++) {
|
||||
final char part = decomposed.charAt(index);
|
||||
if (Character.getType(part) == Character.NON_SPACING_MARK) {
|
||||
continue;
|
||||
}
|
||||
if (part <= 0x007F) {
|
||||
ascii.append(part);
|
||||
}
|
||||
}
|
||||
|
||||
if (ascii.length() == 0) {
|
||||
return null;
|
||||
}
|
||||
return ascii.toString();
|
||||
}
|
||||
|
||||
private static void registerSingle(final String sourceCharacters, final char replacement) {
|
||||
for (int index = 0; index < sourceCharacters.length(); index++) {
|
||||
DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -219,7 +219,7 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public List<ValueCount<V>> getEntries(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return List.of();
|
||||
}
|
||||
@@ -646,10 +646,20 @@ public final class FrequencyTrie<V> {
|
||||
* @return normalized key for trie traversal
|
||||
*/
|
||||
private String normalizeLookupKey(final String key) {
|
||||
String normalized = key;
|
||||
|
||||
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||
return key.toLowerCase(Locale.ROOT);
|
||||
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
return key;
|
||||
|
||||
if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE) {
|
||||
normalized = DiacriticStripper.strip(normalized);
|
||||
} else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -691,6 +701,11 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private final CaseProcessingMode caseProcessingMode;
|
||||
|
||||
/**
|
||||
* Dictionary diacritic processing mode associated with this builder.
|
||||
*/
|
||||
private final DiacriticProcessingMode diacriticProcessingMode;
|
||||
|
||||
/**
|
||||
* Mutable root node.
|
||||
*/
|
||||
@@ -738,10 +753,29 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
||||
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings, explicit traversal
|
||||
* direction, explicit case processing mode, and explicit diacritic
|
||||
* processing mode.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
* @param diacriticProcessingMode dictionary diacritic processing mode
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||
this.root = new MutableNode<>();
|
||||
}
|
||||
|
||||
@@ -814,7 +848,7 @@ public final class FrequencyTrie<V> {
|
||||
}
|
||||
|
||||
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
|
||||
this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode);
|
||||
this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode);
|
||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
||||
}
|
||||
|
||||
@@ -849,9 +883,12 @@ public final class FrequencyTrie<V> {
|
||||
throw new IllegalArgumentException("count must be at least 1.");
|
||||
}
|
||||
|
||||
final String normalizedKey = normalizeDictionaryKey(key);
|
||||
|
||||
MutableNode<V> current = this.root;
|
||||
for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) {
|
||||
final Character edge = key.charAt(this.traversalDirection.logicalIndex(key.length(), traversalOffset));
|
||||
for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) {
|
||||
final Character edge = normalizedKey
|
||||
.charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset));
|
||||
MutableNode<V> child = current.children().get(edge);
|
||||
if (child == null) {
|
||||
child = new MutableNode<>(); // NOPMD
|
||||
@@ -869,6 +906,23 @@ public final class FrequencyTrie<V> {
|
||||
return this;
|
||||
}
|
||||
|
||||
private String normalizeDictionaryKey(final String key) {
|
||||
String normalized = key;
|
||||
|
||||
if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
|
||||
if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) {
|
||||
normalized = DiacriticStripper.strip(normalized);
|
||||
} else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||
throw new UnsupportedOperationException(
|
||||
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||
}
|
||||
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of mutable build-time nodes currently reachable from the
|
||||
* builder root.
|
||||
|
||||
@@ -88,7 +88,8 @@ public final class FrequencyTrieBuilders {
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
|
||||
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings,
|
||||
source.traversalDirection());
|
||||
source.traversalDirection(), source.metadata().caseProcessingMode(),
|
||||
source.metadata().diacriticProcessingMode());
|
||||
final StringBuilder keyBuilder = new StringBuilder(64);
|
||||
|
||||
copyNode(source.root(), keyBuilder, builder, source.traversalDirection());
|
||||
|
||||
109
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
109
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
@@ -0,0 +1,109 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||
|
||||
import org.junit.jupiter.api.DisplayName;
|
||||
import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
/**
|
||||
* Unit tests for {@link DiacriticStripper}.
|
||||
*/
|
||||
@Tag("unit")
|
||||
@Tag("diacritics")
|
||||
@DisplayName("DiacriticStripper")
|
||||
class DiacriticStripperTest {
|
||||
|
||||
/**
|
||||
* Verifies that pure ASCII input is returned unchanged and without allocating a
|
||||
* new string instance.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("ASCII input is returned as-is")
|
||||
void asciiInputIsReturnedAsIs() {
|
||||
final String input = "plain-ascii-123";
|
||||
|
||||
final String stripped = DiacriticStripper.strip(input);
|
||||
|
||||
assertSame(input, stripped);
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies direct-table replacements for Czech and other common diacritics.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Direct replacement table strips common diacritics")
|
||||
void directReplacementTableStripsCommonDiacritics() {
|
||||
assertEquals("prilis zlutoucky kun", DiacriticStripper.strip("příliš žluťoučký kůň"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies explicit multi-character replacements for ligatures and sharp s.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Special replacements support multi-character ASCII output")
|
||||
void specialReplacementsSupportMultiCharacterAsciiOutput() {
|
||||
assertEquals("strasse AEsir and OEuvre", DiacriticStripper.strip("straße Æsir and Œuvre"));
|
||||
assertEquals("aether oeuvre", DiacriticStripper.strip("æther œuvre"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies Unicode decomposition fallback for characters not in the direct
|
||||
* replacement table.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Unicode decomposition fallback strips combining marks")
|
||||
void unicodeDecompositionFallbackStripsCombiningMarks() {
|
||||
assertEquals("I", DiacriticStripper.strip("İ"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies behavior for non-Latin letters that cannot be mapped to ASCII.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Unmappable non-Latin characters remain unchanged")
|
||||
void unmappableNonLatinCharactersRemainUnchanged() {
|
||||
assertEquals("abcЖxyz", DiacriticStripper.strip("abcЖxyz"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies mixed input where normalization starts mid-string and subsequent
|
||||
* unchanged characters are preserved.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Mixed input preserves untouched characters after normalization starts")
|
||||
void mixedInputPreservesUntouchedCharactersAfterNormalizationStarts() {
|
||||
assertEquals("Cafe-123", DiacriticStripper.strip("Café-123"));
|
||||
}
|
||||
}
|
||||
@@ -220,6 +220,47 @@ class FrequencyTrieTest {
|
||||
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that REMOVE mode strips diacritics both at build time and at lookup
|
||||
* time and composes independently with case normalization.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Diacritic REMOVE mode strips dictionary and lookup keys")
|
||||
void diacriticRemoveModeStripsDictionaryAndLookupKeys() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||
DiacriticProcessingMode.REMOVE);
|
||||
builder.put("Příliš", "cz");
|
||||
builder.put("žluťoučký", "cz2");
|
||||
builder.put("Smørrebrød", "da");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(
|
||||
() -> assertEquals("cz", trie.get("PRILIS")),
|
||||
() -> assertEquals("cz", trie.get("příliš")),
|
||||
() -> assertEquals("cz2", trie.get("zlutoucky")),
|
||||
() -> assertEquals("da", trie.get("SMORREBROD")),
|
||||
() -> assertArrayEquals(new String[] { "cz" }, trie.getAll("prilis")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that fallback diacritic mode is explicitly rejected for now.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("AS_IS_AND_STRIPPED_FALLBACK mode is not supported yet")
|
||||
void fallbackDiacriticModeIsNotSupportedYet() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS,
|
||||
DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK);
|
||||
|
||||
final UnsupportedOperationException exception = assertThrows(UnsupportedOperationException.class,
|
||||
() -> builder.put("kůň", "horse"));
|
||||
assertTrue(exception.getMessage().contains("not supported yet"));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
|
||||
*/
|
||||
|
||||
Reference in New Issue
Block a user