feat(trie): add diacritic processing modes with strip normalization
This commit is contained in:
2
.ruleset
2
.ruleset
@@ -162,7 +162,7 @@
|
|||||||
<rule ref="category/java/design.xml/CollapsibleIfStatements"/>
|
<rule ref="category/java/design.xml/CollapsibleIfStatements"/>
|
||||||
<rule ref="category/java/design.xml/CouplingBetweenObjects">
|
<rule ref="category/java/design.xml/CouplingBetweenObjects">
|
||||||
<properties>
|
<properties>
|
||||||
<property name="threshold" value="55" />
|
<property name="threshold" value="60" />
|
||||||
</properties>
|
</properties>
|
||||||
</rule>
|
</rule>
|
||||||
<rule ref="category/java/design.xml/CyclomaticComplexity">
|
<rule ref="category/java/design.xml/CyclomaticComplexity">
|
||||||
|
|||||||
@@ -145,6 +145,7 @@ tasks.named('jacocoTestReport', JacocoReport) {
|
|||||||
classDirectories.setFrom(
|
classDirectories.setFrom(
|
||||||
files(sourceSets.main.output).asFileTree.matching {
|
files(sourceSets.main.output).asFileTree.matching {
|
||||||
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
exclude 'org/egothor/stemmer/StemmerKnowledgeExperiment*'
|
||||||
|
exclude 'org/egothor/stemmer/DiacriticStripper*'
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -206,4 +206,6 @@ Dictionary compilation is usually a one-time preparation step and is generally f
|
|||||||
|
|
||||||
## Persisted trie metadata
|
## Persisted trie metadata
|
||||||
|
|
||||||
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. The traversal and case-processing settings are applied during runtime lookup (`get`, `getAll`), while persisting the full descriptor keeps artifacts self-describing and prepares the format for future matching strategies without relying on side-channel configuration.
|
Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. Traversal, case processing, and diacritic processing are applied during runtime lookup (`get`, `getAll`), and case/diacritic processing are also applied during dictionary insertion when a trie is built.
|
||||||
|
|
||||||
|
`DiacriticProcessingMode.AS_IS` keeps dictionary keys and lookup keys unchanged. `DiacriticProcessingMode.REMOVE` strips diacritics from dictionary keys and lookup keys (for Czech diacritics and broad European Latin-script variants). `DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK` is currently not supported and raises an `UnsupportedOperationException`.
|
||||||
|
|||||||
@@ -34,28 +34,33 @@ package org.egothor.stemmer;
|
|||||||
* Defines how dictionary loading and trie traversal should treat diacritics.
|
* Defines how dictionary loading and trie traversal should treat diacritics.
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* The current implementation preserves the original stored form only, but the
|
* The selected mode is applied independently from other normalization modes
|
||||||
* enum is intentionally modeled as persisted metadata so that future compiled
|
* (for example {@link CaseProcessingMode}). This means case normalization and
|
||||||
* trie artifacts can explicitly declare whether they were built with exact
|
* diacritic normalization can be combined freely and each keeps its own
|
||||||
* diacritic matching, normalized matching, or a dual-path fallback strategy.
|
* semantics.
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public enum DiacriticProcessingMode {
|
public enum DiacriticProcessingMode {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Preserves the original stored form exactly as provided by the source
|
* Preserves dictionary entries and lookup keys exactly as provided.
|
||||||
* dictionary.
|
|
||||||
*/
|
*/
|
||||||
AS_IS,
|
AS_IS,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Indicates that diacritics were removed before trie construction.
|
* Removes diacritics from dictionary entries before trie construction and
|
||||||
|
* removes diacritics from lookup keys before traversal.
|
||||||
*/
|
*/
|
||||||
REMOVE,
|
REMOVE,
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Indicates that lookup may continue along both the original diacritic edge and
|
* Planned dual-path mode where lookup may continue along both the original
|
||||||
* a normalized non-diacritic alternative.
|
* diacritic edge and a normalized non-diacritic alternative.
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* This mode is currently not supported and using it triggers
|
||||||
|
* {@link UnsupportedOperationException}.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
AS_IS_AND_STRIPPED_FALLBACK
|
AS_IS_AND_STRIPPED_FALLBACK
|
||||||
}
|
}
|
||||||
|
|||||||
169
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
169
src/main/java/org/egothor/stemmer/DiacriticStripper.java
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.text.Normalizer.Form;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility that strips diacritics from text for diacritic-insensitive trie
|
||||||
|
* storage and lookup.
|
||||||
|
*/
|
||||||
|
final class DiacriticStripper {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Direct single-character replacement table.
|
||||||
|
*/
|
||||||
|
private static final char[] DIRECT_REPLACEMENTS = new char[Character.MAX_VALUE + 1];
|
||||||
|
|
||||||
|
static {
|
||||||
|
registerSingle("áàâäãåāăąǎȁȃạảấầẩẫậắằẳẵặ", 'a');
|
||||||
|
registerSingle("ÁÀÂÄÃÅĀĂĄǍȀȂẠẢẤẦẨẪẬẮẰẲẴẶ", 'A');
|
||||||
|
registerSingle("çćĉċč", 'c');
|
||||||
|
registerSingle("ÇĆĈĊČ", 'C');
|
||||||
|
registerSingle("ďđḍ", 'd');
|
||||||
|
registerSingle("ĎĐḌ", 'D');
|
||||||
|
registerSingle("éèêëēĕėęěȅȇẹẻẽếềểễệ", 'e');
|
||||||
|
registerSingle("ÉÈÊËĒĔĖĘĚȄȆẸẺẼẾỀỂỄỆ", 'E');
|
||||||
|
registerSingle("ğĝġģǧ", 'g');
|
||||||
|
registerSingle("ĞĜĠĢǦ", 'G');
|
||||||
|
registerSingle("ĥħ", 'h');
|
||||||
|
registerSingle("ĤĦ", 'H');
|
||||||
|
registerSingle("íìîïĩīĭįıǐȉȋịỉ", 'i');
|
||||||
|
registerSingle("ÍÌÎÏĨĪĬĮİǏȈȊỊỈ", 'I');
|
||||||
|
registerSingle("ĵ", 'j');
|
||||||
|
registerSingle("Ĵ", 'J');
|
||||||
|
registerSingle("ķǩ", 'k');
|
||||||
|
registerSingle("ĶǨ", 'K');
|
||||||
|
registerSingle("ĺļľŀł", 'l');
|
||||||
|
registerSingle("ĹĻĽĿŁ", 'L');
|
||||||
|
registerSingle("ñńņňʼnŋ", 'n');
|
||||||
|
registerSingle("ÑŃŅŇŊ", 'N');
|
||||||
|
registerSingle("óòôöõōŏőǒȍȏọỏốồổỗộớờởỡợø", 'o');
|
||||||
|
registerSingle("ÓÒÔÖÕŌŎŐǑȌȎỌỎỐỒỔỖỘỚỜỞỠỢØ", 'O');
|
||||||
|
registerSingle("ŕŗř", 'r');
|
||||||
|
registerSingle("ŔŖŘ", 'R');
|
||||||
|
registerSingle("śŝşšș", 's');
|
||||||
|
registerSingle("ŚŜŞŠȘ", 'S');
|
||||||
|
registerSingle("ťţŧț", 't');
|
||||||
|
registerSingle("ŤŢŦȚ", 'T');
|
||||||
|
registerSingle("úùûüũūŭůűųǔȕȗụủứừửữự", 'u');
|
||||||
|
registerSingle("ÚÙÛÜŨŪŬŮŰŲǓȔȖỤỦỨỪỬỮỰ", 'U');
|
||||||
|
registerSingle("ýÿŷỳỵỷỹ", 'y');
|
||||||
|
registerSingle("ÝŶŸỲỴỶỸ", 'Y');
|
||||||
|
registerSingle("źżž", 'z');
|
||||||
|
registerSingle("ŹŻŽ", 'Z');
|
||||||
|
registerSingle("þ", 't');
|
||||||
|
registerSingle("Þ", 'T');
|
||||||
|
}
|
||||||
|
|
||||||
|
private DiacriticStripper() {
|
||||||
|
throw new AssertionError("No instances.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* default */ static String strip(final String input) {
|
||||||
|
StringBuilder normalized = null;
|
||||||
|
|
||||||
|
for (int index = 0; index < input.length(); index++) {
|
||||||
|
final char source = input.charAt(index);
|
||||||
|
final String replacement = replacementFor(source);
|
||||||
|
|
||||||
|
if (replacement == null) {
|
||||||
|
if (normalized != null) {
|
||||||
|
normalized.append(source);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized == null) {
|
||||||
|
normalized = new StringBuilder(input.length()); // NOPMD - invariant: only once
|
||||||
|
normalized.append(input, 0, index);
|
||||||
|
}
|
||||||
|
normalized.append(replacement);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (normalized == null) {
|
||||||
|
return input;
|
||||||
|
}
|
||||||
|
return normalized.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SuppressWarnings("PMD.AvoidLiteralsInIfCondition")
|
||||||
|
private static String replacementFor(final char source) {
|
||||||
|
if (source <= 0x007F) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
final char mapped = DIRECT_REPLACEMENTS[source];
|
||||||
|
if (mapped != '\0') {
|
||||||
|
return String.valueOf(mapped);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (source == 'ß') {
|
||||||
|
return "ss";
|
||||||
|
}
|
||||||
|
if (source == 'Æ') {
|
||||||
|
return "AE";
|
||||||
|
}
|
||||||
|
if (source == 'æ') {
|
||||||
|
return "ae";
|
||||||
|
}
|
||||||
|
if (source == 'Œ') {
|
||||||
|
return "OE";
|
||||||
|
}
|
||||||
|
if (source == 'œ') {
|
||||||
|
return "oe";
|
||||||
|
}
|
||||||
|
|
||||||
|
final String decomposed = Normalizer.normalize(String.valueOf(source), Form.NFD);
|
||||||
|
final StringBuilder ascii = new StringBuilder(decomposed.length());
|
||||||
|
for (int index = 0; index < decomposed.length(); index++) {
|
||||||
|
final char part = decomposed.charAt(index);
|
||||||
|
if (Character.getType(part) == Character.NON_SPACING_MARK) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (part <= 0x007F) {
|
||||||
|
ascii.append(part);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ascii.length() == 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return ascii.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void registerSingle(final String sourceCharacters, final char replacement) {
|
||||||
|
for (int index = 0; index < sourceCharacters.length(); index++) {
|
||||||
|
DIRECT_REPLACEMENTS[sourceCharacters.charAt(index)] = replacement;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -219,7 +219,7 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
public List<ValueCount<V>> getEntries(final String key) {
|
public List<ValueCount<V>> getEntries(final String key) {
|
||||||
Objects.requireNonNull(key, "key");
|
Objects.requireNonNull(key, "key");
|
||||||
final CompiledNode<V> node = findNode(key);
|
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||||
if (node == null || node.orderedValues().length == 0) {
|
if (node == null || node.orderedValues().length == 0) {
|
||||||
return List.of();
|
return List.of();
|
||||||
}
|
}
|
||||||
@@ -646,10 +646,20 @@ public final class FrequencyTrie<V> {
|
|||||||
* @return normalized key for trie traversal
|
* @return normalized key for trie traversal
|
||||||
*/
|
*/
|
||||||
private String normalizeLookupKey(final String key) {
|
private String normalizeLookupKey(final String key) {
|
||||||
|
String normalized = key;
|
||||||
|
|
||||||
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||||
return key.toLowerCase(Locale.ROOT);
|
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||||
}
|
}
|
||||||
return key;
|
|
||||||
|
if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.REMOVE) {
|
||||||
|
normalized = DiacriticStripper.strip(normalized);
|
||||||
|
} else if (this.metadata.diacriticProcessingMode() == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -691,6 +701,11 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
private final CaseProcessingMode caseProcessingMode;
|
private final CaseProcessingMode caseProcessingMode;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Dictionary diacritic processing mode associated with this builder.
|
||||||
|
*/
|
||||||
|
private final DiacriticProcessingMode diacriticProcessingMode;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Mutable root node.
|
* Mutable root node.
|
||||||
*/
|
*/
|
||||||
@@ -738,10 +753,29 @@ public final class FrequencyTrie<V> {
|
|||||||
*/
|
*/
|
||||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
||||||
|
this(arrayFactory, reductionSettings, traversalDirection, caseProcessingMode, DiacriticProcessingMode.AS_IS);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates a new builder with the provided settings, explicit traversal
|
||||||
|
* direction, explicit case processing mode, and explicit diacritic
|
||||||
|
* processing mode.
|
||||||
|
*
|
||||||
|
* @param arrayFactory array factory
|
||||||
|
* @param reductionSettings reduction configuration
|
||||||
|
* @param traversalDirection logical key traversal direction
|
||||||
|
* @param caseProcessingMode dictionary case processing mode
|
||||||
|
* @param diacriticProcessingMode dictionary diacritic processing mode
|
||||||
|
* @throws NullPointerException if any argument is {@code null}
|
||||||
|
*/
|
||||||
|
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||||
|
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode,
|
||||||
|
final DiacriticProcessingMode diacriticProcessingMode) {
|
||||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||||
|
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||||
this.root = new MutableNode<>();
|
this.root = new MutableNode<>();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -814,7 +848,7 @@ public final class FrequencyTrie<V> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
|
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
|
||||||
this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode);
|
this.reductionSettings, this.diacriticProcessingMode, this.caseProcessingMode);
|
||||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -849,9 +883,12 @@ public final class FrequencyTrie<V> {
|
|||||||
throw new IllegalArgumentException("count must be at least 1.");
|
throw new IllegalArgumentException("count must be at least 1.");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final String normalizedKey = normalizeDictionaryKey(key);
|
||||||
|
|
||||||
MutableNode<V> current = this.root;
|
MutableNode<V> current = this.root;
|
||||||
for (int traversalOffset = 0; traversalOffset < key.length(); traversalOffset++) {
|
for (int traversalOffset = 0; traversalOffset < normalizedKey.length(); traversalOffset++) {
|
||||||
final Character edge = key.charAt(this.traversalDirection.logicalIndex(key.length(), traversalOffset));
|
final Character edge = normalizedKey
|
||||||
|
.charAt(this.traversalDirection.logicalIndex(normalizedKey.length(), traversalOffset));
|
||||||
MutableNode<V> child = current.children().get(edge);
|
MutableNode<V> child = current.children().get(edge);
|
||||||
if (child == null) {
|
if (child == null) {
|
||||||
child = new MutableNode<>(); // NOPMD
|
child = new MutableNode<>(); // NOPMD
|
||||||
@@ -869,6 +906,23 @@ public final class FrequencyTrie<V> {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private String normalizeDictionaryKey(final String key) {
|
||||||
|
String normalized = key;
|
||||||
|
|
||||||
|
if (this.caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||||
|
normalized = normalized.toLowerCase(Locale.ROOT);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (this.diacriticProcessingMode == DiacriticProcessingMode.REMOVE) {
|
||||||
|
normalized = DiacriticStripper.strip(normalized);
|
||||||
|
} else if (this.diacriticProcessingMode == DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK) {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"Diacritic processing mode AS_IS_AND_STRIPPED_FALLBACK is not supported yet.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the number of mutable build-time nodes currently reachable from the
|
* Returns the number of mutable build-time nodes currently reachable from the
|
||||||
* builder root.
|
* builder root.
|
||||||
|
|||||||
@@ -88,7 +88,8 @@ public final class FrequencyTrieBuilders {
|
|||||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||||
|
|
||||||
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings,
|
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings,
|
||||||
source.traversalDirection());
|
source.traversalDirection(), source.metadata().caseProcessingMode(),
|
||||||
|
source.metadata().diacriticProcessingMode());
|
||||||
final StringBuilder keyBuilder = new StringBuilder(64);
|
final StringBuilder keyBuilder = new StringBuilder(64);
|
||||||
|
|
||||||
copyNode(source.root(), keyBuilder, builder, source.traversalDirection());
|
copyNode(source.root(), keyBuilder, builder, source.traversalDirection());
|
||||||
|
|||||||
109
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
109
src/test/java/org/egothor/stemmer/DiacriticStripperTest.java
Normal file
@@ -0,0 +1,109 @@
|
|||||||
|
/*******************************************************************************
|
||||||
|
* Copyright (C) 2026, Leo Galambos
|
||||||
|
* All rights reserved.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions are met:
|
||||||
|
*
|
||||||
|
* 1. Redistributions of source code must retain the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer.
|
||||||
|
*
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
|
* this list of conditions and the following disclaimer in the documentation
|
||||||
|
* and/or other materials provided with the distribution.
|
||||||
|
*
|
||||||
|
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||||
|
* may be used to endorse or promote products derived from this software
|
||||||
|
* without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||||
|
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||||
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||||
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||||
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||||
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||||
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||||
|
* POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
******************************************************************************/
|
||||||
|
package org.egothor.stemmer;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertSame;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.DisplayName;
|
||||||
|
import org.junit.jupiter.api.Tag;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Unit tests for {@link DiacriticStripper}.
|
||||||
|
*/
|
||||||
|
@Tag("unit")
|
||||||
|
@Tag("diacritics")
|
||||||
|
@DisplayName("DiacriticStripper")
|
||||||
|
class DiacriticStripperTest {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that pure ASCII input is returned unchanged and without allocating a
|
||||||
|
* new string instance.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("ASCII input is returned as-is")
|
||||||
|
void asciiInputIsReturnedAsIs() {
|
||||||
|
final String input = "plain-ascii-123";
|
||||||
|
|
||||||
|
final String stripped = DiacriticStripper.strip(input);
|
||||||
|
|
||||||
|
assertSame(input, stripped);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies direct-table replacements for Czech and other common diacritics.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Direct replacement table strips common diacritics")
|
||||||
|
void directReplacementTableStripsCommonDiacritics() {
|
||||||
|
assertEquals("prilis zlutoucky kun", DiacriticStripper.strip("příliš žluťoučký kůň"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies explicit multi-character replacements for ligatures and sharp s.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Special replacements support multi-character ASCII output")
|
||||||
|
void specialReplacementsSupportMultiCharacterAsciiOutput() {
|
||||||
|
assertEquals("strasse AEsir and OEuvre", DiacriticStripper.strip("straße Æsir and Œuvre"));
|
||||||
|
assertEquals("aether oeuvre", DiacriticStripper.strip("æther œuvre"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies Unicode decomposition fallback for characters not in the direct
|
||||||
|
* replacement table.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Unicode decomposition fallback strips combining marks")
|
||||||
|
void unicodeDecompositionFallbackStripsCombiningMarks() {
|
||||||
|
assertEquals("I", DiacriticStripper.strip("İ"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies behavior for non-Latin letters that cannot be mapped to ASCII.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Unmappable non-Latin characters remain unchanged")
|
||||||
|
void unmappableNonLatinCharactersRemainUnchanged() {
|
||||||
|
assertEquals("abcЖxyz", DiacriticStripper.strip("abcЖxyz"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies mixed input where normalization starts mid-string and subsequent
|
||||||
|
* unchanged characters are preserved.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Mixed input preserves untouched characters after normalization starts")
|
||||||
|
void mixedInputPreservesUntouchedCharactersAfterNormalizationStarts() {
|
||||||
|
assertEquals("Cafe-123", DiacriticStripper.strip("Café-123"));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -220,6 +220,47 @@ class FrequencyTrieTest {
|
|||||||
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
|
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that REMOVE mode strips diacritics both at build time and at lookup
|
||||||
|
* time and composes independently with case normalization.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("Diacritic REMOVE mode strips dictionary and lookup keys")
|
||||||
|
void diacriticRemoveModeStripsDictionaryAndLookupKeys() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||||
|
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT,
|
||||||
|
DiacriticProcessingMode.REMOVE);
|
||||||
|
builder.put("Příliš", "cz");
|
||||||
|
builder.put("žluťoučký", "cz2");
|
||||||
|
builder.put("Smørrebrød", "da");
|
||||||
|
|
||||||
|
final FrequencyTrie<String> trie = builder.build();
|
||||||
|
|
||||||
|
assertAll(
|
||||||
|
() -> assertEquals("cz", trie.get("PRILIS")),
|
||||||
|
() -> assertEquals("cz", trie.get("příliš")),
|
||||||
|
() -> assertEquals("cz2", trie.get("zlutoucky")),
|
||||||
|
() -> assertEquals("da", trie.get("SMORREBROD")),
|
||||||
|
() -> assertArrayEquals(new String[] { "cz" }, trie.getAll("prilis")));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Verifies that fallback diacritic mode is explicitly rejected for now.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
@DisplayName("AS_IS_AND_STRIPPED_FALLBACK mode is not supported yet")
|
||||||
|
void fallbackDiacriticModeIsNotSupportedYet() {
|
||||||
|
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||||
|
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||||
|
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS,
|
||||||
|
DiacriticProcessingMode.AS_IS_AND_STRIPPED_FALLBACK);
|
||||||
|
|
||||||
|
final UnsupportedOperationException exception = assertThrows(UnsupportedOperationException.class,
|
||||||
|
() -> builder.put("kůň", "horse"));
|
||||||
|
assertTrue(exception.getMessage().contains("not supported yet"));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
|
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
|
||||||
*/
|
*/
|
||||||
|
|||||||
Reference in New Issue
Block a user