feat: Apply metadata-driven case normalization in get/getAll

This commit is contained in:
2026-04-23 22:32:05 +02:00
parent 4d939f5b6e
commit 8785f2b7cb
14 changed files with 353 additions and 43 deletions

View File

@@ -0,0 +1,55 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.Locale;
/**
* Defines how dictionary items are normalized with respect to letter casing.
*
* <p>
* The mode is applied while parsing dictionary sources and can be persisted in
* trie metadata so that compiled artifacts remain self-describing.
*/
public enum CaseProcessingMode {
/**
* Preserves input character casing exactly as provided by the dictionary
* source.
*/
AS_IS,
/**
* Normalizes all dictionary content to lower case using
* {@link Locale#ROOT}.
*/
LOWERCASE_WITH_LOCALE_ROOT
}

View File

@@ -61,6 +61,7 @@ import java.util.logging.Logger;
* --output &lt;file&gt;
* --reduction-mode &lt;mode&gt;
* [--store-original]
* [--case-processing-mode <mode>]
* [--dominant-winner-min-percent &lt;1..100&gt;]
* [--dominant-winner-over-second-ratio &lt;1..n&gt;]
* [--overwrite]
@@ -152,7 +153,7 @@ public final class Compile {
final WordTraversalDirection traversalDirection = arguments.rightToLeft() ? WordTraversalDirection.FORWARD
: WordTraversalDirection.BACKWARD;
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
reductionSettings, traversalDirection);
reductionSettings, traversalDirection, arguments.caseProcessingMode());
final Path outputFile = arguments.outputFile();
final Path parent = outputFile.toAbsolutePath().getParent();
@@ -168,10 +169,10 @@ public final class Compile {
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.log(Level.INFO,
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, dominantWinnerMinPercent={5}, dominantWinnerOverSecondRatio={6}.",
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, caseProcessingMode={5}, dominantWinnerMinPercent={6}, dominantWinnerOverSecondRatio={7}.",
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
arguments.storeOriginal(), arguments.rightToLeft(),
arguments.storeOriginal(), arguments.rightToLeft(), arguments.caseProcessingMode(),
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio() });
}
}
@@ -186,6 +187,7 @@ public final class Compile {
System.err.println(" --output <file> \\");
System.err.println(" --reduction-mode <mode> \\");
System.err.println(" [--store-original] \\");
System.err.println(" [--case-processing-mode <mode>] \\");
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
System.err.println(" [--overwrite]");
@@ -199,6 +201,13 @@ public final class Compile {
System.err.println(" of the stored word form and patch commands are encoded likewise.");
System.err.println(" --overwrite");
System.err.println(" Replaces the target file when it already exists.");
System.err.println(" --case-processing-mode");
System.err.println(" Controls whether dictionary input is lowercased or preserved as-is.");
System.err.println();
System.err.println("Supported case processing modes:");
for (CaseProcessingMode mode : CaseProcessingMode.values()) {
System.err.println(" " + mode.name());
}
System.err.println();
System.err.println("Supported reduction modes:");
for (ReductionMode mode : ReductionMode.values()) {
@@ -256,14 +265,15 @@ public final class Compile {
* forward traversal on stored word forms
* @param dominantWinnerMinPercent dominant winner minimum percent
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
* @param caseProcessingMode dictionary case processing mode
* @param overwrite whether an existing output may be
* replaced
* @param help whether usage help was requested
*/
@SuppressWarnings("PMD.LongVariable")
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite,
boolean help) {
boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio,
CaseProcessingMode caseProcessingMode, boolean overwrite, boolean help) {
/**
* Parses raw command-line arguments.
@@ -282,6 +292,7 @@ public final class Compile {
boolean rightToLeft = false;
boolean overwrite = false;
boolean help = false;
CaseProcessingMode caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
@@ -330,6 +341,11 @@ public final class Compile {
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
"--dominant-winner-over-second-ratio");
break;
case "--case-processing-mode":
caseProcessingMode = CaseProcessingMode
.valueOf(requireValue(arguments, ++index, "--case-processing-mode")
.toUpperCase(Locale.ROOT));
break;
default:
throw new IllegalArgumentException("Unknown argument: " + argument);
@@ -338,7 +354,7 @@ public final class Compile {
if (help) {
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, overwrite, true);
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, true);
}
if (inputFile == null) {
@@ -352,7 +368,7 @@ public final class Compile {
}
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, overwrite, false);
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, false);
}
/**

View File

@@ -41,6 +41,7 @@ import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.function.IntFunction;
@@ -101,7 +102,7 @@ public final class FrequencyTrie<V> {
/**
* Binary format version.
*/
private static final int STREAM_VERSION = 3;
private static final int STREAM_VERSION = 4;
/**
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
@@ -142,6 +143,10 @@ public final class FrequencyTrie<V> {
* selected deterministically by shorter {@code toString()} value first, then by
* lexicographically lower {@code toString()}, and finally by stable first-seen
* order.
*
* <p>
* The supplied key is normalized according to persisted
* {@link TrieMetadata#caseProcessingMode()} before traversal.
*
* @param key key to resolve
* @return most frequent value, or {@code null} if the key does not exist or no
@@ -150,7 +155,7 @@ public final class FrequencyTrie<V> {
*/
public V get(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode<V> node = findNode(key);
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
if (node == null || node.orderedValues().length == 0) {
return null;
}
@@ -170,6 +175,10 @@ public final class FrequencyTrie<V> {
* <p>
* The returned array is a defensive copy.
*
* <p>
* The supplied key is normalized according to persisted
* {@link TrieMetadata#caseProcessingMode()} before traversal.
*
* @param key key to resolve
* @return all values stored at the addressed node, ordered by descending
* frequency; returns an empty array if the key does not exist or no
@@ -178,7 +187,7 @@ public final class FrequencyTrie<V> {
*/
public V[] getAll(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode<V> node = findNode(key);
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
if (node == null || node.orderedValues().length == 0) {
return this.arrayFactory.apply(0);
}
@@ -336,7 +345,7 @@ public final class FrequencyTrie<V> {
}
final int version = dataInput.readInt();
if (version != 1 && version != STREAM_VERSION) {
if (version != 1 && version != 3 && version != STREAM_VERSION) {
throw new IOException("Unsupported trie stream version: " + version);
}
@@ -376,6 +385,7 @@ public final class FrequencyTrie<V> {
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerMinPercent());
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerOverSecondRatio());
dataOutput.writeInt(metadata.diacriticProcessingMode().ordinal());
dataOutput.writeInt(metadata.caseProcessingMode().ordinal());
}
/**
@@ -419,10 +429,22 @@ public final class FrequencyTrie<V> {
throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal);
}
return new TrieMetadata(
version, traversalDirection, new ReductionSettings(reductionModes[reductionModeOrdinal],
dominantWinnerMinPercent, dominantWinnerOverSecondRatio),
diacriticProcessingModes[diacriticProcessingModeOrdinal]);
final CaseProcessingMode caseProcessingMode;
if (version >= 4) { // NOPMD
final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values();
final int caseProcessingModeOrdinal = dataInput.readInt();
if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) {
throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal);
}
caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal];
} else {
caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
}
return new TrieMetadata(version, traversalDirection,
new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent,
dominantWinnerOverSecondRatio),
diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode);
}
/**
@@ -598,7 +620,7 @@ public final class FrequencyTrie<V> {
/**
* Locates the compiled node for the supplied key.
*
* @param key key to resolve
* @param key already-normalized key to resolve
* @return compiled node, or {@code null} if the path does not exist
*/
private CompiledNode<V> findNode(final String key) {
@@ -613,6 +635,19 @@ public final class FrequencyTrie<V> {
return current;
}
/**
* Applies lookup-time case normalization according to persisted metadata.
*
* @param key lookup key
* @return normalized key for trie traversal
*/
private String normalizeLookupKey(final String key) {
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
return key.toLowerCase(Locale.ROOT);
}
return key;
}
/**
* Builder of {@link FrequencyTrie}.
*
@@ -647,6 +682,11 @@ public final class FrequencyTrie<V> {
*/
private final WordTraversalDirection traversalDirection;
/**
* Dictionary case processing mode associated with this builder.
*/
private final CaseProcessingMode caseProcessingMode;
/**
* Mutable root node.
*/
@@ -679,9 +719,25 @@ public final class FrequencyTrie<V> {
*/
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection) {
this(arrayFactory, reductionSettings, traversalDirection, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
/**
* Creates a new builder with the provided settings, explicit traversal
* direction, and explicit case processing mode.
*
* @param arrayFactory array factory
* @param reductionSettings reduction configuration
* @param traversalDirection logical key traversal direction
* @param caseProcessingMode dictionary case processing mode
* @throws NullPointerException if any argument is {@code null}
*/
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
this.root = new MutableNode<>();
}
@@ -753,8 +809,8 @@ public final class FrequencyTrie<V> {
reductionContext.canonicalNodeCount());
}
final TrieMetadata metadata = TrieMetadata.current(STREAM_VERSION, this.traversalDirection,
this.reductionSettings);
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode);
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
}

View File

@@ -53,8 +53,8 @@ import java.util.logging.Logger;
* to that stem.
*
* <p>
* Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
* and trailing whitespace around each column is ignored.
* Input line case normalization is controlled by {@link CaseProcessingMode}.
* Leading and trailing whitespace around each column is ignored.
*
* <p>
* The parser supports line remarks and trailing remarks. The remark markers
@@ -113,11 +113,27 @@ public final class StemmerDictionaryParser {
* @throws IOException if reading fails
*/
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
return parse(path, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
}
/**
* Parses a dictionary file from a filesystem path.
*
* @param path dictionary file path
* @param caseProcessingMode case processing mode
* @param entryHandler handler receiving parsed entries
* @return parsing statistics
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading fails
*/
public static ParseStatistics parse(final Path path, final CaseProcessingMode caseProcessingMode,
final EntryHandler entryHandler) throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
Objects.requireNonNull(entryHandler, "entryHandler");
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
return parse(reader, path.toAbsolutePath().toString(), entryHandler);
return parse(reader, path.toAbsolutePath().toString(), caseProcessingMode, entryHandler);
}
}
@@ -132,7 +148,23 @@ public final class StemmerDictionaryParser {
*/
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return parse(Path.of(fileName), entryHandler);
return parse(Path.of(fileName), CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
}
/**
* Parses a dictionary file from a path string.
*
* @param fileName dictionary file name or path string
* @param caseProcessingMode case processing mode
* @param entryHandler handler receiving parsed entries
* @return parsing statistics
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading fails
*/
public static ParseStatistics parse(final String fileName, final CaseProcessingMode caseProcessingMode,
final EntryHandler entryHandler) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return parse(Path.of(fileName), caseProcessingMode, entryHandler);
}
/**
@@ -147,8 +179,25 @@ public final class StemmerDictionaryParser {
*/
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
final EntryHandler entryHandler) throws IOException {
return parse(reader, sourceDescription, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
}
/**
* Parses a dictionary from a reader.
*
* @param reader source reader
* @param sourceDescription logical source description for diagnostics
* @param caseProcessingMode case processing mode
* @param entryHandler handler receiving parsed entries
* @return parsing statistics
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading or handler processing fails
*/
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
final CaseProcessingMode caseProcessingMode, final EntryHandler entryHandler) throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(sourceDescription, "sourceDescription");
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
Objects.requireNonNull(entryHandler, "entryHandler");
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
@@ -161,7 +210,7 @@ public final class StemmerDictionaryParser {
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
lineNumber++;
final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
final String normalizedLine = normalizeLineCase(stripRemark(line).trim(), caseProcessingMode);
if (normalizedLine.isEmpty()) {
ignoredLineCount++;
continue;
@@ -226,6 +275,20 @@ public final class StemmerDictionaryParser {
return statistics;
}
/**
* Applies case normalization to one line according to the selected mode.
*
* @param line line to normalize
* @param caseProcessingMode case processing mode
* @return normalized line
*/
private static String normalizeLineCase(final String line, final CaseProcessingMode caseProcessingMode) {
if (caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
return line.toLowerCase(Locale.ROOT);
}
return line;
}
/**
* Determines whether one dictionary item contains any Unicode whitespace
* character.

View File

@@ -283,7 +283,8 @@ public final class StemmerPatchTrieLoader {
try (InputStream inputStream = openBundledResource(resourcePath);
BufferedReader reader = new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language));
return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language),
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
}
@@ -318,7 +319,8 @@ public final class StemmerPatchTrieLoader {
*/
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings) throws IOException {
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD);
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
/**
@@ -338,14 +340,37 @@ public final class StemmerPatchTrieLoader {
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
throws IOException {
return load(path, storeOriginal, reductionSettings, traversalDirection,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
/**
* Loads a dictionary from a filesystem path using explicit reduction settings,
* explicit traversal direction, and explicit case processing mode.
*
* @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys and
* patch commands
* @param caseProcessingMode case processing mode used during dictionary parsing
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode) throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(reductionSettings, "reductionSettings");
Objects.requireNonNull(traversalDirection, "traversalDirection");
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
try (InputStream inputStream = openDictionaryInputStream(path);
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings,
traversalDirection);
traversalDirection, caseProcessingMode);
}
}
@@ -403,7 +428,30 @@ public final class StemmerPatchTrieLoader {
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
throws IOException {
Objects.requireNonNull(fileName, "fileName");
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection);
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
/**
* Loads a dictionary from a filesystem path string using explicit reduction
* settings, explicit traversal direction, and explicit case processing mode.
*
* @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionSettings reduction settings
* @param traversalDirection traversal direction used for both trie keys and
* patch commands
* @param caseProcessingMode case processing mode used during dictionary parsing
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
final CaseProcessingMode caseProcessingMode) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode);
}
/**
@@ -437,14 +485,15 @@ public final class StemmerPatchTrieLoader {
*/
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
final boolean storeOriginal, final ReductionSettings reductionSettings,
final WordTraversalDirection traversalDirection) throws IOException {
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode)
throws IOException {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings,
traversalDirection);
traversalDirection, caseProcessingMode);
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder(traversalDirection);
final int[] insertedMappings = new int[1];
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
sourceDescription, (stem, variants, lineNumber) -> {
sourceDescription, caseProcessingMode, (stem, variants, lineNumber) -> {
if (storeOriginal) {
builder.put(stem, NOOP_PATCH_COMMAND);
insertedMappings[0]++;

View File

@@ -54,9 +54,12 @@ import java.util.Objects;
* @param reductionSettings reduction settings used during compilation
* @param diacriticProcessingMode diacritic processing strategy associated with
* the artifact
* @param caseProcessingMode case processing strategy associated with the
* artifact
*/
public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection,
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode) {
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode,
CaseProcessingMode caseProcessingMode) {
/**
* Creates a new metadata instance.
@@ -66,9 +69,11 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
* @param traversalDirection logical key traversal direction
* @param reductionSettings reduction settings used during compilation
* @param diacriticProcessingMode diacritic processing strategy
* @param caseProcessingMode case processing strategy
*/
public TrieMetadata(final int formatVersion, final WordTraversalDirection traversalDirection,
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode) {
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
final CaseProcessingMode caseProcessingMode) {
if (formatVersion < 1) { // NOPMD
throw new IllegalArgumentException("formatVersion must be at least 1.");
}
@@ -76,6 +81,7 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
}
/**
@@ -89,7 +95,8 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
*/
public static TrieMetadata current(final int formatVersion, final WordTraversalDirection traversalDirection,
final ReductionSettings reductionSettings) {
return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS);
return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS,
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
/**
@@ -104,6 +111,6 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
public static TrieMetadata legacy(final int formatVersion, final WordTraversalDirection traversalDirection) {
return new TrieMetadata(formatVersion, traversalDirection,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
DiacriticProcessingMode.AS_IS);
DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
}
}

View File

@@ -60,7 +60,9 @@
* non-empty logical line starts with a canonical stem followed by known surface
* variants in subsequent tab-separated columns.
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
* which normalizes input to lower case using {@link java.util.Locale#ROOT},
* which applies configurable case processing through
* {@link org.egothor.stemmer.CaseProcessingMode} (default:
* {@link org.egothor.stemmer.CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}),
* supports whole-line as well as trailing remarks introduced by {@code #} or
* {@code //}, and currently ignores dictionary items containing Unicode
* whitespace characters while reporting them through warning-level diagnostics.

View File

@@ -201,6 +201,43 @@ class FrequencyTrieTest {
() -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner")));
}
/**
* Verifies that lookup-time key normalization follows persisted case processing
* metadata.
*/
@Test
@DisplayName("Lookup applies lowercase normalization when metadata requires it")
void lookupAppliesLowercaseNormalizationWhenMetadataRequiresIt() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
builder.put("house", "noun");
builder.put("house", "verb");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("noun", trie.get("HOUSE")),
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
}
/**
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
*/
@Test
@DisplayName("Lookup keeps case-sensitive behavior when metadata is AS_IS")
void lookupKeepsCaseSensitiveBehaviorWhenMetadataIsAsIs() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS);
builder.put("House", "noun");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("noun", trie.get("House")), () -> assertNull(trie.get("house")),
() -> assertArrayEquals(new String[] { "noun" }, trie.getAll("House")),
() -> assertArrayEquals(new String[0], trie.getAll("HOUSE")));
}
/**
* Verifies that a missing path below an existing prefix returns empty results.
*/

View File

@@ -64,7 +64,7 @@ import org.junit.jupiter.api.io.TempDir;
* </p>
* <ul>
* <li>parsing through all public overloads,</li>
* <li>normalization to lower case,</li>
* <li>case processing according to the selected mode,</li>
* <li>handling of empty lines and remarks,</li>
* <li>correct entry emission including line numbers,</li>
* <li>propagation of I/O failures from the handler and file system,</li>
@@ -280,6 +280,22 @@ class StemmerDictionaryParserTest {
assertEquals(expected, exception, "The original exception instance should be preserved.");
}
@Test
@DisplayName("should preserve character case when AS_IS mode is selected")
void shouldPreserveCharacterCaseWhenAsIsModeIsSelected() throws IOException {
final String input = "Root\tRunning\tRuns\tRUNNER\n";
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(
new StringReader(input), "case-as-is", CaseProcessingMode.AS_IS, collectingHandler(entries));
assertAll("Statistics", () -> assertEquals(1, statistics.lineCount()),
() -> assertEquals(1, statistics.entryCount()), () -> assertEquals(0, statistics.ignoredLineCount()));
assertEquals(1, entries.size(), "Exactly one entry should be emitted.");
assertAll("Entry", () -> assertEquals("Root", entries.get(0).stem()),
() -> assertArrayEquals(new String[] { "Running", "Runs", "RUNNER" }, entries.get(0).variants()));
}
@Test
@DisplayName("should reject null reader")
void shouldRejectNullReader() {
@@ -298,6 +314,15 @@ class StemmerDictionaryParserTest {
}));
}
@Test
@DisplayName("should reject null case processing mode")
void shouldRejectNullCaseProcessingMode() {
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(new StringReader("a b"),
"source", null, (stem, variants, lineNumber) -> {
// no-op
}));
}
@Test
@DisplayName("should reject null entry handler")
void shouldRejectNullEntryHandler() {