From 8785f2b7cb03754407c0f9789cf55b044ad6dda4 Mon Sep 17 00:00:00 2001 From: Leo Galambos Date: Thu, 23 Apr 2026 22:32:05 +0200 Subject: [PATCH] feat: Apply metadata-driven case normalization in get/getAll --- docs/cli-compilation.md | 2 +- docs/contributing-dictionaries.md | 2 +- docs/dictionary-format.md | 4 +- docs/programmatic-loading-and-building.md | 2 +- docs/quick-start.md | 4 +- .../egothor/stemmer/CaseProcessingMode.java | 55 +++++++++++++ .../java/org/egothor/stemmer/Compile.java | 30 +++++-- .../org/egothor/stemmer/FrequencyTrie.java | 78 ++++++++++++++++--- .../stemmer/StemmerDictionaryParser.java | 73 +++++++++++++++-- .../stemmer/StemmerPatchTrieLoader.java | 63 +++++++++++++-- .../org/egothor/stemmer/TrieMetadata.java | 15 +++- .../org/egothor/stemmer/package-info.java | 4 +- .../egothor/stemmer/FrequencyTrieTest.java | 37 +++++++++ .../stemmer/StemmerDictionaryParserTest.java | 27 ++++++- 14 files changed, 353 insertions(+), 43 deletions(-) create mode 100644 src/main/java/org/egothor/stemmer/CaseProcessingMode.java diff --git a/docs/cli-compilation.md b/docs/cli-compilation.md index 851a8b7..d41efeb 100644 --- a/docs/cli-compilation.md +++ b/docs/cli-compilation.md @@ -47,7 +47,7 @@ The CLI supports the following arguments: Path to the source dictionary file. -The file must use the standard line-oriented tab-separated values dictionary format, meaning that columns are separated by the tab character. Each non-empty logical line starts with the canonical stem column and may contain zero or more variant columns. The parser expects UTF-8 input, lowercases it using `Locale.ROOT`, ignores trailing remarks introduced by `#` or `//`, and currently ignores dictionary items containing embedded whitespace while reporting them through warning-level log entries. +The file must use the standard line-oriented tab-separated values dictionary format, meaning that columns are separated by the tab character. Each non-empty logical line starts with the canonical stem column and may contain zero or more variant columns. The parser expects UTF-8 input, processes case according to `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores trailing remarks introduced by `#` or `//`, and currently ignores dictionary items containing embedded whitespace while reporting them through warning-level log entries. Example: diff --git a/docs/contributing-dictionaries.md b/docs/contributing-dictionaries.md index 7963b82..ed20b6f 100644 --- a/docs/contributing-dictionaries.md +++ b/docs/contributing-dictionaries.md @@ -41,7 +41,7 @@ The parser: - reads UTF-8 text, - interprets each line as tab-separated values, -- normalizes input to lower case using `Locale.ROOT`, +- applies configurable case processing through `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), - ignores empty lines, - supports remarks introduced by `#` or `//`, - currently ignores dictionary items containing embedded whitespace and reports them through warning-level log entries. diff --git a/docs/dictionary-format.md b/docs/dictionary-format.md index 38543af..7082c13 100644 --- a/docs/dictionary-format.md +++ b/docs/dictionary-format.md @@ -111,7 +111,7 @@ This is also valid: ## Case normalization -Input lines are normalized to lower case using `Locale.ROOT` before tab-separated columns are processed into dictionary entries. +Input-line case normalization is controlled by `CaseProcessingMode`; by default the parser uses `LOWERCASE_WITH_LOCALE_ROOT` before tab-separated columns are processed into dictionary entries. That means dictionary authors should treat the format as **case-insensitive at load time**. If a file contains uppercase or mixed-case tokens, they will be normalized during parsing. @@ -193,7 +193,7 @@ Run Running Runs Ran CONNECT Connected Connecting ``` -This is accepted, but it is normalized to lower case during parsing. +This is accepted. Under the default `LOWERCASE_WITH_LOCALE_ROOT` mode it is normalized to lower case during parsing; under `AS_IS` it is preserved. ## Format limitations diff --git a/docs/programmatic-loading-and-building.md b/docs/programmatic-loading-and-building.md index b113aad..210da89 100644 --- a/docs/programmatic-loading-and-building.md +++ b/docs/programmatic-loading-and-building.md @@ -32,7 +32,7 @@ The `storeOriginal` flag controls whether the canonical stem is inserted as a no ## Load a textual dictionary -Loading from a dictionary file follows the same preparation model as bundled resources, but the source comes from your own file or path. The textual format is tab-separated values, meaning that columns are separated by the tab character. Each non-empty logical line starts with the stem column and may contain zero or more variant columns. Input is normalized to lower case using `Locale.ROOT`, trailing remarks introduced by `#` or `//` are ignored, and dictionary items containing embedded whitespace are currently ignored with warning-level diagnostics. +Loading from a dictionary file follows the same preparation model as bundled resources, but the source comes from your own file or path. The textual format is tab-separated values, meaning that columns are separated by the tab character. Each non-empty logical line starts with the stem column and may contain zero or more variant columns. Input case normalization is controlled by `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), trailing remarks introduced by `#` or `//` are ignored, and dictionary items containing embedded whitespace are currently ignored with warning-level diagnostics. ```java import java.io.IOException; diff --git a/docs/quick-start.md b/docs/quick-start.md index 0c209be..85122c2 100644 --- a/docs/quick-start.md +++ b/docs/quick-start.md @@ -69,7 +69,7 @@ public final class LoadBinaryStemmerExample { ### Build or extend a stemmer from dictionary data -Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The parser lowercases input with `Locale.ROOT`, ignores leading and trailing whitespace, and supports line remarks introduced by `#` or `//`. +Radixor can also build a compiled trie from a custom dictionary. Dictionary lines consist of a canonical stem followed by zero or more variants. The parser applies `CaseProcessingMode` (default: `LOWERCASE_WITH_LOCALE_ROOT`), ignores leading and trailing whitespace, and supports line remarks introduced by `#` or `//`. This path is also relevant when you extend an existing compiled stemmer with additional domain-specific entries and rebuild a new compact artifact. @@ -206,4 +206,4 @@ Dictionary compilation is usually a one-time preparation step and is generally f ## Persisted trie metadata -Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, and the declared `DiacriticProcessingMode`. Even when a given release does not yet actively branch on every field at query time, persisting the full descriptor keeps artifacts self-describing and prepares the format for future matching strategies without relying on side-channel configuration. +Every compiled trie artifact stores a `TrieMetadata` descriptor together with the immutable trie payload. That metadata currently records the binary format version, the `WordTraversalDirection`, the `ReductionSettings` used during compilation, the declared `DiacriticProcessingMode`, and the selected `CaseProcessingMode`. The traversal and case-processing settings are applied during runtime lookup (`get`, `getAll`), while persisting the full descriptor keeps artifacts self-describing and prepares the format for future matching strategies without relying on side-channel configuration. diff --git a/src/main/java/org/egothor/stemmer/CaseProcessingMode.java b/src/main/java/org/egothor/stemmer/CaseProcessingMode.java new file mode 100644 index 0000000..280e962 --- /dev/null +++ b/src/main/java/org/egothor/stemmer/CaseProcessingMode.java @@ -0,0 +1,55 @@ +/******************************************************************************* + * Copyright (C) 2026, Leo Galambos + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + ******************************************************************************/ +package org.egothor.stemmer; + +import java.util.Locale; + +/** + * Defines how dictionary items are normalized with respect to letter casing. + * + *

+ * The mode is applied while parsing dictionary sources and can be persisted in + * trie metadata so that compiled artifacts remain self-describing. + */ +public enum CaseProcessingMode { + + /** + * Preserves input character casing exactly as provided by the dictionary + * source. + */ + AS_IS, + + /** + * Normalizes all dictionary content to lower case using + * {@link Locale#ROOT}. + */ + LOWERCASE_WITH_LOCALE_ROOT +} diff --git a/src/main/java/org/egothor/stemmer/Compile.java b/src/main/java/org/egothor/stemmer/Compile.java index 2f61044..d3c44e8 100644 --- a/src/main/java/org/egothor/stemmer/Compile.java +++ b/src/main/java/org/egothor/stemmer/Compile.java @@ -61,6 +61,7 @@ import java.util.logging.Logger; * --output <file> * --reduction-mode <mode> * [--store-original] + * [--case-processing-mode ] * [--dominant-winner-min-percent <1..100>] * [--dominant-winner-over-second-ratio <1..n>] * [--overwrite] @@ -152,7 +153,7 @@ public final class Compile { final WordTraversalDirection traversalDirection = arguments.rightToLeft() ? WordTraversalDirection.FORWARD : WordTraversalDirection.BACKWARD; final FrequencyTrie trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(), - reductionSettings, traversalDirection); + reductionSettings, traversalDirection, arguments.caseProcessingMode()); final Path outputFile = arguments.outputFile(); final Path parent = outputFile.toAbsolutePath().getParent(); @@ -168,10 +169,10 @@ public final class Compile { if (LOGGER.isLoggable(Level.INFO)) { LOGGER.log(Level.INFO, - "Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, dominantWinnerMinPercent={5}, dominantWinnerOverSecondRatio={6}.", + "Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, caseProcessingMode={5}, dominantWinnerMinPercent={6}, dominantWinnerOverSecondRatio={7}.", new Object[] { arguments.inputFile().toAbsolutePath().toString(), arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(), - arguments.storeOriginal(), arguments.rightToLeft(), + arguments.storeOriginal(), arguments.rightToLeft(), arguments.caseProcessingMode(), arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio() }); } } @@ -186,6 +187,7 @@ public final class Compile { System.err.println(" --output \\"); System.err.println(" --reduction-mode \\"); System.err.println(" [--store-original] \\"); + System.err.println(" [--case-processing-mode ] \\"); System.err.println(" [--dominant-winner-min-percent <1..100>] \\"); System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\"); System.err.println(" [--overwrite]"); @@ -199,6 +201,13 @@ public final class Compile { System.err.println(" of the stored word form and patch commands are encoded likewise."); System.err.println(" --overwrite"); System.err.println(" Replaces the target file when it already exists."); + System.err.println(" --case-processing-mode"); + System.err.println(" Controls whether dictionary input is lowercased or preserved as-is."); + System.err.println(); + System.err.println("Supported case processing modes:"); + for (CaseProcessingMode mode : CaseProcessingMode.values()) { + System.err.println(" " + mode.name()); + } System.err.println(); System.err.println("Supported reduction modes:"); for (ReductionMode mode : ReductionMode.values()) { @@ -256,14 +265,15 @@ public final class Compile { * forward traversal on stored word forms * @param dominantWinnerMinPercent dominant winner minimum percent * @param dominantWinnerOverSecondRatio dominant winner over second ratio + * @param caseProcessingMode dictionary case processing mode * @param overwrite whether an existing output may be * replaced * @param help whether usage help was requested */ @SuppressWarnings("PMD.LongVariable") private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal, - boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite, - boolean help) { + boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, + CaseProcessingMode caseProcessingMode, boolean overwrite, boolean help) { /** * Parses raw command-line arguments. @@ -282,6 +292,7 @@ public final class Compile { boolean rightToLeft = false; boolean overwrite = false; boolean help = false; + CaseProcessingMode caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT; int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT; int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO; @@ -330,6 +341,11 @@ public final class Compile { requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"), "--dominant-winner-over-second-ratio"); break; + case "--case-processing-mode": + caseProcessingMode = CaseProcessingMode + .valueOf(requireValue(arguments, ++index, "--case-processing-mode") + .toUpperCase(Locale.ROOT)); + break; default: throw new IllegalArgumentException("Unknown argument: " + argument); @@ -338,7 +354,7 @@ public final class Compile { if (help) { return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft, - dominantWinnerMinPercent, dominantWinnerOverSecondRatio, overwrite, true); + dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, true); } if (inputFile == null) { @@ -352,7 +368,7 @@ public final class Compile { } return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft, - dominantWinnerMinPercent, dominantWinnerOverSecondRatio, overwrite, false); + dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, false); } /** diff --git a/src/main/java/org/egothor/stemmer/FrequencyTrie.java b/src/main/java/org/egothor/stemmer/FrequencyTrie.java index 54ebdb8..b36ac88 100644 --- a/src/main/java/org/egothor/stemmer/FrequencyTrie.java +++ b/src/main/java/org/egothor/stemmer/FrequencyTrie.java @@ -41,6 +41,7 @@ import java.util.Collections; import java.util.IdentityHashMap; import java.util.LinkedHashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Objects; import java.util.function.IntFunction; @@ -101,7 +102,7 @@ public final class FrequencyTrie { /** * Binary format version. */ - private static final int STREAM_VERSION = 3; + private static final int STREAM_VERSION = 4; /** * Factory used to create correctly typed arrays for {@link #getAll(String)}. @@ -142,6 +143,10 @@ public final class FrequencyTrie { * selected deterministically by shorter {@code toString()} value first, then by * lexicographically lower {@code toString()}, and finally by stable first-seen * order. + * + *

+ * The supplied key is normalized according to persisted + * {@link TrieMetadata#caseProcessingMode()} before traversal. * * @param key key to resolve * @return most frequent value, or {@code null} if the key does not exist or no @@ -150,7 +155,7 @@ public final class FrequencyTrie { */ public V get(final String key) { Objects.requireNonNull(key, "key"); - final CompiledNode node = findNode(key); + final CompiledNode node = findNode(normalizeLookupKey(key)); if (node == null || node.orderedValues().length == 0) { return null; } @@ -170,6 +175,10 @@ public final class FrequencyTrie { *

* The returned array is a defensive copy. * + *

+ * The supplied key is normalized according to persisted + * {@link TrieMetadata#caseProcessingMode()} before traversal. + * * @param key key to resolve * @return all values stored at the addressed node, ordered by descending * frequency; returns an empty array if the key does not exist or no @@ -178,7 +187,7 @@ public final class FrequencyTrie { */ public V[] getAll(final String key) { Objects.requireNonNull(key, "key"); - final CompiledNode node = findNode(key); + final CompiledNode node = findNode(normalizeLookupKey(key)); if (node == null || node.orderedValues().length == 0) { return this.arrayFactory.apply(0); } @@ -336,7 +345,7 @@ public final class FrequencyTrie { } final int version = dataInput.readInt(); - if (version != 1 && version != STREAM_VERSION) { + if (version != 1 && version != 3 && version != STREAM_VERSION) { throw new IOException("Unsupported trie stream version: " + version); } @@ -376,6 +385,7 @@ public final class FrequencyTrie { dataOutput.writeInt(metadata.reductionSettings().dominantWinnerMinPercent()); dataOutput.writeInt(metadata.reductionSettings().dominantWinnerOverSecondRatio()); dataOutput.writeInt(metadata.diacriticProcessingMode().ordinal()); + dataOutput.writeInt(metadata.caseProcessingMode().ordinal()); } /** @@ -419,10 +429,22 @@ public final class FrequencyTrie { throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal); } - return new TrieMetadata( - version, traversalDirection, new ReductionSettings(reductionModes[reductionModeOrdinal], - dominantWinnerMinPercent, dominantWinnerOverSecondRatio), - diacriticProcessingModes[diacriticProcessingModeOrdinal]); + final CaseProcessingMode caseProcessingMode; + if (version >= 4) { // NOPMD + final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values(); + final int caseProcessingModeOrdinal = dataInput.readInt(); + if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) { + throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal); + } + caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal]; + } else { + caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT; + } + + return new TrieMetadata(version, traversalDirection, + new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent, + dominantWinnerOverSecondRatio), + diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode); } /** @@ -598,7 +620,7 @@ public final class FrequencyTrie { /** * Locates the compiled node for the supplied key. * - * @param key key to resolve + * @param key already-normalized key to resolve * @return compiled node, or {@code null} if the path does not exist */ private CompiledNode findNode(final String key) { @@ -613,6 +635,19 @@ public final class FrequencyTrie { return current; } + /** + * Applies lookup-time case normalization according to persisted metadata. + * + * @param key lookup key + * @return normalized key for trie traversal + */ + private String normalizeLookupKey(final String key) { + if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) { + return key.toLowerCase(Locale.ROOT); + } + return key; + } + /** * Builder of {@link FrequencyTrie}. * @@ -647,6 +682,11 @@ public final class FrequencyTrie { */ private final WordTraversalDirection traversalDirection; + /** + * Dictionary case processing mode associated with this builder. + */ + private final CaseProcessingMode caseProcessingMode; + /** * Mutable root node. */ @@ -679,9 +719,25 @@ public final class FrequencyTrie { */ public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection) { + this(arrayFactory, reductionSettings, traversalDirection, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + } + + /** + * Creates a new builder with the provided settings, explicit traversal + * direction, and explicit case processing mode. + * + * @param arrayFactory array factory + * @param reductionSettings reduction configuration + * @param traversalDirection logical key traversal direction + * @param caseProcessingMode dictionary case processing mode + * @throws NullPointerException if any argument is {@code null} + */ + public Builder(final IntFunction arrayFactory, final ReductionSettings reductionSettings, + final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) { this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory"); this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings"); this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection"); + this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); this.root = new MutableNode<>(); } @@ -753,8 +809,8 @@ public final class FrequencyTrie { reductionContext.canonicalNodeCount()); } - final TrieMetadata metadata = TrieMetadata.current(STREAM_VERSION, this.traversalDirection, - this.reductionSettings); + final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection, + this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode); return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata); } diff --git a/src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java b/src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java index 7313b3e..c3e1511 100644 --- a/src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java +++ b/src/main/java/org/egothor/stemmer/StemmerDictionaryParser.java @@ -53,8 +53,8 @@ import java.util.logging.Logger; * to that stem. * *

- * Input lines are normalized to lower case using {@link Locale#ROOT}. Leading - * and trailing whitespace around each column is ignored. + * Input line case normalization is controlled by {@link CaseProcessingMode}. + * Leading and trailing whitespace around each column is ignored. * *

* The parser supports line remarks and trailing remarks. The remark markers @@ -113,11 +113,27 @@ public final class StemmerDictionaryParser { * @throws IOException if reading fails */ public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException { + return parse(path, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler); + } + + /** + * Parses a dictionary file from a filesystem path. + * + * @param path dictionary file path + * @param caseProcessingMode case processing mode + * @param entryHandler handler receiving parsed entries + * @return parsing statistics + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if reading fails + */ + public static ParseStatistics parse(final Path path, final CaseProcessingMode caseProcessingMode, + final EntryHandler entryHandler) throws IOException { Objects.requireNonNull(path, "path"); + Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); Objects.requireNonNull(entryHandler, "entryHandler"); try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) { - return parse(reader, path.toAbsolutePath().toString(), entryHandler); + return parse(reader, path.toAbsolutePath().toString(), caseProcessingMode, entryHandler); } } @@ -132,7 +148,23 @@ public final class StemmerDictionaryParser { */ public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException { Objects.requireNonNull(fileName, "fileName"); - return parse(Path.of(fileName), entryHandler); + return parse(Path.of(fileName), CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler); + } + + /** + * Parses a dictionary file from a path string. + * + * @param fileName dictionary file name or path string + * @param caseProcessingMode case processing mode + * @param entryHandler handler receiving parsed entries + * @return parsing statistics + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if reading fails + */ + public static ParseStatistics parse(final String fileName, final CaseProcessingMode caseProcessingMode, + final EntryHandler entryHandler) throws IOException { + Objects.requireNonNull(fileName, "fileName"); + return parse(Path.of(fileName), caseProcessingMode, entryHandler); } /** @@ -147,8 +179,25 @@ public final class StemmerDictionaryParser { */ public static ParseStatistics parse(final Reader reader, final String sourceDescription, final EntryHandler entryHandler) throws IOException { + return parse(reader, sourceDescription, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler); + } + + /** + * Parses a dictionary from a reader. + * + * @param reader source reader + * @param sourceDescription logical source description for diagnostics + * @param caseProcessingMode case processing mode + * @param entryHandler handler receiving parsed entries + * @return parsing statistics + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if reading or handler processing fails + */ + public static ParseStatistics parse(final Reader reader, final String sourceDescription, + final CaseProcessingMode caseProcessingMode, final EntryHandler entryHandler) throws IOException { Objects.requireNonNull(reader, "reader"); Objects.requireNonNull(sourceDescription, "sourceDescription"); + Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); Objects.requireNonNull(entryHandler, "entryHandler"); final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader @@ -161,7 +210,7 @@ public final class StemmerDictionaryParser { for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) { lineNumber++; - final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT); + final String normalizedLine = normalizeLineCase(stripRemark(line).trim(), caseProcessingMode); if (normalizedLine.isEmpty()) { ignoredLineCount++; continue; @@ -226,6 +275,20 @@ public final class StemmerDictionaryParser { return statistics; } + /** + * Applies case normalization to one line according to the selected mode. + * + * @param line line to normalize + * @param caseProcessingMode case processing mode + * @return normalized line + */ + private static String normalizeLineCase(final String line, final CaseProcessingMode caseProcessingMode) { + if (caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) { + return line.toLowerCase(Locale.ROOT); + } + return line; + } + /** * Determines whether one dictionary item contains any Unicode whitespace * character. diff --git a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java index 62aee96..e5d820c 100644 --- a/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java +++ b/src/main/java/org/egothor/stemmer/StemmerPatchTrieLoader.java @@ -283,7 +283,8 @@ public final class StemmerPatchTrieLoader { try (InputStream inputStream = openBundledResource(resourcePath); BufferedReader reader = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { - return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language)); + return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language), + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); } } @@ -318,7 +319,8 @@ public final class StemmerPatchTrieLoader { */ public static FrequencyTrie load(final Path path, final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException { - return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD); + return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD, + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); } /** @@ -338,14 +340,37 @@ public final class StemmerPatchTrieLoader { public static FrequencyTrie load(final Path path, final boolean storeOriginal, final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection) throws IOException { + return load(path, storeOriginal, reductionSettings, traversalDirection, + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + } + + /** + * Loads a dictionary from a filesystem path using explicit reduction settings, + * explicit traversal direction, and explicit case processing mode. + * + * @param path path to the dictionary file + * @param storeOriginal whether the stem itself should be inserted using the + * canonical no-op patch command + * @param reductionSettings reduction settings + * @param traversalDirection traversal direction used for both trie keys and + * patch commands + * @param caseProcessingMode case processing mode used during dictionary parsing + * @return compiled patch-command trie + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if the file cannot be opened or read + */ + public static FrequencyTrie load(final Path path, final boolean storeOriginal, + final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, + final CaseProcessingMode caseProcessingMode) throws IOException { Objects.requireNonNull(path, "path"); Objects.requireNonNull(reductionSettings, "reductionSettings"); Objects.requireNonNull(traversalDirection, "traversalDirection"); + Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); try (InputStream inputStream = openDictionaryInputStream(path); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings, - traversalDirection); + traversalDirection, caseProcessingMode); } } @@ -403,7 +428,30 @@ public final class StemmerPatchTrieLoader { final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection) throws IOException { Objects.requireNonNull(fileName, "fileName"); - return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection); + return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + } + + /** + * Loads a dictionary from a filesystem path string using explicit reduction + * settings, explicit traversal direction, and explicit case processing mode. + * + * @param fileName file name or path string + * @param storeOriginal whether the stem itself should be inserted using the + * canonical no-op patch command + * @param reductionSettings reduction settings + * @param traversalDirection traversal direction used for both trie keys and + * patch commands + * @param caseProcessingMode case processing mode used during dictionary parsing + * @return compiled patch-command trie + * @throws NullPointerException if any argument is {@code null} + * @throws IOException if the file cannot be opened or read + */ + public static FrequencyTrie load(final String fileName, final boolean storeOriginal, + final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection, + final CaseProcessingMode caseProcessingMode) throws IOException { + Objects.requireNonNull(fileName, "fileName"); + return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode); } /** @@ -437,14 +485,15 @@ public final class StemmerPatchTrieLoader { */ private static FrequencyTrie load(final BufferedReader reader, final String sourceDescription, final boolean storeOriginal, final ReductionSettings reductionSettings, - final WordTraversalDirection traversalDirection) throws IOException { + final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) + throws IOException { final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings, - traversalDirection); + traversalDirection, caseProcessingMode); final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder(traversalDirection); final int[] insertedMappings = new int[1]; final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader, - sourceDescription, (stem, variants, lineNumber) -> { + sourceDescription, caseProcessingMode, (stem, variants, lineNumber) -> { if (storeOriginal) { builder.put(stem, NOOP_PATCH_COMMAND); insertedMappings[0]++; diff --git a/src/main/java/org/egothor/stemmer/TrieMetadata.java b/src/main/java/org/egothor/stemmer/TrieMetadata.java index b5703bc..77d2ad1 100644 --- a/src/main/java/org/egothor/stemmer/TrieMetadata.java +++ b/src/main/java/org/egothor/stemmer/TrieMetadata.java @@ -54,9 +54,12 @@ import java.util.Objects; * @param reductionSettings reduction settings used during compilation * @param diacriticProcessingMode diacritic processing strategy associated with * the artifact + * @param caseProcessingMode case processing strategy associated with the + * artifact */ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection, - ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode) { + ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode, + CaseProcessingMode caseProcessingMode) { /** * Creates a new metadata instance. @@ -66,9 +69,11 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi * @param traversalDirection logical key traversal direction * @param reductionSettings reduction settings used during compilation * @param diacriticProcessingMode diacritic processing strategy + * @param caseProcessingMode case processing strategy */ public TrieMetadata(final int formatVersion, final WordTraversalDirection traversalDirection, - final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode) { + final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode, + final CaseProcessingMode caseProcessingMode) { if (formatVersion < 1) { // NOPMD throw new IllegalArgumentException("formatVersion must be at least 1."); } @@ -76,6 +81,7 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection"); this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings"); this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode"); + this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode"); } /** @@ -89,7 +95,8 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi */ public static TrieMetadata current(final int formatVersion, final WordTraversalDirection traversalDirection, final ReductionSettings reductionSettings) { - return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS); + return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS, + CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); } /** @@ -104,6 +111,6 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi public static TrieMetadata legacy(final int formatVersion, final WordTraversalDirection traversalDirection) { return new TrieMetadata(formatVersion, traversalDirection, ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), - DiacriticProcessingMode.AS_IS); + DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); } } diff --git a/src/main/java/org/egothor/stemmer/package-info.java b/src/main/java/org/egothor/stemmer/package-info.java index 5653438..315cc99 100644 --- a/src/main/java/org/egothor/stemmer/package-info.java +++ b/src/main/java/org/egothor/stemmer/package-info.java @@ -60,7 +60,9 @@ * non-empty logical line starts with a canonical stem followed by known surface * variants in subsequent tab-separated columns. * Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser}, - * which normalizes input to lower case using {@link java.util.Locale#ROOT}, + * which applies configurable case processing through + * {@link org.egothor.stemmer.CaseProcessingMode} (default: + * {@link org.egothor.stemmer.CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}), * supports whole-line as well as trailing remarks introduced by {@code #} or * {@code //}, and currently ignores dictionary items containing Unicode * whitespace characters while reporting them through warning-level diagnostics. diff --git a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java index 97eaffb..a1e9878 100644 --- a/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java +++ b/src/test/java/org/egothor/stemmer/FrequencyTrieTest.java @@ -201,6 +201,43 @@ class FrequencyTrieTest { () -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner"))); } + /** + * Verifies that lookup-time key normalization follows persisted case processing + * metadata. + */ + @Test + @DisplayName("Lookup applies lowercase normalization when metadata requires it") + void lookupAppliesLowercaseNormalizationWhenMetadataRequiresIt() { + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, + ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), + WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT); + builder.put("house", "noun"); + builder.put("house", "verb"); + + final FrequencyTrie trie = builder.build(); + + assertAll(() -> assertEquals("noun", trie.get("HOUSE")), + () -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE"))); + } + + /** + * Verifies that lookup preserves casing when metadata uses AS_IS mode. + */ + @Test + @DisplayName("Lookup keeps case-sensitive behavior when metadata is AS_IS") + void lookupKeepsCaseSensitiveBehaviorWhenMetadataIsAsIs() { + final FrequencyTrie.Builder builder = new FrequencyTrie.Builder<>(String[]::new, + ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS), + WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS); + builder.put("House", "noun"); + + final FrequencyTrie trie = builder.build(); + + assertAll(() -> assertEquals("noun", trie.get("House")), () -> assertNull(trie.get("house")), + () -> assertArrayEquals(new String[] { "noun" }, trie.getAll("House")), + () -> assertArrayEquals(new String[0], trie.getAll("HOUSE"))); + } + /** * Verifies that a missing path below an existing prefix returns empty results. */ diff --git a/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java b/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java index 580627e..353c29c 100644 --- a/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java +++ b/src/test/java/org/egothor/stemmer/StemmerDictionaryParserTest.java @@ -64,7 +64,7 @@ import org.junit.jupiter.api.io.TempDir; *

*