feat: Apply metadata-driven case normalization in get/getAll
This commit is contained in:
55
src/main/java/org/egothor/stemmer/CaseProcessingMode.java
Normal file
55
src/main/java/org/egothor/stemmer/CaseProcessingMode.java
Normal file
@@ -0,0 +1,55 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (C) 2026, Leo Galambos
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright notice,
|
||||
* this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
* this list of conditions and the following disclaimer in the documentation
|
||||
* and/or other materials provided with the distribution.
|
||||
*
|
||||
* 3. Neither the name of the copyright holder nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
* POSSIBILITY OF SUCH DAMAGE.
|
||||
******************************************************************************/
|
||||
package org.egothor.stemmer;
|
||||
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* Defines how dictionary items are normalized with respect to letter casing.
|
||||
*
|
||||
* <p>
|
||||
* The mode is applied while parsing dictionary sources and can be persisted in
|
||||
* trie metadata so that compiled artifacts remain self-describing.
|
||||
*/
|
||||
public enum CaseProcessingMode {
|
||||
|
||||
/**
|
||||
* Preserves input character casing exactly as provided by the dictionary
|
||||
* source.
|
||||
*/
|
||||
AS_IS,
|
||||
|
||||
/**
|
||||
* Normalizes all dictionary content to lower case using
|
||||
* {@link Locale#ROOT}.
|
||||
*/
|
||||
LOWERCASE_WITH_LOCALE_ROOT
|
||||
}
|
||||
@@ -61,6 +61,7 @@ import java.util.logging.Logger;
|
||||
* --output <file>
|
||||
* --reduction-mode <mode>
|
||||
* [--store-original]
|
||||
* [--case-processing-mode <mode>]
|
||||
* [--dominant-winner-min-percent <1..100>]
|
||||
* [--dominant-winner-over-second-ratio <1..n>]
|
||||
* [--overwrite]
|
||||
@@ -152,7 +153,7 @@ public final class Compile {
|
||||
final WordTraversalDirection traversalDirection = arguments.rightToLeft() ? WordTraversalDirection.FORWARD
|
||||
: WordTraversalDirection.BACKWARD;
|
||||
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
|
||||
reductionSettings, traversalDirection);
|
||||
reductionSettings, traversalDirection, arguments.caseProcessingMode());
|
||||
|
||||
final Path outputFile = arguments.outputFile();
|
||||
final Path parent = outputFile.toAbsolutePath().getParent();
|
||||
@@ -168,10 +169,10 @@ public final class Compile {
|
||||
|
||||
if (LOGGER.isLoggable(Level.INFO)) {
|
||||
LOGGER.log(Level.INFO,
|
||||
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, dominantWinnerMinPercent={5}, dominantWinnerOverSecondRatio={6}.",
|
||||
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, rightToLeft={4}, caseProcessingMode={5}, dominantWinnerMinPercent={6}, dominantWinnerOverSecondRatio={7}.",
|
||||
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
|
||||
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
|
||||
arguments.storeOriginal(), arguments.rightToLeft(),
|
||||
arguments.storeOriginal(), arguments.rightToLeft(), arguments.caseProcessingMode(),
|
||||
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio() });
|
||||
}
|
||||
}
|
||||
@@ -186,6 +187,7 @@ public final class Compile {
|
||||
System.err.println(" --output <file> \\");
|
||||
System.err.println(" --reduction-mode <mode> \\");
|
||||
System.err.println(" [--store-original] \\");
|
||||
System.err.println(" [--case-processing-mode <mode>] \\");
|
||||
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
|
||||
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
|
||||
System.err.println(" [--overwrite]");
|
||||
@@ -199,6 +201,13 @@ public final class Compile {
|
||||
System.err.println(" of the stored word form and patch commands are encoded likewise.");
|
||||
System.err.println(" --overwrite");
|
||||
System.err.println(" Replaces the target file when it already exists.");
|
||||
System.err.println(" --case-processing-mode");
|
||||
System.err.println(" Controls whether dictionary input is lowercased or preserved as-is.");
|
||||
System.err.println();
|
||||
System.err.println("Supported case processing modes:");
|
||||
for (CaseProcessingMode mode : CaseProcessingMode.values()) {
|
||||
System.err.println(" " + mode.name());
|
||||
}
|
||||
System.err.println();
|
||||
System.err.println("Supported reduction modes:");
|
||||
for (ReductionMode mode : ReductionMode.values()) {
|
||||
@@ -256,14 +265,15 @@ public final class Compile {
|
||||
* forward traversal on stored word forms
|
||||
* @param dominantWinnerMinPercent dominant winner minimum percent
|
||||
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
* @param overwrite whether an existing output may be
|
||||
* replaced
|
||||
* @param help whether usage help was requested
|
||||
*/
|
||||
@SuppressWarnings("PMD.LongVariable")
|
||||
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
|
||||
boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite,
|
||||
boolean help) {
|
||||
boolean rightToLeft, int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio,
|
||||
CaseProcessingMode caseProcessingMode, boolean overwrite, boolean help) {
|
||||
|
||||
/**
|
||||
* Parses raw command-line arguments.
|
||||
@@ -282,6 +292,7 @@ public final class Compile {
|
||||
boolean rightToLeft = false;
|
||||
boolean overwrite = false;
|
||||
boolean help = false;
|
||||
CaseProcessingMode caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
|
||||
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
|
||||
|
||||
@@ -330,6 +341,11 @@ public final class Compile {
|
||||
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
|
||||
"--dominant-winner-over-second-ratio");
|
||||
break;
|
||||
case "--case-processing-mode":
|
||||
caseProcessingMode = CaseProcessingMode
|
||||
.valueOf(requireValue(arguments, ++index, "--case-processing-mode")
|
||||
.toUpperCase(Locale.ROOT));
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown argument: " + argument);
|
||||
@@ -338,7 +354,7 @@ public final class Compile {
|
||||
|
||||
if (help) {
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
|
||||
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, overwrite, true);
|
||||
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, true);
|
||||
}
|
||||
|
||||
if (inputFile == null) {
|
||||
@@ -352,7 +368,7 @@ public final class Compile {
|
||||
}
|
||||
|
||||
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, rightToLeft,
|
||||
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, overwrite, false);
|
||||
dominantWinnerMinPercent, dominantWinnerOverSecondRatio, caseProcessingMode, overwrite, false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -41,6 +41,7 @@ import java.util.Collections;
|
||||
import java.util.IdentityHashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.function.IntFunction;
|
||||
@@ -101,7 +102,7 @@ public final class FrequencyTrie<V> {
|
||||
/**
|
||||
* Binary format version.
|
||||
*/
|
||||
private static final int STREAM_VERSION = 3;
|
||||
private static final int STREAM_VERSION = 4;
|
||||
|
||||
/**
|
||||
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
|
||||
@@ -142,6 +143,10 @@ public final class FrequencyTrie<V> {
|
||||
* selected deterministically by shorter {@code toString()} value first, then by
|
||||
* lexicographically lower {@code toString()}, and finally by stable first-seen
|
||||
* order.
|
||||
*
|
||||
* <p>
|
||||
* The supplied key is normalized according to persisted
|
||||
* {@link TrieMetadata#caseProcessingMode()} before traversal.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return most frequent value, or {@code null} if the key does not exist or no
|
||||
@@ -150,7 +155,7 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public V get(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return null;
|
||||
}
|
||||
@@ -170,6 +175,10 @@ public final class FrequencyTrie<V> {
|
||||
* <p>
|
||||
* The returned array is a defensive copy.
|
||||
*
|
||||
* <p>
|
||||
* The supplied key is normalized according to persisted
|
||||
* {@link TrieMetadata#caseProcessingMode()} before traversal.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @return all values stored at the addressed node, ordered by descending
|
||||
* frequency; returns an empty array if the key does not exist or no
|
||||
@@ -178,7 +187,7 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public V[] getAll(final String key) {
|
||||
Objects.requireNonNull(key, "key");
|
||||
final CompiledNode<V> node = findNode(key);
|
||||
final CompiledNode<V> node = findNode(normalizeLookupKey(key));
|
||||
if (node == null || node.orderedValues().length == 0) {
|
||||
return this.arrayFactory.apply(0);
|
||||
}
|
||||
@@ -336,7 +345,7 @@ public final class FrequencyTrie<V> {
|
||||
}
|
||||
|
||||
final int version = dataInput.readInt();
|
||||
if (version != 1 && version != STREAM_VERSION) {
|
||||
if (version != 1 && version != 3 && version != STREAM_VERSION) {
|
||||
throw new IOException("Unsupported trie stream version: " + version);
|
||||
}
|
||||
|
||||
@@ -376,6 +385,7 @@ public final class FrequencyTrie<V> {
|
||||
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerMinPercent());
|
||||
dataOutput.writeInt(metadata.reductionSettings().dominantWinnerOverSecondRatio());
|
||||
dataOutput.writeInt(metadata.diacriticProcessingMode().ordinal());
|
||||
dataOutput.writeInt(metadata.caseProcessingMode().ordinal());
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -419,10 +429,22 @@ public final class FrequencyTrie<V> {
|
||||
throw new IOException("Invalid diacritic processing mode ordinal: " + diacriticProcessingModeOrdinal);
|
||||
}
|
||||
|
||||
return new TrieMetadata(
|
||||
version, traversalDirection, new ReductionSettings(reductionModes[reductionModeOrdinal],
|
||||
dominantWinnerMinPercent, dominantWinnerOverSecondRatio),
|
||||
diacriticProcessingModes[diacriticProcessingModeOrdinal]);
|
||||
final CaseProcessingMode caseProcessingMode;
|
||||
if (version >= 4) { // NOPMD
|
||||
final CaseProcessingMode[] caseProcessingModes = CaseProcessingMode.values();
|
||||
final int caseProcessingModeOrdinal = dataInput.readInt();
|
||||
if (caseProcessingModeOrdinal < 0 || caseProcessingModeOrdinal >= caseProcessingModes.length) {
|
||||
throw new IOException("Invalid case processing mode ordinal: " + caseProcessingModeOrdinal);
|
||||
}
|
||||
caseProcessingMode = caseProcessingModes[caseProcessingModeOrdinal];
|
||||
} else {
|
||||
caseProcessingMode = CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT;
|
||||
}
|
||||
|
||||
return new TrieMetadata(version, traversalDirection,
|
||||
new ReductionSettings(reductionModes[reductionModeOrdinal], dominantWinnerMinPercent,
|
||||
dominantWinnerOverSecondRatio),
|
||||
diacriticProcessingModes[diacriticProcessingModeOrdinal], caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -598,7 +620,7 @@ public final class FrequencyTrie<V> {
|
||||
/**
|
||||
* Locates the compiled node for the supplied key.
|
||||
*
|
||||
* @param key key to resolve
|
||||
* @param key already-normalized key to resolve
|
||||
* @return compiled node, or {@code null} if the path does not exist
|
||||
*/
|
||||
private CompiledNode<V> findNode(final String key) {
|
||||
@@ -613,6 +635,19 @@ public final class FrequencyTrie<V> {
|
||||
return current;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies lookup-time case normalization according to persisted metadata.
|
||||
*
|
||||
* @param key lookup key
|
||||
* @return normalized key for trie traversal
|
||||
*/
|
||||
private String normalizeLookupKey(final String key) {
|
||||
if (this.metadata.caseProcessingMode() == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||
return key.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builder of {@link FrequencyTrie}.
|
||||
*
|
||||
@@ -647,6 +682,11 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
private final WordTraversalDirection traversalDirection;
|
||||
|
||||
/**
|
||||
* Dictionary case processing mode associated with this builder.
|
||||
*/
|
||||
private final CaseProcessingMode caseProcessingMode;
|
||||
|
||||
/**
|
||||
* Mutable root node.
|
||||
*/
|
||||
@@ -679,9 +719,25 @@ public final class FrequencyTrie<V> {
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection) {
|
||||
this(arrayFactory, reductionSettings, traversalDirection, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new builder with the provided settings, explicit traversal
|
||||
* direction, and explicit case processing mode.
|
||||
*
|
||||
* @param arrayFactory array factory
|
||||
* @param reductionSettings reduction configuration
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param caseProcessingMode dictionary case processing mode
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
*/
|
||||
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode) {
|
||||
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
this.root = new MutableNode<>();
|
||||
}
|
||||
|
||||
@@ -753,8 +809,8 @@ public final class FrequencyTrie<V> {
|
||||
reductionContext.canonicalNodeCount());
|
||||
}
|
||||
|
||||
final TrieMetadata metadata = TrieMetadata.current(STREAM_VERSION, this.traversalDirection,
|
||||
this.reductionSettings);
|
||||
final TrieMetadata metadata = new TrieMetadata(STREAM_VERSION, this.traversalDirection,
|
||||
this.reductionSettings, DiacriticProcessingMode.AS_IS, this.caseProcessingMode);
|
||||
return new FrequencyTrie<>(this.arrayFactory, compiledRoot, metadata);
|
||||
}
|
||||
|
||||
|
||||
@@ -53,8 +53,8 @@ import java.util.logging.Logger;
|
||||
* to that stem.
|
||||
*
|
||||
* <p>
|
||||
* Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
|
||||
* and trailing whitespace around each column is ignored.
|
||||
* Input line case normalization is controlled by {@link CaseProcessingMode}.
|
||||
* Leading and trailing whitespace around each column is ignored.
|
||||
*
|
||||
* <p>
|
||||
* The parser supports line remarks and trailing remarks. The remark markers
|
||||
@@ -113,11 +113,27 @@ public final class StemmerDictionaryParser {
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
|
||||
return parse(path, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a filesystem path.
|
||||
*
|
||||
* @param path dictionary file path
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Path path, final CaseProcessingMode caseProcessingMode,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
|
||||
return parse(reader, path.toAbsolutePath().toString(), entryHandler);
|
||||
return parse(reader, path.toAbsolutePath().toString(), caseProcessingMode, entryHandler);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,7 +148,23 @@ public final class StemmerDictionaryParser {
|
||||
*/
|
||||
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return parse(Path.of(fileName), entryHandler);
|
||||
return parse(Path.of(fileName), CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary file from a path string.
|
||||
*
|
||||
* @param fileName dictionary file name or path string
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading fails
|
||||
*/
|
||||
public static ParseStatistics parse(final String fileName, final CaseProcessingMode caseProcessingMode,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return parse(Path.of(fileName), caseProcessingMode, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -147,8 +179,25 @@ public final class StemmerDictionaryParser {
|
||||
*/
|
||||
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||
final EntryHandler entryHandler) throws IOException {
|
||||
return parse(reader, sourceDescription, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT, entryHandler);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a dictionary from a reader.
|
||||
*
|
||||
* @param reader source reader
|
||||
* @param sourceDescription logical source description for diagnostics
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @param entryHandler handler receiving parsed entries
|
||||
* @return parsing statistics
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if reading or handler processing fails
|
||||
*/
|
||||
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
|
||||
final CaseProcessingMode caseProcessingMode, final EntryHandler entryHandler) throws IOException {
|
||||
Objects.requireNonNull(reader, "reader");
|
||||
Objects.requireNonNull(sourceDescription, "sourceDescription");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
Objects.requireNonNull(entryHandler, "entryHandler");
|
||||
|
||||
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
|
||||
@@ -161,7 +210,7 @@ public final class StemmerDictionaryParser {
|
||||
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
|
||||
lineNumber++;
|
||||
|
||||
final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
|
||||
final String normalizedLine = normalizeLineCase(stripRemark(line).trim(), caseProcessingMode);
|
||||
if (normalizedLine.isEmpty()) {
|
||||
ignoredLineCount++;
|
||||
continue;
|
||||
@@ -226,6 +275,20 @@ public final class StemmerDictionaryParser {
|
||||
return statistics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies case normalization to one line according to the selected mode.
|
||||
*
|
||||
* @param line line to normalize
|
||||
* @param caseProcessingMode case processing mode
|
||||
* @return normalized line
|
||||
*/
|
||||
private static String normalizeLineCase(final String line, final CaseProcessingMode caseProcessingMode) {
|
||||
if (caseProcessingMode == CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT) {
|
||||
return line.toLowerCase(Locale.ROOT);
|
||||
}
|
||||
return line;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether one dictionary item contains any Unicode whitespace
|
||||
* character.
|
||||
|
||||
@@ -283,7 +283,8 @@ public final class StemmerPatchTrieLoader {
|
||||
try (InputStream inputStream = openBundledResource(resourcePath);
|
||||
BufferedReader reader = new BufferedReader(
|
||||
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language));
|
||||
return load(reader, resourcePath, storeOriginal, reductionSettings, traversalDirectionOf(language),
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -318,7 +319,8 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings) throws IOException {
|
||||
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD);
|
||||
return load(path, storeOriginal, reductionSettings, WordTraversalDirection.BACKWARD,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -338,14 +340,37 @@ public final class StemmerPatchTrieLoader {
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||
throws IOException {
|
||||
return load(path, storeOriginal, reductionSettings, traversalDirection,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path using explicit reduction settings,
|
||||
* explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* @param path path to the dictionary file
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||
Objects.requireNonNull(path, "path");
|
||||
Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
|
||||
try (InputStream inputStream = openDictionaryInputStream(path);
|
||||
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
|
||||
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings,
|
||||
traversalDirection);
|
||||
traversalDirection, caseProcessingMode);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -403,7 +428,30 @@ public final class StemmerPatchTrieLoader {
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection)
|
||||
throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection);
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads a dictionary from a filesystem path string using explicit reduction
|
||||
* settings, explicit traversal direction, and explicit case processing mode.
|
||||
*
|
||||
* @param fileName file name or path string
|
||||
* @param storeOriginal whether the stem itself should be inserted using the
|
||||
* canonical no-op patch command
|
||||
* @param reductionSettings reduction settings
|
||||
* @param traversalDirection traversal direction used for both trie keys and
|
||||
* patch commands
|
||||
* @param caseProcessingMode case processing mode used during dictionary parsing
|
||||
* @return compiled patch-command trie
|
||||
* @throws NullPointerException if any argument is {@code null}
|
||||
* @throws IOException if the file cannot be opened or read
|
||||
*/
|
||||
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
|
||||
final ReductionSettings reductionSettings, final WordTraversalDirection traversalDirection,
|
||||
final CaseProcessingMode caseProcessingMode) throws IOException {
|
||||
Objects.requireNonNull(fileName, "fileName");
|
||||
return load(Path.of(fileName), storeOriginal, reductionSettings, traversalDirection, caseProcessingMode);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -437,14 +485,15 @@ public final class StemmerPatchTrieLoader {
|
||||
*/
|
||||
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
|
||||
final boolean storeOriginal, final ReductionSettings reductionSettings,
|
||||
final WordTraversalDirection traversalDirection) throws IOException {
|
||||
final WordTraversalDirection traversalDirection, final CaseProcessingMode caseProcessingMode)
|
||||
throws IOException {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings,
|
||||
traversalDirection);
|
||||
traversalDirection, caseProcessingMode);
|
||||
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder(traversalDirection);
|
||||
final int[] insertedMappings = new int[1];
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
|
||||
sourceDescription, (stem, variants, lineNumber) -> {
|
||||
sourceDescription, caseProcessingMode, (stem, variants, lineNumber) -> {
|
||||
if (storeOriginal) {
|
||||
builder.put(stem, NOOP_PATCH_COMMAND);
|
||||
insertedMappings[0]++;
|
||||
|
||||
@@ -54,9 +54,12 @@ import java.util.Objects;
|
||||
* @param reductionSettings reduction settings used during compilation
|
||||
* @param diacriticProcessingMode diacritic processing strategy associated with
|
||||
* the artifact
|
||||
* @param caseProcessingMode case processing strategy associated with the
|
||||
* artifact
|
||||
*/
|
||||
public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDirection,
|
||||
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode) {
|
||||
ReductionSettings reductionSettings, DiacriticProcessingMode diacriticProcessingMode,
|
||||
CaseProcessingMode caseProcessingMode) {
|
||||
|
||||
/**
|
||||
* Creates a new metadata instance.
|
||||
@@ -66,9 +69,11 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
|
||||
* @param traversalDirection logical key traversal direction
|
||||
* @param reductionSettings reduction settings used during compilation
|
||||
* @param diacriticProcessingMode diacritic processing strategy
|
||||
* @param caseProcessingMode case processing strategy
|
||||
*/
|
||||
public TrieMetadata(final int formatVersion, final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode) {
|
||||
final ReductionSettings reductionSettings, final DiacriticProcessingMode diacriticProcessingMode,
|
||||
final CaseProcessingMode caseProcessingMode) {
|
||||
if (formatVersion < 1) { // NOPMD
|
||||
throw new IllegalArgumentException("formatVersion must be at least 1.");
|
||||
}
|
||||
@@ -76,6 +81,7 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
|
||||
this.traversalDirection = Objects.requireNonNull(traversalDirection, "traversalDirection");
|
||||
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
|
||||
this.diacriticProcessingMode = Objects.requireNonNull(diacriticProcessingMode, "diacriticProcessingMode");
|
||||
this.caseProcessingMode = Objects.requireNonNull(caseProcessingMode, "caseProcessingMode");
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -89,7 +95,8 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
|
||||
*/
|
||||
public static TrieMetadata current(final int formatVersion, final WordTraversalDirection traversalDirection,
|
||||
final ReductionSettings reductionSettings) {
|
||||
return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS);
|
||||
return new TrieMetadata(formatVersion, traversalDirection, reductionSettings, DiacriticProcessingMode.AS_IS,
|
||||
CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -104,6 +111,6 @@ public record TrieMetadata(int formatVersion, WordTraversalDirection traversalDi
|
||||
public static TrieMetadata legacy(final int formatVersion, final WordTraversalDirection traversalDirection) {
|
||||
return new TrieMetadata(formatVersion, traversalDirection,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
DiacriticProcessingMode.AS_IS);
|
||||
DiacriticProcessingMode.AS_IS, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,7 +60,9 @@
|
||||
* non-empty logical line starts with a canonical stem followed by known surface
|
||||
* variants in subsequent tab-separated columns.
|
||||
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
|
||||
* which normalizes input to lower case using {@link java.util.Locale#ROOT},
|
||||
* which applies configurable case processing through
|
||||
* {@link org.egothor.stemmer.CaseProcessingMode} (default:
|
||||
* {@link org.egothor.stemmer.CaseProcessingMode#LOWERCASE_WITH_LOCALE_ROOT}),
|
||||
* supports whole-line as well as trailing remarks introduced by {@code #} or
|
||||
* {@code //}, and currently ignores dictionary items containing Unicode
|
||||
* whitespace characters while reporting them through warning-level diagnostics.
|
||||
|
||||
@@ -201,6 +201,43 @@ class FrequencyTrieTest {
|
||||
() -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that lookup-time key normalization follows persisted case processing
|
||||
* metadata.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Lookup applies lowercase normalization when metadata requires it")
|
||||
void lookupAppliesLowercaseNormalizationWhenMetadataRequiresIt() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.LOWERCASE_WITH_LOCALE_ROOT);
|
||||
builder.put("house", "noun");
|
||||
builder.put("house", "verb");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("noun", trie.get("HOUSE")),
|
||||
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("HoUsE")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that lookup preserves casing when metadata uses AS_IS mode.
|
||||
*/
|
||||
@Test
|
||||
@DisplayName("Lookup keeps case-sensitive behavior when metadata is AS_IS")
|
||||
void lookupKeepsCaseSensitiveBehaviorWhenMetadataIsAsIs() {
|
||||
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new,
|
||||
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
|
||||
WordTraversalDirection.BACKWARD, CaseProcessingMode.AS_IS);
|
||||
builder.put("House", "noun");
|
||||
|
||||
final FrequencyTrie<String> trie = builder.build();
|
||||
|
||||
assertAll(() -> assertEquals("noun", trie.get("House")), () -> assertNull(trie.get("house")),
|
||||
() -> assertArrayEquals(new String[] { "noun" }, trie.getAll("House")),
|
||||
() -> assertArrayEquals(new String[0], trie.getAll("HOUSE")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies that a missing path below an existing prefix returns empty results.
|
||||
*/
|
||||
|
||||
@@ -64,7 +64,7 @@ import org.junit.jupiter.api.io.TempDir;
|
||||
* </p>
|
||||
* <ul>
|
||||
* <li>parsing through all public overloads,</li>
|
||||
* <li>normalization to lower case,</li>
|
||||
* <li>case processing according to the selected mode,</li>
|
||||
* <li>handling of empty lines and remarks,</li>
|
||||
* <li>correct entry emission including line numbers,</li>
|
||||
* <li>propagation of I/O failures from the handler and file system,</li>
|
||||
@@ -280,6 +280,22 @@ class StemmerDictionaryParserTest {
|
||||
assertEquals(expected, exception, "The original exception instance should be preserved.");
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should preserve character case when AS_IS mode is selected")
|
||||
void shouldPreserveCharacterCaseWhenAsIsModeIsSelected() throws IOException {
|
||||
final String input = "Root\tRunning\tRuns\tRUNNER\n";
|
||||
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
|
||||
|
||||
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(
|
||||
new StringReader(input), "case-as-is", CaseProcessingMode.AS_IS, collectingHandler(entries));
|
||||
|
||||
assertAll("Statistics", () -> assertEquals(1, statistics.lineCount()),
|
||||
() -> assertEquals(1, statistics.entryCount()), () -> assertEquals(0, statistics.ignoredLineCount()));
|
||||
assertEquals(1, entries.size(), "Exactly one entry should be emitted.");
|
||||
assertAll("Entry", () -> assertEquals("Root", entries.get(0).stem()),
|
||||
() -> assertArrayEquals(new String[] { "Running", "Runs", "RUNNER" }, entries.get(0).variants()));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null reader")
|
||||
void shouldRejectNullReader() {
|
||||
@@ -298,6 +314,15 @@ class StemmerDictionaryParserTest {
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null case processing mode")
|
||||
void shouldRejectNullCaseProcessingMode() {
|
||||
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(new StringReader("a b"),
|
||||
"source", null, (stem, variants, lineNumber) -> {
|
||||
// no-op
|
||||
}));
|
||||
}
|
||||
|
||||
@Test
|
||||
@DisplayName("should reject null entry handler")
|
||||
void shouldRejectNullEntryHandler() {
|
||||
|
||||
Reference in New Issue
Block a user