Refine stemmer core, compiled trie workflow, tests, and public documentation

feat: implement Compile CLI for building binary stemmer tables from source dictionaries
feat: add loading support for persisted compiled tries, including GZip-compressed binaries
feat: add a builder path for recreating a writable trie from a compiled trie
feat: expose read-only value/count access for compiled trie entries
feat: support deterministic NOOP patch encoding for identical source and target words

fix: make value selection deterministic for equal frequencies using length and lexical tie-breakers
fix: preserve valid alternative reductions during trie optimization and reduction
fix: correct patch command edge cases discovered in round-trip and malformed-input tests
fix: address persistence and compiled-trie handling defects found during implementation review
fix: resolve test failures and behavioral regressions uncovered by PMD and JUnit runs

refactor: reorganize trie-related support types into dedicated packages and classes
refactor: simplify the core FrequencyTrie design toward a cleaner practical architecture
refactor: improve compiled/read-only trie boundaries without restoring mutability
refactor: clean up internal reduction, serialization, and helper structure

test: add professional JUnit coverage for stemmer core classes
test: split trie tests into dedicated test classes per production type
test: improve parameterized tests for readability, diagnostics, and edge-case traceability
test: cover positive, negative, malformed, persistence, and round-trip scenarios
test: verify compiled dictionaries against source inputs using getAll semantics

docs: write public README and supplementary Markdown documentation for project publishing
docs: document architecture, reduction model, built-in languages, and operational guidance
docs: clarify reverse-word storage, mutable construction, and compiled-trie runtime behavior
docs: remove placeholders, vague buzzwords, and unexplained terminology from the documentation
docs: improve examples and wording for professional reader-facing project guidance

chore: align project materials with the practical Radix scope and Egothor/Stempel lineage
chore: raise overall project quality through documentation review and test hardening
This commit is contained in:
2026-04-13 02:10:46 +02:00
parent 15248c92c9
commit 038514bad0
64 changed files with 190190 additions and 20 deletions

View File

@@ -1,25 +1,25 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
@@ -34,20 +34,339 @@
******************************************************************************/
package org.egothor.stemmer;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Objects;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* Command-line compiler of stemmer dictionary files into compressed binary
* {@link FrequencyTrie} artifacts.
*
* <p>
* The CLI reads an input file in the same syntax as the project's stemmer
* resource files, compiles it into a read-only {@link FrequencyTrie} of patch
* commands, applies the selected subtree reduction strategy, and writes the
* resulting trie in the project binary format under GZip compression.
*
* <p>
* Remarks introduced by {@code #} or {@code //} are supported through
* {@link StemmerDictionaryParser}.
*
* <p>
* Supported arguments:
* </p>
*
* <pre>
* --input &lt;file&gt;
* --output &lt;file&gt;
* --reduction-mode &lt;mode&gt;
* [--store-original]
* [--dominant-winner-min-percent &lt;1..100&gt;]
* [--dominant-winner-over-second-ratio &lt;1..n&gt;]
* [--overwrite]
* [--help]
* </pre>
*/
public class Compile {
private static final Logger LOG = Logger.getLogger(Compile.class.getName());
public final class Compile {
/**
* @param args
* Logger of this class.
*/
public static void main(String[] args) {
LOG.log(Level.FINE, "execute", args);
private static final Logger LOGGER = Logger.getLogger(Compile.class.getName());
/**
* Exit status indicating success.
*/
private static final int EXIT_SUCCESS = 0;
/**
* Exit status indicating invalid command-line usage.
*/
private static final int EXIT_USAGE_ERROR = 2;
/**
* Exit status indicating processing failure.
*/
private static final int EXIT_PROCESSING_ERROR = 1;
/**
* Utility class.
*/
private Compile() {
throw new AssertionError("No instances.");
}
/**
* CLI entry point.
*
* @param arguments command-line arguments
*/
public static void main(final String[] arguments) {
final int exitCode = run(arguments);
if (exitCode != EXIT_SUCCESS) {
System.exit(exitCode);
}
}
/**
* Executes the CLI.
*
* @param arguments command-line arguments
* @return process exit code
*/
/* default */ static int run(final String... arguments) {
try {
final Arguments parsedArguments = Arguments.parse(arguments);
if (parsedArguments.help()) {
printUsage();
return EXIT_SUCCESS;
}
compile(parsedArguments);
return EXIT_SUCCESS;
} catch (IllegalArgumentException exception) {
System.err.println(exception.getMessage());
System.err.println();
printUsage();
return EXIT_USAGE_ERROR;
} catch (IOException exception) {
if (LOGGER.isLoggable(Level.SEVERE)) {
LOGGER.log(Level.SEVERE, "CLI compilation failed for input {0} and output {1}.",
new Object[] { safeInput(arguments), safeOutput(arguments) });
}
System.err.println("Compilation failed: " + exception.getMessage());
return EXIT_PROCESSING_ERROR;
}
}
/**
* Compiles the input dictionary and writes the compressed binary trie.
*
* @param arguments parsed command-line arguments
* @throws IOException if compilation or output writing fails
*/
private static void compile(final Arguments arguments) throws IOException {
final ReductionSettings reductionSettings = new ReductionSettings(arguments.reductionMode(),
arguments.dominantWinnerMinPercent(), arguments.dominantWinnerOverSecondRatio());
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(arguments.inputFile(), arguments.storeOriginal(),
reductionSettings);
final Path outputFile = arguments.outputFile();
final Path parent = outputFile.toAbsolutePath().getParent();
if (parent != null) {
Files.createDirectories(parent);
}
if (Files.exists(outputFile) && !arguments.overwrite()) {
throw new IOException("Output file already exists: " + outputFile.toAbsolutePath());
}
StemmerPatchTrieBinaryIO.write(trie, outputFile);
if (LOGGER.isLoggable(Level.INFO)) {
LOGGER.log(Level.INFO,
"Compiled dictionary {0} to {1} using mode {2}, storeOriginal={3}, dominantWinnerMinPercent={4}, dominantWinnerOverSecondRatio={5}.",
new Object[] { arguments.inputFile().toAbsolutePath().toString(),
arguments.outputFile().toAbsolutePath().toString(), arguments.reductionMode().name(),
arguments.storeOriginal(), arguments.dominantWinnerMinPercent(),
arguments.dominantWinnerOverSecondRatio() });
}
}
/**
* Prints CLI usage help.
*/
private static void printUsage() {
System.err.println("Usage:");
System.err.println(" java org.egothor.stemmer.Compile \\");
System.err.println(" --input <file> \\");
System.err.println(" --output <file> \\");
System.err.println(" --reduction-mode <mode> \\");
System.err.println(" [--store-original] \\");
System.err.println(" [--dominant-winner-min-percent <1..100>] \\");
System.err.println(" [--dominant-winner-over-second-ratio <1..n>] \\");
System.err.println(" [--overwrite]");
System.err.println();
System.err.println("Supported reduction modes:");
for (ReductionMode mode : ReductionMode.values()) {
System.err.println(" " + mode.name());
}
}
/**
* Returns a best-effort input value for diagnostic logging.
*
* @param arguments raw command-line arguments
* @return input value if present, otherwise {@code "<unknown>"}
*/
private static String safeInput(final String... arguments) {
return safeOptionValue(arguments, "--input");
}
/**
* Returns a best-effort output value for diagnostic logging.
*
* @param arguments raw command-line arguments
* @return output value if present, otherwise {@code "<unknown>"}
*/
private static String safeOutput(final String... arguments) {
return safeOptionValue(arguments, "--output");
}
/**
* Returns a best-effort option value from raw arguments.
*
* @param arguments raw command-line arguments
* @param option option name
* @return option value if present, otherwise {@code "<unknown>"}
*/
private static String safeOptionValue(final String[] arguments, final String option) {
if (arguments == null) {
return "<unknown>";
}
for (int index = 0; index < arguments.length - 1; index++) {
if (option.equals(arguments[index])) {
return arguments[index + 1];
}
}
return "<unknown>";
}
/**
* Immutable parsed CLI arguments.
*
* @param inputFile input dictionary file
* @param outputFile output compressed trie file
* @param reductionMode subtree reduction mode
* @param storeOriginal whether original stems are stored
* @param dominantWinnerMinPercent dominant winner minimum percent
* @param dominantWinnerOverSecondRatio dominant winner over second ratio
* @param overwrite whether an existing output may be
* replaced
* @param help whether usage help was requested
*/
@SuppressWarnings("PMD.LongVariable")
private record Arguments(Path inputFile, Path outputFile, ReductionMode reductionMode, boolean storeOriginal,
int dominantWinnerMinPercent, int dominantWinnerOverSecondRatio, boolean overwrite, boolean help) {
/**
* Parses raw command-line arguments.
*
* @param arguments raw command-line arguments
* @return parsed arguments
*/
@SuppressWarnings({ "PMD.AvoidReassigningLoopVariables", "PMD.CyclomaticComplexity" })
private static Arguments parse(final String... arguments) {
Objects.requireNonNull(arguments, "arguments");
Path inputFile = null;
Path outputFile = null;
ReductionMode reductionMode = null;
boolean storeOriginal = false;
boolean overwrite = false;
boolean help = false;
int dominantWinnerMinPercent = ReductionSettings.DEFAULT_DOMINANT_WINNER_MIN_PERCENT;
int dominantWinnerOverSecondRatio = ReductionSettings.DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO;
for (int index = 0; index < arguments.length; index++) {
final String argument = arguments[index];
switch (argument) {
case "--help":
case "-h":
help = true;
break;
case "--store-original":
storeOriginal = true;
break;
case "--overwrite":
overwrite = true;
break;
case "--input":
inputFile = Path.of(requireValue(arguments, ++index, "--input"));
break;
case "--output":
outputFile = Path.of(requireValue(arguments, ++index, "--output"));
break;
case "--reduction-mode":
reductionMode = ReductionMode
.valueOf(requireValue(arguments, ++index, "--reduction-mode").toUpperCase(Locale.ROOT));
break;
case "--dominant-winner-min-percent":
dominantWinnerMinPercent = parseInteger(
requireValue(arguments, ++index, "--dominant-winner-min-percent"),
"--dominant-winner-min-percent");
break;
case "--dominant-winner-over-second-ratio":
dominantWinnerOverSecondRatio = parseInteger(
requireValue(arguments, ++index, "--dominant-winner-over-second-ratio"),
"--dominant-winner-over-second-ratio");
break;
default:
throw new IllegalArgumentException("Unknown argument: " + argument);
}
}
if (help) {
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
dominantWinnerOverSecondRatio, overwrite, true);
}
if (inputFile == null) {
throw new IllegalArgumentException("Missing required argument --input.");
}
if (outputFile == null) {
throw new IllegalArgumentException("Missing required argument --output.");
}
if (reductionMode == null) {
throw new IllegalArgumentException("Missing required argument --reduction-mode.");
}
return new Arguments(inputFile, outputFile, reductionMode, storeOriginal, dominantWinnerMinPercent,
dominantWinnerOverSecondRatio, overwrite, false);
}
/**
* Returns the required value of an option.
*
* @param arguments raw arguments
* @param index value index
* @param option option name
* @return option value
*/
private static String requireValue(final String[] arguments, final int index, final String option) {
if (index >= arguments.length) {
throw new IllegalArgumentException("Missing value for " + option + ".");
}
return arguments[index];
}
/**
* Parses an integer option value.
*
* @param value raw value
* @param optionName option name
* @return parsed integer
*/
private static int parseInteger(final String value, final String optionName) {
try {
return Integer.parseInt(value);
} catch (NumberFormatException exception) {
throw new IllegalArgumentException("Invalid integer for " + optionName + ": " + value, exception);
}
}
}
}

View File

@@ -0,0 +1,784 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.IntFunction;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.egothor.stemmer.trie.CompiledNode;
import org.egothor.stemmer.trie.LocalValueSummary;
import org.egothor.stemmer.trie.MutableNode;
import org.egothor.stemmer.trie.NodeData;
import org.egothor.stemmer.trie.ReducedNode;
import org.egothor.stemmer.trie.ReductionContext;
import org.egothor.stemmer.trie.ReductionSignature;
/**
* Read-only trie mapping {@link String} keys to one or more values with
* frequency tracking.
*
* <p>
* A key may be associated with multiple values. Each value keeps the number of
* times it was inserted during the build phase. The method {@link #get(String)}
* returns the locally most frequent value stored at the terminal node of the
* supplied key, while {@link #getAll(String)} returns all locally stored values
* ordered by descending frequency.
*
* <p>
* If multiple values have the same local frequency, their ordering is
* deterministic. The preferred value is selected by the following tie-breaking
* rules, in order:
* <ol>
* <li>shorter {@link String} representation wins, based on
* {@code value.toString()}</li>
* <li>if the lengths are equal, lexicographically lower {@link String}
* representation wins</li>
* <li>if the textual representations are still equal, first-seen insertion
* order remains stable</li>
* </ol>
*
* <p>
* Values may be stored at any trie node, including internal nodes and leaf
* nodes. Therefore, reduction and canonicalization always operate on both the
* node-local terminal values and the structure of all descendant edges.
*
* @param <V> value type
*/
public final class FrequencyTrie<V> {
/**
* Logger of this class.
*/
private static final Logger LOGGER = Logger.getLogger(FrequencyTrie.class.getName());
/**
* Binary format magic header.
*/
private static final int STREAM_MAGIC = 0x45475452;
/**
* Binary format version.
*/
private static final int STREAM_VERSION = 1;
/**
* Factory used to create correctly typed arrays for {@link #getAll(String)}.
*/
private final IntFunction<V[]> arrayFactory;
/**
* Root node of the compiled read-only trie.
*/
private final CompiledNode<V> root;
/**
* Creates a new compiled trie instance.
*
* @param arrayFactory array factory
* @param root compiled root node
* @throws NullPointerException if any argument is {@code null}
*/
private FrequencyTrie(final IntFunction<V[]> arrayFactory, final CompiledNode<V> root) {
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
this.root = Objects.requireNonNull(root, "root");
}
/**
* Returns the most frequent value stored at the node addressed by the supplied
* key.
*
* <p>
* If multiple values have the same local frequency, the returned value is
* selected deterministically by shorter {@code toString()} value first, then by
* lexicographically lower {@code toString()}, and finally by stable first-seen
* order.
*
* @param key key to resolve
* @return most frequent value, or {@code null} if the key does not exist or no
* value is stored at the addressed node
* @throws NullPointerException if {@code key} is {@code null}
*/
public V get(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode<V> node = findNode(key);
if (node == null || node.orderedValues().length == 0) {
return null;
}
return node.orderedValues()[0];
}
/**
* Returns all values stored at the node addressed by the supplied key, ordered
* by descending frequency.
*
* <p>
* If multiple values have the same local frequency, the ordering is
* deterministic by shorter {@code toString()} value first, then by
* lexicographically lower {@code toString()}, and finally by stable first-seen
* order.
*
* <p>
* The returned array is a defensive copy.
*
* @param key key to resolve
* @return all values stored at the addressed node, ordered by descending
* frequency; returns an empty array if the key does not exist or no
* value is stored at the addressed node
* @throws NullPointerException if {@code key} is {@code null}
*/
public V[] getAll(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode<V> node = findNode(key);
if (node == null || node.orderedValues().length == 0) {
return this.arrayFactory.apply(0);
}
return Arrays.copyOf(node.orderedValues(), node.orderedValues().length);
}
/**
* Returns all values stored at the node addressed by the supplied key together
* with their occurrence counts, ordered by the same rules as
* {@link #getAll(String)}.
*
* <p>
* The returned list is aligned with the arrays returned by
* {@link #getAll(String)} and the internal compiled count representation.
*
* <p>
* The returned list is immutable.
*
* <p>
* In reduction modes that merge semantically equivalent subtrees, the returned
* counts may be aggregated across multiple original build-time nodes that were
* reduced into the same canonical compiled node.
*
* @param key key to resolve
* @return immutable ordered list of value-count entries; returns an empty list
* if the key does not exist or no value is stored at the addressed node
* @throws NullPointerException if {@code key} is {@code null}
*/
public List<ValueCount<V>> getEntries(final String key) {
Objects.requireNonNull(key, "key");
final CompiledNode<V> node = findNode(key);
if (node == null || node.orderedValues().length == 0) {
return List.of();
}
final List<ValueCount<V>> entries = new ArrayList<>(node.orderedValues().length);
for (int index = 0; index < node.orderedValues().length; index++) {
entries.add(new ValueCount<>(node.orderedValues()[index], node.orderedCounts()[index]));
}
return Collections.unmodifiableList(entries);
}
/**
* Returns the root node mainly for diagnostics and tests within the package.
*
* @return compiled root node
*/
/* default */ CompiledNode<V> root() {
return this.root;
}
/**
* Writes this compiled trie to the supplied output stream.
*
* <p>
* The binary format is versioned and preserves canonical shared compiled nodes,
* therefore the serialized representation remains compact even for tries
* reduced by subtree merging.
*
* <p>
* The supplied codec is responsible for persisting individual values of type
* {@code V}.
*
* @param outputStream target output stream
* @param valueCodec codec used to write values
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if writing fails
*/
public void writeTo(final OutputStream outputStream, final ValueStreamCodec<V> valueCodec) throws IOException {
Objects.requireNonNull(outputStream, "outputStream");
Objects.requireNonNull(valueCodec, "valueCodec");
final DataOutputStream dataOutput; // NOPMD
if (outputStream instanceof DataOutputStream) {
dataOutput = (DataOutputStream) outputStream;
} else {
dataOutput = new DataOutputStream(outputStream);
}
final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>();
final List<CompiledNode<V>> orderedNodes = new ArrayList<>();
assignNodeIds(this.root, nodeIds, orderedNodes);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, "Writing compiled trie with {0} canonical nodes.", orderedNodes.size());
}
dataOutput.writeInt(STREAM_MAGIC);
dataOutput.writeInt(STREAM_VERSION);
dataOutput.writeInt(orderedNodes.size());
dataOutput.writeInt(nodeIds.get(this.root));
for (CompiledNode<V> node : orderedNodes) {
writeNode(dataOutput, valueCodec, node, nodeIds);
}
dataOutput.flush();
}
/**
* Reads a compiled trie from the supplied input stream.
*
* <p>
* The caller must provide the same value codec semantics that were used during
* persistence as well as the array factory required for typed result arrays.
*
* @param inputStream source input stream
* @param arrayFactory factory used to create typed arrays
* @param valueCodec codec used to read values
* @param <V> value type
* @return deserialized compiled trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading fails or the binary format is invalid
*/
public static <V> FrequencyTrie<V> readFrom(final InputStream inputStream, final IntFunction<V[]> arrayFactory,
final ValueStreamCodec<V> valueCodec) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
Objects.requireNonNull(arrayFactory, "arrayFactory");
Objects.requireNonNull(valueCodec, "valueCodec");
final DataInputStream dataInput; // NOPMD
if (inputStream instanceof DataInputStream) {
dataInput = (DataInputStream) inputStream;
} else {
dataInput = new DataInputStream(inputStream);
}
final int magic = dataInput.readInt();
if (magic != STREAM_MAGIC) {
throw new IOException("Unsupported trie stream header: " + Integer.toHexString(magic));
}
final int version = dataInput.readInt();
if (version != STREAM_VERSION) {
throw new IOException("Unsupported trie stream version: " + version);
}
final int nodeCount = dataInput.readInt();
if (nodeCount < 0) {
throw new IOException("Negative node count: " + nodeCount);
}
final int rootNodeId = dataInput.readInt();
if (rootNodeId < 0 || rootNodeId >= nodeCount) {
throw new IOException("Invalid root node id: " + rootNodeId);
}
final CompiledNode<V>[] nodes = readNodes(dataInput, arrayFactory, valueCodec, nodeCount);
final CompiledNode<V> rootNode = nodes[rootNodeId];
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, "Read compiled trie with {0} canonical nodes.", nodeCount);
}
return new FrequencyTrie<>(arrayFactory, rootNode);
}
/**
* Returns the number of canonical compiled nodes reachable from the root.
*
* <p>
* The returned value reflects the size of the final reduced immutable trie, not
* the number of mutable build-time nodes inserted before reduction. Shared
* canonical subtrees are counted only once.
*
* @return number of canonical compiled nodes in this trie
*/
public int size() {
final Map<CompiledNode<V>, Integer> nodeIds = new IdentityHashMap<>();
final List<CompiledNode<V>> orderedNodes = new ArrayList<>();
assignNodeIds(this.root, nodeIds, orderedNodes);
return orderedNodes.size();
}
/**
* Assigns deterministic identifiers to all canonical compiled nodes reachable
* from the supplied root.
*
* @param node current node
* @param nodeIds assigned node identifiers
* @param orderedNodes ordered nodes in identifier order
*/
private static <V> void assignNodeIds(final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds,
final List<CompiledNode<V>> orderedNodes) {
if (nodeIds.containsKey(node)) {
return;
}
final int nodeId = orderedNodes.size();
nodeIds.put(node, nodeId);
orderedNodes.add(node);
for (CompiledNode<V> child : node.children()) {
assignNodeIds(child, nodeIds, orderedNodes);
}
}
/**
* Writes one compiled node.
*
* @param dataOutput output
* @param valueCodec value codec
* @param node node to write
* @param nodeIds node identifiers
* @throws IOException if writing fails
*/
private static <V> void writeNode(final DataOutputStream dataOutput, final ValueStreamCodec<V> valueCodec,
final CompiledNode<V> node, final Map<CompiledNode<V>, Integer> nodeIds) throws IOException {
dataOutput.writeInt(node.edgeLabels().length);
for (int index = 0; index < node.edgeLabels().length; index++) {
dataOutput.writeChar(node.edgeLabels()[index]);
final Integer childNodeId = nodeIds.get(node.children()[index]);
if (childNodeId == null) {
throw new IOException("Missing child node identifier during serialization.");
}
dataOutput.writeInt(childNodeId);
}
dataOutput.writeInt(node.orderedValues().length);
for (int index = 0; index < node.orderedValues().length; index++) {
valueCodec.write(dataOutput, node.orderedValues()[index]);
dataOutput.writeInt(node.orderedCounts()[index]);
}
}
/**
* Reads all compiled nodes and resolves child references.
*
* @param dataInput input
* @param arrayFactory array factory
* @param valueCodec value codec
* @param nodeCount number of nodes
* @param <V> value type
* @return array of nodes indexed by serialized node identifier
* @throws IOException if reading fails or the stream is invalid
*/
@SuppressWarnings("PMD.AvoidInstantiatingObjectsInLoops")
private static <V> CompiledNode<V>[] readNodes(final DataInputStream dataInput, final IntFunction<V[]> arrayFactory,
final ValueStreamCodec<V> valueCodec, final int nodeCount) throws IOException {
final List<NodeData<V>> nodeDataList = new ArrayList<>(nodeCount);
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
final int edgeCount = dataInput.readInt();
if (edgeCount < 0) {
throw new IOException("Negative edge count at node " + nodeIndex + ": " + edgeCount);
}
final char[] edgeLabels = new char[edgeCount];
final int[] childNodeIds = new int[edgeCount];
for (int edgeIndex = 0; edgeIndex < edgeCount; edgeIndex++) {
edgeLabels[edgeIndex] = dataInput.readChar();
childNodeIds[edgeIndex] = dataInput.readInt();
}
final int valueCount = dataInput.readInt();
if (valueCount < 0) {
throw new IOException("Negative value count at node " + nodeIndex + ": " + valueCount);
}
final V[] orderedValues = arrayFactory.apply(valueCount);
final int[] orderedCounts = new int[valueCount];
for (int valueIndex = 0; valueIndex < valueCount; valueIndex++) {
orderedValues[valueIndex] = valueCodec.read(dataInput);
orderedCounts[valueIndex] = dataInput.readInt();
if (orderedCounts[valueIndex] <= 0) {
throw new IOException("Non-positive stored count at node " + nodeIndex + ", value index "
+ valueIndex + ": " + orderedCounts[valueIndex]);
}
}
nodeDataList.add(new NodeData<>(edgeLabels, childNodeIds, orderedValues, orderedCounts));
}
@SuppressWarnings("unchecked")
final CompiledNode<V>[] nodes = new CompiledNode[nodeCount];
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
@SuppressWarnings("unchecked")
final CompiledNode<V>[] children = new CompiledNode[nodeData.childNodeIds().length];
nodes[nodeIndex] = new CompiledNode<>(nodeData.edgeLabels(), children, nodeData.orderedValues(),
nodeData.orderedCounts());
}
for (int nodeIndex = 0; nodeIndex < nodeCount; nodeIndex++) {
final NodeData<V> nodeData = nodeDataList.get(nodeIndex);
final CompiledNode<V> node = nodes[nodeIndex];
for (int edgeIndex = 0; edgeIndex < nodeData.childNodeIds().length; edgeIndex++) {
final int childNodeId = nodeData.childNodeIds()[edgeIndex];
if (childNodeId < 0 || childNodeId >= nodeCount) {
throw new IOException("Invalid child node id at node " + nodeIndex + ", edge index " + edgeIndex
+ ": " + childNodeId);
}
node.children()[edgeIndex] = nodes[childNodeId];
}
}
return nodes;
}
/**
* Locates the compiled node for the supplied key.
*
* @param key key to resolve
* @return compiled node, or {@code null} if the path does not exist
*/
private CompiledNode<V> findNode(final String key) {
CompiledNode<V> current = this.root;
for (int index = 0; index < key.length(); index++) {
current = current.findChild(key.charAt(index));
if (current == null) {
return null;
}
}
return current;
}
/**
* Builder of {@link FrequencyTrie}.
*
* <p>
* The builder is intentionally mutable and optimized for repeated
* {@link #put(String, Object)} calls. The final trie is created by
* {@link #build()}, which performs bottom-up subtree reduction and converts the
* structure to a compact immutable representation optimized for read
* operations.
*
* @param <V> value type
*/
public static final class Builder<V> {
/**
* Logger of this class.
*/
private static final Logger LOGGER = Logger.getLogger(Builder.class.getName());
/**
* Factory used to create typed arrays.
*/
private final IntFunction<V[]> arrayFactory;
/**
* Reduction configuration.
*/
private final ReductionSettings reductionSettings;
/**
* Mutable root node.
*/
private final MutableNode<V> root;
/**
* Creates a new builder with the provided settings.
*
* @param arrayFactory array factory
* @param reductionSettings reduction configuration
* @throws NullPointerException if any argument is {@code null}
*/
public Builder(final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
this.arrayFactory = Objects.requireNonNull(arrayFactory, "arrayFactory");
this.reductionSettings = Objects.requireNonNull(reductionSettings, "reductionSettings");
this.root = new MutableNode<>();
}
/**
* Creates a new builder using default thresholds for the supplied reduction
* mode.
*
* @param arrayFactory array factory
* @param reductionMode reduction mode
* @throws NullPointerException if any argument is {@code null}
*/
public Builder(final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
this(arrayFactory, ReductionSettings.withDefaults(reductionMode));
}
/**
* Stores a value for the supplied key and increments its local frequency.
*
* <p>
* Values are stored at the node addressed by the full key. Since trie values
* may also appear on internal nodes, an empty key is valid and stores a value
* directly at the root.
*
* @param key key
* @param value value
* @return this builder
* @throws NullPointerException if {@code key} or {@code value} is {@code null}
*/
public Builder<V> put(final String key, final V value) {
return put(key, value, 1);
}
/**
* Builds a compiled read-only trie.
*
* @return compiled trie
*/
public FrequencyTrie<V> build() {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, "Starting trie compilation with reduction mode {0}.",
this.reductionSettings.reductionMode());
}
final ReductionContext<V> reductionContext = new ReductionContext<>(this.reductionSettings);
final ReducedNode<V> reducedRoot = reduce(this.root, reductionContext);
final CompiledNode<V> compiledRoot = freeze(reducedRoot, new IdentityHashMap<>());
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, "Trie compilation finished. Canonical node count: {0}.",
reductionContext.canonicalNodeCount());
}
return new FrequencyTrie<>(this.arrayFactory, compiledRoot);
}
/**
* Stores a value for the supplied key and increments its local frequency by the
* specified positive count.
*
* <p>
* Values are stored at the node addressed by the full key. Since trie values
* may also appear on internal nodes, an empty key is valid and stores a value
* directly at the root.
*
* <p>
* This method is functionally equivalent to calling
* {@link #put(String, Object)} repeatedly {@code count} times, but it avoids
* unnecessary repeated map updates and is therefore preferable for bulk
* reconstruction from compiled tries or other aggregated sources.
*
* @param key key
* @param value value
* @param count positive frequency increment
* @return this builder
* @throws NullPointerException if {@code key} or {@code value} is
* {@code null}
* @throws IllegalArgumentException if {@code count} is less than {@code 1}
*/
public Builder<V> put(final String key, final V value, final int count) {
Objects.requireNonNull(key, "key");
Objects.requireNonNull(value, "value");
if (count < 1) { // NOPMD
throw new IllegalArgumentException("count must be at least 1.");
}
MutableNode<V> current = this.root;
for (int index = 0; index < key.length(); index++) {
final Character edge = key.charAt(index);
MutableNode<V> child = current.children().get(edge);
if (child == null) {
child = new MutableNode<>(); // NOPMD
current.children().put(edge, child);
}
current = child;
}
final Integer previous = current.valueCounts().get(value);
if (previous == null) {
current.valueCounts().put(value, count);
} else {
current.valueCounts().put(value, previous + count);
}
return this;
}
/**
* Returns the number of mutable build-time nodes currently reachable from the
* builder root.
*
* <p>
* This metric is intended mainly for diagnostics and tests that compare the
* unreduced build-time structure with the final reduced compiled trie.
*
* @return number of mutable build-time nodes
*/
/* default */ int buildTimeSize() {
return countMutableNodes(this.root);
}
/**
* Counts mutable nodes recursively.
*
* @param node current node
* @return reachable mutable node count
*/
private int countMutableNodes(final MutableNode<V> node) {
int count = 1;
for (MutableNode<V> child : node.children().values()) {
count += countMutableNodes(child);
}
return count;
}
/**
* Reduces a mutable node to a canonical reduced node.
*
* @param source source mutable node
* @param context reduction context
* @return canonical reduced node
*/
private ReducedNode<V> reduce(final MutableNode<V> source, final ReductionContext<V> context) {
final Map<Character, ReducedNode<V>> reducedChildren = new LinkedHashMap<>();
for (Map.Entry<Character, MutableNode<V>> childEntry : source.children().entrySet()) {
final ReducedNode<V> reducedChild = reduce(childEntry.getValue(), context);
reducedChildren.put(childEntry.getKey(), reducedChild);
}
final Map<V, Integer> localCounts = copyCounts(source.valueCounts());
final LocalValueSummary<V> localSummary = LocalValueSummary.of(localCounts, this.arrayFactory);
final ReductionSignature<V> signature = ReductionSignature.create(localSummary, reducedChildren,
context.settings());
ReducedNode<V> canonical = context.lookup(signature);
if (canonical == null) {
canonical = new ReducedNode<>(signature, localCounts, reducedChildren);
context.register(signature, canonical);
return canonical;
}
canonical.mergeLocalCounts(localCounts);
canonical.mergeChildren(reducedChildren);
return canonical;
}
/**
* Freezes a reduced node into an immutable compiled node.
*
* @param reducedNode reduced node
* @param cache already frozen nodes
* @return immutable compiled node
*/
private CompiledNode<V> freeze(final ReducedNode<V> reducedNode,
final Map<ReducedNode<V>, CompiledNode<V>> cache) {
final CompiledNode<V> existing = cache.get(reducedNode);
if (existing != null) {
return existing;
}
final LocalValueSummary<V> localSummary = LocalValueSummary.of(reducedNode.localCounts(),
this.arrayFactory);
final List<Map.Entry<Character, ReducedNode<V>>> childEntries = new ArrayList<>(
reducedNode.children().entrySet());
childEntries.sort(Map.Entry.comparingByKey());
final char[] edges = new char[childEntries.size()];
@SuppressWarnings("unchecked")
final CompiledNode<V>[] childNodes = new CompiledNode[childEntries.size()];
for (int index = 0; index < childEntries.size(); index++) {
final Map.Entry<Character, ReducedNode<V>> entry = childEntries.get(index);
edges[index] = entry.getKey();
childNodes[index] = freeze(entry.getValue(), cache);
}
final CompiledNode<V> frozen = new CompiledNode<>(edges, childNodes, localSummary.orderedValues(),
localSummary.orderedCounts());
cache.put(reducedNode, frozen);
return frozen;
}
/**
* Creates a shallow frequency copy preserving deterministic insertion order of
* first occurrence.
*
* @param source source counts
* @return copied counts
*/
private Map<V, Integer> copyCounts(final Map<V, Integer> source) {
return new LinkedHashMap<>(source);
}
}
/**
* Codec used to persist values stored in the trie.
*
* @param <V> value type
*/
public interface ValueStreamCodec<V> {
/**
* Writes one value to the supplied data output.
*
* @param dataOutput target data output
* @param value value to write
* @throws IOException if writing fails
*/
void write(DataOutputStream dataOutput, V value) throws IOException;
/**
* Reads one value from the supplied data input.
*
* @param dataInput source data input
* @return read value
* @throws IOException if reading fails
*/
V read(DataInputStream dataInput) throws IOException;
}
}

View File

@@ -0,0 +1,141 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.Objects;
import java.util.function.IntFunction;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.egothor.stemmer.trie.CompiledNode;
/**
* Factory utilities related to {@link FrequencyTrie.Builder}.
*
* <p>
* This helper reconstructs writable builders from compiled read-only tries. The
* reconstruction preserves the semantics and local counts of the compiled trie
* as currently stored, which makes it suitable for subsequent modifications
* followed by recompilation.
*
* <p>
* Reconstruction operates on the compiled form. Therefore, if the compiled trie
* was produced using a reduction mode that merged semantically equivalent
* subtrees, the recreated builder reflects that reduced compiled state rather
* than the exact original unreduced insertion history.
*/
public final class FrequencyTrieBuilders {
/**
* Logger of this class.
*/
private static final Logger LOGGER = Logger.getLogger(FrequencyTrieBuilders.class.getName());
/**
* Utility class.
*/
private FrequencyTrieBuilders() {
throw new AssertionError("No instances.");
}
/**
* Reconstructs a new writable builder from a compiled read-only trie.
*
* <p>
* The returned builder contains the same key-local value counts as the supplied
* compiled trie. Callers may continue modifying the returned builder and then
* compile a new {@link FrequencyTrie} instance.
*
* @param source source compiled trie
* @param arrayFactory array factory for the reconstructed builder
* @param reductionSettings reduction settings to associate with the new builder
* @param <V> value type
* @return reconstructed writable builder
* @throws NullPointerException if any argument is {@code null}
*/
public static <V> FrequencyTrie.Builder<V> copyOf(final FrequencyTrie<V> source,
final IntFunction<V[]> arrayFactory, final ReductionSettings reductionSettings) {
Objects.requireNonNull(source, "source");
Objects.requireNonNull(arrayFactory, "arrayFactory");
Objects.requireNonNull(reductionSettings, "reductionSettings");
final FrequencyTrie.Builder<V> builder = new FrequencyTrie.Builder<>(arrayFactory, reductionSettings);
final StringBuilder keyBuilder = new StringBuilder(64);
copyNode(source.root(), keyBuilder, builder);
LOGGER.log(Level.FINE, "Reconstructed writable builder from compiled trie.");
return builder;
}
/**
* Reconstructs a new writable builder from a compiled read-only trie using
* default settings for the supplied reduction mode.
*
* @param source source compiled trie
* @param arrayFactory array factory for the reconstructed builder
* @param reductionMode reduction mode to associate with the new builder
* @param <V> value type
* @return reconstructed writable builder
* @throws NullPointerException if any argument is {@code null}
*/
public static <V> FrequencyTrie.Builder<V> copyOf(final FrequencyTrie<V> source,
final IntFunction<V[]> arrayFactory, final ReductionMode reductionMode) {
Objects.requireNonNull(reductionMode, "reductionMode");
return copyOf(source, arrayFactory, ReductionSettings.withDefaults(reductionMode));
}
/**
* Copies one compiled node and all reachable descendants into the target
* builder.
*
* @param node current compiled node
* @param keyBuilder current key builder
* @param builder target mutable builder
* @param <V> value type
*/
private static <V> void copyNode(final CompiledNode<V> node, final StringBuilder keyBuilder,
final FrequencyTrie.Builder<V> builder) {
for (int valueIndex = 0; valueIndex < node.orderedValues().length; valueIndex++) {
builder.put(keyBuilder.toString(), node.orderedValues()[valueIndex], node.orderedCounts()[valueIndex]);
}
for (int childIndex = 0; childIndex < node.edgeLabels().length; childIndex++) {
keyBuilder.append(node.edgeLabels()[childIndex]);
copyNode(node.children()[childIndex], keyBuilder, builder);
keyBuilder.setLength(keyBuilder.length() - 1);
}
}
}

View File

@@ -0,0 +1,583 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.concurrent.locks.ReentrantLock;
/**
* Encodes a compact patch command that transforms one word form into another
* and applies such commands back to source words.
*
* <p>
* The generated patch command follows the historical Egothor convention:
* instructions are serialized so that they are applied from the end of the
* source word toward its beginning. This keeps the command stream compact and
* matches the behavior expected by existing stemming data.
* </p>
*
* <p>
* The encoder computes a minimum-cost edit script using weighted insert,
* delete, replace, and match transitions. The resulting trace is then
* serialized into the compact patch language.
* </p>
*
* <p>
* This class is stateful and reuses internal dynamic-programming matrices
* across invocations to reduce allocation pressure during repeated use.
* Instances are therefore not suitable for unsynchronized concurrent access.
* The {@link #encode(String, String)} method is synchronized so that a shared
* instance can still be used safely when needed.
* </p>
*/
public final class PatchCommandEncoder {
/**
* Serialized opcode for deleting one or more characters.
*/
private static final char DELETE_OPCODE = 'D';
/**
* Serialized opcode for inserting one character.
*/
private static final char INSERT_OPCODE = 'I';
/**
* Serialized opcode for replacing one character.
*/
private static final char REPLACE_OPCODE = 'R';
/**
* Serialized opcode for skipping one or more unchanged characters.
*/
private static final char SKIP_OPCODE = '-';
/**
* Sentinel placed immediately before {@code 'a'} and used to accumulate compact
* counts in the patch format.
*/
private static final char COUNT_SENTINEL = (char) ('a' - 1);
/**
* Serialized opcode for a canonical no-operation patch.
*
* <p>
* This opcode represents an identity transform of the whole source word. It is
* used to ensure that equal source and target words always produce the same
* serialized patch command.
* </p>
*/
private static final char NOOP_OPCODE = 'N';
/**
* Canonical argument used by the serialized no-operation patch.
*/
private static final char NOOP_ARGUMENT = 'a';
/**
* Canonical serialized no-operation patch.
*
* <p>
* This constant is returned by {@link #encode(String, String)} whenever source
* and target are equal.
* </p>
*/
/* default */ static final String NOOP_PATCH = String.valueOf(new char[] { NOOP_OPCODE, NOOP_ARGUMENT });
/**
* Safety penalty used to prevent a mismatch from being selected as a match.
*/
private static final int MISMATCH_PENALTY = 100;
/**
* Extra headroom added when internal matrices need to grow.
*/
private static final int CAPACITY_MARGIN = 8;
/**
* Cost of inserting one character.
*/
private final int insertCost;
/**
* Cost of deleting one character.
*/
private final int deleteCost;
/**
* Cost of replacing one character.
*/
private final int replaceCost;
/**
* Cost of keeping one matching character unchanged.
*/
private final int matchCost;
/**
* Currently allocated source dimension of reusable matrices.
*/
private int sourceCapacity;
/**
* Currently allocated target dimension of reusable matrices.
*/
private int targetCapacity;
/**
* Dynamic-programming matrix containing cumulative minimum costs.
*/
private int[][] costMatrix;
/**
* Matrix storing the chosen transition for each dynamic-programming cell.
*/
private Trace[][] traceMatrix;
/**
* Reentrant lock for {@link #encode(String, String)} exclusive operation.
*/
private final ReentrantLock lock = new ReentrantLock();
/**
* Internal dynamic-programming transition selected for one matrix cell.
*/
private enum Trace {
/**
* Deletes one character from the source sequence.
*/
DELETE,
/**
* Inserts one character from the target sequence.
*/
INSERT,
/**
* Replaces one source character with one target character.
*/
REPLACE,
/**
* Keeps one matching character unchanged.
*/
MATCH
}
/**
* Creates an encoder with the traditional Egothor cost model: insert = 1,
* delete = 1, replace = 1, match = 0.
*/
public PatchCommandEncoder() {
this(1, 1, 1, 0);
}
/**
* Creates an encoder with explicit operation costs.
*
* @param insertCost cost of inserting one character
* @param deleteCost cost of deleting one character
* @param replaceCost cost of replacing one character
* @param matchCost cost of keeping one equal character unchanged
*/
public PatchCommandEncoder(int insertCost, int deleteCost, int replaceCost, int matchCost) {
if (insertCost < 0) {
throw new IllegalArgumentException("insertCost must be non-negative.");
}
if (deleteCost < 0) {
throw new IllegalArgumentException("deleteCost must be non-negative.");
}
if (replaceCost < 0) {
throw new IllegalArgumentException("replaceCost must be non-negative.");
}
if (matchCost < 0) {
throw new IllegalArgumentException("matchCost must be non-negative.");
}
this.insertCost = insertCost;
this.deleteCost = deleteCost;
this.replaceCost = replaceCost;
this.matchCost = matchCost;
this.sourceCapacity = 0;
this.targetCapacity = 0;
this.costMatrix = new int[0][0];
this.traceMatrix = new Trace[0][0];
}
/**
* Produces a compact patch command that transforms {@code source} into
* {@code target}.
*
* @param source source word form
* @param target target word form
* @return compact patch command, or {@code null} when any argument is
* {@code null}
*/
public String encode(String source, String target) {
if (source == null || target == null) {
return null;
}
if (source.equals(target)) {
return NOOP_PATCH;
}
int sourceLength = source.length();
int targetLength = target.length();
lock.lock();
try {
ensureCapacity(sourceLength + 1, targetLength + 1);
initializeBoundaryConditions(sourceLength, targetLength);
char[] sourceCharacters = source.toCharArray();
char[] targetCharacters = target.toCharArray();
fillMatrices(sourceCharacters, targetCharacters, sourceLength, targetLength);
return buildPatchCommand(targetCharacters, sourceLength, targetLength);
} finally {
lock.unlock();
}
}
/**
* Applies a compact patch command to the supplied source word.
*
* <p>
* This method operates directly on serialized opcodes rather than mapping them
* to another representation. That keeps the hot path small and avoids
* unnecessary indirection during patch application.
* </p>
*
* <p>
* For compatibility with the historical behavior, malformed patch input that
* causes index failures results in the original source word being returned
* unchanged.
* </p>
*
* @param source original source word
* @param patchCommand compact patch command
* @return transformed word, or {@code null} when {@code source} is {@code null}
*/
public static String apply(String source, String patchCommand) {
if (source == null) {
return null;
}
if (patchCommand == null || patchCommand.isEmpty()) {
return source;
}
if (NOOP_PATCH.equals(patchCommand)) {
return source;
}
StringBuilder result = new StringBuilder(source);
if (result.isEmpty()) {
return applyToEmptySource(result, patchCommand);
}
int position = result.length() - 1;
try {
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
char opcode = patchCommand.charAt(patchIndex);
char argument = patchCommand.charAt(patchIndex + 1);
int encodedCount = argument - 'a' + 1;
switch (opcode) {
case SKIP_OPCODE:
position = position - encodedCount + 1;
break;
case REPLACE_OPCODE:
result.setCharAt(position, argument);
break;
case DELETE_OPCODE:
int deleteEndExclusive = position + 1;
position -= encodedCount - 1;
result.delete(position, deleteEndExclusive);
break;
case INSERT_OPCODE:
result.insert(position + 1, argument);
position++;
break;
case NOOP_OPCODE:
if (argument != NOOP_ARGUMENT) {
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
}
return source;
default:
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
}
position--;
}
} catch (IndexOutOfBoundsException exception) {
return source;
}
return result.toString();
}
/**
* Applies a patch command to an empty source word.
*
* <p>
* Only insertion instructions are meaningful for an empty source. Skip,
* replace, and delete instructions are treated as malformed and therefore cause
* the original source to be preserved, consistent with the historical fallback
* behavior for index-invalid commands.
* </p>
*
* @param result empty result builder
* @param patchCommand compact patch command
* @return transformed word, or the original empty word when the patch is
* malformed
*/
private static String applyToEmptySource(StringBuilder result, String patchCommand) {
try {
for (int patchIndex = 0, patchLength = patchCommand.length(); patchIndex < patchLength; patchIndex += 2) { // NOPMD
char opcode = patchCommand.charAt(patchIndex);
char argument = patchCommand.charAt(patchIndex + 1);
switch (opcode) {
case INSERT_OPCODE:
result.insert(0, argument);
break;
case SKIP_OPCODE:
case REPLACE_OPCODE:
case DELETE_OPCODE:
return "";
case NOOP_OPCODE:
if (argument != NOOP_ARGUMENT) {
throw new IllegalArgumentException("Unsupported NOOP patch argument: " + argument);
}
return "";
default:
throw new IllegalArgumentException("Unsupported patch opcode: " + opcode);
}
}
} catch (IndexOutOfBoundsException exception) {
return "";
}
return result.toString();
}
/**
* Ensures that internal matrices are large enough for the requested input
* dimensions.
*
* @param requiredSourceCapacity required source dimension
* @param requiredTargetCapacity required target dimension
*/
private void ensureCapacity(int requiredSourceCapacity, int requiredTargetCapacity) {
if (requiredSourceCapacity <= sourceCapacity && requiredTargetCapacity <= targetCapacity) {
return;
}
sourceCapacity = Math.max(sourceCapacity, requiredSourceCapacity) + CAPACITY_MARGIN;
targetCapacity = Math.max(targetCapacity, requiredTargetCapacity) + CAPACITY_MARGIN;
costMatrix = new int[sourceCapacity][targetCapacity];
traceMatrix = new Trace[sourceCapacity][targetCapacity];
}
/**
* Initializes the first row and first column of the dynamic-programming
* matrices.
*
* @param sourceLength length of the source word
* @param targetLength length of the target word
*/
private void initializeBoundaryConditions(int sourceLength, int targetLength) {
costMatrix[0][0] = 0;
traceMatrix[0][0] = Trace.MATCH;
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
costMatrix[sourceIndex][0] = sourceIndex * deleteCost;
traceMatrix[sourceIndex][0] = Trace.DELETE;
}
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
costMatrix[0][targetIndex] = targetIndex * insertCost;
traceMatrix[0][targetIndex] = Trace.INSERT;
}
}
/**
* Fills dynamic-programming matrices for the supplied source and target
* character sequences.
*
* @param sourceCharacters source characters
* @param targetCharacters target characters
* @param sourceLength source length
* @param targetLength target length
*/
private void fillMatrices(char[] sourceCharacters, char[] targetCharacters, int sourceLength, int targetLength) {
for (int sourceIndex = 1; sourceIndex <= sourceLength; sourceIndex++) {
char sourceCharacter = sourceCharacters[sourceIndex - 1];
for (int targetIndex = 1; targetIndex <= targetLength; targetIndex++) {
char targetCharacter = targetCharacters[targetIndex - 1];
int deleteCandidate = costMatrix[sourceIndex - 1][targetIndex] + deleteCost;
int insertCandidate = costMatrix[sourceIndex][targetIndex - 1] + insertCost;
int replaceCandidate = costMatrix[sourceIndex - 1][targetIndex - 1] + replaceCost;
int matchCandidate = costMatrix[sourceIndex - 1][targetIndex - 1]
+ (sourceCharacter == targetCharacter ? matchCost : MISMATCH_PENALTY);
int bestCost = matchCandidate;
Trace bestTrace = Trace.MATCH;
if (deleteCandidate <= bestCost) {
bestCost = deleteCandidate;
bestTrace = Trace.DELETE;
}
if (insertCandidate < bestCost) {
bestCost = insertCandidate;
bestTrace = Trace.INSERT;
}
if (replaceCandidate < bestCost) {
bestCost = replaceCandidate;
bestTrace = Trace.REPLACE;
}
costMatrix[sourceIndex][targetIndex] = bestCost;
traceMatrix[sourceIndex][targetIndex] = bestTrace;
}
}
}
/**
* Reconstructs the compact patch command by traversing the trace matrix from
* the final cell back to the origin.
*
* @param targetCharacters target characters
* @param sourceLength source length
* @param targetLength target length
* @return compact patch command
*/
private String buildPatchCommand(char[] targetCharacters, int sourceLength, int targetLength) {
StringBuilder patchBuilder = new StringBuilder(sourceLength + targetLength);
char pendingDeletes = COUNT_SENTINEL;
char pendingSkips = COUNT_SENTINEL;
int sourceIndex = sourceLength;
int targetIndex = targetLength;
while (sourceIndex != 0 || targetIndex != 0) {
Trace trace = traceMatrix[sourceIndex][targetIndex];
switch (trace) {
case DELETE:
if (pendingSkips != COUNT_SENTINEL) {
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
pendingSkips = COUNT_SENTINEL;
}
pendingDeletes++;
sourceIndex--;
break;
case INSERT:
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
pendingDeletes = COUNT_SENTINEL;
}
if (pendingSkips != COUNT_SENTINEL) {
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
pendingSkips = COUNT_SENTINEL;
}
targetIndex--;
appendInstruction(patchBuilder, INSERT_OPCODE, targetCharacters[targetIndex]);
break;
case REPLACE:
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
pendingDeletes = COUNT_SENTINEL;
}
if (pendingSkips != COUNT_SENTINEL) {
appendInstruction(patchBuilder, SKIP_OPCODE, pendingSkips);
pendingSkips = COUNT_SENTINEL;
}
targetIndex--;
sourceIndex--;
appendInstruction(patchBuilder, REPLACE_OPCODE, targetCharacters[targetIndex]);
break;
case MATCH:
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
pendingDeletes = COUNT_SENTINEL;
}
pendingSkips++;
sourceIndex--;
targetIndex--;
break;
}
}
if (pendingDeletes != COUNT_SENTINEL) {
appendInstruction(patchBuilder, DELETE_OPCODE, pendingDeletes);
}
return patchBuilder.toString();
}
/**
* Appends one serialized instruction to the patch command builder.
*
* @param patchBuilder patch command builder
* @param opcode single-character instruction opcode
* @param argument encoded instruction argument
*/
private static void appendInstruction(StringBuilder patchBuilder, char opcode, char argument) {
patchBuilder.append(opcode).append(argument);
}
}

View File

@@ -0,0 +1,79 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
/**
* Defines the subtree reduction strategy applied during trie compilation.
*
* <p>
* All reduction modes operate on the full subtree semantics, not only on the
* local content of a single node. This is important because trie values may be
* stored on both internal nodes and leaf nodes.
*/
@SuppressWarnings("PMD.LongVariable")
public enum ReductionMode {
/**
* Merges subtrees whose {@code getAll()} results are equivalent for every
* reachable key suffix and whose local result ordering is the same.
*
* <p>
* This mode ignores absolute frequencies when comparing subtree signatures, but
* preserves the value order returned by {@code getAll()}.
*/
MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS,
/**
* Merges subtrees whose {@code getAll()} results are equivalent for every
* reachable key suffix, regardless of the local ordering of values.
*
* <p>
* This mode ignores both absolute frequencies and local result ordering when
* comparing subtree signatures.
*/
MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS,
/**
* Merges subtrees whose preferred {@code get()} results are equivalent for
* every reachable key suffix, provided that the locally dominant winner
* satisfies the configured dominance constraints.
*
* <p>
* If a node does not satisfy the dominance constraints, the implementation
* falls back to ranked {@code getAll()} semantics for that node in order to
* avoid unsafe over-reduction.
*/
MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS
}

View File

@@ -0,0 +1,100 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.Objects;
/**
* Immutable reduction configuration used by {@link FrequencyTrie.Builder}.
*
* <p>
* The settings influence how mutable trie nodes are merged into canonical
* read-only nodes during compilation.
*
* @param reductionMode reduction mode
* @param dominantWinnerMinPercent minimum dominant winner percentage
* @param dominantWinnerOverSecondRatio minimum winner-over-second ratio
*/
@SuppressWarnings("PMD.LongVariable")
public record ReductionSettings(ReductionMode reductionMode, int dominantWinnerMinPercent,
int dominantWinnerOverSecondRatio) {
/**
* Default minimum dominant winner percentage.
*/
public static final int DEFAULT_DOMINANT_WINNER_MIN_PERCENT = 75;
/**
* Default minimum winner-over-second ratio.
*/
public static final int DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO = 3;
/**
* Creates a new instance.
*
* @param reductionMode reduction mode
* @param dominantWinnerMinPercent minimum dominant winner percentage in
* the inclusive range {@code 1..100}
* @param dominantWinnerOverSecondRatio minimum winner-over-second ratio, must
* be at least {@code 1}
* @throws NullPointerException if {@code reductionMode} is {@code null}
* @throws IllegalArgumentException if any numeric value is outside the valid
* range
*/
public ReductionSettings(final ReductionMode reductionMode, final int dominantWinnerMinPercent,
final int dominantWinnerOverSecondRatio) {
this.reductionMode = Objects.requireNonNull(reductionMode, "reductionMode");
if (dominantWinnerMinPercent < 1 || dominantWinnerMinPercent > 100) {
throw new IllegalArgumentException("dominantWinnerMinPercent must be in range 1..100.");
}
if (dominantWinnerOverSecondRatio < 1) { // NOPMD
throw new IllegalArgumentException("dominantWinnerOverSecondRatio must be at least 1.");
}
this.dominantWinnerMinPercent = dominantWinnerMinPercent;
this.dominantWinnerOverSecondRatio = dominantWinnerOverSecondRatio;
}
/**
* Creates settings with default dominance thresholds.
*
* @param reductionMode reduction mode
* @return new settings instance
* @throws NullPointerException if {@code reductionMode} is {@code null}
*/
public static ReductionSettings withDefaults(final ReductionMode reductionMode) {
return new ReductionSettings(reductionMode, DEFAULT_DOMINANT_WINNER_MIN_PERCENT,
DEFAULT_DOMINANT_WINNER_OVER_SECOND_RATIO);
}
}

View File

@@ -0,0 +1,257 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Locale;
import java.util.Objects;
import java.util.StringTokenizer;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* Parser of line-oriented stemmer dictionary files.
*
* <p>
* Each non-empty logical line consists of a stem followed by zero or more known
* word variants separated by whitespace. The first token is interpreted as the
* canonical stem, and every following token on the same line is interpreted as
* a variant belonging to that stem.
*
* <p>
* Input lines are normalized to lower case using {@link Locale#ROOT}. Leading
* and trailing whitespace is ignored.
*
* <p>
* The parser supports line remarks and trailing remarks. The remark markers
* {@code #} and {@code //} terminate the logical content of the line, and the
* remainder of that line is ignored.
*
* <p>
* This class is intentionally stateless and allocation-light so it can be used
* both by runtime loading and by offline compilation tooling.
*/
public final class StemmerDictionaryParser {
/**
* Logger of this class.
*/
private static final Logger LOGGER = Logger.getLogger(StemmerDictionaryParser.class.getName());
/**
* Utility class.
*/
private StemmerDictionaryParser() {
throw new AssertionError("No instances.");
}
/**
* Callback receiving one parsed dictionary line.
*/
@FunctionalInterface
public interface EntryHandler {
/**
* Accepts one parsed dictionary entry.
*
* @param stem canonical stem, never {@code null}
* @param variants variants in encounter order, never {@code null}
* @param lineNumber original physical line number in the parsed source
* @throws IOException if processing fails
*/
void onEntry(String stem, String[] variants, int lineNumber) throws IOException;
}
/**
* Parses a dictionary file from a filesystem path.
*
* @param path dictionary file path
* @param entryHandler handler receiving parsed entries
* @return parsing statistics
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading fails
*/
public static ParseStatistics parse(final Path path, final EntryHandler entryHandler) throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(entryHandler, "entryHandler");
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
return parse(reader, path.toAbsolutePath().toString(), entryHandler);
}
}
/**
* Parses a dictionary file from a path string.
*
* @param fileName dictionary file name or path string
* @param entryHandler handler receiving parsed entries
* @return parsing statistics
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading fails
*/
public static ParseStatistics parse(final String fileName, final EntryHandler entryHandler) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return parse(Path.of(fileName), entryHandler);
}
/**
* Parses a dictionary from a reader.
*
* @param reader source reader
* @param sourceDescription logical source description for diagnostics
* @param entryHandler handler receiving parsed entries
* @return parsing statistics
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if reading or handler processing fails
*/
public static ParseStatistics parse(final Reader reader, final String sourceDescription,
final EntryHandler entryHandler) throws IOException {
Objects.requireNonNull(reader, "reader");
Objects.requireNonNull(sourceDescription, "sourceDescription");
Objects.requireNonNull(entryHandler, "entryHandler");
final BufferedReader bufferedReader = reader instanceof BufferedReader ? (BufferedReader) reader
: new BufferedReader(reader);
int lineNumber = 0;
int logicalEntryCount = 0;
int ignoredLineCount = 0;
for (String line = bufferedReader.readLine(); line != null; line = bufferedReader.readLine()) {
lineNumber++;
final String normalizedLine = stripRemark(line).trim().toLowerCase(Locale.ROOT);
if (normalizedLine.isEmpty()) {
ignoredLineCount++;
continue;
}
final StringTokenizer tokenizer = new StringTokenizer(normalizedLine); // NOPMD
if (!tokenizer.hasMoreTokens()) {
ignoredLineCount++;
continue;
}
final String stem = tokenizer.nextToken();
final String[] variants = new String[tokenizer.countTokens()]; // NOPMD
for (int index = 0; index < variants.length; index++) {
variants[index] = tokenizer.nextToken();
}
entryHandler.onEntry(stem, variants, lineNumber);
logicalEntryCount++;
}
final ParseStatistics statistics = new ParseStatistics(sourceDescription, lineNumber, logicalEntryCount,
ignoredLineCount);
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE, "Parsed dictionary source {0}: lines={1}, entries={2}, ignoredLines={3}.",
new Object[] { statistics.sourceDescription(), statistics.lineCount(), statistics.entryCount(),
statistics.ignoredLineCount() });
}
return statistics;
}
/**
* Removes a trailing remark from one physical line.
*
* <p>
* The earliest occurrence of either supported remark marker terminates the
* logical line content.
*
* @param line physical line
* @return line content without a trailing remark
*/
private static String stripRemark(final String line) {
final int hashIndex = line.indexOf('#');
final int slashIndex = line.indexOf("//");
final int remarkIndex;
if (hashIndex < 0) {
remarkIndex = slashIndex;
} else if (slashIndex < 0) {
remarkIndex = hashIndex;
} else {
remarkIndex = Math.min(hashIndex, slashIndex);
}
if (remarkIndex < 0) {
return line;
}
return line.substring(0, remarkIndex);
}
/**
* Immutable parsing statistics.
*
* @param sourceDescription logical source description
* @param lineCount number of physical lines read
* @param entryCount number of logical dictionary entries emitted
* @param ignoredLineCount number of ignored empty or remark-only lines
*/
public record ParseStatistics(String sourceDescription, int lineCount, int entryCount, int ignoredLineCount) {
/**
* Creates parsing statistics.
*
* @param sourceDescription logical source description
* @param lineCount number of physical lines read
* @param entryCount number of logical dictionary entries emitted
* @param ignoredLineCount number of ignored empty or remark-only lines
* @throws NullPointerException if {@code sourceDescription} is {@code null}
* @throws IllegalArgumentException if any numeric value is negative
*/
public ParseStatistics {
Objects.requireNonNull(sourceDescription, "sourceDescription");
if (lineCount < 0) {
throw new IllegalArgumentException("lineCount must not be negative.");
}
if (entryCount < 0) {
throw new IllegalArgumentException("entryCount must not be negative.");
}
if (ignoredLineCount < 0) {
throw new IllegalArgumentException("ignoredLineCount must not be negative.");
}
}
}
}

View File

@@ -0,0 +1,216 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
/**
* Binary persistence helper for patch-command stemmer tries.
*
* <p>
* This class persists {@link FrequencyTrie} instances whose values are compact
* patch commands represented as {@link String}. The serialized trie payload is
* the native binary format of {@link FrequencyTrie}, wrapped in GZip
* compression.
*
* <p>
* The helper centralizes the codec and compression details so that higher-level
* loader APIs can remain focused on source selection rather than stream
* mechanics.
*/
public final class StemmerPatchTrieBinaryIO {
/**
* Logger of this class.
*/
private static final Logger LOGGER = Logger.getLogger(StemmerPatchTrieBinaryIO.class.getName());
/**
* Value codec for persisted patch-command strings.
*/
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new StringValueStreamCodec();
/**
* Utility class.
*/
private StemmerPatchTrieBinaryIO() {
throw new AssertionError("No instances.");
}
/**
* Reads a GZip-compressed binary patch-command trie from a filesystem path.
*
* @param path source file
* @return deserialized trie
* @throws NullPointerException if {@code path} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static FrequencyTrie<String> read(final Path path) throws IOException {
Objects.requireNonNull(path, "path");
try (InputStream fileInputStream = Files.newInputStream(path)) {
return read(fileInputStream);
}
}
/**
* Reads a GZip-compressed binary patch-command trie from a filesystem path
* string.
*
* @param fileName source file name or path string
* @return deserialized trie
* @throws NullPointerException if {@code fileName} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static FrequencyTrie<String> read(final String fileName) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return read(Path.of(fileName));
}
/**
* Reads a GZip-compressed binary patch-command trie from an input stream.
*
* <p>
* The supplied stream is consumed but not interpreted as plain trie bytes; it
* is first decompressed using {@link GZIPInputStream}.
*
* @param inputStream source stream
* @return deserialized trie
* @throws NullPointerException if {@code inputStream} is {@code null}
* @throws IOException if reading or decompression fails
*/
public static FrequencyTrie<String> read(final InputStream inputStream) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
try (GZIPInputStream gzipInputStream = new GZIPInputStream(new BufferedInputStream(inputStream));
DataInputStream dataInputStream = new DataInputStream(gzipInputStream)) {
final FrequencyTrie<String> trie = FrequencyTrie.readFrom(dataInputStream, String[]::new, STRING_CODEC);
LOGGER.log(Level.FINE, "Read compressed binary stemmer trie.");
return trie;
}
}
/**
* Writes a GZip-compressed binary patch-command trie to a filesystem path.
*
* @param trie trie to persist
* @param path target file
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if writing fails
*/
public static void write(final FrequencyTrie<String> trie, final Path path) throws IOException {
Objects.requireNonNull(trie, "trie");
Objects.requireNonNull(path, "path");
final Path parent = path.toAbsolutePath().getParent();
if (parent != null) {
Files.createDirectories(parent);
}
try (OutputStream fileOutputStream = Files.newOutputStream(path)) {
write(trie, fileOutputStream);
}
}
/**
* Writes a GZip-compressed binary patch-command trie to a filesystem path
* string.
*
* @param trie trie to persist
* @param fileName target file name or path string
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if writing fails
*/
public static void write(final FrequencyTrie<String> trie, final String fileName) throws IOException {
Objects.requireNonNull(fileName, "fileName");
write(trie, Path.of(fileName));
}
/**
* Writes a GZip-compressed binary patch-command trie to an output stream.
*
* @param trie trie to persist
* @param outputStream target stream
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if writing fails
*/
public static void write(final FrequencyTrie<String> trie, final OutputStream outputStream) throws IOException {
Objects.requireNonNull(trie, "trie");
Objects.requireNonNull(outputStream, "outputStream");
try (GZIPOutputStream gzipOutputStream = new GZIPOutputStream(new BufferedOutputStream(outputStream));
DataOutputStream dataOutputStream = new DataOutputStream(gzipOutputStream)) {
trie.writeTo(dataOutputStream, STRING_CODEC);
}
LOGGER.log(Level.FINE, "Wrote compressed binary stemmer trie.");
}
/**
* Binary stream codec for persisted patch-command strings.
*/
private static final class StringValueStreamCodec implements FrequencyTrie.ValueStreamCodec<String> {
/**
* Creates a codec instance.
*/
private StringValueStreamCodec() {
}
@Override
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
dataOutput.writeUTF(value);
}
@Override
public String read(final DataInputStream dataInput) throws IOException {
return dataInput.readUTF();
}
}
}

View File

@@ -0,0 +1,431 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* Loader of patch-command tries from bundled stemmer dictionaries.
*
* <p>
* Each dictionary is line-oriented. The first token on a line is interpreted as
* the stem, and all following tokens are treated as known variants of that
* stem.
*
* <p>
* For each line, the loader inserts:
* <ul>
* <li>the stem itself mapped to the canonical no-op patch command
* {@link PatchCommandEncoder#NOOP_PATCH}, when requested by the caller</li>
* <li>every distinct variant mapped to the patch command transforming that
* variant to the stem</li>
* </ul>
*
* <p>
* Parsing is delegated to {@link StemmerDictionaryParser}, which also supports
* line remarks introduced by {@code #} or {@code //}.
*/
public final class StemmerPatchTrieLoader {
/**
* Logger of this class.
*/
private static final Logger LOGGER = Logger.getLogger(StemmerPatchTrieLoader.class.getName());
/**
* Canonical no-op patch command used when the source and target are equal.
*/
private static final String NOOP_PATCH_COMMAND = PatchCommandEncoder.NOOP_PATCH;
/**
* Utility class.
*/
private StemmerPatchTrieLoader() {
throw new AssertionError("No instances.");
}
/**
* Supported bundled stemmer dictionaries.
*/
public enum Language {
/**
* Danish.
*/
DA_DK("da_dk"),
/**
* German.
*/
DE_DE("de_de"),
/**
* Spanish.
*/
ES_ES("es_es"),
/**
* French.
*/
FR_FR("fr_fr"),
/**
* Italian.
*/
IT_IT("it_it"),
/**
* Dutch.
*/
NL_NL("nl_nl"),
/**
* Norwegian.
*/
NO_NO("no_no"),
/**
* Portuguese.
*/
PT_PT("pt_pt"),
/**
* Russian.
*/
RU_RU("ru_ru"),
/**
* Swedish.
*/
SV_SE("sv_se"),
/**
* English.
*/
US_UK("us_uk"),
/**
* English professional dictionary.
*/
US_UK_PROFI("us_uk.profi");
/**
* Resource directory name.
*/
private final String resourceDirectory;
/**
* Creates a language constant.
*
* @param resourceDirectory resource directory name
*/
Language(final String resourceDirectory) {
this.resourceDirectory = resourceDirectory;
}
/**
* Returns the classpath resource path of the stemmer dictionary.
*
* @return classpath resource path
*/
public String resourcePath() {
return this.resourceDirectory + "/stemmer";
}
/**
* Returns the resource directory name.
*
* @return resource directory name
*/
public String resourceDirectory() {
return this.resourceDirectory;
}
}
/**
* Loads a bundled dictionary using explicit reduction settings.
*
* @param language bundled language dictionary
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionSettings reduction settings
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the dictionary cannot be found or read
*/
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
final ReductionSettings reductionSettings) throws IOException {
Objects.requireNonNull(language, "language");
Objects.requireNonNull(reductionSettings, "reductionSettings");
final String resourcePath = language.resourcePath();
try (InputStream inputStream = openBundledResource(resourcePath);
BufferedReader reader = new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
return load(reader, resourcePath, storeOriginal, reductionSettings);
}
}
/**
* Loads a bundled dictionary using default settings for the supplied reduction
* mode.
*
* @param language bundled language dictionary
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionMode reduction mode
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the dictionary cannot be found or read
*/
public static FrequencyTrie<String> load(final Language language, final boolean storeOriginal,
final ReductionMode reductionMode) throws IOException {
Objects.requireNonNull(reductionMode, "reductionMode");
return load(language, storeOriginal, ReductionSettings.withDefaults(reductionMode));
}
/**
* Loads a dictionary from a filesystem path using explicit reduction settings.
*
* @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionSettings reduction settings
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionSettings reductionSettings) throws IOException {
Objects.requireNonNull(path, "path");
Objects.requireNonNull(reductionSettings, "reductionSettings");
try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) {
return load(reader, path.toAbsolutePath().toString(), storeOriginal, reductionSettings);
}
}
/**
* Loads a dictionary from a filesystem path using default settings for the
* supplied reduction mode.
*
* @param path path to the dictionary file
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionMode reduction mode
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final Path path, final boolean storeOriginal,
final ReductionMode reductionMode) throws IOException {
Objects.requireNonNull(reductionMode, "reductionMode");
return load(path, storeOriginal, ReductionSettings.withDefaults(reductionMode));
}
/**
* Loads a dictionary from a filesystem path string using explicit reduction
* settings.
*
* @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionSettings reduction settings
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionSettings reductionSettings) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return load(Path.of(fileName), storeOriginal, reductionSettings);
}
/**
* Loads a dictionary from a filesystem path string using default settings for
* the supplied reduction mode.
*
* @param fileName file name or path string
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionMode reduction mode
* @return compiled patch-command trie
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if the file cannot be opened or read
*/
public static FrequencyTrie<String> load(final String fileName, final boolean storeOriginal,
final ReductionMode reductionMode) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return load(Path.of(fileName), storeOriginal, reductionMode);
}
/**
* Parses one dictionary and builds the compiled trie.
*
* @param reader dictionary reader
* @param sourceDescription logical source description used for diagnostics
* @param storeOriginal whether the stem itself should be inserted using the
* canonical no-op patch command
* @param reductionSettings reduction settings
* @return compiled patch-command trie
* @throws IOException if parsing fails
*/
private static FrequencyTrie<String> load(final BufferedReader reader, final String sourceDescription,
final boolean storeOriginal, final ReductionSettings reductionSettings) throws IOException {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<>(String[]::new, reductionSettings);
final PatchCommandEncoder patchCommandEncoder = new PatchCommandEncoder();
final int[] insertedMappings = new int[1];
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
sourceDescription, (stem, variants, lineNumber) -> {
if (storeOriginal) {
builder.put(stem, NOOP_PATCH_COMMAND);
insertedMappings[0]++;
}
for (String variant : variants) {
if (!variant.equals(stem)) {
builder.put(variant, patchCommandEncoder.encode(variant, stem));
insertedMappings[0]++;
}
}
});
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.log(Level.FINE,
"Loaded stemmer dictionary from {0}; insertedMappings={1}, lines={2}, entries={3}, ignoredLines={4}.",
new Object[] { sourceDescription, insertedMappings[0], statistics.lineCount(),
statistics.entryCount(), statistics.ignoredLineCount() });
}
return builder.build();
}
/**
* Loads a GZip-compressed binary patch-command trie from a filesystem path.
*
* @param path path to the compressed binary trie file
* @return compiled patch-command trie
* @throws NullPointerException if {@code path} is {@code null}
* @throws IOException if the file cannot be opened, decompressed, or
* read
*/
public static FrequencyTrie<String> loadBinary(final Path path) throws IOException {
Objects.requireNonNull(path, "path");
return StemmerPatchTrieBinaryIO.read(path);
}
/**
* Loads a GZip-compressed binary patch-command trie from a filesystem path
* string.
*
* @param fileName file name or path string
* @return compiled patch-command trie
* @throws NullPointerException if {@code fileName} is {@code null}
* @throws IOException if the file cannot be opened, decompressed, or
* read
*/
public static FrequencyTrie<String> loadBinary(final String fileName) throws IOException {
Objects.requireNonNull(fileName, "fileName");
return StemmerPatchTrieBinaryIO.read(fileName);
}
/**
* Loads a GZip-compressed binary patch-command trie from an input stream.
*
* @param inputStream source input stream
* @return compiled patch-command trie
* @throws NullPointerException if {@code inputStream} is {@code null}
* @throws IOException if the stream cannot be decompressed or read
*/
public static FrequencyTrie<String> loadBinary(final InputStream inputStream) throws IOException {
Objects.requireNonNull(inputStream, "inputStream");
return StemmerPatchTrieBinaryIO.read(inputStream);
}
/**
* Saves a compiled patch-command trie as a GZip-compressed binary file.
*
* @param trie compiled trie
* @param path target file
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if writing fails
*/
public static void saveBinary(final FrequencyTrie<String> trie, final Path path) throws IOException {
Objects.requireNonNull(trie, "trie");
Objects.requireNonNull(path, "path");
StemmerPatchTrieBinaryIO.write(trie, path);
}
/**
* Saves a compiled patch-command trie as a GZip-compressed binary file.
*
* @param trie compiled trie
* @param fileName target file name or path string
* @throws NullPointerException if any argument is {@code null}
* @throws IOException if writing fails
*/
public static void saveBinary(final FrequencyTrie<String> trie, final String fileName) throws IOException {
Objects.requireNonNull(trie, "trie");
Objects.requireNonNull(fileName, "fileName");
StemmerPatchTrieBinaryIO.write(trie, fileName);
}
/**
* Opens a bundled resource from the classpath.
*
* @param resourcePath classpath resource path
* @return opened input stream
* @throws IOException if the resource cannot be found
*/
private static InputStream openBundledResource(final String resourcePath) throws IOException {
final ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
final InputStream inputStream = classLoader.getResourceAsStream(resourcePath);
if (inputStream == null) {
throw new IOException("Stemmer resource not found: " + resourcePath);
}
return inputStream;
}
}

View File

@@ -0,0 +1,62 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer;
import java.util.Objects;
/**
* Immutable value-count pair returned by read-only trie queries.
*
* @param <V> value type
* @param value stored value
* @param count occurrence count associated with the value
*/
public record ValueCount<V>(V value, int count) {
/**
* Creates a new value-count pair.
*
* @param value stored value
* @param count occurrence count
* @throws NullPointerException if {@code value} is {@code null}
* @throws IllegalArgumentException if {@code count} is negative
*/
public ValueCount {
Objects.requireNonNull(value, "value");
if (count < 0) {
throw new IllegalArgumentException("count must not be negative.");
}
}
}

View File

@@ -0,0 +1,75 @@
/**
* Provides the core Egothor-style stemming infrastructure based on compact
* patch-command tries.
*
* <p>
* The package centers on a read-only {@link org.egothor.stemmer.FrequencyTrie}
* that maps word forms to one or more values together with their recorded local
* frequencies. In the stemming use case, these values are compact patch
* commands that reconstruct a canonical stem from an observed surface form. The
* trie is built through {@link org.egothor.stemmer.FrequencyTrie.Builder},
* reduced into a canonical immutable structure, and then queried through
* deterministic {@code get(String)}, {@code getAll(String)}, and
* {@code getEntries(String)} operations.
* </p>
*
* <p>
* Patch commands are produced and interpreted by
* {@link org.egothor.stemmer.PatchCommandEncoder}. The encoder follows the
* historical Egothor convention in which edit instructions are serialized for
* application from the end of the source word toward its beginning. The
* implementation supports canonical no-operation patches for identity
* transformations and compact commands for insertion, deletion, replacement,
* and suffix-preserving transitions.
* </p>
*
* <p>
* Dictionary loading is provided by
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}, which reads the
* traditional line-oriented stemmer resource format in which each non-empty
* logical line starts with a canonical stem followed by known surface variants.
* Parsing is delegated to {@link org.egothor.stemmer.StemmerDictionaryParser},
* which normalizes input to lower case using {@link java.util.Locale#ROOT} and
* supports whole-line as well as trailing remarks introduced by {@code #} or
* {@code //}. During loading, each variant is converted into a patch command
* targeting the canonical stem, and the stem itself may optionally be stored
* under the canonical no-operation patch.
* </p>
*
* <p>
* Trie compilation behavior is controlled by
* {@link org.egothor.stemmer.ReductionMode} and
* {@link org.egothor.stemmer.ReductionSettings}. These types define how
* semantically equivalent subtrees may be merged during compilation in order to
* reduce the size of the final immutable trie while preserving the intended
* lookup semantics. Depending on the selected mode, reduction may preserve full
* ranked {@code getAll()} semantics, unordered value equivalence, or dominant
* {@code get()} semantics subject to configurable dominance thresholds.
* </p>
*
* <p>
* Persisted compiled tries are supported through
* {@link org.egothor.stemmer.StemmerPatchTrieBinaryIO} and the corresponding
* binary loading and saving methods on
* {@link org.egothor.stemmer.StemmerPatchTrieLoader}. The persisted form wraps
* the native {@link org.egothor.stemmer.FrequencyTrie} binary format in GZip
* compression and is intended for efficient deployment and runtime loading.
* Reconstructing a writable builder from an already compiled trie is supported
* by {@link org.egothor.stemmer.FrequencyTrieBuilders}.
* </p>
*
* <p>
* For offline preparation of deployment artifacts, the package also provides
* the {@link org.egothor.stemmer.Compile} command-line utility, which reads a
* dictionary source, applies the configured reduction strategy, and writes the
* resulting compressed binary trie.
* </p>
*
* <p>
* The package is designed for deterministic behavior, compact persisted
* representation, and efficient runtime lookup. Public APIs are intentionally
* focused on immutable compiled structures for read paths, with separate
* explicit builder-oriented entry points for mutation and reconstruction.
* </p>
*/
package org.egothor.stemmer;

View File

@@ -0,0 +1,83 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.Objects;
/**
* Child signature descriptor.
*
* @param <V> value type
*/
/* default */ final class ChildDescriptor<V> {
/**
* Edge character.
*/
private final char edge;
/**
* Child subtree signature.
*/
private final ReductionSignature<V> childSignature;
/**
* Creates a child descriptor.
*
* @param edge edge character
* @param childSignature child signature
*/
/* default */ ChildDescriptor(final char edge, final ReductionSignature<V> childSignature) {
this.edge = edge;
this.childSignature = childSignature;
}
@Override
public int hashCode() {
return Objects.hash(this.edge, this.childSignature);
}
@Override
public boolean equals(final Object other) {
if (this == other) {
return true;
}
if (!(other instanceof ChildDescriptor<?>)) {
return false;
}
final ChildDescriptor<?> that = (ChildDescriptor<?>) other;
return this.edge == that.edge && Objects.equals(this.childSignature, that.childSignature);
}
}

View File

@@ -0,0 +1,68 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.Arrays;
/**
* Immutable compiled trie node optimized for read access.
*
* <p>
* The returned arrays are the internal backing storage of the compiled node.
* They are exposed for efficient access by closely related trie infrastructure
* and therefore must never be modified by callers.
*
* @param <V> value type
* @param edgeLabels internal edge label array
* @param children internal child array
* @param orderedValues internal ordered values array
* @param orderedCounts internal ordered counts array
*/
public record CompiledNode<V>(char[] edgeLabels, CompiledNode<V>[] children, V[] orderedValues, int... orderedCounts) {
/**
* Finds a child for the supplied edge character.
*
* @param edge edge character
* @return child node, or {@code null} if absent
*/
public CompiledNode<V> findChild(final char edge) {
final int index = Arrays.binarySearch(this.edgeLabels, edge);
if (index < 0) {
return null;
}
return this.children[index];
}
}

View File

@@ -0,0 +1,76 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.Objects;
/**
* Local descriptor preserving dominant {@code get()} semantics.
*
* @param <V> value type
*/
/* default */ final class DominantLocalDescriptor<V> {
/**
* Dominant value.
*/
private final V dominantValue;
/**
* Creates a descriptor.
*
* @param dominantValue dominant value
*/
/* default */ DominantLocalDescriptor(final V dominantValue) {
this.dominantValue = dominantValue;
}
@Override
public int hashCode() {
return Objects.hashCode(this.dominantValue);
}
@Override
public boolean equals(final Object other) {
if (this == other) {
return true;
}
if (!(other instanceof DominantLocalDescriptor<?>)) {
return false;
}
final DominantLocalDescriptor<?> that = (DominantLocalDescriptor<?>) other;
return Objects.equals(this.dominantValue, that.dominantValue);
}
}

View File

@@ -0,0 +1,201 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.IntFunction;
import org.egothor.stemmer.ReductionSettings;
/**
* Local terminal value summary of a node.
*
* @param <V> value type
*/
public final class LocalValueSummary<V> {
/**
* Locally stored values ordered by descending frequency.
*/
private final V[] orderedValues;
/**
* Frequencies aligned with {@link #orderedValues}.
*/
private final int[] orderedCounts;
/**
* Total local frequency.
*/
private final int totalCount;
/**
* Winning value, or {@code null} if the node has no local value.
*/
/* default */ final V dominantValue;
/**
* Winning value frequency.
*/
private final int dominantCount;
/**
* Second best value frequency.
*/
private final int secondCount;
/**
* Creates a summary.
*
* @param orderedValues ordered values
* @param orderedCounts ordered counts
* @param totalCount total count
* @param dominantValue dominant value
* @param dominantCount dominant count
* @param secondCount second count
*/
public LocalValueSummary(final V[] orderedValues, final int[] orderedCounts, final int totalCount,
final V dominantValue, final int dominantCount, final int secondCount) {
this.orderedValues = orderedValues;
this.orderedCounts = orderedCounts;
this.totalCount = totalCount;
this.dominantValue = dominantValue;
this.dominantCount = dominantCount;
this.secondCount = secondCount;
}
/**
* Builds a summary from local counts.
*
* @param counts local counts
* @param arrayFactory array factory
* @param <V> value type
* @return summary
*/
public static <V> LocalValueSummary<V> of(final Map<V, Integer> counts, final IntFunction<V[]> arrayFactory) {
final List<SortableValue<V>> entries = new ArrayList<>(counts.size());
int insertionOrder = 0;
for (Map.Entry<V, Integer> entry : counts.entrySet()) {
entries.add(new SortableValue<>(entry.getKey(), entry.getValue(), String.valueOf(entry.getKey()),
insertionOrder++));
}
entries.sort((left, right) -> {
final int frequencyCompare = Integer.compare(right.count(), left.count());
if (frequencyCompare != 0) {
return frequencyCompare;
}
final int lengthCompare = Integer.compare(left.textLength(), right.textLength());
if (lengthCompare != 0) {
return lengthCompare;
}
final int textCompare = left.text().compareTo(right.text());
if (textCompare != 0) {
return textCompare;
}
return Integer.compare(left.insertionOrder(), right.insertionOrder());
});
final V[] orderedValues = arrayFactory.apply(entries.size());
final int[] orderedCounts = new int[entries.size()];
int totalCount = 0;
for (int index = 0; index < entries.size(); index++) {
final SortableValue<V> entry = entries.get(index);
orderedValues[index] = entry.value();
orderedCounts[index] = entry.count();
totalCount += orderedCounts[index];
}
final V dominantValue = orderedValues.length == 0 ? null : orderedValues[0];
final int dominantCount = orderedCounts.length == 0 ? 0 : orderedCounts[0];
final int secondCount = orderedCounts.length < 2 ? 0 : orderedCounts[1];
return new LocalValueSummary<>(orderedValues, orderedCounts, totalCount, dominantValue, dominantCount,
secondCount);
}
/**
* Returns ordered values.
*
* @return ordered values
*/
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public V[] orderedValues() {
return this.orderedValues;
}
/**
* Returns ordered counts.
*
* @return ordered counts
*/
@SuppressWarnings("PMD.MethodReturnsInternalArray")
public int[] orderedCounts() {
return this.orderedCounts;
}
/**
* Indicates whether the dominant value satisfies the configured dominance
* constraints.
*
* @param settings reduction settings
* @return {@code true} if dominant, otherwise {@code false}
*/
/* default */ boolean hasQualifiedDominantWinner(final ReductionSettings settings) {
if (this.dominantValue == null) {
return false;
}
final int thresholdPercent = settings.dominantWinnerMinPercent();
final int ratio = settings.dominantWinnerOverSecondRatio();
final boolean percentSatisfied = this.dominantCount * 100L >= (long) this.totalCount * thresholdPercent;
final boolean ratioSatisfied;
if (this.secondCount == 0) {
ratioSatisfied = true;
} else {
ratioSatisfied = this.dominantCount >= (long) this.secondCount * ratio;
}
return percentSatisfied && ratioSatisfied;
}
}

View File

@@ -0,0 +1,95 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* Mutable build-time node.
*
* <p>
* The maps exposed by the accessors are the internal mutable backing state of
* the node. They are returned directly for efficiency and are intended only for
* closely related trie-building infrastructure.
*
* @param <V> value type
*/
public final class MutableNode<V> {
/**
* Child nodes indexed by transition character.
*/
private final Map<Character, MutableNode<V>> children;
/**
* Local terminal value counts stored exactly at this node.
*/
private final Map<V, Integer> valueCounts;
/**
* Creates an empty node.
*/
public MutableNode() {
this.children = new LinkedHashMap<>();
this.valueCounts = new LinkedHashMap<>();
}
/**
* Returns the internal child-node map indexed by transition character.
*
* <p>
* The returned map is the internal mutable backing state of this node and is
* exposed only for efficient cooperation with trie-building infrastructure.
*
* @return internal child-node map
*/
public Map<Character, MutableNode<V>> children() {
return this.children;
}
/**
* Returns the internal local terminal value-count map.
*
* <p>
* The returned map is the internal mutable backing state of this node and is
* exposed only for efficient cooperation with trie-building infrastructure.
*
* @return internal local value-count map
*/
public Map<V, Integer> valueCounts() {
return this.valueCounts;
}
}

View File

@@ -0,0 +1,54 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
/**
* Intermediate node data used during deserialization before child references
* are resolved.
*
* <p>
* The arrays exposed by the accessors are the internal backing storage of this
* holder. They are returned directly for efficiency and therefore must be
* treated as read-only by callers.
*
* @param <V> value type
* @param edgeLabels edge labels
* @param childNodeIds child node identifiers
* @param orderedValues ordered values
* @param orderedCounts ordered counts
*/
public record NodeData<V>(char[] edgeLabels, int[] childNodeIds, V[] orderedValues, int... orderedCounts) {
}

View File

@@ -0,0 +1,88 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* Local descriptor preserving ranked {@code getAll()} semantics.
*/
/* default */ final class RankedLocalDescriptor {
/**
* Ordered values.
*/
private final List<Object> orderedValues;
/**
* Creates a descriptor.
*
* @param orderedValues ordered values
*/
private RankedLocalDescriptor(final List<Object> orderedValues) {
this.orderedValues = orderedValues;
}
/**
* Creates a descriptor from an ordered value array.
*
* @param orderedValues ordered values
* @return descriptor
*/
@SuppressWarnings("PMD.UseVarargs")
/* default */ static RankedLocalDescriptor of(final Object[] orderedValues) {
return new RankedLocalDescriptor(
Collections.unmodifiableList(Arrays.asList(Arrays.copyOf(orderedValues, orderedValues.length))));
}
@Override
public int hashCode() {
return this.orderedValues.hashCode();
}
@Override
public boolean equals(final Object other) {
if (this == other) {
return true;
}
if (!(other instanceof RankedLocalDescriptor)) {
return false;
}
final RankedLocalDescriptor that = (RankedLocalDescriptor) other;
return this.orderedValues.equals(that.orderedValues);
}
}

View File

@@ -0,0 +1,154 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.LinkedHashMap;
import java.util.Map;
/**
* Canonical reduced node used during subtree merging.
*
* <p>
* The maps exposed by the accessors are the internal backing state of the
* canonical reduced node. They are returned directly for efficiency and are
* intended only for closely related trie-reduction infrastructure.
*
* @param <V> value type
*/
public final class ReducedNode<V> {
/**
* Reduction signature.
*/
private final ReductionSignature<V> signature;
/**
* Aggregated local value counts.
*/
private final Map<V, Integer> localCounts;
/**
* Canonical children by edge.
*/
private final Map<Character, ReducedNode<V>> children;
/**
* Creates a new reduced node.
*
* @param signature reduction signature
* @param localCounts local counts
* @param children children
*/
public ReducedNode(final ReductionSignature<V> signature, final Map<V, Integer> localCounts,
final Map<Character, ReducedNode<V>> children) {
this.signature = signature;
this.localCounts = new LinkedHashMap<>(localCounts);
this.children = new LinkedHashMap<>(children);
}
/**
* Returns the reduction signature of this canonical node.
*
* @return reduction signature
*/
public ReductionSignature<V> signature() {
return this.signature;
}
/**
* Returns the internal aggregated local value-count map.
*
* <p>
* The returned map is the internal backing state of this canonical reduced node
* and is exposed only for efficient cooperation with trie-reduction
* infrastructure.
*
* @return internal aggregated local value-count map
*/
public Map<V, Integer> localCounts() {
return this.localCounts;
}
/**
* Returns the internal canonical child map indexed by transition character.
*
* <p>
* The returned map is the internal backing state of this canonical reduced node
* and is exposed only for efficient cooperation with trie-reduction
* infrastructure.
*
* @return internal canonical child map
*/
public Map<Character, ReducedNode<V>> children() {
return this.children;
}
/**
* Merges additional local counts into this node.
*
* @param additionalCounts additional local counts
*/
public void mergeLocalCounts(final Map<V, Integer> additionalCounts) {
for (Map.Entry<V, Integer> entry : additionalCounts.entrySet()) {
final Integer previous = this.localCounts.get(entry.getKey());
if (previous == null) {
this.localCounts.put(entry.getKey(), entry.getValue());
} else {
this.localCounts.put(entry.getKey(), previous + entry.getValue());
}
}
}
/**
* Merges child references into this node.
*
* <p>
* For nodes with the same reduction signature, child edge sets and child
* signatures must be compatible. This method therefore only needs to verify
* consistency and store the canonical child instance.
*
* @param additionalChildren additional children
*/
public void mergeChildren(final Map<Character, ReducedNode<V>> additionalChildren) {
for (Map.Entry<Character, ReducedNode<V>> entry : additionalChildren.entrySet()) {
final ReducedNode<V> existing = this.children.get(entry.getKey());
if (existing == null) {
this.children.put(entry.getKey(), entry.getValue());
} else if (existing != entry.getValue()) { // NOPMD - we have canonical instances
throw new IllegalStateException("Incompatible canonical child encountered during reduction.");
}
}
}
}

View File

@@ -0,0 +1,106 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.LinkedHashMap;
import java.util.Map;
import org.egothor.stemmer.ReductionSettings;
/**
* Reduction context used while canonicalizing mutable nodes.
*
* @param <V> value type
*/
public final class ReductionContext<V> {
/**
* Reduction settings.
*/
private final ReductionSettings settings;
/**
* Canonical nodes by signature.
*/
private final Map<ReductionSignature<V>, ReducedNode<V>> canonicalNodes;
/**
* Creates a new context.
*
* @param settings settings
*/
public ReductionContext(final ReductionSettings settings) {
this.settings = settings;
this.canonicalNodes = new LinkedHashMap<>();
}
/**
* Looks up a canonical node.
*
* @param signature signature
* @return canonical node, or {@code null} if absent
*/
public ReducedNode<V> lookup(final ReductionSignature<V> signature) {
return this.canonicalNodes.get(signature);
}
/**
* Registers a canonical node.
*
* @param signature signature
* @param node node
*/
public void register(final ReductionSignature<V> signature, final ReducedNode<V> node) {
this.canonicalNodes.put(signature, node);
}
/**
* Returns the settings.
*
* @return settings
*/
public ReductionSettings settings() {
return this.settings;
}
/**
* Returns the number of canonical nodes.
*
* @return canonical node count
*/
public int canonicalNodeCount() {
return this.canonicalNodes.size();
}
}

View File

@@ -0,0 +1,127 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.egothor.stemmer.ReductionSettings;
/**
* Immutable reduction signature of a full subtree.
*
* @param <V> value type
*/
public final class ReductionSignature<V> {
/**
* Local semantic descriptor.
*/
private final Object localDescriptor;
/**
* Child edge descriptors in sorted edge order.
*/
private final List<ChildDescriptor<V>> childDescriptors;
/**
* Creates a signature.
*
* @param localDescriptor local descriptor
* @param childDescriptors child descriptors
*/
private ReductionSignature(final Object localDescriptor, final List<ChildDescriptor<V>> childDescriptors) {
this.localDescriptor = localDescriptor;
this.childDescriptors = childDescriptors;
}
/**
* Creates a subtree signature according to the selected reduction mode.
*
* @param localSummary local value summary
* @param children reduced children
* @param settings reduction settings
* @param <V> value type
* @return subtree signature
*/
public static <V> ReductionSignature<V> create(final LocalValueSummary<V> localSummary,
final Map<Character, ReducedNode<V>> children, final ReductionSettings settings) {
final Object localDescriptor = switch (settings.reductionMode()) {
case MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS ->
RankedLocalDescriptor.of(localSummary.orderedValues());
case MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS ->
UnorderedLocalDescriptor.of(localSummary.orderedValues());
case MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS -> {
if (localSummary.hasQualifiedDominantWinner(settings)) {
yield new DominantLocalDescriptor<>(localSummary.dominantValue);
} else {
yield RankedLocalDescriptor.of(localSummary.orderedValues());
}
}
};
final List<Map.Entry<Character, ReducedNode<V>>> entries = new ArrayList<>(children.entrySet());
entries.sort(Map.Entry.comparingByKey());
final List<ChildDescriptor<V>> childDescriptors = new ArrayList<>(entries.size());
for (Map.Entry<Character, ReducedNode<V>> entry : entries) {
childDescriptors.add(new ChildDescriptor<>(entry.getKey(), entry.getValue().signature()));
}
return new ReductionSignature<>(localDescriptor, Collections.unmodifiableList(childDescriptors));
}
@Override
public int hashCode() {
return Objects.hash(this.localDescriptor, this.childDescriptors);
}
@Override
public boolean equals(final Object other) {
if (this == other) {
return true;
}
if (!(other instanceof ReductionSignature<?>)) {
return false;
}
final ReductionSignature<?> that = (ReductionSignature<?>) other;
return Objects.equals(this.localDescriptor, that.localDescriptor)
&& Objects.equals(this.childDescriptors, that.childDescriptors);
}
}

View File

@@ -0,0 +1,55 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
/**
* Sortable local value entry used to produce deterministic value ordering.
*
* @param <V> value type
* @param value stored value
* @param count local frequency
* @param text textual representation
* @param insertionOrder first-seen insertion order
*/
record SortableValue<V>(V value, int count, String text, int insertionOrder) {
/**
* Returns the length of the textual representation.
*
* @return textual representation length
*/
/* default */ int textLength() {
return this.text.length();
}
}

View File

@@ -0,0 +1,90 @@
/*******************************************************************************
* Copyright (C) 2026, Leo Galambos
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. All advertising materials mentioning features or use of this software must
* display the following acknowledgement:
* This product includes software developed by the Egothor project.
*
* 4. Neither the name of the copyright holder nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
******************************************************************************/
package org.egothor.stemmer.trie;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* Local descriptor preserving only unordered {@code getAll()} membership.
*/
/* default */ final class UnorderedLocalDescriptor {
/**
* Unordered distinct values.
*/
private final Set<Object> distinctValues;
/**
* Creates a descriptor.
*
* @param distinctValues distinct values
*/
private UnorderedLocalDescriptor(final Set<Object> distinctValues) {
this.distinctValues = distinctValues;
}
/**
* Creates a descriptor from an ordered value array.
*
* @param orderedValues ordered values
* @return descriptor
*/
@SuppressWarnings("PMD.UseVarargs")
/* default */ static UnorderedLocalDescriptor of(final Object[] orderedValues) {
final Set<Object> distinct = new HashSet<>();
distinct.addAll(Arrays.asList(orderedValues));
return new UnorderedLocalDescriptor(Collections.unmodifiableSet(distinct));
}
@Override
public int hashCode() {
return this.distinctValues.hashCode();
}
@Override
public boolean equals(final Object other) {
if (this == other) {
return true;
}
if (!(other instanceof UnorderedLocalDescriptor)) {
return false;
}
final UnorderedLocalDescriptor that = (UnorderedLocalDescriptor) other;
return this.distinctValues.equals(that.distinctValues);
}
}

View File

@@ -0,0 +1,74 @@
/**
* Provides internal trie infrastructure used by
* {@link org.egothor.stemmer.FrequencyTrie} compilation, reduction,
* canonicalization, and binary reconstruction.
*
* <p>
* This subpackage contains the implementation-level data structures that
* support transformation of mutable build-time trie content into a compact
* immutable compiled representation. The types in this package are primarily
* intended for cooperation within the stemming implementation and are not
* designed as a general-purpose public extension surface.
* </p>
*
* <p>
* Trie construction begins with mutable nodes represented by
* {@link org.egothor.stemmer.trie.MutableNode}, which store child transitions
* and local terminal value frequencies in insertion-preserving maps. Local node
* value distributions are analyzed through
* {@link org.egothor.stemmer.trie.LocalValueSummary}, which derives the
* deterministically ordered local values, aligned counts, total local
* frequency, and dominant-value metadata required by reduction logic.
* Deterministic local ordering is supported by
* {@link org.egothor.stemmer.trie.SortableValue}.
* </p>
*
* <p>
* Subtree reduction is driven by
* {@link org.egothor.stemmer.trie.ReductionSignature}, which captures the
* semantic identity of a full subtree under the active reduction strategy.
* Depending on the selected reduction settings, local subtree semantics are
* represented by ranked, unordered, or dominant-value descriptors via
* {@link org.egothor.stemmer.trie.RankedLocalDescriptor},
* {@link org.egothor.stemmer.trie.UnorderedLocalDescriptor}, and
* {@link org.egothor.stemmer.trie.DominantLocalDescriptor}. Child structure is
* incorporated into the signature through
* {@link org.egothor.stemmer.trie.ChildDescriptor}, ensuring that canonical
* equivalence covers both local node content and all reachable descendants.
* </p>
*
* <p>
* Canonicalization of semantically equivalent subtrees is coordinated by
* {@link org.egothor.stemmer.trie.ReductionContext}, which maintains the
* signature-to-node mapping for canonical reduced nodes. Canonical merged
* subtrees are represented by {@link org.egothor.stemmer.trie.ReducedNode},
* whose aggregated local counts and canonical child references serve as the
* intermediate form between mutable construction and immutable freezing.
* </p>
*
* <p>
* The final read-optimized structure is represented by
* {@link org.egothor.stemmer.trie.CompiledNode}. Compiled nodes expose compact
* aligned arrays of sorted edge labels, child references, ordered values, and
* ordered counts for efficient lookup and serialization. During binary
* deserialization, unresolved intermediate payload is carried in
* {@link org.egothor.stemmer.trie.NodeData} until canonical node references are
* re-linked into the final compiled form.
* </p>
*
* <p>
* Several accessors in this subpackage intentionally expose internal mutable or
* array-backed state directly in order to avoid unnecessary copying on
* performance-sensitive internal paths. Such APIs are intended strictly for
* tightly related trie infrastructure within the implementation and must be
* treated as internal-use contracts.
* </p>
*
* <p>
* In summary, this subpackage contains the internal semantic model and storage
* forms that allow the stemming implementation to move efficiently between
* build-time mutation, reduction-time canonical equivalence, and runtime
* immutable lookup.
* </p>
*/
package org.egothor.stemmer.trie;

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,353 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
/**
* Unit tests for {@link Compile}.
*
* <p>
* The suite verifies command-line orchestration, argument validation, overwrite
* semantics, help output, processing failures, and successful compilation into
* a compressed binary trie artifact.
* </p>
*
* <p>
* The tests target the package-visible {@link Compile#run(String...)} method so
* that the CLI logic can be exercised without triggering
* {@link System#exit(int)}.
* </p>
*/
@Tag("unit")
@Tag("cli")
@DisplayName("Compile")
class CompileTest {
/**
* Temporary directory for each test.
*/
@TempDir
Path temporaryDirectory;
@Test
@DisplayName("should reject utility class instantiation")
void shouldRejectUtilityClassInstantiation() throws Exception {
final Constructor<Compile> constructor = Compile.class.getDeclaredConstructor();
constructor.setAccessible(true);
final InvocationTargetException exception = assertThrows(InvocationTargetException.class,
constructor::newInstance);
assertAll(() -> assertNotNull(exception.getCause(), "The root cause must be present."),
() -> assertEquals(AssertionError.class, exception.getCause().getClass(),
"The utility constructor must fail with AssertionError."),
() -> assertEquals("No instances.", exception.getCause().getMessage(),
"The utility constructor must expose the expected diagnostic message."));
}
@Test
@DisplayName("should return success and print usage when help is requested")
void shouldReturnSuccessAndPrintUsageWhenHelpIsRequested() {
final CommandResult result = runWithCapturedStandardError("--help");
assertAll(() -> assertEquals(0, result.exitCode(), "Help must terminate successfully."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Help output must contain the usage header."),
() -> assertTrue(result.standardError().contains("--input <file>"),
"Help output must describe the input option."),
() -> assertTrue(result.standardError().contains("--output <file>"),
"Help output must describe the output option."),
() -> assertTrue(result.standardError().contains("--reduction-mode <mode>"),
"Help output must describe the reduction mode option."));
}
@Test
@DisplayName("should compile minimal dictionary into non-empty output file")
void shouldCompileMinimalDictionaryIntoNonEmptyOutputFile() throws Exception {
final Path inputFile = createMinimalDictionaryFile("minimal-dictionary.txt");
final Path outputFile = temporaryDirectory.resolve("compiled-trie.dat.gz");
final int exitCode = Compile.run("--input", inputFile.toString(), "--output", outputFile.toString(),
"--reduction-mode", validReductionModeName());
assertAll(() -> assertEquals(0, exitCode, "Valid compilation must succeed."),
() -> assertTrue(Files.exists(outputFile), "Compilation must create the output file."),
() -> assertTrue(Files.size(outputFile) > 0L, "The written output file must not be empty."));
}
@Test
@DisplayName("should compile successfully when store-original is enabled")
void shouldCompileSuccessfullyWhenStoreOriginalIsEnabled() throws Exception {
final Path inputFile = createMinimalDictionaryFile("store-original-dictionary.txt");
final Path outputFile = temporaryDirectory.resolve("compiled-store-original.dat.gz");
final int exitCode = Compile.run("--input", inputFile.toString(), "--output", outputFile.toString(),
"--reduction-mode", validReductionModeName(), "--store-original");
assertAll(() -> assertEquals(0, exitCode, "Compilation with store-original must succeed."),
() -> assertTrue(Files.exists(outputFile), "Compilation must create the output file."),
() -> assertTrue(Files.size(outputFile) > 0L, "The written output file must not be empty."));
}
@Test
@DisplayName("should fail with processing error when output exists and overwrite is not enabled")
void shouldFailWithProcessingErrorWhenOutputExistsAndOverwriteIsNotEnabled() throws Exception {
final Path inputFile = createMinimalDictionaryFile("overwrite-protection-dictionary.txt");
final Path outputFile = temporaryDirectory.resolve("already-present.dat.gz");
Files.writeString(outputFile, "existing-content", StandardCharsets.UTF_8);
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
outputFile.toString(), "--reduction-mode", validReductionModeName());
assertAll(
() -> assertEquals(1, result.exitCode(),
"Existing output without overwrite must be reported as processing failure."),
() -> assertTrue(result.standardError().contains("Compilation failed:"),
"Processing failures must be reported to standard error."),
() -> assertTrue(result.standardError().contains("Output file already exists"),
"The failure reason must mention overwrite protection."));
}
@Test
@DisplayName("should overwrite existing output when overwrite is enabled")
void shouldOverwriteExistingOutputWhenOverwriteIsEnabled() throws Exception {
final Path inputFile = createMinimalDictionaryFile("overwrite-enabled-dictionary.txt");
final Path outputFile = temporaryDirectory.resolve("overwrite-enabled.dat.gz");
Files.writeString(outputFile, "obsolete-content", StandardCharsets.UTF_8);
final int exitCode = Compile.run("--input", inputFile.toString(), "--output", outputFile.toString(),
"--reduction-mode", validReductionModeName(), "--overwrite");
assertAll(() -> assertEquals(0, exitCode, "Overwrite-enabled compilation must succeed."),
() -> assertTrue(Files.exists(outputFile), "The output file must exist after overwrite."),
() -> assertTrue(Files.size(outputFile) > 0L, "The overwritten output file must not be empty."),
() -> assertFalse(
Files.readString(outputFile, StandardCharsets.ISO_8859_1).contains("obsolete-content"),
"The original placeholder content must be replaced by compiled binary output."));
}
@Nested
@DisplayName("argument validation")
class ArgumentValidationTest {
@Test
@DisplayName("should fail with usage error when input is missing")
void shouldFailWithUsageErrorWhenInputIsMissing() {
final CommandResult result = runWithCapturedStandardError("--output",
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", validReductionModeName());
assertAll(() -> assertEquals(2, result.exitCode(), "Missing input must be treated as usage error."),
() -> assertTrue(result.standardError().contains("--input"),
"The diagnostic message must identify the missing input argument."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
@Test
@DisplayName("should fail with usage error when output is missing")
void shouldFailWithUsageErrorWhenOutputIsMissing() throws Exception {
final Path inputFile = createMinimalDictionaryFile("missing-output.txt");
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(),
"--reduction-mode", validReductionModeName());
assertAll(() -> assertEquals(2, result.exitCode(), "Missing output must be treated as usage error."),
() -> assertTrue(result.standardError().contains("--output"),
"The diagnostic message must identify the missing output argument."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
@Test
@DisplayName("should fail with usage error when reduction mode is missing")
void shouldFailWithUsageErrorWhenReductionModeIsMissing() throws Exception {
final Path inputFile = createMinimalDictionaryFile("missing-mode.txt");
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
temporaryDirectory.resolve("out.dat.gz").toString());
assertAll(
() -> assertEquals(2, result.exitCode(), "Missing reduction mode must be treated as usage error."),
() -> assertTrue(result.standardError().contains("--reduction-mode"),
"The diagnostic message must identify the missing reduction mode."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
@Test
@DisplayName("should fail with usage error for unknown argument")
void shouldFailWithUsageErrorForUnknownArgument() {
final CommandResult result = runWithCapturedStandardError("--unknown-option");
assertAll(() -> assertEquals(2, result.exitCode(), "Unknown options must be treated as usage errors."),
() -> assertTrue(result.standardError().contains("Unknown argument: --unknown-option"),
"The diagnostic message must identify the unknown option."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
@Test
@DisplayName("should fail with usage error for invalid reduction mode")
void shouldFailWithUsageErrorForInvalidReductionMode() throws Exception {
final Path inputFile = createMinimalDictionaryFile("invalid-mode.txt");
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", "NOT_A_MODE");
assertAll(
() -> assertEquals(2, result.exitCode(),
"An unsupported reduction mode must be treated as usage error."),
() -> assertTrue(
result.standardError().contains("NOT_A_MODE")
|| result.standardError().contains("No enum constant"),
"The diagnostic message must expose the invalid reduction mode."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
@Test
@DisplayName("should fail with usage error for invalid dominant winner min percent")
void shouldFailWithUsageErrorForInvalidDominantWinnerMinPercent() throws Exception {
final Path inputFile = createMinimalDictionaryFile("invalid-min-percent.txt");
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", validReductionModeName(),
"--dominant-winner-min-percent", "invalid");
assertAll(
() -> assertEquals(2, result.exitCode(),
"A non-integer dominant winner min percent must be treated as usage error."),
() -> assertTrue(result.standardError().contains("--dominant-winner-min-percent"),
"The diagnostic message must identify the invalid numeric option."),
() -> assertTrue(result.standardError().contains("invalid"),
"The diagnostic message should include the invalid value."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
@Test
@DisplayName("should fail with usage error for invalid dominant winner over second ratio")
void shouldFailWithUsageErrorForInvalidDominantWinnerOverSecondRatio() throws Exception {
final Path inputFile = createMinimalDictionaryFile("invalid-ratio.txt");
final CommandResult result = runWithCapturedStandardError("--input", inputFile.toString(), "--output",
temporaryDirectory.resolve("out.dat.gz").toString(), "--reduction-mode", validReductionModeName(),
"--dominant-winner-over-second-ratio", "invalid");
assertAll(
() -> assertEquals(2, result.exitCode(),
"A non-integer dominant winner ratio must be treated as usage error."),
() -> assertTrue(result.standardError().contains("--dominant-winner-over-second-ratio"),
"The diagnostic message must identify the invalid numeric option."),
() -> assertTrue(result.standardError().contains("invalid"),
"The diagnostic message should include the invalid value."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
@Test
@DisplayName("should fail with usage error when option value is missing")
void shouldFailWithUsageErrorWhenOptionValueIsMissing() {
final CommandResult result = runWithCapturedStandardError("--input");
assertAll(
() -> assertEquals(2, result.exitCode(), "Missing option values must be treated as usage errors."),
() -> assertTrue(result.standardError().contains("Missing value for --input."),
"The diagnostic message must identify the missing option value."),
() -> assertTrue(result.standardError().contains("Usage:"),
"Usage help must be printed for invalid invocation."));
}
}
@Test
@DisplayName("should fail with processing error when input file does not exist")
void shouldFailWithProcessingErrorWhenInputFileDoesNotExist() {
final Path missingInputFile = temporaryDirectory.resolve("missing-dictionary.txt");
final Path outputFile = temporaryDirectory.resolve("out.dat.gz");
final CommandResult result = runWithCapturedStandardError("--input", missingInputFile.toString(), "--output",
outputFile.toString(), "--reduction-mode", validReductionModeName());
assertAll(
() -> assertEquals(1, result.exitCode(), "Missing input file must be reported as processing failure."),
() -> assertTrue(result.standardError().contains("Compilation failed:"),
"Processing failures must be reported to standard error."),
() -> assertFalse(Files.exists(outputFile),
"The output file must not be created when the input file cannot be read."));
}
/**
* Returns a valid reduction mode name from the current project enum.
*
* @return name of a valid reduction mode
*/
private static String validReductionModeName() {
return ReductionMode.values()[0].name();
}
/**
* Creates a minimal valid dictionary file for CLI execution.
*
* @param fileName target file name
* @return path to the created file
* @throws Exception if the file cannot be written
*/
private Path createMinimalDictionaryFile(final String fileName) throws Exception {
final Path inputFile = temporaryDirectory.resolve(fileName);
final String content = "" + "# minimal dictionary for CLI tests\n" + "run running runs runner\n"
+ "walk walking walks walked\n";
Files.writeString(inputFile, content, StandardCharsets.UTF_8);
return inputFile;
}
/**
* Executes {@link Compile#run(String...)} while capturing {@code System.err}.
*
* @param arguments CLI arguments
* @return captured command result
*/
private static CommandResult runWithCapturedStandardError(final String... arguments) {
final PrintStream originalStandardError = System.err;
final ByteArrayOutputStream capturedStandardError = new ByteArrayOutputStream();
try (PrintStream replacementStandardError = new PrintStream(capturedStandardError, true,
StandardCharsets.UTF_8)) {
System.setErr(replacementStandardError);
final int exitCode = Compile.run(arguments);
replacementStandardError.flush();
return new CommandResult(exitCode, capturedStandardError.toString(StandardCharsets.UTF_8));
} finally {
System.setErr(originalStandardError);
}
}
/**
* Immutable captured CLI execution result.
*
* @param exitCode process-style exit code
* @param standardError captured standard error
*/
private record CommandResult(int exitCode, String standardError) {
// No additional members.
}
}

View File

@@ -0,0 +1,314 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
import java.util.function.IntFunction;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link FrequencyTrieBuilders}.
*
* <p>
* The tested helper reconstructs a writable {@link FrequencyTrie.Builder} from
* a compiled read-only {@link FrequencyTrie}. These tests verify that the
* reconstructed builder preserves the observable compiled semantics of the
* source trie, including local value counts, deterministic ordering, root-local
* values, traversal across sibling branches, and the ability to continue
* mutating the reconstructed builder before recompilation.
*/
@DisplayName("FrequencyTrieBuilders")
@Tag("unit")
@Tag("builder")
@Tag("frequency-trie")
class FrequencyTrieBuildersTest {
/**
* Shared array factory used by all tries in this test class.
*/
private static final IntFunction<String[]> ARRAY_FACTORY = String[]::new;
/**
* Ranked reduction settings preserving deterministic {@code getAll()}
* semantics.
*/
private static final ReductionSettings RANKED_SETTINGS = ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
/**
* Verifies that the utility class constructor is intentionally inaccessible and
* rejects instantiation attempts.
*
* @throws Exception if reflection unexpectedly fails
*/
@Test
@DisplayName("should reject instantiation of utility class")
void shouldRejectInstantiationOfUtilityClass() throws Exception {
final Constructor<FrequencyTrieBuilders> constructor = FrequencyTrieBuilders.class.getDeclaredConstructor();
constructor.setAccessible(true);
final InvocationTargetException exception = assertThrows(InvocationTargetException.class,
() -> constructor.newInstance());
assertAll(() -> assertEquals(AssertionError.class, exception.getCause().getClass()),
() -> assertEquals("No instances.", exception.getCause().getMessage()));
}
/**
* Verifies that reconstruction of an empty compiled trie yields an empty
* writable builder whose compiled form remains observably empty.
*/
@Test
@DisplayName("should reconstruct empty trie")
void shouldReconstructEmptyTrie() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
final FrequencyTrie<String> original = builder.build();
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(original, ARRAY_FACTORY,
RANKED_SETTINGS);
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
assertTrieStateEquals(original, reconstructed, "");
assertTrieStateEquals(original, reconstructed, "a");
assertTrieStateEquals(original, reconstructed, "missing");
}
/**
* Verifies that reconstruction preserves the observable compiled semantics for
* a representative trie containing root-local values, multiple values on the
* same node, and several independent branches.
*/
@Test
@DisplayName("should preserve get, getAll and getEntries after reconstruction")
void shouldPreserveCompiledSemanticsAfterReconstruction() {
final FrequencyTrie<String> original = createRepresentativeTrie();
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(original, ARRAY_FACTORY,
RANKED_SETTINGS);
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
assertTrieStateEquals(original, reconstructed, "");
assertTrieStateEquals(original, reconstructed, "a");
assertTrieStateEquals(original, reconstructed, "ab");
assertTrieStateEquals(original, reconstructed, "abc");
assertTrieStateEquals(original, reconstructed, "abd");
assertTrieStateEquals(original, reconstructed, "x");
assertTrieStateEquals(original, reconstructed, "xy");
assertTrieStateEquals(original, reconstructed, "missing");
}
/**
* Verifies that values stored directly on the root node are reconstructed
* exactly, including their counts and ranking order.
*/
@Test
@DisplayName("should preserve root-local values")
void shouldPreserveRootLocalValues() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
builder.put("", "root-dominant", 4);
builder.put("", "root-secondary", 2);
builder.put("a", "child", 1);
final FrequencyTrie<String> compiled = builder.build();
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
RANKED_SETTINGS);
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
assertAll(() -> assertEquals("root-dominant", reconstructed.get("")),
() -> assertArrayEquals(new String[] { "root-dominant", "root-secondary" }, reconstructed.getAll("")),
() -> assertIterableEquals(List.of(new ValueCount<String>("root-dominant", 4),
new ValueCount<String>("root-secondary", 2)), reconstructed.getEntries("")));
}
/**
* Verifies that local counts are reconstructed exactly and that deterministic
* ordering remains preserved after reconstruction.
*
* <p>
* This scenario is important because the helper copies raw ordered values and
* ordered counts from compiled nodes.
*/
@Test
@DisplayName("should preserve local counts and deterministic local ordering")
void shouldPreserveLocalCountsAndOrdering() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
builder.put("node", "bbb", 2);
builder.put("node", "aa", 2);
builder.put("node", "c", 2);
builder.put("node", "winner", 5);
final FrequencyTrie<String> compiled = builder.build();
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
RANKED_SETTINGS);
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
assertAll(() -> assertEquals("winner", reconstructed.get("node")),
() -> assertArrayEquals(new String[] { "winner", "c", "aa", "bbb" }, reconstructed.getAll("node")),
() -> assertIterableEquals(
List.of(new ValueCount<String>("winner", 5), new ValueCount<String>("c", 2),
new ValueCount<String>("aa", 2), new ValueCount<String>("bbb", 2)),
reconstructed.getEntries("node")));
}
/**
* Verifies that recursive traversal correctly restores sibling branches sharing
* a common prefix, which indirectly exercises the internal key-builder
* backtracking logic used during node copying.
*/
@Test
@DisplayName("should preserve sibling branches under a shared prefix")
void shouldPreserveSiblingBranchesUnderSharedPrefix() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
builder.put("car", "car", 4);
builder.put("card", "card", 3);
builder.put("care", "care", 2);
builder.put("cat", "cat", 5);
builder.put("dog", "dog", 1);
final FrequencyTrie<String> compiled = builder.build();
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(compiled, ARRAY_FACTORY,
RANKED_SETTINGS);
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
assertTrieStateEquals(compiled, reconstructed, "car");
assertTrieStateEquals(compiled, reconstructed, "card");
assertTrieStateEquals(compiled, reconstructed, "care");
assertTrieStateEquals(compiled, reconstructed, "cat");
assertTrieStateEquals(compiled, reconstructed, "dog");
assertTrieStateEquals(compiled, reconstructed, "cab");
}
/**
* Verifies that the reconstructed builder can be further modified and that such
* modifications do not affect the already compiled source trie.
*/
@Test
@DisplayName("should allow further modifications without affecting source trie")
void shouldAllowFurtherModificationsWithoutAffectingSourceTrie() {
final FrequencyTrie.Builder<String> originalBuilder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY,
RANKED_SETTINGS);
originalBuilder.put("walk", "Ra", 2);
originalBuilder.put("walked", "Rb", 1);
final FrequencyTrie<String> source = originalBuilder.build();
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(source, ARRAY_FACTORY,
RANKED_SETTINGS);
reconstructedBuilder.put("walk", "Rc", 4);
reconstructedBuilder.put("walker", "Rd", 3);
final FrequencyTrie<String> modified = reconstructedBuilder.build();
assertAll(
() -> assertIterableEquals(List.of(new ValueCount<String>("Ra", 2)), source.getEntries("walk"),
"Source trie must remain unchanged."),
() -> assertEquals(null, source.get("walker"), "Source trie must not gain newly inserted keys."),
() -> assertEquals("Rc", modified.get("walk")),
() -> assertIterableEquals(List.of(new ValueCount<String>("Rc", 4), new ValueCount<String>("Ra", 2)),
modified.getEntries("walk")),
() -> assertEquals("Rd", modified.get("walker")),
() -> assertIterableEquals(List.of(new ValueCount<String>("Rd", 3)), modified.getEntries("walker")),
() -> assertIterableEquals(List.of(new ValueCount<String>("Rb", 1)), modified.getEntries("walked")));
}
/**
* Verifies that reconstruction also works when only the reduction mode is
* supplied and the helper internally derives default reduction settings.
*/
@Test
@DisplayName("should reconstruct builder when only reduction mode is supplied")
void shouldReconstructUsingReductionModeShortcut() {
final FrequencyTrie<String> original = createRepresentativeTrie();
final FrequencyTrie.Builder<String> reconstructedBuilder = FrequencyTrieBuilders.copyOf(original, ARRAY_FACTORY,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final FrequencyTrie<String> reconstructed = reconstructedBuilder.build();
assertTrieStateEquals(original, reconstructed, "");
assertTrieStateEquals(original, reconstructed, "ab");
assertTrieStateEquals(original, reconstructed, "xy");
}
/**
* Verifies the documented null-argument contract for both public reconstruction
* entry points.
*/
@Test
@DisplayName("should reject null arguments")
void shouldRejectNullArguments() {
final FrequencyTrie<String> trie = createRepresentativeTrie();
assertAll(
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrieBuilders.copyOf(null, ARRAY_FACTORY, RANKED_SETTINGS)),
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrieBuilders.copyOf(trie, null, RANKED_SETTINGS)),
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrieBuilders.copyOf(trie, ARRAY_FACTORY, (ReductionSettings) null)),
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrieBuilders.copyOf(null, ARRAY_FACTORY,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS)),
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrieBuilders.copyOf(trie, null,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS)),
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrieBuilders.copyOf(trie, ARRAY_FACTORY, (ReductionMode) null)));
}
/**
* Creates a representative compiled trie used across multiple tests.
*
* @return compiled trie with several branches and ranked values
*/
private static FrequencyTrie<String> createRepresentativeTrie() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(ARRAY_FACTORY, RANKED_SETTINGS);
builder.put("", "root-main", 3);
builder.put("", "root-alt", 1);
builder.put("a", "A1", 2);
builder.put("a", "A2", 1);
builder.put("ab", "AB1", 5);
builder.put("ab", "AB2", 2);
builder.put("abc", "ABC", 4);
builder.put("abd", "ABD", 3);
builder.put("x", "X", 1);
builder.put("xy", "XY1", 2);
builder.put("xy", "XY2", 2);
final FrequencyTrie<String> trie = builder.build();
assertNotNull(trie);
return trie;
}
/**
* Asserts equality of the observable trie state for one key.
*
* @param expected expected trie
* @param actual actual trie
* @param key key to verify
*/
private static void assertTrieStateEquals(final FrequencyTrie<String> expected, final FrequencyTrie<String> actual,
final String key) {
assertAll(
() -> assertEquals(expected.get(key), actual.get(key),
"Unexpected get() result for key '" + key + "'."),
() -> assertArrayEquals(expected.getAll(key), actual.getAll(key),
"Unexpected getAll() result for key '" + key + "'."),
() -> assertIterableEquals(expected.getEntries(key), actual.getEntries(key),
"Unexpected getEntries() result for key '" + key + "'."));
}
}

View File

@@ -0,0 +1,772 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.List;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link FrequencyTrie}.
*
* <p>
* The suite validates lookup semantics, deterministic value ordering, reduction
* behavior, counted insertion, and binary persistence. Tests intentionally
* verify both leaf and internal-node storage because the trie permits values at
* any node in the path.
*/
@Tag("unit")
@Tag("trie")
@Tag("frequency-trie")
@DisplayName("FrequencyTrie")
class FrequencyTrieTest {
/**
* Codec used by persistence tests for {@link String} values.
*/
private static final FrequencyTrie.ValueStreamCodec<String> STRING_CODEC = new FrequencyTrie.ValueStreamCodec<String>() {
@Override
public void write(final DataOutputStream dataOutput, final String value) throws IOException {
dataOutput.writeUTF(value);
}
@Override
public String read(final DataInputStream dataInput) throws IOException {
return dataInput.readUTF();
}
};
/**
* Creates a builder using the ranked get-all reduction mode.
*
* @return new builder
*/
private static FrequencyTrie.Builder<String> rankedBuilder() {
return new FrequencyTrie.Builder<String>(String[]::new,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
}
/**
* Verifies that the builder rejects {@code null} constructor arguments.
*/
@Test
@DisplayName("Builder rejects null constructor arguments")
void builderRejectsNullConstructorArguments() {
assertAll(
() -> assertThrows(NullPointerException.class,
() -> new FrequencyTrie.Builder<String>(null,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS)),
() -> assertThrows(NullPointerException.class,
() -> new FrequencyTrie.Builder<String>(String[]::new, (ReductionMode) null)),
() -> assertThrows(NullPointerException.class,
() -> new FrequencyTrie.Builder<String>(String[]::new, (ReductionSettings) null)));
}
/**
* Verifies that the builder rejects {@code null} put arguments.
*/
@Test
@DisplayName("Builder rejects null put arguments")
void builderRejectsNullPutArguments() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
assertAll(() -> assertThrows(NullPointerException.class, () -> builder.put(null, "x")),
() -> assertThrows(NullPointerException.class, () -> builder.put("x", null)),
() -> assertThrows(NullPointerException.class, () -> builder.put(null, "x", 1)),
() -> assertThrows(NullPointerException.class, () -> builder.put("x", null, 1)));
}
/**
* Verifies that counted insertion rejects non-positive counts.
*/
@Test
@DisplayName("Builder rejects non-positive counted insertion")
void builderRejectsNonPositiveCountedInsertion() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
assertAll(() -> assertThrows(IllegalArgumentException.class, () -> builder.put("x", "v", 0)),
() -> assertThrows(IllegalArgumentException.class, () -> builder.put("x", "v", -1)));
}
/**
* Verifies that lookup methods reject {@code null} keys.
*/
@Test
@DisplayName("Trie rejects null lookup keys")
void trieRejectsNullLookupKeys() {
final FrequencyTrie<String> trie = rankedBuilder().build();
assertAll(() -> assertThrows(NullPointerException.class, () -> trie.get(null)),
() -> assertThrows(NullPointerException.class, () -> trie.getAll(null)),
() -> assertThrows(NullPointerException.class, () -> trie.getEntries(null)));
}
/**
* Verifies lookup behavior for an empty trie.
*/
@Test
@DisplayName("Empty trie returns null, empty array, and empty entries")
void emptyTrieReturnsNullEmptyArrayAndEmptyEntries() {
final FrequencyTrie<String> trie = rankedBuilder().build();
assertAll(() -> assertNull(trie.get("missing")), () -> assertArrayEquals(new String[0], trie.getAll("missing")),
() -> assertEquals(List.of(), trie.getEntries("missing")));
}
/**
* Verifies that an empty key stores values directly at the root node.
*/
@Test
@DisplayName("Empty key stores values at the root node")
void emptyKeyStoresValuesAtRootNode() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("", "root");
builder.put("", "root");
builder.put("", "alternate");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("root", trie.get("")),
() -> assertArrayEquals(new String[] { "root", "alternate" }, trie.getAll("")),
() -> assertEquals(List.of(new ValueCount<String>("root", 2), new ValueCount<String>("alternate", 1)),
trie.getEntries("")));
}
/**
* Verifies that values stored on an internal node remain local to that node.
*/
@Test
@DisplayName("Internal-node values remain local to that node")
void internalNodeValuesRemainLocalToThatNode() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("run", "verb");
builder.put("run", "verb");
builder.put("run", "noun");
builder.put("runner", "noun");
builder.put("runner", "agent");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("verb", trie.get("run")),
() -> assertArrayEquals(new String[] { "verb", "noun" }, trie.getAll("run")),
() -> assertEquals("noun", trie.get("runner")),
() -> assertArrayEquals(new String[] { "noun", "agent" }, trie.getAll("runner")));
}
/**
* Verifies that a missing path below an existing prefix returns empty results.
*/
@Test
@DisplayName("Missing path below existing prefix returns empty results")
void missingPathBelowExistingPrefixReturnsEmptyResults() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("run", "verb");
builder.put("runner", "noun");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertNull(trie.get("rune")), () -> assertArrayEquals(new String[0], trie.getAll("rune")),
() -> assertEquals(List.of(), trie.getEntries("rune")));
}
/**
* Verifies that values are returned in descending frequency order.
*/
@Test
@DisplayName("getAll returns values ordered by descending local frequency")
void getAllReturnsValuesOrderedByDescendingLocalFrequency() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("house", "noun");
builder.put("house", "noun");
builder.put("house", "noun");
builder.put("house", "verb");
builder.put("house", "adjective");
builder.put("house", "verb");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("noun", trie.get("house")),
() -> assertArrayEquals(new String[] { "noun", "verb", "adjective" }, trie.getAll("house")),
() -> assertEquals(List.of(new ValueCount<String>("noun", 3), new ValueCount<String>("verb", 2),
new ValueCount<String>("adjective", 1)), trie.getEntries("house")));
}
/**
* Verifies that counted insertion aggregates local frequencies correctly.
*/
@Test
@DisplayName("Counted insertion aggregates frequencies correctly")
void countedInsertionAggregatesFrequenciesCorrectly() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("stem", "noun", 3);
builder.put("stem", "verb", 2);
builder.put("stem", "noun", 4);
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("noun", trie.get("stem")),
() -> assertArrayEquals(new String[] { "noun", "verb" }, trie.getAll("stem")),
() -> assertEquals(List.of(new ValueCount<String>("noun", 7), new ValueCount<String>("verb", 2)),
trie.getEntries("stem")));
}
/**
* Verifies that {@link FrequencyTrie#getAll(String)} returns a defensive copy.
*/
@Test
@DisplayName("getAll returns a defensive copy")
void getAllReturnsDefensiveCopy() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("alpha", "x");
builder.put("alpha", "y");
final FrequencyTrie<String> trie = builder.build();
final String[] first = trie.getAll("alpha");
first[0] = "mutated";
final String[] second = trie.getAll("alpha");
assertArrayEquals(new String[] { "x", "y" }, second);
}
/**
* Verifies that {@link FrequencyTrie#getEntries(String)} returns an immutable
* list.
*/
@Test
@DisplayName("getEntries returns immutable list")
void getEntriesReturnsImmutableList() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("alpha", "x");
builder.put("alpha", "x");
builder.put("alpha", "y");
final FrequencyTrie<String> trie = builder.build();
final List<ValueCount<String>> entries = trie.getEntries("alpha");
assertThrows(UnsupportedOperationException.class, () -> entries.add(new ValueCount<String>("z", 1)));
}
/**
* Verifies that equal frequencies prefer the shorter string representation.
*/
@Test
@DisplayName("Equal frequencies prefer shorter string representation")
void equalFrequenciesPreferShorterStringRepresentation() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("k", "longer");
builder.put("k", "x");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("x", trie.get("k")),
() -> assertArrayEquals(new String[] { "x", "longer" }, trie.getAll("k")),
() -> assertEquals(List.of(new ValueCount<String>("x", 1), new ValueCount<String>("longer", 1)),
trie.getEntries("k")));
}
/**
* Verifies that equal frequencies and equal string lengths prefer the
* lexicographically lower string representation.
*/
@Test
@DisplayName("Equal frequencies and lengths prefer lexicographically lower string")
void equalFrequenciesAndLengthsPreferLexicographicallyLowerString() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("k", "bb");
builder.put("k", "aa");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("aa", trie.get("k")),
() -> assertArrayEquals(new String[] { "aa", "bb" }, trie.getAll("k")),
() -> assertEquals(List.of(new ValueCount<String>("aa", 1), new ValueCount<String>("bb", 1)),
trie.getEntries("k")));
}
/**
* Verifies that if textual representations are equal, first-seen order remains
* stable.
*/
@Test
@DisplayName("Equal textual representations preserve first-seen order")
void equalTextualRepresentationsPreserveFirstSeenOrder() {
final FrequencyTrie.Builder<Object> builder = new FrequencyTrie.Builder<Object>(Object[]::new,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final Object first = new Object() {
@Override
public String toString() {
return "same";
}
};
final Object second = new Object() {
@Override
public String toString() {
return "same";
}
};
builder.put("k", first);
builder.put("k", second);
final FrequencyTrie<Object> trie = builder.build();
assertAll(() -> assertSame(first, trie.get("k")),
() -> assertArrayEquals(new Object[] { first, second }, trie.getAll("k")));
}
/**
* Verifies ranked reduction. Equivalent ranked local results should merge even
* if absolute frequencies differ.
*/
@Test
@Tag("reduction")
@DisplayName("Ranked reduction merges subtrees with equivalent ranked getAll semantics")
void rankedReductionMergesEquivalentRankedGetAllSubtrees() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("ab", "X");
builder.put("ab", "X");
builder.put("ab", "Y");
builder.put("cb", "X");
builder.put("cb", "Y");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("X", trie.get("ab")),
() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("ab")),
() -> assertEquals("X", trie.get("cb")),
() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("cb")));
}
/**
* Verifies that ranked reduction does not merge nodes when ranked ordering
* differs.
*/
@Test
@Tag("reduction")
@DisplayName("Ranked reduction keeps nodes separate when getAll ordering differs")
void rankedReductionKeepsNodesSeparateWhenOrderingDiffers() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("ab", "X");
builder.put("ab", "X");
builder.put("ab", "Y");
builder.put("cb", "Y");
builder.put("cb", "Y");
builder.put("cb", "X");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("ab")),
() -> assertArrayEquals(new String[] { "Y", "X" }, trie.getAll("cb")));
}
/**
* Verifies that unordered reduction may merge nodes even when ranked ordering
* differs, because only the value set matters to the signature.
*/
@Test
@Tag("reduction")
@DisplayName("Unordered reduction merges nodes with the same getAll value set")
void unorderedReductionMergesNodesWithSameGetAllValueSet() {
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS);
builder.put("ab", "X");
builder.put("ab", "X");
builder.put("ab", "Y");
builder.put("cb", "Y");
builder.put("cb", "Y");
builder.put("cb", "X");
final FrequencyTrie<String> trie = builder.build();
final String[] ab = trie.getAll("ab");
final String[] cb = trie.getAll("cb");
assertAll(() -> assertNotNull(ab), () -> assertNotNull(cb), () -> assertArrayEquals(ab, cb),
() -> assertEquals(trie.get("ab"), trie.get("cb")));
}
/**
* Verifies that dominant reduction merges nodes when the local winner satisfies
* the configured dominance conditions.
*/
@Test
@Tag("reduction")
@DisplayName("Dominant reduction merges nodes with a qualified dominant winner")
void dominantReductionMergesQualifiedDominantWinnerNodes() {
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 75, 3);
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new, settings);
builder.put("ab", "X");
builder.put("ab", "X");
builder.put("ab", "X");
builder.put("ab", "Y");
builder.put("cb", "X");
builder.put("cb", "X");
builder.put("cb", "X");
builder.put("cb", "Z");
final FrequencyTrie<String> trie = builder.build();
final String[] ab = trie.getAll("ab");
final String[] cb = trie.getAll("cb");
assertAll(() -> assertEquals("X", trie.get("ab")), () -> assertEquals("X", trie.get("cb")),
() -> assertArrayEquals(ab, cb), () -> assertEquals(3, ab.length));
}
/**
* Verifies that dominant reduction does not over-reduce nodes whose local
* winner is not dominant enough.
*/
@Test
@Tag("reduction")
@DisplayName("Dominant reduction falls back when winner is not dominant enough")
void dominantReductionFallsBackWhenWinnerIsNotDominantEnough() {
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 75, 3);
final FrequencyTrie.Builder<String> builder = new FrequencyTrie.Builder<String>(String[]::new, settings);
builder.put("ab", "X");
builder.put("ab", "X");
builder.put("ab", "Y");
builder.put("cb", "X");
builder.put("cb", "Z");
builder.put("cb", "Z");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("X", trie.get("ab")),
() -> assertArrayEquals(new String[] { "X", "Y" }, trie.getAll("ab")),
() -> assertEquals("Z", trie.get("cb")),
() -> assertArrayEquals(new String[] { "Z", "X" }, trie.getAll("cb")));
}
/**
* Verifies that local values on internal nodes participate in reduction.
*/
@Test
@Tag("reduction")
@DisplayName("Reduction takes internal-node local values into account")
void reductionTakesInternalNodeLocalValuesIntoAccount() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("a", "prefix-a");
builder.put("a", "prefix-a");
builder.put("ab", "leaf");
builder.put("c", "prefix-c");
builder.put("c", "prefix-c");
builder.put("cb", "leaf");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("prefix-a", trie.get("a")), () -> assertEquals("prefix-c", trie.get("c")),
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("ab")),
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("cb")));
}
/**
* Verifies that equivalent descendants do not override differing internal-node
* semantics.
*/
@Test
@Tag("reduction")
@DisplayName("Equivalent descendants do not override differing internal-node semantics")
void equivalentDescendantsDoNotOverrideDifferingInternalNodeSemantics() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("a", "left");
builder.put("ab", "child");
builder.put("c", "right");
builder.put("cb", "child");
final FrequencyTrie<String> trie = builder.build();
assertAll(() -> assertEquals("left", trie.get("a")), () -> assertEquals("right", trie.get("c")),
() -> assertArrayEquals(new String[] { "child" }, trie.getAll("ab")),
() -> assertArrayEquals(new String[] { "child" }, trie.getAll("cb")));
}
/**
* Verifies that subtree reduction materially decreases compiled trie size for a
* dataset with repeated equivalent suffix structures.
*/
@Test
@Tag("reduction")
@DisplayName("Reduction materially decreases compiled trie size for repeated equivalent suffixes")
void reductionMateriallyDecreasesCompiledTrieSizeForRepeatedEquivalentSuffixes() {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
for (int index = 0; index < 20; index++) {
final String prefix = "p" + index;
builder.put(prefix, "prefix");
builder.put(prefix + "x", "mid");
builder.put(prefix + "xy", "leaf");
builder.put(prefix + "xz", "leaf-alt");
}
final int buildTimeSize = builder.buildTimeSize();
final FrequencyTrie<String> trie = builder.build();
final int compiledSize = trie.size();
final double reductionRatio = 1.0d - ((double) compiledSize / (double) buildTimeSize);
assertAll(() -> assertEquals("prefix", trie.get("p0")), () -> assertEquals("mid", trie.get("p0x")),
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("p0xy")),
() -> assertArrayEquals(new String[] { "leaf-alt" }, trie.getAll("p0xz")),
() -> assertEquals("prefix", trie.get("p19")), () -> assertEquals("mid", trie.get("p19x")),
() -> assertArrayEquals(new String[] { "leaf" }, trie.getAll("p19xy")),
() -> assertArrayEquals(new String[] { "leaf-alt" }, trie.getAll("p19xz")),
() -> assertEquals(82, buildTimeSize), () -> assertEquals(7, compiledSize),
() -> assertEquals(1.0d - (7.0d / 82.0d), reductionRatio, 0.0000001d),
() -> assertTrue(reductionRatio >= 0.50d,
() -> "Expected at least 50% reduction, but build-time size was " + buildTimeSize
+ " and compiled size was " + compiledSize + ", giving ratio " + reductionRatio + '.'));
}
/**
* Verifies that serialization preserves trie semantics and canonical size.
*
* @throws IOException if test I/O fails unexpectedly
*/
@Test
@Tag("persistence")
@DisplayName("writeTo and readFrom round-trip trie content")
void writeToAndReadFromRoundTripTrieContent() throws IOException {
final FrequencyTrie.Builder<String> builder = rankedBuilder();
builder.put("", "root", 2);
builder.put("run", "verb", 3);
builder.put("run", "noun", 1);
builder.put("runner", "noun", 2);
builder.put("cab", "X", 2);
builder.put("cab", "Y", 1);
builder.put("dab", "X", 1);
builder.put("dab", "Y", 1);
final FrequencyTrie<String> original = builder.build();
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
original.writeTo(outputStream, STRING_CODEC);
final FrequencyTrie<String> restored = FrequencyTrie
.readFrom(new ByteArrayInputStream(outputStream.toByteArray()), String[]::new, STRING_CODEC);
assertAll(() -> assertEquals(original.size(), restored.size()),
() -> assertEquals(original.get(""), restored.get("")),
() -> assertArrayEquals(original.getAll(""), restored.getAll("")),
() -> assertEquals(original.get("run"), restored.get("run")),
() -> assertArrayEquals(original.getAll("run"), restored.getAll("run")),
() -> assertEquals(original.getEntries("run"), restored.getEntries("run")),
() -> assertEquals(original.get("runner"), restored.get("runner")),
() -> assertArrayEquals(original.getAll("runner"), restored.getAll("runner")),
() -> assertEquals(original.getEntries("runner"), restored.getEntries("runner")),
() -> assertEquals(original.get("cab"), restored.get("cab")),
() -> assertArrayEquals(original.getAll("cab"), restored.getAll("cab")),
() -> assertEquals(original.getEntries("cab"), restored.getEntries("cab")),
() -> assertEquals(original.get("dab"), restored.get("dab")),
() -> assertArrayEquals(original.getAll("dab"), restored.getAll("dab")),
() -> assertEquals(original.getEntries("dab"), restored.getEntries("dab")),
() -> assertNull(restored.get("missing")),
() -> assertArrayEquals(new String[0], restored.getAll("missing")),
() -> assertEquals(List.of(), restored.getEntries("missing")));
}
/**
* Verifies that persistence methods reject {@code null} arguments.
*
* @throws IOException if test I/O fails unexpectedly
*/
@Test
@Tag("persistence")
@DisplayName("writeTo and readFrom reject null arguments")
void writeToAndReadFromRejectNullArguments() throws IOException {
final FrequencyTrie<String> trie = rankedBuilder().build();
final ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
final byte[] serializedEmptyTrie;
trie.writeTo(outputStream, STRING_CODEC);
serializedEmptyTrie = outputStream.toByteArray();
assertAll(() -> assertThrows(NullPointerException.class, () -> trie.writeTo(null, STRING_CODEC)),
() -> assertThrows(NullPointerException.class, () -> trie.writeTo(new ByteArrayOutputStream(), null)),
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrie.readFrom(null, String[]::new, STRING_CODEC)),
() -> assertThrows(NullPointerException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(serializedEmptyTrie), null,
STRING_CODEC)),
() -> assertThrows(NullPointerException.class, () -> FrequencyTrie
.readFrom(new ByteArrayInputStream(serializedEmptyTrie), String[]::new, null)));
}
/**
* Verifies that deserialization rejects an invalid stream magic header.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects invalid stream magic header")
void readFromRejectsInvalidStreamMagicHeader() {
final byte[] bytes = createSerializedStream(0x12345678, 1, 1, 0, new NodeWriter[0]);
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Unsupported trie stream header"));
}
/**
* Verifies that deserialization rejects an unsupported stream version.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects unsupported stream version")
void readFromRejectsUnsupportedStreamVersion() {
final byte[] bytes = createSerializedStream(0x45475452, 999, 1, 0, new NodeWriter[0]);
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Unsupported trie stream version"));
}
/**
* Verifies that deserialization rejects a negative node count.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects negative node count")
void readFromRejectsNegativeNodeCount() {
final byte[] bytes = createSerializedStream(0x45475452, 1, -1, 0, new NodeWriter[0]);
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Negative node count"));
}
/**
* Verifies that deserialization rejects an invalid root node identifier.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects invalid root node identifier")
void readFromRejectsInvalidRootNodeIdentifier() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 1, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Invalid root node id"));
}
/**
* Verifies that deserialization rejects non-positive stored counts.
*/
@Test
@Tag("persistence")
@DisplayName("readFrom rejects non-positive stored counts")
void readFromRejectsNonPositiveStoredCounts() {
final byte[] bytes = createSerializedStream(0x45475452, 1, 1, 0, new NodeWriter[] { dataOutput -> {
dataOutput.writeInt(0);
dataOutput.writeInt(1);
dataOutput.writeUTF("value");
dataOutput.writeInt(0);
} });
final IOException exception = assertThrows(IOException.class,
() -> FrequencyTrie.readFrom(new ByteArrayInputStream(bytes), String[]::new, STRING_CODEC));
assertTrue(exception.getMessage().contains("Non-positive stored count"));
}
/**
* Writes one node body into a synthetic serialized trie stream.
*/
@FunctionalInterface
private interface NodeWriter {
/**
* Writes one serialized node body.
*
* @param dataOutput output stream
* @throws IOException if writing fails
*/
void write(DataOutputStream dataOutput) throws IOException;
}
/**
* Creates a synthetic serialized trie stream.
*
* @param magic stream magic
* @param version stream version
* @param nodeCount declared node count
* @param rootNodeId declared root node identifier
* @param nodes node body writers
* @return serialized bytes
*/
private static byte[] createSerializedStream(final int magic, final int version, final int nodeCount,
final int rootNodeId, final NodeWriter[] nodes) {
try {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
final DataOutputStream dataOutputStream = new DataOutputStream(byteArrayOutputStream);
dataOutputStream.writeInt(magic);
dataOutputStream.writeInt(version);
dataOutputStream.writeInt(nodeCount);
dataOutputStream.writeInt(rootNodeId);
for (NodeWriter node : nodes) {
node.write(dataOutputStream);
}
dataOutputStream.flush();
return byteArrayOutputStream.toByteArray();
} catch (IOException exception) {
throw new IllegalStateException("Unexpected I/O while building synthetic trie stream.", exception);
}
}
}

View File

@@ -0,0 +1,668 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.util.stream.Stream;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
/**
* Unit tests for {@link PatchCommandEncoder}.
*
* <p>
* The suite verifies both major public responsibilities of the encoder:
* generation of compact patch commands and application of those commands back
* to source terms.
* </p>
*
* <p>
* The implementation intentionally exposes some historical compatibility
* behavior, especially when malformed patch commands cause index-related
* failures during patch application. Those cases are covered explicitly so that
* future refactoring does not silently alter externally observable semantics.
* </p>
*/
@DisplayName("PatchCommandEncoder")
@Tag("unit")
@Tag("stemmer")
@Tag("patch")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
class PatchCommandEncoderTest {
/**
* Provides representative source-target pairs for round-trip validation.
*
* @return test arguments
*/
private static Stream<Arguments> provideRoundTripPairs() {
return Stream.of(
// 1
Arguments.of(1, "", ""),
// 2
Arguments.of(2, "a", "a"),
// 3
Arguments.of(3, "a", "b"),
// 4
Arguments.of(4, "ab", "ab"),
// 5
Arguments.of(5, "ab", "abc"),
// 6
Arguments.of(6, "abc", "ab"),
// 7
Arguments.of(7, "teacher", "teach"),
// 8
Arguments.of(8, "running", "run"),
// 9
Arguments.of(9, "cities", "city"),
// 10
Arguments.of(10, "walked", "walk"),
// 11
Arguments.of(11, "redo", "undo"),
// 12
Arguments.of(12, "stemming", "stem"),
// 13
Arguments.of(13, "abcdef", "azced"),
// 14
Arguments.of(14, "x", ""),
// 15
Arguments.of(15, "mississippi", "missouri"),
// 16
Arguments.of(16, "transformation", "transform"),
// 17
Arguments.of(17, "preprocessing", "process"),
// 18
Arguments.of(18, "internationalization", "i18n"),
// 19
Arguments.of(19, "bookkeeper", "bookkeeping"));
}
/**
* Provides explicit patch application cases.
*
* @return test arguments
*/
private static Stream<Arguments> provideApplyCases() {
return Stream.of(
// 1
Arguments.of(1, "teacher", "Db", "teach"),
// 2
Arguments.of(2, "abc", "Ic", "abcc"),
// 3
Arguments.of(3, "abc", "Rx", "abx"),
// 4
Arguments.of(4, "abc", "-bRx", "xbc"),
// 5
Arguments.of(5, "abcd", "Dc", "a"),
// 6
Arguments.of(6, "abcd", "-c", "abcd"),
// 7
Arguments.of(7, "kitten", "DbIg", "kittg"),
// 8
Arguments.of(8, "", "Ix", "x"),
// 9
Arguments.of(9, "", "IbIa", "ab"),
// 10
Arguments.of(10, "teacher", PatchCommandEncoder.NOOP_PATCH, "teacher"));
}
/**
* Provides malformed or index-invalid patch inputs that must preserve the
* original source according to the implementation contract.
*
* @return test arguments
*/
private static Stream<Arguments> provideMalformedPatchCases() {
return Stream.of(
// 1
Arguments.of(1, "abc", "Dz"),
// 2
Arguments.of(2, "abc", "-z"),
// 3
Arguments.of(3, "abc", "R"),
// 4
Arguments.of(4, "abc", "I"),
// 5
Arguments.of(5, "abc", "D"),
// 6
Arguments.of(6, "abc", "-"),
// 7
Arguments.of(7, "abc", "IuDz"),
// 8
Arguments.of(8, "", "Da"),
// 9
Arguments.of(9, "", "-a"),
// 10
Arguments.of(10, "", "Ra"));
}
/**
* Provides representative source-target pairs for mirrored-orientation tests.
*
* @return test arguments
*/
private static Stream<Arguments> provideReversedRoundTripPairs() {
return Stream.of(
// 1
Arguments.of(1, "", ""),
// 2
Arguments.of(2, "a", "a"),
// 3
Arguments.of(3, "a", "b"),
// 4
Arguments.of(4, "teacher", "teach"),
// 5
Arguments.of(5, "running", "run"),
// 6
Arguments.of(6, "cities", "city"),
// 7
Arguments.of(7, "walked", "walk"),
// 8
Arguments.of(8, "redo", "undo"),
// 9
Arguments.of(9, "stemming", "stem"),
// 10
Arguments.of(10, "abcdef", "azced"),
// 11
Arguments.of(11, "mississippi", "missouri"),
// 12
Arguments.of(12, "transformation", "transform"),
// 13
Arguments.of(13, "preprocessing", "process"),
// 14
Arguments.of(14, "bookkeeper", "bookkeeping"),
// 15
Arguments.of(15, "", "x"),
// 16
Arguments.of(16, "", "ab"),
// 17
Arguments.of(17, "", "stem"));
}
/**
* Returns a reversed copy of the supplied text.
*
* @param text input text
* @return reversed text
*/
private static String reverse(String text) {
return new StringBuilder(text).reverse().toString();
}
/**
* Tests constructor validation and basic instantiation behavior.
*/
@Nested
@DisplayName("construction")
@Tag("constructor")
class ConstructionTests {
/**
* Verifies that the default constructor creates a usable encoder instance.
*/
@Test
@DisplayName("creates encoder with default cost model")
void shouldCreateEncoderWithDefaultCostModel() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
assertNotNull(encoder);
assertEquals("teach", PatchCommandEncoder.apply("teacher", encoder.encode("teacher", "teach")));
}
/**
* Verifies that a negative insert cost is rejected.
*/
@Test
@DisplayName("rejects negative insert cost")
void shouldRejectNegativeInsertCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(-1, 1, 1, 0));
assertEquals("insertCost must be non-negative.", exception.getMessage());
}
/**
* Verifies that a negative delete cost is rejected.
*/
@Test
@DisplayName("rejects negative delete cost")
void shouldRejectNegativeDeleteCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(1, -1, 1, 0));
assertEquals("deleteCost must be non-negative.", exception.getMessage());
}
/**
* Verifies that a negative replace cost is rejected.
*/
@Test
@DisplayName("rejects negative replace cost")
void shouldRejectNegativeReplaceCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(1, 1, -1, 0));
assertEquals("replaceCost must be non-negative.", exception.getMessage());
}
/**
* Verifies that a negative match cost is rejected.
*/
@Test
@DisplayName("rejects negative match cost")
void shouldRejectNegativeMatchCost() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> new PatchCommandEncoder(1, 1, 1, -1));
assertEquals("matchCost must be non-negative.", exception.getMessage());
}
}
/**
* Tests {@link PatchCommandEncoder#encode(String, String)}.
*/
@Nested
@DisplayName("encode(String, String)")
@Tag("encode")
class EncodeTests {
/**
* Verifies that a null source yields a null patch.
*/
@Test
@DisplayName("returns null when source is null")
void shouldReturnNullWhenSourceIsNull() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode(null, "target");
assertNull(patch);
}
/**
* Verifies that a null target yields a null patch.
*/
@Test
@DisplayName("returns null when target is null")
void shouldReturnNullWhenTargetIsNull() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode("source", null);
assertNull(patch);
}
/**
* Verifies that equal words always produce the canonical identity patch.
*/
@Test
@DisplayName("returns canonical NOOP patch for equal words")
void shouldReturnCanonicalNoopPatchForEqualWords() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode("teacher", "teacher");
assertAll(() -> assertNotNull(patch), () -> assertEquals(PatchCommandEncoder.NOOP_PATCH, patch),
() -> assertEquals("teacher", PatchCommandEncoder.apply("teacher", patch)));
}
/**
* Verifies deterministic identity encoding for empty words.
*/
@Test
@DisplayName("returns canonical NOOP patch for equal empty words")
void shouldReturnCanonicalNoopPatchForEqualEmptyWords() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode("", "");
assertAll(() -> assertEquals(PatchCommandEncoder.NOOP_PATCH, patch),
() -> assertEquals("", PatchCommandEncoder.apply("", patch)));
}
/**
* Verifies round-trip reconstruction on representative pairs.
*
* @param caseId numeric case identifier
* @param source source word
* @param target target word
*/
@ParameterizedTest(name = "[{index}] case {0}: {1} -> {2}")
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideRoundTripPairs")
@DisplayName("produces patches that reconstruct the target")
void shouldReconstructTargetForRoundTripPairs(int caseId, String source, String target) {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode(source, target);
String reconstructed = PatchCommandEncoder.apply(source, patch);
assertAll(
() -> assertNotNull(patch,
() -> "Case " + caseId + " unexpectedly produced a null patch for source='" + source
+ "', target='" + target + "'."),
() -> assertEquals(target, reconstructed, () -> "Case " + caseId + " failed for source='" + source
+ "', target='" + target + "', patch='" + patch + "'."));
}
/**
* Verifies that one encoder instance remains correct across varying matrix
* sizes.
*/
@Test
@DisplayName("remains correct when reused across different input sizes")
void shouldRemainCorrectWhenReusedAcrossDifferentInputSizes() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
assertAll(
() -> assertEquals("transformation",
PatchCommandEncoder.apply("transform", encoder.encode("transform", "transformation"))),
() -> assertEquals("cat", PatchCommandEncoder.apply("cats", encoder.encode("cats", "cat"))),
() -> assertEquals("book", PatchCommandEncoder.apply("back", encoder.encode("back", "book"))),
() -> assertEquals("", PatchCommandEncoder.apply("x", encoder.encode("x", ""))));
}
/**
* Verifies that custom operation costs still produce a usable patch.
*/
@Test
@DisplayName("supports custom operation costs")
void shouldSupportCustomOperationCosts() {
PatchCommandEncoder encoder = new PatchCommandEncoder(1, 1, 2, 0);
String patch = encoder.encode("teacher", "teach");
String reconstructed = PatchCommandEncoder.apply("teacher", patch);
assertAll(() -> assertNotNull(patch), () -> assertEquals("teach", reconstructed));
}
}
/**
* Tests {@link PatchCommandEncoder#apply(String, String)}.
*/
@Nested
@DisplayName("apply(String, String)")
@Tag("apply")
class ApplyTests {
/**
* Verifies that a null source returns null.
*/
@Test
@DisplayName("returns null when source is null")
void shouldReturnNullWhenSourceIsNull() {
assertNull(PatchCommandEncoder.apply(null, "Da"));
}
/**
* Verifies that a null patch returns the original source.
*/
@Test
@DisplayName("returns original source when patch is null")
void shouldReturnSourceWhenPatchIsNull() {
String source = "teacher";
assertSame(source, PatchCommandEncoder.apply(source, null));
}
/**
* Verifies that an empty patch returns the original source.
*/
@Test
@DisplayName("returns original source when patch is empty")
void shouldReturnSourceWhenPatchIsEmpty() {
String source = "teacher";
assertSame(source, PatchCommandEncoder.apply(source, ""));
}
/**
* Verifies that the canonical NOOP patch returns the original source.
*/
@Test
@DisplayName("returns original source when patch is canonical NOOP")
void shouldReturnSourceWhenPatchIsCanonicalNoop() {
String source = "teacher";
assertSame(source, PatchCommandEncoder.apply(source, PatchCommandEncoder.NOOP_PATCH));
}
/**
* Verifies explicit patch application cases.
*
* @param caseId numeric case identifier
* @param source source word
* @param patch patch command
* @param expected expected transformed word
*/
@ParameterizedTest(name = "[{index}] case {0}: apply({1}, {2}) -> {3}")
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideApplyCases")
@DisplayName("applies explicit patch commands correctly")
void shouldApplyExplicitPatchCommandsCorrectly(int caseId, String source, String patch, String expected) {
assertEquals(expected, PatchCommandEncoder.apply(source, patch),
() -> "Case " + caseId + " failed for source='" + source + "', patch='" + patch + "'.");
}
/**
* Verifies that unsupported opcodes fail fast.
*/
@Test
@DisplayName("throws IllegalArgumentException for unsupported opcode")
void shouldThrowForUnsupportedOpcode() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> PatchCommandEncoder.apply("abc", "Xa"));
assertEquals("Unsupported patch opcode: X", exception.getMessage());
}
/**
* Verifies that an unsupported NOOP argument fails fast.
*/
@Test
@DisplayName("throws IllegalArgumentException for unsupported NOOP argument")
void shouldThrowForUnsupportedNoopArgument() {
IllegalArgumentException exception = assertThrows(IllegalArgumentException.class,
() -> PatchCommandEncoder.apply("abc", "Nb"));
assertEquals("Unsupported NOOP patch argument: b", exception.getMessage());
}
/**
* Verifies malformed and index-invalid compatibility behavior.
*
* @param caseId numeric case identifier
* @param source original source
* @param malformedPatch malformed patch
*/
@ParameterizedTest(name = "[{index}] case {0}: malformed patch {2} preserves {1}")
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideMalformedPatchCases")
@DisplayName("returns the original source for malformed or index-invalid patch commands")
void shouldReturnOriginalSourceForMalformedOrIndexInvalidPatchCommands(int caseId, String source,
String malformedPatch) {
assertEquals(source, PatchCommandEncoder.apply(source, malformedPatch), () -> "Case " + caseId
+ " failed for source='" + source + "', malformedPatch='" + malformedPatch + "'.");
}
}
/**
* Tests representative stemming-style scenarios.
*/
@Nested
@DisplayName("stemming-oriented scenarios")
@Tag("regression")
class StemmingScenarioTests {
/**
* Verifies deletion-heavy suffix stripping.
*/
@Test
@DisplayName("handles deletion-heavy suffix stripping")
void shouldHandleDeletionHeavySuffixStripping() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode("teacher", "teach");
assertEquals("teach", PatchCommandEncoder.apply("teacher", patch));
}
/**
* Verifies plural to singular transformation.
*/
@Test
@DisplayName("handles plural to singular transformation")
void shouldHandlePluralToSingularTransformation() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode("cities", "city");
assertEquals("city", PatchCommandEncoder.apply("cities", patch));
}
/**
* Verifies reduction to a shorter derivational stem.
*/
@Test
@DisplayName("handles derivational reduction to a shorter stem")
void shouldHandleDerivationalReductionToShorterStem() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode("stemming", "stem");
assertEquals("stem", PatchCommandEncoder.apply("stemming", patch));
}
/**
* Verifies single-character replacement.
*/
@Test
@DisplayName("handles single-character replacement")
void shouldHandleSingleCharacterReplacement() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String patch = encoder.encode("a", "z");
assertEquals("z", PatchCommandEncoder.apply("a", patch));
}
}
/**
* Tests reversed-word processing.
*/
@Nested
@DisplayName("reversed-word processing")
@Tag("reverse")
class ReversedWordProcessingTests {
/**
* Verifies reconstruction for reversed source and target pairs.
*
* @param caseId numeric case identifier
* @param source source word
* @param target target word
*/
@ParameterizedTest(name = "[{index}] case {0}: reverse({1}) -> reverse({2})")
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
@DisplayName("reconstructs reversed targets from reversed sources")
void shouldReconstructReversedTargetsFromReversedSources(int caseId, String source, String target) {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String reversedSource = reverse(source);
String reversedTarget = reverse(target);
String patch = encoder.encode(reversedSource, reversedTarget);
String reconstructed = PatchCommandEncoder.apply(reversedSource, patch);
assertAll(
() -> assertNotNull(patch,
() -> "Case " + caseId + " unexpectedly produced a null patch for reversedSource='"
+ reversedSource + "', reversedTarget='" + reversedTarget + "'."),
() -> assertEquals(reversedTarget, reconstructed,
() -> "Case " + caseId + " failed for reversedSource='" + reversedSource
+ "', reversedTarget='" + reversedTarget + "', patch='" + patch + "'."));
}
/**
* Verifies representative mirrored stemming transformations.
*/
@Test
@DisplayName("handles mirrored stemming transformations")
void shouldHandleMirroredStemmingTransformations() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
assertAll(
() -> assertEquals(reverse("teach"),
PatchCommandEncoder.apply(reverse("teacher"),
encoder.encode(reverse("teacher"), reverse("teach")))),
() -> assertEquals(reverse("run"),
PatchCommandEncoder.apply(reverse("running"),
encoder.encode(reverse("running"), reverse("run")))),
() -> assertEquals(reverse("city"),
PatchCommandEncoder.apply(reverse("cities"),
encoder.encode(reverse("cities"), reverse("city")))),
() -> assertEquals(reverse("walk"), PatchCommandEncoder.apply(reverse("walked"),
encoder.encode(reverse("walked"), reverse("walk")))));
}
/**
* Verifies encoder reuse on reversed words of different sizes.
*/
@Test
@DisplayName("remains correct when reused on reversed words of different sizes")
void shouldRemainCorrectWhenReusedOnReversedWordsOfDifferentSizes() {
PatchCommandEncoder encoder = new PatchCommandEncoder();
assertAll(
() -> assertEquals(reverse("transformation"),
PatchCommandEncoder.apply(reverse("transform"),
encoder.encode(reverse("transform"), reverse("transformation")))),
() -> assertEquals(reverse("cat"),
PatchCommandEncoder.apply(reverse("cats"),
encoder.encode(reverse("cats"), reverse("cat")))),
() -> assertEquals(reverse("book"),
PatchCommandEncoder.apply(reverse("back"),
encoder.encode(reverse("back"), reverse("book")))),
() -> assertEquals("",
PatchCommandEncoder.apply(reverse("x"), encoder.encode(reverse("x"), reverse("")))));
}
}
/**
* Verifies correctness under mirrored input orientation.
*
* @param caseId numeric case identifier
* @param source source word
* @param target target word
*/
@ParameterizedTest(name = "[{index}] case {0}: mirrored consistency for {1} -> {2}")
@MethodSource("org.egothor.stemmer.PatchCommandEncoderTest#provideReversedRoundTripPairs")
@DisplayName("preserves correctness under mirrored input orientation")
void shouldPreserveCorrectnessUnderMirroredInputOrientation(int caseId, String source, String target) {
PatchCommandEncoder encoder = new PatchCommandEncoder();
String normalPatch = encoder.encode(source, target);
String normalResult = PatchCommandEncoder.apply(source, normalPatch);
String reversedSource = reverse(source);
String reversedTarget = reverse(target);
String reversedPatch = encoder.encode(reversedSource, reversedTarget);
String reversedResult = PatchCommandEncoder.apply(reversedSource, reversedPatch);
assertAll(
() -> assertEquals(target, normalResult,
() -> "Case " + caseId + " failed in normal orientation for source='" + source + "', target='"
+ target + "', patch='" + normalPatch + "'."),
() -> assertEquals(reversedTarget, reversedResult,
() -> "Case " + caseId + " failed in mirrored orientation for reversedSource='" + reversedSource
+ "', reversedTarget='" + reversedTarget + "', patch='" + reversedPatch + "'."));
}
}

View File

@@ -0,0 +1,326 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
/**
* Unit tests for {@link StemmerDictionaryParser}.
*
* <p>
* The suite verifies:
* </p>
* <ul>
* <li>parsing through all public overloads,</li>
* <li>normalization to lower case,</li>
* <li>handling of empty lines and remarks,</li>
* <li>correct entry emission including line numbers,</li>
* <li>propagation of I/O failures from the handler and file system,</li>
* <li>argument validation,</li>
* <li>validation rules of {@link StemmerDictionaryParser.ParseStatistics}.</li>
* </ul>
*/
@DisplayName("StemmerDictionaryParser")
@Tag("unit")
@Tag("parser")
class StemmerDictionaryParserTest {
/**
* Temporary directory used by file-based parser tests.
*/
@TempDir
Path tempDir;
/**
* Parsed entry snapshot used to assert handler callbacks deterministically.
*
* @param stem canonical stem
* @param variants parsed variants in encounter order
* @param lineNumber physical source line number
*/
private record CapturedEntry(String stem, String[] variants, int lineNumber) {
// Record used only as a compact assertion carrier.
}
/**
* Creates a handler that collects all parser callbacks into the supplied list.
*
* @param entries target entry list
* @return collecting handler
*/
private static StemmerDictionaryParser.EntryHandler collectingHandler(final List<CapturedEntry> entries) {
return (stem, variants, lineNumber) -> entries.add(new CapturedEntry(stem, variants.clone(), lineNumber));
}
/**
* Creates a UTF-8 file with the provided content.
*
* @param fileName target file name
* @param content file content
* @return created file path
* @throws IOException if writing fails
*/
private Path createFile(final String fileName, final String content) throws IOException {
final Path file = this.tempDir.resolve(fileName);
Files.writeString(file, content, StandardCharsets.UTF_8);
return file;
}
@Nested
@DisplayName("parse(Reader, String, EntryHandler)")
class ReaderParsingTests {
@Test
@DisplayName("should parse normalized entries and collect accurate statistics")
void shouldParseNormalizedEntriesAndCollectAccurateStatistics() throws IOException {
final String input = "# full line remark\n" + " \n"
+ "Root Running Runs RUNNER # trailing hash remark\n"
+ "House HOUSEHOLD houseS // trailing slash remark\n" + "SingleStem\n"
+ "// full line slash remark\n";
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
final Reader reader = new StringReader(input);
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser.parse(reader,
"reader-source", collectingHandler(entries));
assertNotNull(statistics);
assertEquals(6, statistics.lineCount(), "All physical lines must be counted.");
assertEquals(3, statistics.entryCount(), "Three logical entries must be emitted.");
assertEquals(3, statistics.ignoredLineCount(), "Remark-only and blank lines must be ignored.");
assertEquals("reader-source", statistics.sourceDescription(), "Source description must be preserved.");
assertEquals(3, entries.size(), "Exactly three parsed entries are expected.");
final CapturedEntry first = entries.get(0);
assertAll("First entry", () -> assertEquals("root", first.stem(), "Stem must be normalized to lower case."),
() -> assertArrayEquals(new String[] { "running", "runs", "runner" }, first.variants(),
"Variants must be normalized and kept in encounter order."),
() -> assertEquals(3, first.lineNumber(), "Line number must refer to the physical source line."));
final CapturedEntry second = entries.get(1);
assertAll("Second entry", () -> assertEquals("house", second.stem()),
() -> assertArrayEquals(new String[] { "household", "houses" }, second.variants()),
() -> assertEquals(4, second.lineNumber()));
final CapturedEntry third = entries.get(2);
assertAll("Third entry", () -> assertEquals("singlestem", third.stem()),
() -> assertArrayEquals(new String[0], third.variants(),
"A line containing only the stem must produce zero variants."),
() -> assertEquals(5, third.lineNumber()));
}
@Test
@DisplayName("should prefer earliest remark marker regardless of marker type")
void shouldPreferEarliestRemarkMarkerRegardlessOfMarkerType() throws IOException {
final String input = "alpha beta // slash remark before # hash remark # ignored\n"
+ "gamma delta # hash remark before // slash remark // ignored\n";
final List<CapturedEntry> entries = new ArrayList<CapturedEntry>();
final StemmerDictionaryParser.ParseStatistics statistics = StemmerDictionaryParser
.parse(new StringReader(input), "mixed-remarks", collectingHandler(entries));
assertAll("Statistics", () -> assertEquals(2, statistics.lineCount()),
() -> assertEquals(2, statistics.entryCount()),
() -> assertEquals(0, statistics.ignoredLineCount()));
assertEquals(2, entries.size(), "Both logical entries must be parsed.");
assertAll("First parsed line", () -> assertEquals("alpha", entries.get(0).stem()),
() -> assertArrayEquals(new String[] { "beta" }, entries.get(0).variants()));
assertAll("Second parsed line", () -> assertEquals("gamma", entries.get(1).stem()),
() -> assertArrayEquals(new String[] { "delta" }, entries.get(1).variants()));
}
@Test
@DisplayName("should propagate handler IOException without swallowing it")
void shouldPropagateHandlerIOExceptionWithoutSwallowingIt() {
final IOException expected = new IOException("Simulated handler failure.");
final Reader reader = new StringReader("stem variant\n");
final IOException exception = assertThrows(IOException.class,
() -> StemmerDictionaryParser.parse(reader, "failing-handler", (stem, variants, lineNumber) -> {
throw expected;
}), "Handler I/O failure must be propagated.");
assertEquals(expected, exception, "The original exception instance should be preserved.");
}
@Test
@DisplayName("should reject null reader")
void shouldRejectNullReader() {
assertThrows(NullPointerException.class,
() -> StemmerDictionaryParser.parse((Reader) null, "source", (stem, variants, lineNumber) -> {
// no-op
}));
}
@Test
@DisplayName("should reject null source description")
void shouldRejectNullSourceDescription() {
assertThrows(NullPointerException.class,
() -> StemmerDictionaryParser.parse(new StringReader("a b"), null, (stem, variants, lineNumber) -> {
// no-op
}));
}
@Test
@DisplayName("should reject null entry handler")
void shouldRejectNullEntryHandler() {
assertThrows(NullPointerException.class,
() -> StemmerDictionaryParser.parse(new StringReader("a b"), "source", null));
}
}
@Nested
@DisplayName("parse(Path, EntryHandler) and parse(String, EntryHandler)")
class FileParsingTests {
@Test
@DisplayName("should parse same content through path and string overloads")
void shouldParseSameContentThroughPathAndStringOverloads() throws IOException {
final String content = "walk walking walked\n" + "run running\n" + "\n" + "# ignored\n";
final Path file = createFile("dictionary.txt", content);
final List<CapturedEntry> pathEntries = new ArrayList<CapturedEntry>();
final StemmerDictionaryParser.ParseStatistics pathStatistics = StemmerDictionaryParser.parse(file,
collectingHandler(pathEntries));
final List<CapturedEntry> stringEntries = new ArrayList<CapturedEntry>();
final StemmerDictionaryParser.ParseStatistics stringStatistics = StemmerDictionaryParser
.parse(file.toString(), collectingHandler(stringEntries));
assertAll("Path statistics",
() -> assertEquals(file.toAbsolutePath().toString(), pathStatistics.sourceDescription()),
() -> assertEquals(4, pathStatistics.lineCount()),
() -> assertEquals(2, pathStatistics.entryCount()),
() -> assertEquals(2, pathStatistics.ignoredLineCount()));
assertAll("String statistics",
() -> assertEquals(file.toAbsolutePath().toString(), stringStatistics.sourceDescription()),
() -> assertEquals(pathStatistics.lineCount(), stringStatistics.lineCount()),
() -> assertEquals(pathStatistics.entryCount(), stringStatistics.entryCount()),
() -> assertEquals(pathStatistics.ignoredLineCount(), stringStatistics.ignoredLineCount()));
assertEquals(pathEntries.size(), stringEntries.size(),
"Both overloads must emit the same number of entries.");
for (int index = 0; index < pathEntries.size(); index++) {
final CapturedEntry pathEntry = pathEntries.get(index);
final CapturedEntry stringEntry = stringEntries.get(index);
assertAll("Entry " + index, () -> assertEquals(pathEntry.stem(), stringEntry.stem()),
() -> assertArrayEquals(pathEntry.variants(), stringEntry.variants()),
() -> assertEquals(pathEntry.lineNumber(), stringEntry.lineNumber()));
}
}
@Test
@DisplayName("should reject null path")
void shouldRejectNullPath() {
assertThrows(NullPointerException.class,
() -> StemmerDictionaryParser.parse((Path) null, (stem, variants, lineNumber) -> {
// no-op
}));
}
@Test
@DisplayName("should reject null file name")
void shouldRejectNullFileName() {
assertThrows(NullPointerException.class,
() -> StemmerDictionaryParser.parse((String) null, (stem, variants, lineNumber) -> {
// no-op
}));
}
@Test
@DisplayName("should reject null handler for path overload")
void shouldRejectNullHandlerForPathOverload() throws IOException {
final Path file = createFile("path-null-handler.txt", "root roots\n");
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(file, null));
}
@Test
@DisplayName("should reject null handler for string overload")
void shouldRejectNullHandlerForStringOverload() throws IOException {
final Path file = createFile("string-null-handler.txt", "root roots\n");
assertThrows(NullPointerException.class, () -> StemmerDictionaryParser.parse(file.toString(), null));
}
@Test
@DisplayName("should propagate file access failure for missing path")
void shouldPropagateFileAccessFailureForMissingPath() {
final Path missingFile = StemmerDictionaryParserTest.this.tempDir.resolve("missing-dictionary.txt");
assertThrows(IOException.class,
() -> StemmerDictionaryParser.parse(missingFile, (stem, variants, lineNumber) -> {
// no-op
}), "Missing file must surface as an I/O failure.");
}
}
@Nested
@DisplayName("ParseStatistics")
class ParseStatisticsTests {
@Test
@DisplayName("should create record when all values are valid")
void shouldCreateRecordWhenAllValuesAreValid() {
final StemmerDictionaryParser.ParseStatistics statistics = new StemmerDictionaryParser.ParseStatistics(
"source", 7, 4, 3);
assertAll("Record state", () -> assertEquals("source", statistics.sourceDescription()),
() -> assertEquals(7, statistics.lineCount()), () -> assertEquals(4, statistics.entryCount()),
() -> assertEquals(3, statistics.ignoredLineCount()));
}
@Test
@DisplayName("should reject null source description")
void shouldRejectNullSourceDescription() {
assertThrows(NullPointerException.class, () -> new StemmerDictionaryParser.ParseStatistics(null, 0, 0, 0));
}
@Test
@DisplayName("should reject negative line count")
void shouldRejectNegativeLineCount() {
assertThrows(IllegalArgumentException.class,
() -> new StemmerDictionaryParser.ParseStatistics("source", -1, 0, 0));
}
@Test
@DisplayName("should reject negative entry count")
void shouldRejectNegativeEntryCount() {
assertThrows(IllegalArgumentException.class,
() -> new StemmerDictionaryParser.ParseStatistics("source", 0, -1, 0));
}
@Test
@DisplayName("should reject negative ignored line count")
void shouldRejectNegativeIgnoredLineCount() {
assertThrows(IllegalArgumentException.class,
() -> new StemmerDictionaryParser.ParseStatistics("source", 0, 0, -1));
}
}
}

View File

@@ -0,0 +1,512 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.mockStatic;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.verifyNoMoreInteractions;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.zip.GZIPInputStream;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.MockedStatic;
/**
* Unit tests for {@link StemmerPatchTrieBinaryIO}.
*
* <p>
* The test suite verifies the externally observable contract of the binary I/O
* helper:
* </p>
* <ul>
* <li>null-argument validation for all public overloads,</li>
* <li>utility-class constructor behavior,</li>
* <li>delegation to
* {@link FrequencyTrie#writeTo(DataOutputStream, FrequencyTrie.ValueStreamCodec)},</li>
* <li>delegation to
* {@link FrequencyTrie#readFrom(DataInputStream, java.util.function.IntFunction, FrequencyTrie.ValueStreamCodec)},</li>
* <li>GZip wrapping of persisted data,</li>
* <li>filesystem convenience behavior such as parent directory creation,
* and</li>
* <li>propagation of malformed-input failures.</li>
* </ul>
*
* <p>
* These tests intentionally validate the helper in isolation and therefore rely
* on Mockito static mocking for {@link FrequencyTrie#readFrom(...)}.
* </p>
*/
@Tag("unit")
@Tag("io")
@Tag("persistence")
@DisplayName("StemmerPatchTrieBinaryIO")
class StemmerPatchTrieBinaryIOTest {
/**
* Temporary directory provided by JUnit.
*/
@TempDir
Path temporaryDirectory;
/**
* Verifies that the utility-class constructor is inaccessible in practice and
* fails with the documented assertion.
*
* @throws Exception if reflective access unexpectedly fails for a reason other
* than the constructor throwing its assertion
*/
@Test
@DisplayName("Constructor should reject instantiation")
void shouldRejectInstantiation() throws Exception {
final Constructor<StemmerPatchTrieBinaryIO> constructor = StemmerPatchTrieBinaryIO.class
.getDeclaredConstructor();
constructor.setAccessible(true);
final InvocationTargetException invocationTargetException = assertThrows(InvocationTargetException.class,
constructor::newInstance, "Utility-class constructor must not allow instantiation.");
final Throwable cause = invocationTargetException.getCause();
assertAll(() -> assertNotNull(cause, "Constructor failure must expose the root cause."),
() -> assertInstanceOf(AssertionError.class, cause, "Constructor must fail with AssertionError."),
() -> assertEquals("No instances.", cause.getMessage(),
"Constructor must communicate the non-instantiability contract."));
}
/**
* Tests for write operations.
*/
@Nested
@DisplayName("write(...)")
class WriteTests {
/**
* Verifies null handling for all write overloads.
*/
@Test
@DisplayName("Should reject null arguments across all overloads")
void shouldRejectNullArgumentsAcrossAllWriteOverloads() {
@SuppressWarnings("unchecked")
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
final OutputStream outputStream = new ByteArrayOutputStream();
final Path path = temporaryDirectory.resolve("stemmer.bin.gz");
assertAll(
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.write(null, path),
"write(FrequencyTrie, Path) must reject null trie."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.write(trie, (Path) null),
"write(FrequencyTrie, Path) must reject null path."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.write(null, "file.bin.gz"),
"write(FrequencyTrie, String) must reject null trie."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.write(trie, (String) null),
"write(FrequencyTrie, String) must reject null file name."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.write(null, outputStream),
"write(FrequencyTrie, OutputStream) must reject null trie."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.write(trie, (OutputStream) null),
"write(FrequencyTrie, OutputStream) must reject null output stream."));
}
/**
* Verifies that the stream overload compresses the payload and delegates trie
* serialization once.
*
* @throws IOException if the helper unexpectedly fails
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should compress output and delegate trie serialization")
void shouldCompressOutputAndDelegateTrieSerialization() throws IOException {
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
StemmerPatchTrieBinaryIO.write(trie, byteArrayOutputStream);
verify(trie).writeTo(any(DataOutputStream.class), any(FrequencyTrie.ValueStreamCodec.class));
verifyNoMoreInteractions(trie);
final byte[] compressedBytes = byteArrayOutputStream.toByteArray();
assertAll(
() -> assertTrue(compressedBytes.length > 2,
"Compressed output must contain at least the GZip header."),
() -> assertEquals(0x1f, compressedBytes[0] & 0xff, "First byte must match the GZip magic header."),
() -> assertEquals(0x8b, compressedBytes[1] & 0xff,
"Second byte must match the GZip magic header."));
}
/**
* Verifies that the path overload creates missing parent directories and writes
* a readable GZip payload.
*
* @throws IOException if the helper unexpectedly fails
*/
@Test
@DisplayName("Should create parent directories and write gzip file")
void shouldCreateParentDirectoriesAndWriteGzipFile() throws IOException {
@SuppressWarnings("unchecked")
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
final Path targetFile = temporaryDirectory.resolve("nested").resolve("deeper").resolve("stemmer.bin.gz");
StemmerPatchTrieBinaryIO.write(trie, targetFile);
assertAll(() -> assertTrue(Files.exists(targetFile), "Target file must be created."),
() -> assertTrue(Files.isDirectory(targetFile.getParent()),
"Missing parent directories must be created."));
final byte[] bytes = Files.readAllBytes(targetFile);
assertAll(() -> assertTrue(bytes.length > 2, "Persisted file must not be empty."),
() -> assertEquals(0x1f, bytes[0] & 0xff, "Persisted file must start with the GZip magic header."),
() -> assertEquals(0x8b, bytes[1] & 0xff, "Persisted file must start with the GZip magic header."));
}
/**
* Verifies that the string-path overload delegates correctly to
* filesystem-based persistence.
*
* @throws IOException if the helper unexpectedly fails
*/
@Test
@DisplayName("Should write to filesystem when file name string is used")
void shouldWriteToFilesystemWhenFileNameStringIsUsed() throws IOException {
@SuppressWarnings("unchecked")
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
final Path targetFile = temporaryDirectory.resolve("string-path-stemmer.bin.gz");
StemmerPatchTrieBinaryIO.write(trie, targetFile.toString());
assertAll(() -> assertTrue(Files.exists(targetFile), "String-based overload must create the target file."),
() -> assertTrue(Files.size(targetFile) > 0L,
"String-based overload must write non-empty output."));
}
/**
* Verifies that the helper closes the supplied output stream because the
* implementation owns the wrapping GZip/DataOutput streams in a
* try-with-resources block.
*
* @throws IOException if the helper unexpectedly fails
*/
@Test
@DisplayName("Should close supplied output stream")
void shouldCloseSuppliedOutputStream() throws IOException {
@SuppressWarnings("unchecked")
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
final TrackingOutputStream trackingOutputStream = new TrackingOutputStream();
StemmerPatchTrieBinaryIO.write(trie, trackingOutputStream);
assertTrue(trackingOutputStream.isClosed(), "Output stream must be closed when write completes.");
}
/**
* Verifies that write failures raised by the trie serializer are propagated
* unchanged to the caller.
*
* @throws IOException if the mock setup unexpectedly fails
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should propagate write failure from trie serialization")
void shouldPropagateWriteFailureFromTrieSerialization() throws IOException {
final FrequencyTrie<String> trie = mock(FrequencyTrie.class);
final IOException expectedException = new IOException("write failure");
org.mockito.Mockito.doThrow(expectedException).when(trie).writeTo(any(DataOutputStream.class),
any(FrequencyTrie.ValueStreamCodec.class));
final IOException actualException = assertThrows(IOException.class,
() -> StemmerPatchTrieBinaryIO.write(trie, new ByteArrayOutputStream()),
"Write-side serialization failures must be propagated unchanged.");
assertSame(expectedException, actualException,
"The helper must propagate the original write exception instance.");
}
}
/**
* Tests for read operations.
*/
@Nested
@DisplayName("read(...)")
class ReadTests {
/**
* Verifies null handling for all read overloads.
*/
@Test
@DisplayName("Should reject null arguments across all overloads")
void shouldRejectNullArgumentsAcrossAllReadOverloads() {
assertAll(
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((Path) null),
"read(Path) must reject null path."),
() -> assertThrows(NullPointerException.class, () -> StemmerPatchTrieBinaryIO.read((String) null),
"read(String) must reject null file name."),
() -> assertThrows(NullPointerException.class,
() -> StemmerPatchTrieBinaryIO.read((ByteArrayInputStream) null),
"read(InputStream) must reject null input stream."));
}
/**
* Verifies that the stream overload delegates deserialization to
* {@link FrequencyTrie#readFrom(...)} and returns its result unchanged.
*
* @throws IOException if the helper unexpectedly fails
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should decompress input and delegate trie deserialization")
void shouldDecompressInputAndDelegateTrieDeserialization() throws IOException {
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
final byte[] gzipPayload = gzip("binary-content-not-interpreted-directly");
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO
.read(new ByteArrayInputStream(gzipPayload));
assertSame(expectedTrie, actualTrie,
"read(InputStream) must return exactly the trie produced by FrequencyTrie.readFrom(...).");
mockedStatic.verify(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class)));
}
}
/**
* Verifies that the path overload reads from the filesystem and delegates to
* the same deserialization path.
*
* @throws IOException if the helper unexpectedly fails
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should read gzip payload from path")
void shouldReadGzipPayloadFromPath() throws IOException {
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
final Path sourceFile = temporaryDirectory.resolve("input-stemmer.bin.gz");
Files.write(sourceFile, gzip("path-based-payload"));
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile);
assertSame(expectedTrie, actualTrie,
"read(Path) must return the trie created by FrequencyTrie.readFrom(...).");
}
}
/**
* Verifies that the string-path overload reads from the filesystem and
* delegates to the same deserialization path.
*
* @throws IOException if the helper unexpectedly fails
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should read gzip payload from file name string")
void shouldReadGzipPayloadFromFileNameString() throws IOException {
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
final Path sourceFile = temporaryDirectory.resolve("input-string-stemmer.bin.gz");
Files.write(sourceFile, gzip("string-based-payload"));
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(sourceFile.toString());
assertSame(expectedTrie, actualTrie,
"read(String) must return the trie created by FrequencyTrie.readFrom(...).");
}
}
/**
* Verifies that malformed non-GZip input is reported as an I/O failure.
*/
@Test
@DisplayName("Should fail for malformed non-gzip input")
void shouldFailForMalformedNonGzipInput() {
final ByteArrayInputStream malformedInput = new ByteArrayInputStream(
"not-a-gzip-stream".getBytes(StandardCharsets.UTF_8));
assertThrows(IOException.class, () -> StemmerPatchTrieBinaryIO.read(malformedInput),
"Malformed non-GZip input must be reported as an I/O failure.");
}
/**
* Verifies that the helper closes the supplied input stream because the
* implementation owns the wrapping GZip/DataInput streams in a
* try-with-resources block.
*
* @throws IOException if the helper unexpectedly fails
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should close supplied input stream")
void shouldCloseSuppliedInputStream() throws IOException {
final FrequencyTrie<String> expectedTrie = mock(FrequencyTrie.class);
final TrackingInputStream trackingInputStream = new TrackingInputStream(gzip("close-check"));
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class))).thenReturn(expectedTrie);
final FrequencyTrie<String> actualTrie = StemmerPatchTrieBinaryIO.read(trackingInputStream);
assertAll(
() -> assertSame(expectedTrie, actualTrie,
"Read operation must still return the deserialized trie."),
() -> assertTrue(trackingInputStream.isClosed(),
"Input stream must be closed when read completes."));
}
}
/**
* Verifies that read failures raised by the trie deserializer are propagated
* unchanged to the caller.
*
* @throws IOException if the mock setup unexpectedly fails
*/
@SuppressWarnings("unchecked")
@Test
@DisplayName("Should propagate read failure from trie deserialization")
void shouldPropagateReadFailureFromTrieDeserialization() throws IOException {
final IOException expectedException = new IOException("read failure");
final byte[] gzipPayload = gzip("deserialization-input");
try (@SuppressWarnings("rawtypes")
MockedStatic<FrequencyTrie> mockedStatic = mockStatic(FrequencyTrie.class)) {
mockedStatic.when(() -> FrequencyTrie.readFrom(any(DataInputStream.class), any(),
any(FrequencyTrie.ValueStreamCodec.class))).thenThrow(expectedException);
final IOException actualException = assertThrows(IOException.class,
() -> StemmerPatchTrieBinaryIO.read(new ByteArrayInputStream(gzipPayload)),
"Read-side deserialization failures must be propagated unchanged.");
assertSame(expectedException, actualException,
"The helper must propagate the original read exception instance.");
}
}
}
/**
* Utility method that produces a small GZip-compressed byte array.
*
* @param payload textual payload to compress
* @return compressed bytes
* @throws IOException if compression fails unexpectedly
*/
private static byte[] gzip(final String payload) throws IOException {
final ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
try (java.util.zip.GZIPOutputStream gzipOutputStream = new java.util.zip.GZIPOutputStream(
byteArrayOutputStream)) {
gzipOutputStream.write(payload.getBytes(StandardCharsets.UTF_8));
}
final byte[] compressedBytes = byteArrayOutputStream.toByteArray();
try (GZIPInputStream ignored = new GZIPInputStream(new ByteArrayInputStream(compressedBytes))) {
assertTrue(compressedBytes.length > 0, "Test fixture must create a valid non-empty GZip payload.");
}
return compressedBytes;
}
/**
* Output stream that records whether it has been closed.
*/
private static final class TrackingOutputStream extends ByteArrayOutputStream {
/**
* Whether {@link #close()} has been invoked.
*/
private boolean closed;
@Override
public void close() throws IOException {
this.closed = true;
super.close();
}
/**
* Returns whether the stream has been closed.
*
* @return {@code true} if the stream has been closed; {@code false} otherwise
*/
boolean isClosed() {
return this.closed;
}
}
/**
* Input stream that records whether it has been closed.
*/
private static final class TrackingInputStream extends ByteArrayInputStream {
/**
* Whether {@link #close()} has been invoked.
*/
private boolean closed;
/**
* Creates a tracking stream backed by the given bytes.
*
* @param buffer input bytes
*/
TrackingInputStream(final byte[] buffer) {
super(buffer);
}
@Override
public void close() throws IOException {
this.closed = true;
super.close();
}
/**
* Returns whether the stream has been closed.
*
* @return {@code true} if the stream has been closed; {@code false} otherwise
*/
boolean isClosed() {
return this.closed;
}
}
}

View File

@@ -0,0 +1,732 @@
package org.egothor.stemmer;
import static org.junit.jupiter.api.Assertions.assertAll;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertThrows;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Stream;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestInstance;
import org.junit.jupiter.api.io.TempDir;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
/**
* Professional test suite for {@link StemmerPatchTrieLoader}.
*
* <p>
* The suite combines focused API-level verification with integration validation
* against bundled dictionaries. It verifies:
* </p>
* <ul>
* <li>all public loading overloads</li>
* <li>binary persistence round-trips</li>
* <li>null-argument contracts</li>
* <li>comment-aware parsing delegated to {@link StemmerDictionaryParser}</li>
* <li>preservation of all valid stem candidates returned by
* {@link FrequencyTrie#getAll(String)}</li>
* </ul>
*/
@Tag("unit")
@Tag("integration")
@Tag("stemmer")
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
final class StemmerPatchTrieLoaderTest {
/**
* Temporary directory for filesystem-based tests.
*/
@TempDir
private Path tempDir;
/**
* Reduction mode used for deterministic getAll-preserving checks in focused
* tests.
*/
private static final ReductionMode DEFAULT_REDUCTION_MODE = ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS;
/**
* Provides arguments for bundled dictionary verification across both supported
* getAll-preserving reduction modes.
*
* @return parameter stream
*/
static Stream<Arguments> bundledDictionaryCases() {
return Stream.of(
// 01
Arguments.of("01-da_dk-ranked", StemmerPatchTrieLoader.Language.DA_DK,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 02
Arguments.of("02-de_de-ranked", StemmerPatchTrieLoader.Language.DE_DE,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 03
Arguments.of("03-es_es-ranked", StemmerPatchTrieLoader.Language.ES_ES,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 04
Arguments.of("04-fr_fr-ranked", StemmerPatchTrieLoader.Language.FR_FR,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 05
Arguments.of("05-it_it-ranked", StemmerPatchTrieLoader.Language.IT_IT,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 06
Arguments.of("06-nl_nl-ranked", StemmerPatchTrieLoader.Language.NL_NL,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 07
Arguments.of("07-no_no-ranked", StemmerPatchTrieLoader.Language.NO_NO,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 08
Arguments.of("08-pt_pt-ranked", StemmerPatchTrieLoader.Language.PT_PT,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 09
Arguments.of("09-ru_ru-ranked", StemmerPatchTrieLoader.Language.RU_RU,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 10
Arguments.of("10-sv_se-ranked", StemmerPatchTrieLoader.Language.SV_SE,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 11
Arguments.of("11-us_uk-ranked", StemmerPatchTrieLoader.Language.US_UK,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 12
Arguments.of("12-us_uk_profi-ranked", StemmerPatchTrieLoader.Language.US_UK_PROFI,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS),
// 13
Arguments.of("13-da_dk-unordered", StemmerPatchTrieLoader.Language.DA_DK,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 14
Arguments.of("14-de_de-unordered", StemmerPatchTrieLoader.Language.DE_DE,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 15
Arguments.of("15-es_es-unordered", StemmerPatchTrieLoader.Language.ES_ES,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 16
Arguments.of("16-fr_fr-unordered", StemmerPatchTrieLoader.Language.FR_FR,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 17
Arguments.of("17-it_it-unordered", StemmerPatchTrieLoader.Language.IT_IT,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 18
Arguments.of("18-nl_nl-unordered", StemmerPatchTrieLoader.Language.NL_NL,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 19
Arguments.of("19-no_no-unordered", StemmerPatchTrieLoader.Language.NO_NO,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 20
Arguments.of("20-pt_pt-unordered", StemmerPatchTrieLoader.Language.PT_PT,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 21
Arguments.of("21-ru_ru-unordered", StemmerPatchTrieLoader.Language.RU_RU,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 22
Arguments.of("22-sv_se-unordered", StemmerPatchTrieLoader.Language.SV_SE,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 23
Arguments.of("23-us_uk-unordered", StemmerPatchTrieLoader.Language.US_UK,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS),
// 24
Arguments.of("24-us_uk_profi-unordered", StemmerPatchTrieLoader.Language.US_UK_PROFI,
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS));
}
/**
* Provides representative bundled languages for overload consistency checks.
*
* @return parameter stream
*/
static Stream<Arguments> bundledLanguageSamples() {
return Stream.of(
// 01
Arguments.of("01-us_uk", StemmerPatchTrieLoader.Language.US_UK),
// 02
Arguments.of("02-de_de", StemmerPatchTrieLoader.Language.DE_DE),
// 03
Arguments.of("03-fr_fr", StemmerPatchTrieLoader.Language.FR_FR));
}
/**
* Provides invalid null-argument scenarios for public methods.
*
* @return parameter stream
*/
static Stream<Arguments> nullContractCases() {
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
final FrequencyTrie<String> trie = new FrequencyTrie.Builder<String>(String[]::new, settings)
.put("running", new PatchCommandEncoder().encode("running", "run")).build();
return Stream.of(
// 01
Arguments.of("01-load-language-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
true, settings),
"language"),
// 02
Arguments.of("02-load-language-mode",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((StemmerPatchTrieLoader.Language) null,
true, DEFAULT_REDUCTION_MODE),
"language"),
// 03
Arguments.of("03-load-language-null-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
true, (ReductionSettings) null),
"reductionSettings"),
// 04
Arguments.of("04-load-language-null-mode",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(StemmerPatchTrieLoader.Language.US_UK,
true, (ReductionMode) null),
"reductionMode"),
// 05
Arguments.of("05-load-path-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true, settings), "path"),
// 06
Arguments.of("06-load-path-mode",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((Path) null, true,
DEFAULT_REDUCTION_MODE),
"path"),
// 07
Arguments.of("07-load-path-null-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true,
(ReductionSettings) null),
"reductionSettings"),
// 08
Arguments.of("08-load-path-null-mode",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath(), true, (ReductionMode) null),
"reductionMode"),
// 09
Arguments.of("09-load-string-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true, settings),
"fileName"),
// 10
Arguments.of("10-load-string-mode",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load((String) null, true,
DEFAULT_REDUCTION_MODE),
"fileName"),
// 11
Arguments.of("11-load-string-null-settings",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
(ReductionSettings) null),
"reductionSettings"),
// 12
Arguments.of("12-load-string-null-mode",
(ExecutableOperation) () -> StemmerPatchTrieLoader.load(tempPath().toString(), true,
(ReductionMode) null),
"reductionMode"),
// 13
Arguments.of("13-load-binary-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((Path) null), "path"),
// 14
Arguments.of("14-load-binary-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((String) null), "fileName"),
// 15
Arguments.of("15-load-binary-stream",
(ExecutableOperation) () -> StemmerPatchTrieLoader.loadBinary((InputStream) null),
"inputStream"),
// 16
Arguments.of("16-save-binary-null-trie-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath()), "trie"),
// 17
Arguments.of("17-save-binary-null-path",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (Path) null), "path"),
// 18
Arguments.of("18-save-binary-null-trie-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(null, tempPath().toString()),
"trie"),
// 19
Arguments.of("19-save-binary-null-string",
(ExecutableOperation) () -> StemmerPatchTrieLoader.saveBinary(trie, (String) null),
"fileName"));
}
/**
* Returns a representative temporary path for null-contract method sources.
*
* @return representative path
*/
private static Path tempPath() {
return Path.of("target", "test-loader-null-contracts.dict");
}
/**
* Focused API contract tests.
*/
@Nested
@DisplayName("API contracts")
final class ApiContractTests {
/**
* Verifies that all documented null contracts are enforced consistently by
* public methods.
*
* @param scenario expected scenario identifier
* @param operation operation that must fail
* @param expectedMessageFragment expected message fragment
*/
@ParameterizedTest(name = "[{index}] {0}")
@MethodSource("org.egothor.stemmer.StemmerPatchTrieLoaderTest#nullContractCases")
@DisplayName("Public methods must reject null arguments with precise diagnostics")
void shouldRejectNullArguments(final String scenario, final ExecutableOperation operation,
final String expectedMessageFragment) {
final NullPointerException exception = assertThrows(NullPointerException.class, operation::execute,
"Scenario " + scenario + " must reject null input.");
assertNotNull(exception.getMessage(), "NullPointerException message must be present.");
assertEquals(expectedMessageFragment, exception.getMessage(),
"Scenario " + scenario + " must identify the offending argument.");
}
/**
* Verifies that loading from a missing file fails with an {@link IOException}.
*/
@Test
@DisplayName("Loading from a missing dictionary file must fail with IOException")
void shouldFailWhenDictionaryFileDoesNotExist() {
final Path missingFile = tempDir.resolve("missing-dictionary.dict");
assertThrows(IOException.class,
() -> StemmerPatchTrieLoader.load(missingFile, true, DEFAULT_REDUCTION_MODE));
}
/**
* Verifies that loading a missing binary file fails with an
* {@link IOException}.
*/
@Test
@DisplayName("Loading a missing binary trie file must fail with IOException")
void shouldFailWhenBinaryFileDoesNotExist() {
final Path missingFile = tempDir.resolve("missing-trie.bin.gz");
assertThrows(IOException.class, () -> StemmerPatchTrieLoader.loadBinary(missingFile));
}
}
/**
* Focused filesystem and parser behavior tests.
*/
@Nested
@DisplayName("Filesystem and parser behavior")
final class FilesystemAndParserTests {
/**
* Verifies that all textual loading overloads produce equivalent tries for the
* same source dictionary.
*
* @throws IOException if the test file cannot be written or read
*/
@Test
@DisplayName("Path and String overloads must load equivalent tries")
void shouldLoadEquivalentTrieFromPathAndStringOverloads() throws IOException {
final Path dictionaryFile = writeDictionary("""
run running runs runner
play playing played plays
city cities
""");
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
final FrequencyTrie<String> fromPathWithSettings = StemmerPatchTrieLoader.load(dictionaryFile, true,
settings);
final FrequencyTrie<String> fromPathWithMode = StemmerPatchTrieLoader.load(dictionaryFile, true,
DEFAULT_REDUCTION_MODE);
final FrequencyTrie<String> fromStringWithSettings = StemmerPatchTrieLoader.load(dictionaryFile.toString(),
true, settings);
final FrequencyTrie<String> fromStringWithMode = StemmerPatchTrieLoader.load(dictionaryFile.toString(),
true, DEFAULT_REDUCTION_MODE);
assertTriePatchSemanticsEqual(fromPathWithSettings, fromPathWithMode, "running", "played", "cities", "run");
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithSettings, "running", "played", "cities",
"run");
assertTriePatchSemanticsEqual(fromPathWithSettings, fromStringWithMode, "running", "played", "cities",
"run");
}
/**
* Verifies that the loader honors {@code storeOriginal=true} by inserting the
* canonical no-op patch for the stem itself.
*
* @throws IOException if the test file cannot be written or read
*/
@Test
@DisplayName("storeOriginal=true must make the stem itself resolvable through the no-op patch")
void shouldStoreOriginalStemWhenRequested() throws IOException {
final Path dictionaryFile = writeDictionary("""
run running runs
""");
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
DEFAULT_REDUCTION_MODE);
final String[] patches = trie.getAll("run");
final Set<String> reconstructedStems = reconstructAllStemCandidates(trie, "run");
assertAll(() -> assertNotNull(patches, "Patch array must be returned for stored stem."),
() -> assertFalse(reconstructedStems.isEmpty(),
"Stored stem must yield at least one reconstructed candidate."),
() -> assertEquals(Set.of("run"), reconstructedStems,
"Stored stem must reconstruct exactly itself."));
}
/**
* Verifies that the loader honors {@code storeOriginal=false}.
*
* @throws IOException if the test file cannot be written or read
*/
@Test
@DisplayName("storeOriginal=false must not insert the stem itself unless present as a variant elsewhere")
void shouldNotStoreOriginalStemWhenDisabled() throws IOException {
final Path dictionaryFile = writeDictionary("""
run running runs
play playing played plays
""");
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, false,
DEFAULT_REDUCTION_MODE);
assertNull(trie.get("run"),
"Stem itself must not be resolvable when storeOriginal is disabled and the stem is not a variant.");
assertEquals(Set.of("run"), reconstructAllStemCandidates(trie, "running"),
"Variants must still reconstruct the proper stem.");
}
/**
* Verifies that comment syntax documented by the loader is effectively honored
* through delegated parsing.
*
* @throws IOException if the test file cannot be written or read
*/
@Test
@DisplayName("Parser must ignore hash and slash-slash remarks")
void shouldIgnoreHashAndDoubleSlashRemarks() throws IOException {
final Path dictionaryFile = writeDictionary("""
# full-line hash comment
// full-line slash comment
run running runs // inline slash comment
play playing played # inline hash comment
city cities
""");
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(dictionaryFile, true,
DEFAULT_REDUCTION_MODE);
assertAll(() -> assertEquals(Set.of("run"), reconstructAllStemCandidates(trie, "running")),
() -> assertEquals(Set.of("play"), reconstructAllStemCandidates(trie, "played")),
() -> assertEquals(Set.of("city"), reconstructAllStemCandidates(trie, "cities")),
() -> assertNull(trie.get("#"), "Comment markers must not become dictionary terms."),
() -> assertNull(trie.get("//"), "Comment markers must not become dictionary terms."));
}
/**
* Verifies binary save/load round-trip equivalence for the filesystem and
* stream overloads.
*
* @throws IOException if writing or reading fails
*/
@Test
@DisplayName("Binary save and load overloads must preserve trie semantics")
void shouldRoundTripBinaryTrieAcrossAllBinaryOverloads() throws IOException {
final Path dictionaryFile = writeDictionary("""
run running runs runner
city cities
study studies studying
""");
final Path binaryFile = tempDir.resolve("stemmer-trie.bin.gz");
final FrequencyTrie<String> original = StemmerPatchTrieLoader.load(dictionaryFile, true,
DEFAULT_REDUCTION_MODE);
StemmerPatchTrieLoader.saveBinary(original, binaryFile);
final FrequencyTrie<String> fromPath = StemmerPatchTrieLoader.loadBinary(binaryFile);
final FrequencyTrie<String> fromString = StemmerPatchTrieLoader.loadBinary(binaryFile.toString());
final byte[] binaryBytes = Files.readAllBytes(binaryFile);
try (InputStream inputStream = new ByteArrayInputStream(binaryBytes)) {
final FrequencyTrie<String> fromStream = StemmerPatchTrieLoader.loadBinary(inputStream);
assertTriePatchSemanticsEqual(original, fromPath, "run", "running", "runner", "cities", "studying");
assertTriePatchSemanticsEqual(original, fromString, "run", "running", "runner", "cities", "studying");
assertTriePatchSemanticsEqual(original, fromStream, "run", "running", "runner", "cities", "studying");
}
}
/**
* Writes a dictionary file into the temporary directory.
*
* @param content dictionary content
* @return written file path
* @throws IOException if writing fails
*/
private Path writeDictionary(final String content) throws IOException {
final Path file = tempDir.resolve("dictionary-" + System.nanoTime() + ".dict");
Files.writeString(file, content, StandardCharsets.UTF_8);
return file;
}
}
/**
* Bundled dictionary integration tests.
*/
@Nested
@DisplayName("Bundled dictionaries")
final class BundledDictionaryTests {
/**
* Verifies that each bundled dictionary compiles into a trie whose
* {@link FrequencyTrie#getAll(String)} results still reconstruct exactly the
* same set of stems as the source dictionary.
*
* @param scenario human-readable numbered scenario identifier
* @param language tested bundled language
* @param reductionMode reduction mode
* @throws IOException if a bundled dictionary cannot be read
*/
@ParameterizedTest(name = "[{index}] {0}")
@MethodSource("org.egothor.stemmer.StemmerPatchTrieLoaderTest#bundledDictionaryCases")
@DisplayName("Bundled dictionaries must preserve all valid stem candidates in getAll()")
void shouldPreserveAllStemCandidatesForBundledDictionaries(final String scenario,
final StemmerPatchTrieLoader.Language language, final ReductionMode reductionMode) throws IOException {
Objects.requireNonNull(scenario, "scenario");
Objects.requireNonNull(language, "language");
Objects.requireNonNull(reductionMode, "reductionMode");
final FrequencyTrie<String> trie = StemmerPatchTrieLoader.load(language, true, reductionMode);
final Map<String, Set<String>> expectedStemsByWord = readExpectedStems(language);
assertNotNull(trie, "Compiled trie must be created.");
assertFalse(expectedStemsByWord.isEmpty(), "Bundled dictionary must not be empty.");
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
final String word = entry.getKey();
final Set<String> expectedStems = entry.getValue();
final Set<String> actualStems = reconstructAllStemCandidates(trie, word);
assertFalse(actualStems.isEmpty(),
() -> "No patch candidates returned for word '" + word + "' in scenario " + scenario + ".");
assertEquals(expectedStems, actualStems, () -> "Reconstructed stem candidates differ for word '" + word
+ "' in scenario " + scenario + "'. Expected: " + expectedStems + ", actual: " + actualStems);
}
}
/**
* Verifies that representative bundled dictionaries load equivalently through
* both reduction-setting and reduction-mode overloads.
*
* @param scenario scenario identifier
* @param language tested language
* @throws IOException if reading fails
*/
@ParameterizedTest(name = "[{index}] {0}")
@MethodSource("org.egothor.stemmer.StemmerPatchTrieLoaderTest#bundledLanguageSamples")
@DisplayName("Bundled dictionary overloads must produce equivalent trie semantics")
void shouldLoadBundledDictionariesEquivalentlyAcrossOverloads(final String scenario,
final StemmerPatchTrieLoader.Language language) throws IOException {
final ReductionSettings settings = ReductionSettings.withDefaults(DEFAULT_REDUCTION_MODE);
final FrequencyTrie<String> viaSettings = StemmerPatchTrieLoader.load(language, true, settings);
final FrequencyTrie<String> viaMode = StemmerPatchTrieLoader.load(language, true, DEFAULT_REDUCTION_MODE);
final Map<String, Set<String>> expectedStemsByWord = readExpectedStems(language);
final int verifiedWords = 25;
int counter = 0;
for (Map.Entry<String, Set<String>> entry : expectedStemsByWord.entrySet()) {
assertTriePatchSemanticsEqual(viaSettings, viaMode, entry.getKey());
counter++;
if (counter >= verifiedWords) {
break;
}
}
assertFalse(expectedStemsByWord.isEmpty(),
"Scenario " + scenario + " must provide at least one bundled dictionary entry.");
}
}
/**
* Reads the bundled dictionary and builds a mapping of surface word to all
* stems it is associated with in the source data.
*
* <p>
* The method intentionally delegates parsing to {@link StemmerDictionaryParser}
* so that expected values follow the same comment and normalization rules as
* the production loader.
* </p>
*
* @param language bundled language
* @return expected stems by surface word
* @throws IOException if the bundled resource cannot be read
*/
private static Map<String, Set<String>> readExpectedStems(final StemmerPatchTrieLoader.Language language)
throws IOException {
final Map<String, Set<String>> expectedStemsByWord = new LinkedHashMap<String, Set<String>>();
final String resourcePath = language.resourcePath();
try (InputStream inputStream = openBundledResource(resourcePath);
BufferedReader reader = new BufferedReader(
new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
StemmerDictionaryParser.parse(reader, resourcePath, (stem, variants, lineNumber) -> {
registerExpectedStem(expectedStemsByWord, stem, stem);
for (String variant : variants) {
registerExpectedStem(expectedStemsByWord, variant, stem);
}
});
}
return expectedStemsByWord;
}
/**
* Registers one expected stem for one surface word.
*
* @param expectedStemsByWord expected stem mapping
* @param word surface word
* @param stem expected stem
*/
private static void registerExpectedStem(final Map<String, Set<String>> expectedStemsByWord, final String word,
final String stem) {
Set<String> stems = expectedStemsByWord.get(word);
if (stems == null) {
stems = new LinkedHashSet<String>();
expectedStemsByWord.put(word, stems);
}
stems.add(stem);
}
/**
* Reconstructs all stem candidates for the supplied word from all patch
* commands returned by {@link FrequencyTrie#getAll(String)}.
*
* @param trie compiled patch trie
* @param word surface word
* @return reconstructed stem candidates
*/
private static Set<String> reconstructAllStemCandidates(final FrequencyTrie<String> trie, final String word) {
final String[] patchCommands = trie.getAll(word);
final Set<String> stems = new LinkedHashSet<String>();
if (patchCommands == null) {
return stems;
}
for (String patchCommand : patchCommands) {
stems.add(PatchCommandEncoder.apply(word, patchCommand));
}
return stems;
}
/**
* Verifies semantic equality of two tries for the supplied words by comparing
* both their raw patch arrays and reconstructed stem sets.
*
* @param expected reference trie
* @param actual compared trie
* @param words words to verify
*/
private static void assertTriePatchSemanticsEqual(final FrequencyTrie<String> expected,
final FrequencyTrie<String> actual, final String... words) {
for (String word : words) {
assertAll(
() -> assertArrayEquals(expected.getAll(word), actual.getAll(word),
"Patch arrays must match for word '" + word + "'."),
() -> assertEquals(reconstructAllStemCandidates(expected, word),
reconstructAllStemCandidates(actual, word),
"Reconstructed stems must match for word '" + word + "'."));
}
}
/**
* Opens one bundled dictionary resource.
*
* @param resourcePath classpath resource path
* @return opened input stream
* @throws IOException if the resource cannot be found
*/
private static InputStream openBundledResource(final String resourcePath) throws IOException {
final InputStream inputStream = StemmerPatchTrieLoaderTest.class.getClassLoader()
.getResourceAsStream(resourcePath);
if (inputStream == null) {
throw new IOException("Bundled stemmer resource not found: " + resourcePath);
}
return inputStream;
}
/**
* Minimal checked-exception-friendly operation used by null-contract tests.
*/
@FunctionalInterface
private interface ExecutableOperation {
/**
* Executes the operation.
*
* @throws Exception if execution fails
*/
void execute() throws Exception;
}
}

View File

@@ -0,0 +1,48 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import java.util.Map;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link ChildDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("ChildDescriptor")
class ChildDescriptorTest {
@Test
@DisplayName("must implement equality and hash code from edge and child signature")
void shouldImplementEqualityAndHashCodeFromEdgeAndChildSignature() {
final ReductionSignature<String> signatureA = createLeafSignature("alpha");
final ReductionSignature<String> signatureB = createLeafSignature("beta");
final ChildDescriptor<String> descriptor = new ChildDescriptor<>('a', signatureA);
final ChildDescriptor<String> equalDescriptor = new ChildDescriptor<>('a', signatureA);
final ChildDescriptor<String> differentEdge = new ChildDescriptor<>('b', signatureA);
final ChildDescriptor<String> differentSignature = new ChildDescriptor<>('a', signatureB);
assertEquals(descriptor, equalDescriptor);
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
assertNotEquals(descriptor, differentEdge);
assertNotEquals(descriptor, differentSignature);
assertNotEquals(descriptor, null);
assertNotEquals(descriptor, "x");
}
private static ReductionSignature<String> createLeafSignature(final String value) {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
value, 1, 0);
return ReductionSignature.create(summary, Map.of(),
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
}
}

View File

@@ -0,0 +1,41 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link DominantLocalDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("DominantLocalDescriptor")
class DominantLocalDescriptorTest {
@Test
@DisplayName("must implement equality and hash code from dominant value")
void shouldImplementEqualityAndHashCodeFromDominantValue() {
final DominantLocalDescriptor<String> descriptor = new DominantLocalDescriptor<>("stem");
final DominantLocalDescriptor<String> equalDescriptor = new DominantLocalDescriptor<>("stem");
final DominantLocalDescriptor<String> differentDescriptor = new DominantLocalDescriptor<>("other");
assertEquals(descriptor, equalDescriptor);
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
assertNotEquals(descriptor, differentDescriptor);
assertNotEquals(descriptor, null);
assertNotEquals(descriptor, "x");
}
@Test
@DisplayName("must support null dominant value in equality semantics")
void shouldSupportNullDominantValueInEqualitySemantics() {
final DominantLocalDescriptor<String> descriptor = new DominantLocalDescriptor<>(null);
final DominantLocalDescriptor<String> equalDescriptor = new DominantLocalDescriptor<>(null);
assertEquals(descriptor, equalDescriptor);
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
}
}

View File

@@ -0,0 +1,184 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.LinkedHashMap;
import java.util.Map;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link LocalValueSummary}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("LocalValueSummary")
class LocalValueSummaryTest {
@Test
@DisplayName("of must create empty summary for empty counts")
void shouldCreateEmptySummaryForEmptyCounts() {
final LocalValueSummary<String> summary = LocalValueSummary.of(Map.of(), String[]::new);
assertArrayEquals(new String[0], summary.orderedValues());
assertArrayEquals(new int[0], summary.orderedCounts());
assertNull(summary.dominantValue);
}
@Test
@DisplayName("of must order by descending frequency then shorter textual form then lexicographically")
void shouldOrderByFrequencyLengthAndLexicographicalValue() {
final Map<String, Integer> counts = new LinkedHashMap<>();
counts.put("bbb", 4);
counts.put("a", 4);
counts.put("aa", 4);
counts.put("ab", 4);
counts.put("z", 2);
final LocalValueSummary<String> summary = LocalValueSummary.of(counts, String[]::new);
assertArrayEquals(new String[] { "a", "aa", "ab", "bbb", "z" }, summary.orderedValues());
assertArrayEquals(new int[] { 4, 4, 4, 4, 2 }, summary.orderedCounts());
assertEquals("a", summary.dominantValue);
}
@Test
@DisplayName("of must use insertion order as the final tie breaker")
void shouldUseInsertionOrderAsFinalTieBreaker() {
final Map<Object, Integer> counts = new LinkedHashMap<>();
final TextTwin first = new TextTwin("xy");
final TextTwin second = new TextTwin("xy");
counts.put(first, 5);
counts.put(second, 5);
final LocalValueSummary<Object> summary = LocalValueSummary.of(counts, Object[]::new);
assertSame(first, summary.orderedValues()[0]);
assertSame(second, summary.orderedValues()[1]);
assertArrayEquals(new int[] { 5, 5 }, summary.orderedCounts());
}
@Test
@DisplayName("orderedValues must expose the documented backing array")
void shouldExposeDocumentedOrderedValuesBackingArray() {
final Map<String, Integer> counts = new LinkedHashMap<>();
counts.put("alpha", 2);
counts.put("beta", 1);
final LocalValueSummary<String> summary = LocalValueSummary.of(counts, String[]::new);
final String[] orderedValues = summary.orderedValues();
orderedValues[0] = "mutated";
assertEquals("mutated", summary.orderedValues()[0]);
}
@Test
@DisplayName("orderedCounts must expose the documented backing array")
void shouldExposeDocumentedOrderedCountsBackingArray() {
final Map<String, Integer> counts = new LinkedHashMap<>();
counts.put("alpha", 2);
counts.put("beta", 1);
final LocalValueSummary<String> summary = LocalValueSummary.of(counts, String[]::new);
final int[] orderedCounts = summary.orderedCounts();
orderedCounts[0] = 99;
assertEquals(99, summary.orderedCounts()[0]);
}
@Test
@DisplayName("hasQualifiedDominantWinner must return true when percentage and ratio thresholds are satisfied")
void shouldAcceptQualifiedDominantWinner() {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 8, 2 },
10, "a", 8, 2);
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 70, 3);
assertTrue(summary.hasQualifiedDominantWinner(settings));
}
@Test
@DisplayName("hasQualifiedDominantWinner must reject winner when percentage threshold is not satisfied")
void shouldRejectWinnerWhenPercentageThresholdIsNotSatisfied() {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 6, 4 },
10, "a", 6, 4);
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 70, 1);
assertFalse(summary.hasQualifiedDominantWinner(settings));
}
@Test
@DisplayName("hasQualifiedDominantWinner must reject winner when over-second ratio is not satisfied")
void shouldRejectWinnerWhenOverSecondRatioIsNotSatisfied() {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 6, 4 },
10, "a", 6, 4);
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 50, 2);
assertFalse(summary.hasQualifiedDominantWinner(settings));
}
@Test
@DisplayName("hasQualifiedDominantWinner must accept single winner when second count is absent")
void shouldAcceptSingleWinnerWhenSecondCountIsAbsent() {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "a" }, new int[] { 3 }, 3, "a",
3, 0);
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 100, 10);
assertTrue(summary.hasQualifiedDominantWinner(settings));
}
@Test
@DisplayName("hasQualifiedDominantWinner must return false when no dominant value exists")
void shouldReturnFalseWhenNoDominantValueExists() {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[0], new int[0], 0, null, 0, 0);
final ReductionSettings settings = ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS);
assertFalse(summary.hasQualifiedDominantWinner(settings));
}
/**
* Test helper with identical textual form but distinct identity.
*/
private static final class TextTwin {
/**
* Textual form.
*/
private final String text;
/**
* Creates a helper value.
*
* @param text textual form
*/
private TextTwin(final String text) {
this.text = text;
}
@Override
public String toString() {
return this.text;
}
}
}

View File

@@ -0,0 +1,54 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Map;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link MutableNode}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("MutableNode")
class MutableNodeTest {
@Test
@DisplayName("must create empty maps on construction")
void shouldCreateEmptyMapsOnConstruction() {
final MutableNode<String> node = new MutableNode<>();
assertTrue(node.children().isEmpty());
assertTrue(node.valueCounts().isEmpty());
}
@Test
@DisplayName("children must expose mutable backing map")
void shouldExposeMutableBackingChildrenMap() {
final MutableNode<String> node = new MutableNode<>();
final MutableNode<String> child = new MutableNode<>();
final Map<Character, MutableNode<String>> children = node.children();
children.put('x', child);
assertSame(children, node.children());
assertSame(child, node.children().get('x'));
}
@Test
@DisplayName("valueCounts must expose mutable backing map")
void shouldExposeMutableBackingValueCountsMap() {
final MutableNode<String> node = new MutableNode<>();
final Map<String, Integer> valueCounts = node.valueCounts();
valueCounts.put("stem", 3);
assertSame(valueCounts, node.valueCounts());
assertEquals(Integer.valueOf(3), node.valueCounts().get("stem"));
}
}

View File

@@ -0,0 +1,42 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link RankedLocalDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("RankedLocalDescriptor")
class RankedLocalDescriptorTest {
@Test
@DisplayName("of must preserve order in equality semantics")
void shouldPreserveOrderInEqualitySemantics() {
final RankedLocalDescriptor descriptor = RankedLocalDescriptor.of(new Object[] { "a", "b", "c" });
final RankedLocalDescriptor equalDescriptor = RankedLocalDescriptor.of(new Object[] { "a", "b", "c" });
final RankedLocalDescriptor differentOrder = RankedLocalDescriptor.of(new Object[] { "b", "a", "c" });
assertEquals(descriptor, equalDescriptor);
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
assertNotEquals(descriptor, differentOrder);
assertNotEquals(descriptor, null);
assertNotEquals(descriptor, "x");
}
@Test
@DisplayName("of must defensively copy source array")
void shouldDefensivelyCopySourceArray() {
final Object[] orderedValues = new Object[] { "a", "b" };
final RankedLocalDescriptor descriptor = RankedLocalDescriptor.of(orderedValues);
orderedValues[0] = "mutated";
assertEquals(descriptor, RankedLocalDescriptor.of(new Object[] { "a", "b" }));
}
}

View File

@@ -0,0 +1,133 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.LinkedHashMap;
import java.util.Map;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link ReducedNode}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("ReducedNode")
class ReducedNodeTest {
@Test
@DisplayName("constructor must defensively copy input maps")
void shouldDefensivelyCopyInputMaps() {
final ReductionSignature<String> signature = createLeafSignature("root");
final Map<String, Integer> localCounts = new LinkedHashMap<>();
localCounts.put("a", 1);
final Map<Character, ReducedNode<String>> children = new LinkedHashMap<>();
final ReducedNode<String> node = new ReducedNode<>(signature, localCounts, children);
localCounts.put("b", 2);
children.put('x', createReducedLeaf("child"));
assertEquals(Map.of("a", 1), node.localCounts());
assertTrue(node.children().isEmpty());
}
@Test
@DisplayName("localCounts must expose internal backing map")
void shouldExposeInternalBackingLocalCountsMap() {
final ReducedNode<String> node = createReducedLeaf("root");
final Map<String, Integer> localCounts = node.localCounts();
localCounts.put("other", 7);
assertSame(localCounts, node.localCounts());
assertEquals(Integer.valueOf(7), node.localCounts().get("other"));
}
@Test
@DisplayName("children must expose internal backing map")
void shouldExposeInternalBackingChildrenMap() {
final ReducedNode<String> node = createReducedLeaf("root");
final ReducedNode<String> child = createReducedLeaf("child");
final Map<Character, ReducedNode<String>> children = node.children();
children.put('c', child);
assertSame(children, node.children());
assertSame(child, node.children().get('c'));
}
@Test
@DisplayName("mergeLocalCounts must sum existing counts and append missing values")
void shouldMergeLocalCountsBySummingAndAppending() {
final ReducedNode<String> node = new ReducedNode<>(createLeafSignature("root"),
new LinkedHashMap<>(Map.of("a", 2)), Map.of());
final Map<String, Integer> additionalCounts = new LinkedHashMap<>();
additionalCounts.put("a", 5);
additionalCounts.put("b", 3);
node.mergeLocalCounts(additionalCounts);
assertEquals(Integer.valueOf(7), node.localCounts().get("a"));
assertEquals(Integer.valueOf(3), node.localCounts().get("b"));
assertEquals(2, node.localCounts().size());
}
@Test
@DisplayName("mergeChildren must append child when edge is absent")
void shouldAppendChildWhenEdgeIsAbsent() {
final ReducedNode<String> node = createReducedLeaf("root");
final ReducedNode<String> child = createReducedLeaf("child");
node.mergeChildren(Map.of('a', child));
assertSame(child, node.children().get('a'));
}
@Test
@DisplayName("mergeChildren must allow the same canonical child instance for the same edge")
void shouldAllowSameCanonicalChildInstanceForSameEdge() {
final ReducedNode<String> child = createReducedLeaf("child");
final ReducedNode<String> node = new ReducedNode<>(createLeafSignature("root"), Map.of(), Map.of('a', child));
node.mergeChildren(Map.of('a', child));
assertSame(child, node.children().get('a'));
}
@Test
@DisplayName("mergeChildren must reject incompatible canonical child instance for the same edge")
void shouldRejectIncompatibleCanonicalChildInstanceForSameEdge() {
final ReducedNode<String> childA = createReducedLeaf("child-a");
final ReducedNode<String> childB = createReducedLeaf("child-b");
final ReducedNode<String> node = new ReducedNode<>(createLeafSignature("root"), Map.of(), Map.of('a', childA));
final IllegalStateException exception = assertThrows(IllegalStateException.class,
() -> node.mergeChildren(Map.of('a', childB)));
assertTrue(exception.getMessage().contains("Incompatible canonical child"));
}
private static ReductionSignature<String> createLeafSignature(final String value) {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
value, 1, 0);
return ReductionSignature.create(summary, Map.of(),
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
}
private static ReducedNode<String> createReducedLeaf(final String value) {
return new ReducedNode<>(createLeafSignature(value), new LinkedHashMap<>(Map.of(value, 1)),
new LinkedHashMap<>());
}
}

View File

@@ -0,0 +1,71 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertSame;
import java.util.LinkedHashMap;
import java.util.Map;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link ReductionContext}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("ReductionContext")
class ReductionContextTest {
@Test
@DisplayName("must expose settings and manage canonical node registry")
void shouldExposeSettingsAndManageCanonicalNodeRegistry() {
final ReductionSettings settings = ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final ReductionContext<String> context = new ReductionContext<>(settings);
final ReductionSignature<String> signature = createLeafSignature("stem");
final ReducedNode<String> node = new ReducedNode<>(signature, new LinkedHashMap<>(Map.of("stem", 1)),
new LinkedHashMap<>());
assertSame(settings, context.settings());
assertEquals(0, context.canonicalNodeCount());
assertNull(context.lookup(signature));
context.register(signature, node);
assertEquals(1, context.canonicalNodeCount());
assertSame(node, context.lookup(signature));
}
@Test
@DisplayName("register must replace previous canonical node for the same signature")
void shouldReplacePreviousCanonicalNodeForTheSameSignature() {
final ReductionContext<String> context = new ReductionContext<>(
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
final ReductionSignature<String> signature = createLeafSignature("stem");
final ReducedNode<String> first = new ReducedNode<>(signature, new LinkedHashMap<>(Map.of("first", 1)),
new LinkedHashMap<>());
final ReducedNode<String> second = new ReducedNode<>(signature, new LinkedHashMap<>(Map.of("second", 1)),
new LinkedHashMap<>());
context.register(signature, first);
context.register(signature, second);
assertEquals(1, context.canonicalNodeCount());
assertSame(second, context.lookup(signature));
}
private static ReductionSignature<String> createLeafSignature(final String value) {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
value, 1, 0);
return ReductionSignature.create(summary, Map.of(),
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
}
}

View File

@@ -0,0 +1,155 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import java.util.LinkedHashMap;
import java.util.Map;
import org.egothor.stemmer.ReductionMode;
import org.egothor.stemmer.ReductionSettings;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link ReductionSignature}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("ReductionSignature")
class ReductionSignatureTest {
@Test
@DisplayName("create must preserve ranked getAll semantics in ranked mode")
void shouldPreserveRankedGetAllSemanticsInRankedMode() {
final ReductionSettings settings = ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final ReductionSignature<String> left = ReductionSignature.create(createTwoValueSummary("a", 5, "b", 2),
Map.of(), settings);
final ReductionSignature<String> sameRankingDifferentCounts = ReductionSignature
.create(createTwoValueSummary("a", 9, "b", 1), Map.of(), settings);
final ReductionSignature<String> differentOrder = ReductionSignature.create(
new LocalValueSummary<>(new String[] { "b", "a" }, new int[] { 5, 2 }, 7, "b", 5, 2), Map.of(),
settings);
assertEquals(left, sameRankingDifferentCounts);
assertNotEquals(left, differentOrder);
}
@Test
@DisplayName("create must ignore local ordering in unordered mode")
void shouldIgnoreLocalOrderingInUnorderedMode() {
final ReductionSettings settings = ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_UNORDERED_GET_ALL_RESULTS);
final ReductionSignature<String> left = ReductionSignature.create(
new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 5, 2 }, 7, "a", 5, 2), Map.of(),
settings);
final ReductionSignature<String> right = ReductionSignature.create(
new LocalValueSummary<>(new String[] { "b", "a" }, new int[] { 5, 2 }, 7, "b", 5, 2), Map.of(),
settings);
assertEquals(left, right);
}
@Test
@DisplayName("create must use dominant descriptor in dominant mode when dominant winner qualifies")
void shouldUseDominantDescriptorWhenDominantWinnerQualifies() {
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 70, 3);
final ReductionSignature<String> left = ReductionSignature.create(
new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 8, 2 }, 10, "a", 8, 2), Map.of(),
settings);
final ReductionSignature<String> right = ReductionSignature.create(
new LocalValueSummary<>(new String[] { "a", "x", "y" }, new int[] { 8, 1, 1 }, 10, "a", 8, 1), Map.of(),
settings);
assertEquals(left, right);
}
@Test
@DisplayName("create must fall back to ranked descriptor in dominant mode when dominant winner does not qualify")
void shouldFallBackToRankedDescriptorWhenDominantWinnerDoesNotQualify() {
final ReductionSettings settings = new ReductionSettings(
ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_DOMINANT_GET_RESULTS, 80, 2);
final ReductionSignature<String> left = ReductionSignature.create(
new LocalValueSummary<>(new String[] { "a", "b" }, new int[] { 6, 4 }, 10, "a", 6, 4), Map.of(),
settings);
final ReductionSignature<String> right = ReductionSignature.create(
new LocalValueSummary<>(new String[] { "a", "c" }, new int[] { 6, 4 }, 10, "a", 6, 4), Map.of(),
settings);
assertNotEquals(left, right);
}
@Test
@DisplayName("create must sort child descriptors by edge regardless of map insertion order")
void shouldSortChildDescriptorsByEdgeRegardlessOfMapInsertionOrder() {
final ReductionSettings settings = ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final ReducedNode<String> childA = createReducedLeaf("child-a");
final ReducedNode<String> childB = createReducedLeaf("child-b");
final Map<Character, ReducedNode<String>> leftChildren = new LinkedHashMap<>();
leftChildren.put('b', childB);
leftChildren.put('a', childA);
final Map<Character, ReducedNode<String>> rightChildren = new LinkedHashMap<>();
rightChildren.put('a', childA);
rightChildren.put('b', childB);
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "root" }, new int[] { 1 }, 1,
"root", 1, 0);
final ReductionSignature<String> left = ReductionSignature.create(summary, leftChildren, settings);
final ReductionSignature<String> right = ReductionSignature.create(summary, rightChildren, settings);
assertEquals(left, right);
}
@Test
@DisplayName("create must include child signatures in equality")
void shouldIncludeChildSignaturesInEquality() {
final ReductionSettings settings = ReductionSettings
.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS);
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { "root" }, new int[] { 1 }, 1,
"root", 1, 0);
final ReductionSignature<String> left = ReductionSignature.create(summary, Map.of('a', createReducedLeaf("x")),
settings);
final ReductionSignature<String> right = ReductionSignature.create(summary, Map.of('a', createReducedLeaf("y")),
settings);
assertNotEquals(left, right);
}
private static LocalValueSummary<String> createTwoValueSummary(final String dominantValue, final int dominantCount,
final String secondValue, final int secondCount) {
return new LocalValueSummary<>(new String[] { dominantValue, secondValue },
new int[] { dominantCount, secondCount }, dominantCount + secondCount, dominantValue, dominantCount,
secondCount);
}
private static ReductionSignature<String> createLeafSignature(final String value) {
final LocalValueSummary<String> summary = new LocalValueSummary<>(new String[] { value }, new int[] { 1 }, 1,
value, 1, 0);
return ReductionSignature.create(summary, Map.of(),
ReductionSettings.withDefaults(ReductionMode.MERGE_SUBTREES_WITH_EQUIVALENT_RANKED_GET_ALL_RESULTS));
}
private static ReducedNode<String> createReducedLeaf(final String value) {
return new ReducedNode<>(createLeafSignature(value), Map.of(value, 1), Map.of());
}
}

View File

@@ -0,0 +1,42 @@
package org.egothor.stemmer.trie;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
/**
* Unit tests for {@link UnorderedLocalDescriptor}.
*/
@Tag("unit")
@Tag("fast")
@DisplayName("UnorderedLocalDescriptor")
class UnorderedLocalDescriptorTest {
@Test
@DisplayName("of must ignore ordering and duplicates in equality semantics")
void shouldIgnoreOrderingAndDuplicatesInEqualitySemantics() {
final UnorderedLocalDescriptor descriptor = UnorderedLocalDescriptor.of(new Object[] { "a", "b", "a" });
final UnorderedLocalDescriptor equalDescriptor = UnorderedLocalDescriptor.of(new Object[] { "b", "a" });
final UnorderedLocalDescriptor differentDescriptor = UnorderedLocalDescriptor.of(new Object[] { "a", "c" });
assertEquals(descriptor, equalDescriptor);
assertEquals(descriptor.hashCode(), equalDescriptor.hashCode());
assertNotEquals(descriptor, differentDescriptor);
assertNotEquals(descriptor, null);
assertNotEquals(descriptor, "x");
}
@Test
@DisplayName("of must defensively isolate descriptor from source array mutation")
void shouldDefensivelyIsolateDescriptorFromSourceArrayMutation() {
final Object[] values = new Object[] { "a", "b" };
final UnorderedLocalDescriptor descriptor = UnorderedLocalDescriptor.of(values);
values[0] = "mutated";
assertEquals(descriptor, UnorderedLocalDescriptor.of(new Object[] { "a", "b" }));
}
}